diff --git a/.clang-tidy b/.clang-tidy index a00642e87b7..d7380a9a89b 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,2 +1,164 @@ -Checks: '-*,google-readability-avoid-underscore-in-googletest-name,misc-throw-by-value-catch-by-reference,misc-misplaced-const,misc-unconventional-assign-operator,modernize-avoid-bind,modernize-loop-convert,modernize-make-shared,modernize-make-unique,modernize-raw-string-literal,modernize-redundant-void-arg,modernize-replace-auto-ptr,modernize-replace-random-shuffle,modernize-use-bool-literals,modernize-use-nullptr,modernize-use-using,performance-faster-string-find,performance-for-range-copy,readability-avoid-const-params-in-decls,readability-const-return-type,readability-container-size-empty,readability-convert-member-functions-to-static,readability-delete-null-pointer,readability-deleted-default,readability-make-member-function-const,readability-misplaced-array-index,readability-non-const-parameter,readability-qualified-auto,readability-redundant-access-specifiers,readability-redundant-control-flow,readability-redundant-function-ptr-dereference,readability-redundant-smartptr-get,readability-redundant-string-cstr,readability-redundant-string-init,readability-static-definition-in-anonymous-namespace,readability-string-compare,readability-uniqueptr-delete-release,modernize-use-equals-default,modernize-use-equals-delete,bugprone-undelegated-constructor,readability-redundant-member-init,readability-simplify-subscript-expr,readability-simplify-boolean-expr,readability-inconsistent-declaration-parameter-name' +Checks: '-*, + misc-throw-by-value-catch-by-reference, + misc-misplaced-const, + misc-unconventional-assign-operator, + misc-redundant-expression, + misc-static-assert, + misc-unconventional-assign-operator, + misc-uniqueptr-reset-release, + misc-unused-alias-decls, + misc-unused-parameters, + misc-unused-using-decls, + + modernize-avoid-bind, + modernize-loop-convert, + modernize-make-shared, + modernize-make-unique, + modernize-raw-string-literal, + modernize-redundant-void-arg, + modernize-replace-auto-ptr, + modernize-replace-random-shuffle, + modernize-use-bool-literals, + modernize-use-nullptr, + modernize-use-using, + modernize-use-equals-default, + modernize-use-equals-delete, + + performance-faster-string-find, + performance-for-range-copy, + performance-implicit-conversion-in-loop, + performance-inefficient-algorithm, + performance-inefficient-vector-operation, + performance-move-constructor-init, + performance-no-automatic-move, + performance-trivially-destructible, + performance-unnecessary-copy-initialization, + + readability-avoid-const-params-in-decls, + readability-const-return-type, + readability-container-size-empty, + readability-convert-member-functions-to-static, + readability-delete-null-pointer, + readability-deleted-default, + readability-make-member-function-const, + readability-misplaced-array-index, + readability-non-const-parameter, + readability-qualified-auto, + readability-redundant-access-specifiers, + readability-redundant-control-flow, + readability-redundant-function-ptr-dereference, + readability-redundant-smartptr-get, + readability-redundant-string-cstr, + readability-redundant-string-init, + readability-static-definition-in-anonymous-namespace, + readability-string-compare, + readability-uniqueptr-delete-release, + readability-redundant-member-init, + readability-simplify-subscript-expr, + readability-simplify-boolean-expr, + readability-inconsistent-declaration-parameter-name, + + bugprone-undelegated-constructor, + bugprone-argument-comment, + bugprone-bad-signal-to-kill-thread, + bugprone-bool-pointer-implicit-conversion, + bugprone-copy-constructor-init, + bugprone-dangling-handle, + bugprone-forward-declaration-namespace, + bugprone-fold-init-type, + bugprone-inaccurate-erase, + bugprone-incorrect-roundings, + bugprone-infinite-loop, + bugprone-integer-division, + bugprone-macro-parentheses, + bugprone-macro-repeated-side-effects, + bugprone-misplaced-operator-in-strlen-in-alloc, + bugprone-misplaced-pointer-artithmetic-in-alloc, + bugprone-misplaced-widening-cast, + bugprone-move-forwarding-reference, + bugprone-multiple-statement-macro, + bugprone-parent-virtual-call, + bugprone-posix-return, + bugprone-reserved-identifier, + bugprone-signed-char-misuse, + bugprone-sizeof-container, + bugprone-sizeof-expression, + bugprone-string-constructor, + bugprone-string-integer-assignment, + bugprone-string-literal-with-embedded-nul, + bugprone-suspicious-enum-usage, + bugprone-suspicious-include, + bugprone-suspicious-memset-usage, + bugprone-suspicious-missing-comma, + bugprone-suspicious-string-compare, + bugprone-swapped-arguments, + bugprone-terminating-continue, + bugprone-throw-keyword-missing, + bugprone-too-small-loop-variable, + bugprone-undefined-memory-manipulation, + bugprone-unhandled-self-assignment, + bugprone-unused-raii, + bugprone-unused-return-value, + bugprone-use-after-move, + bugprone-virtual-near-miss, + + cert-dcl21-cpp, + cert-dcl50-cpp, + cert-env33-c, + cert-err34-c, + cert-err52-cpp, + cert-flp30-c, + cert-mem57-cpp, + cert-msc50-cpp, + cert-oop58-cpp, + + google-build-explicit-make-pair, + google-build-namespaces, + google-default-arguments, + google-explicit-constructor, + google-readability-casting, + google-readability-avoid-underscore-in-googletest-name, + google-runtime-int, + google-runtime-operator, + + hicpp-exception-baseclass, + + clang-analyzer-core.CallAndMessage, + clang-analyzer-core.DivideZero, + clang-analyzer-core.NonNullParamChecker, + clang-analyzer-core.NullDereference, + clang-analyzer-core.StackAddressEscape, + clang-analyzer-core.UndefinedBinaryOperatorResult, + clang-analyzer-core.VLASize, + clang-analyzer-core.uninitialized.ArraySubscript, + clang-analyzer-core.uninitialized.Assign, + clang-analyzer-core.uninitialized.Branch, + clang-analyzer-core.uninitialized.CapturedBlockVariable, + clang-analyzer-core.uninitialized.UndefReturn, + clang-analyzer-cplusplus.InnerPointer, + clang-analyzer-cplusplus.NewDelete, + clang-analyzer-cplusplus.NewDeleteLeaks, + clang-analyzer-cplusplus.PlacementNewChecker, + clang-analyzer-cplusplus.SelfAssignment, + clang-analyzer-deadcode.DeadStores, + clang-analyzer-optin.cplusplus.VirtualCall, + clang-analyzer-security.insecureAPI.UncheckedReturn, + clang-analyzer-security.insecureAPI.bcmp, + clang-analyzer-security.insecureAPI.bcopy, + clang-analyzer-security.insecureAPI.bzero, + clang-analyzer-security.insecureAPI.getpw, + clang-analyzer-security.insecureAPI.gets, + clang-analyzer-security.insecureAPI.mkstemp, + clang-analyzer-security.insecureAPI.mktemp, + clang-analyzer-security.insecureAPI.rand, + clang-analyzer-security.insecureAPI.strcpy, + clang-analyzer-unix.Malloc, + clang-analyzer-unix.MallocSizeof, + clang-analyzer-unix.MismatchedDeallocator, + clang-analyzer-unix.Vfork, + clang-analyzer-unix.cstring.BadSizeArg, + clang-analyzer-unix.cstring.NullArg, + + boost-use-to-string, +' WarningsAsErrors: '*' diff --git a/.gitignore b/.gitignore index 1e6bb1716ec..6aa331edc84 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,10 @@ /docs/publish /docs/edit /docs/website +/docs/venv/ /docs/tools/venv/ +/docs/tools/translate/venv/ +/docs/tools/translate/output.md /docs/en/single.md /docs/ru/single.md /docs/zh/single.md diff --git a/CMakeLists.txt b/CMakeLists.txt index 29f97c7c0ae..8feb5d8c129 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -186,11 +186,13 @@ if (COMPILER_GCC OR COMPILER_CLANG) endif () option(WITH_COVERAGE "Build with coverage." 0) + if(WITH_COVERAGE AND COMPILER_CLANG) set(COMPILER_FLAGS "${COMPILER_FLAGS} -fprofile-instr-generate -fcoverage-mapping") # If we want to disable coverage for specific translation units set(WITHOUT_COVERAGE "-fno-profile-instr-generate -fno-coverage-mapping") endif() + if(WITH_COVERAGE AND COMPILER_GCC) set(COMPILER_FLAGS "${COMPILER_FLAGS} -fprofile-arcs -ftest-coverage") set(COVERAGE_OPTION "-lgcov") @@ -351,7 +353,6 @@ if (ENABLE_TESTS) endif () # Need to process before "contrib" dir: -include (cmake/find/cctz.cmake) include (cmake/find/mysqlclient.cmake) # When testing for memory leaks with Valgrind, don't link tcmalloc or jemalloc. diff --git a/README.md b/README.md index 919df5a0049..a6f2c29d628 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ ClickHouse is an open-source column-oriented database management system that all * [Tutorial](https://clickhouse.tech/docs/en/getting_started/tutorial/) shows how to set up and query small ClickHouse cluster. * [Documentation](https://clickhouse.tech/docs/en/) provides more in-depth information. * [YouTube channel](https://www.youtube.com/c/ClickHouseDB) has a lot of content about ClickHouse in video format. +* [Slack](https://join.slack.com/t/clickhousedb/shared_invite/enQtOTUzMjM4ODQwNTc5LWJmMjE3Yjc2YmI1ZDBlZmI4ZTc3OWY3ZTIwYTljYzY4MzBlODM3YzBjZTc1YmYyODRlZTJkYTgzYzBiNTA2Yjk) and [Telegram](https://telegram.me/clickhouse_en) allow to chat with ClickHouse users in real-time. * [Blog](https://clickhouse.yandex/blog/en/) contains various ClickHouse-related articles, as well as announces and reports about events. * [Contacts](https://clickhouse.tech/#contacts) can help to get your questions answered if there are any. * You can also [fill this form](https://forms.yandex.com/surveys/meet-yandex-clickhouse-team/) to meet Yandex ClickHouse team in person. diff --git a/base/common/CMakeLists.txt b/base/common/CMakeLists.txt index 3e831d3ee82..7d332959617 100644 --- a/base/common/CMakeLists.txt +++ b/base/common/CMakeLists.txt @@ -6,6 +6,7 @@ set (SRCS DateLUT.cpp DateLUTImpl.cpp demangle.cpp + getFQDNOrHostName.cpp getMemoryAmount.cpp getThreadId.cpp JSON.cpp @@ -20,15 +21,9 @@ set (SRCS ) if (ENABLE_REPLXX) - set (SRCS ${SRCS} - ReplxxLineReader.cpp - ReplxxLineReader.h - ) + list (APPEND SRCS ReplxxLineReader.cpp) elseif (ENABLE_READLINE) - set (SRCS ${SRCS} - ReadlineLineReader.cpp - ReadlineLineReader.h - ) + list (APPEND SRCS ReadlineLineReader.cpp) endif () if (USE_DEBUG_HELPERS) @@ -38,11 +33,13 @@ endif () add_library (common ${SRCS}) -target_include_directories(common PUBLIC .. ${CMAKE_CURRENT_BINARY_DIR}/..) +if (WITH_COVERAGE) + target_compile_definitions(common PUBLIC WITH_COVERAGE=1) +else () + target_compile_definitions(common PUBLIC WITH_COVERAGE=0) +endif () -if(CCTZ_INCLUDE_DIR) - target_include_directories(common BEFORE PRIVATE ${CCTZ_INCLUDE_DIR}) -endif() +target_include_directories(common PUBLIC .. ${CMAKE_CURRENT_BINARY_DIR}/..) if (NOT USE_INTERNAL_BOOST_LIBRARY) target_include_directories (common SYSTEM BEFORE PUBLIC ${Boost_INCLUDE_DIRS}) @@ -52,12 +49,6 @@ if(NOT USE_INTERNAL_POCO_LIBRARY) target_include_directories (common SYSTEM BEFORE PUBLIC ${Poco_Foundation_INCLUDE_DIR}) endif() -if(CCTZ_LIBRARY) - target_link_libraries(common PRIVATE ${CCTZ_LIBRARY}) -endif() - -target_link_libraries(common PUBLIC replxx) - # allow explicitly fallback to readline if (NOT ENABLE_REPLXX AND ENABLE_READLINE) message (STATUS "Attempt to fallback to readline explicitly") @@ -82,11 +73,16 @@ endif () target_link_libraries (common PUBLIC + ${Poco_Net_LIBRARY} ${Poco_Util_LIBRARY} ${Poco_Foundation_LIBRARY} ${CITYHASH_LIBRARIES} ${Boost_SYSTEM_LIBRARY} FastMemcpy + replxx + + PRIVATE + cctz ) if (ENABLE_TESTS) diff --git a/base/common/DateLUT.cpp b/base/common/DateLUT.cpp index 8db1458d00f..750900493aa 100644 --- a/base/common/DateLUT.cpp +++ b/base/common/DateLUT.cpp @@ -1,9 +1,10 @@ -#include +#include "DateLUT.h" -#include +#include #include #include -#include + +#include #include diff --git a/base/common/DateLUT.h b/base/common/DateLUT.h index 3cb935bc553..93c6cb403e2 100644 --- a/base/common/DateLUT.h +++ b/base/common/DateLUT.h @@ -1,20 +1,15 @@ #pragma once #include "DateLUTImpl.h" -#include -#include -#include -#include + +#include "defines.h" + #include -// Also defined in Core/Defines.h -#if !defined(ALWAYS_INLINE) -#if defined(_MSC_VER) - #define ALWAYS_INLINE __forceinline -#else - #define ALWAYS_INLINE __attribute__((__always_inline__)) -#endif -#endif +#include +#include +#include +#include /// This class provides lazy initialization and lookup of singleton DateLUTImpl objects for a given timezone. diff --git a/base/common/DateLUTImpl.cpp b/base/common/DateLUTImpl.cpp index 080f8fb6395..d7ab0046992 100644 --- a/base/common/DateLUTImpl.cpp +++ b/base/common/DateLUTImpl.cpp @@ -1,23 +1,14 @@ -#if __has_include() -#include // bundled, debian -#else -#include // freebsd -#endif +#include "DateLUTImpl.h" -#if __has_include() +#include #include -#else -#include -#endif - -#include #include -#include +#include #include #include -#include #include +#include #define DATE_LUT_MIN 0 diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h index 2f2e431f950..d9d27c56ee3 100644 --- a/base/common/DateLUTImpl.h +++ b/base/common/DateLUTImpl.h @@ -1,8 +1,9 @@ #pragma once -#include "Types.h" #include "DayNum.h" -#include "likely.h" +#include "defines.h" +#include "types.h" + #include #include diff --git a/base/common/DayNum.h b/base/common/DayNum.h index 904a9281d64..a4ef0c43b69 100644 --- a/base/common/DayNum.h +++ b/base/common/DayNum.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include /** Represents number of days since 1970-01-01. diff --git a/base/common/JSON.cpp b/base/common/JSON.cpp index 92de0dc2d25..10a1bab92f1 100644 --- a/base/common/JSON.cpp +++ b/base/common/JSON.cpp @@ -215,7 +215,7 @@ JSON::ElementType JSON::getType() const void JSON::checkPos(Pos pos) const { - if (pos >= ptr_end) + if (pos >= ptr_end || ptr_begin == nullptr) throw JSONException("JSON: unexpected end of data."); } @@ -776,7 +776,7 @@ JSON::iterator & JSON::iterator::operator++() return *this; } -JSON::iterator JSON::iterator::operator++(int) +JSON::iterator JSON::iterator::operator++(int) // NOLINT { iterator copy(*this); ++*this; diff --git a/base/common/JSON.h b/base/common/JSON.h index 5f3d9325626..7039036eeb3 100644 --- a/base/common/JSON.h +++ b/base/common/JSON.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include /** Очень простой класс для чтения JSON (или его кусочков). diff --git a/base/common/LineReader.cpp b/base/common/LineReader.cpp index c69690e3420..d0aedabb02b 100644 --- a/base/common/LineReader.cpp +++ b/base/common/LineReader.cpp @@ -30,7 +30,7 @@ void trim(String & s) bool hasInputData() { timeval timeout = {0, 0}; - fd_set fds; + fd_set fds{}; FD_ZERO(&fds); FD_SET(STDIN_FILENO, &fds); return select(1, &fds, nullptr, nullptr, &timeout) == 1; @@ -53,18 +53,18 @@ LineReader::Suggest::WordsRange LineReader::Suggest::getCompletions(const String /// last_word can be empty. - if (case_insensitive) + /// Only perform case sensitive completion when the prefix string contains any uppercase characters + if (std::none_of(prefix.begin(), prefix.end(), [&](auto c) { return c >= 'A' && c <= 'Z'; })) return std::equal_range( - words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) - { - return strncasecmp(s.data(), prefix_searched.data(), prefix_length) < 0; - }); + words_no_case.begin(), words_no_case.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) + { + return strncasecmp(s.data(), prefix_searched.data(), prefix_length) < 0; + }); else - return std::equal_range( - words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) - { - return strncmp(s.data(), prefix_searched.data(), prefix_length) < 0; - }); + return std::equal_range(words.begin(), words.end(), last_word, [prefix_length](std::string_view s, std::string_view prefix_searched) + { + return strncmp(s.data(), prefix_searched.data(), prefix_length) < 0; + }); } LineReader::LineReader(const String & history_file_path_, char extender_, char delimiter_) diff --git a/base/common/LineReader.h b/base/common/LineReader.h index 66de46d5fcb..3e64bc858ad 100644 --- a/base/common/LineReader.h +++ b/base/common/LineReader.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -14,13 +14,11 @@ public: using WordsRange = std::pair; Words words; + Words words_no_case; std::atomic ready{false}; /// Get iterators for the matched range of words if any. WordsRange getCompletions(const String & prefix, size_t prefix_length) const; - - /// case sensitive suggestion - bool case_insensitive = false; }; LineReader(const String & history_file_path, char extender, char delimiter = 0); /// if delimiter != 0, then it's multiline mode diff --git a/base/common/ReadlineLineReader.cpp b/base/common/ReadlineLineReader.cpp index fdbb929be79..ee9a37d2168 100644 --- a/base/common/ReadlineLineReader.cpp +++ b/base/common/ReadlineLineReader.cpp @@ -104,6 +104,8 @@ ReadlineLineReader::ReadlineLineReader(const Suggest & suggest_, const String & if (signal(SIGINT, clear_prompt_or_exit) == SIG_ERR) throw std::runtime_error(std::string("Cannot set signal handler for readline: ") + strerror(errno)); + + rl_variable_bind("completion-ignore-case", "on"); } ReadlineLineReader::~ReadlineLineReader() diff --git a/base/common/StringRef.h b/base/common/StringRef.h index 54010f15085..961aab58980 100644 --- a/base/common/StringRef.h +++ b/base/common/StringRef.h @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/base/common/config_common.h.in b/base/common/config_common.h.in index 2c9a2b82374..29d487c3ab5 100644 --- a/base/common/config_common.h.in +++ b/base/common/config_common.h.in @@ -3,4 +3,3 @@ // .h autogenerated by cmake ! #cmakedefine01 UNBUNDLED -#cmakedefine01 WITH_COVERAGE diff --git a/base/common/defines.h b/base/common/defines.h new file mode 100644 index 00000000000..af5981023ff --- /dev/null +++ b/base/common/defines.h @@ -0,0 +1,87 @@ +#pragma once + +#if defined(_MSC_VER) +# if !defined(likely) +# define likely(x) (x) +# endif +# if !defined(unlikely) +# define unlikely(x) (x) +# endif +#else +# if !defined(likely) +# define likely(x) (__builtin_expect(!!(x), 1)) +# endif +# if !defined(unlikely) +# define unlikely(x) (__builtin_expect(!!(x), 0)) +# endif +#endif + +#if defined(_MSC_VER) +# define ALWAYS_INLINE __forceinline +# define NO_INLINE static __declspec(noinline) +# define MAY_ALIAS +#else +# define ALWAYS_INLINE __attribute__((__always_inline__)) +# define NO_INLINE __attribute__((__noinline__)) +# define MAY_ALIAS __attribute__((__may_alias__)) +#endif + +#if !defined(__x86_64__) && !defined(__aarch64__) && !defined(__PPC__) +# error "The only supported platforms are x86_64 and AArch64, PowerPC (work in progress)" +#endif + +/// Check for presence of address sanitizer +#if !defined(ADDRESS_SANITIZER) +# if defined(__has_feature) +# if __has_feature(address_sanitizer) +# define ADDRESS_SANITIZER 1 +# endif +# elif defined(__SANITIZE_ADDRESS__) +# define ADDRESS_SANITIZER 1 +# endif +#endif + +#if !defined(THREAD_SANITIZER) +# if defined(__has_feature) +# if __has_feature(thread_sanitizer) +# define THREAD_SANITIZER 1 +# endif +# elif defined(__SANITIZE_THREAD__) +# define THREAD_SANITIZER 1 +# endif +#endif + +#if !defined(MEMORY_SANITIZER) +# if defined(__has_feature) +# if __has_feature(memory_sanitizer) +# define MEMORY_SANITIZER 1 +# endif +# elif defined(__MEMORY_SANITIZER__) +# define MEMORY_SANITIZER 1 +# endif +#endif + +/// TODO: Strange enough, there is no way to detect UB sanitizer. + +/// Explicitly allow undefined behaviour for certain functions. Use it as a function attribute. +/// It is useful in case when compiler cannot see (and exploit) it, but UBSan can. +/// Example: multiplication of signed integers with possibility of overflow when both sides are from user input. +#if defined(__clang__) +# define NO_SANITIZE_UNDEFINED __attribute__((__no_sanitize__("undefined"))) +# define NO_SANITIZE_ADDRESS __attribute__((__no_sanitize__("address"))) +# define NO_SANITIZE_THREAD __attribute__((__no_sanitize__("thread"))) +#else /// It does not work in GCC. GCC 7 cannot recognize this attribute and GCC 8 simply ignores it. +# define NO_SANITIZE_UNDEFINED +# define NO_SANITIZE_ADDRESS +# define NO_SANITIZE_THREAD +#endif + +#if defined __GNUC__ && !defined __clang__ +# define OPTIMIZE(x) __attribute__((__optimize__(x))) +#else +# define OPTIMIZE(x) +#endif + +/// A macro for suppressing warnings about unused variables or function results. +/// Useful for structured bindings which have no standard way to declare this. +#define UNUSED(...) (void)(__VA_ARGS__) diff --git a/base/common/find_symbols.h b/base/common/find_symbols.h index 162c73251fa..fc86a72900e 100644 --- a/base/common/find_symbols.h +++ b/base/common/find_symbols.h @@ -292,3 +292,26 @@ inline char * find_last_not_symbols_or_null(char * begin, char * end) { return const_cast(detail::find_last_symbols_sse2(begin, end)); } + + +/// Slightly resembles boost::split. The drawback of boost::split is that it fires a false positive in clang static analyzer. +/// See https://github.com/boostorg/algorithm/issues/63 +/// And https://bugs.llvm.org/show_bug.cgi?id=41141 +template +inline void splitInto(To & to, const std::string & what, bool token_compress = false) +{ + const char * pos = what.data(); + const char * end = pos + what.size(); + while (pos < end) + { + const char * delimiter_or_end = find_first_symbols(pos, end); + + if (!token_compress || pos < delimiter_or_end) + to.emplace_back(pos, delimiter_or_end); + + if (delimiter_or_end < end) + pos = delimiter_or_end + 1; + else + pos = delimiter_or_end; + } +} diff --git a/dbms/src/Common/getFQDNOrHostName.cpp b/base/common/getFQDNOrHostName.cpp similarity index 91% rename from dbms/src/Common/getFQDNOrHostName.cpp rename to base/common/getFQDNOrHostName.cpp index 08ec015919e..f67b37bd71c 100644 --- a/dbms/src/Common/getFQDNOrHostName.cpp +++ b/base/common/getFQDNOrHostName.cpp @@ -1,5 +1,5 @@ #include -#include +#include namespace diff --git a/dbms/src/Common/getFQDNOrHostName.h b/base/common/getFQDNOrHostName.h similarity index 100% rename from dbms/src/Common/getFQDNOrHostName.h rename to base/common/getFQDNOrHostName.h diff --git a/base/common/getMemoryAmount.cpp b/base/common/getMemoryAmount.cpp index d1a7907b152..5e600a37351 100644 --- a/base/common/getMemoryAmount.cpp +++ b/base/common/getMemoryAmount.cpp @@ -69,13 +69,13 @@ uint64_t getMemoryAmountOrZero() #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) /* FreeBSD, Linux, OpenBSD, and Solaris. -------------------- */ - return (uint64_t)sysconf(_SC_PHYS_PAGES) - * (uint64_t)sysconf(_SC_PAGESIZE); + return uint64_t(sysconf(_SC_PHYS_PAGES)) + *uint64_t(sysconf(_SC_PAGESIZE)); #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGE_SIZE) /* Legacy. -------------------------------------------------- */ - return (uint64_t)sysconf(_SC_PHYS_PAGES) - * (uint64_t)sysconf(_SC_PAGE_SIZE); + return uint64_t(sysconf(_SC_PHYS_PAGES)) + * uint64_t(sysconf(_SC_PAGE_SIZE)); #elif defined(CTL_HW) && (defined(HW_PHYSMEM) || defined(HW_REALMEM)) /* DragonFly BSD, FreeBSD, NetBSD, OpenBSD, and OSX. -------- */ diff --git a/base/common/itoa.h b/base/common/itoa.h index a29befd9c6d..5d660ca4378 100644 --- a/base/common/itoa.h +++ b/base/common/itoa.h @@ -30,7 +30,6 @@ #include #include #include -#include "likely.h" using int128_t = __int128; using uint128_t = unsigned __int128; diff --git a/base/common/likely.h b/base/common/likely.h deleted file mode 100644 index 338498af35f..00000000000 --- a/base/common/likely.h +++ /dev/null @@ -1,15 +0,0 @@ -#if defined(_MSC_VER) -# if !defined(likely) -# define likely(x) (x) -# endif -# if !defined(unlikely) -# define unlikely(x) (x) -# endif -#else -# if !defined(likely) -# define likely(x) (__builtin_expect(!!(x), 1)) -# endif -# if !defined(unlikely) -# define unlikely(x) (__builtin_expect(!!(x), 0)) -# endif -#endif diff --git a/base/common/memory.h b/base/common/memory.h index ab96cb593b9..e2bd8d618da 100644 --- a/base/common/memory.h +++ b/base/common/memory.h @@ -1,7 +1,7 @@ #pragma once #include -#include "likely.h" +#include "defines.h" #if __has_include() #include @@ -19,27 +19,11 @@ #include #endif -// Also defined in Core/Defines.h -#if !defined(ALWAYS_INLINE) -#if defined(_MSC_VER) - #define ALWAYS_INLINE inline __forceinline -#else - #define ALWAYS_INLINE inline __attribute__((__always_inline__)) -#endif -#endif - -#if !defined(NO_INLINE) -#if defined(_MSC_VER) - #define NO_INLINE static __declspec(noinline) -#else - #define NO_INLINE __attribute__((__noinline__)) -#endif -#endif namespace Memory { -ALWAYS_INLINE void * newImpl(std::size_t size) +inline ALWAYS_INLINE void * newImpl(std::size_t size) { auto * ptr = malloc(size); if (likely(ptr != nullptr)) @@ -49,19 +33,19 @@ ALWAYS_INLINE void * newImpl(std::size_t size) throw std::bad_alloc{}; } -ALWAYS_INLINE void * newNoExept(std::size_t size) noexcept +inline ALWAYS_INLINE void * newNoExept(std::size_t size) noexcept { return malloc(size); } -ALWAYS_INLINE void deleteImpl(void * ptr) noexcept +inline ALWAYS_INLINE void deleteImpl(void * ptr) noexcept { free(ptr); } #if USE_JEMALLOC -ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size) noexcept +inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size) noexcept { if (unlikely(ptr == nullptr)) return; @@ -71,7 +55,7 @@ ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size) noexcept #else -ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unused]]) noexcept +inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unused]]) noexcept { free(ptr); } diff --git a/base/common/shift10.cpp b/base/common/shift10.cpp index 45f5733bd76..341ced6aa81 100644 --- a/base/common/shift10.cpp +++ b/base/common/shift10.cpp @@ -1,6 +1,6 @@ #include -#include +#include "defines.h" #include diff --git a/base/common/shift10.h b/base/common/shift10.h index 68c18f34450..c50121cfb27 100644 --- a/base/common/shift10.h +++ b/base/common/shift10.h @@ -1,6 +1,6 @@ #pragma once -#include +#include /** Almost the same as x = x * exp10(exponent), but gives more accurate result. * Example: diff --git a/base/common/terminalColors.h b/base/common/terminalColors.h index 0c33b23752b..a1ba85dc8cd 100644 --- a/base/common/terminalColors.h +++ b/base/common/terminalColors.h @@ -1,5 +1,5 @@ #include -#include +#include /** Set color in terminal based on 64-bit hash value. diff --git a/base/common/tests/date_lut2.cpp b/base/common/tests/date_lut2.cpp index f1a106a16ca..6dcf5e8adf2 100644 --- a/base/common/tests/date_lut2.cpp +++ b/base/common/tests/date_lut2.cpp @@ -43,7 +43,7 @@ void loop(time_t begin, time_t end, int step) } -int main(int argc, char ** argv) +int main(int, char **) { loop(orderedIdentifierToDate(20101031), orderedIdentifierToDate(20101101), 15 * 60); loop(orderedIdentifierToDate(20100328), orderedIdentifierToDate(20100330), 15 * 60); diff --git a/base/common/tests/date_lut3.cpp b/base/common/tests/date_lut3.cpp index c2e4c7ccf8d..411765d2b2a 100644 --- a/base/common/tests/date_lut3.cpp +++ b/base/common/tests/date_lut3.cpp @@ -53,7 +53,7 @@ void loop(time_t begin, time_t end, int step) } -int main(int argc, char ** argv) +int main(int, char **) { loop(orderedIdentifierToDate(20101031), orderedIdentifierToDate(20101101), 15 * 60); loop(orderedIdentifierToDate(20100328), orderedIdentifierToDate(20100330), 15 * 60); diff --git a/base/common/tests/date_lut4.cpp b/base/common/tests/date_lut4.cpp index 50c3ef4e3d3..a82ec25f183 100644 --- a/base/common/tests/date_lut4.cpp +++ b/base/common/tests/date_lut4.cpp @@ -2,7 +2,7 @@ #include -int main(int argc, char ** argv) +int main(int, char **) { /** В DateLUT был глюк - для времён из дня 1970-01-01, возвращался номер часа больше 23. */ static const time_t TIME = 66130; diff --git a/base/common/tests/date_lut_default_timezone.cpp b/base/common/tests/date_lut_default_timezone.cpp index aeefae3c9e0..b8e5aa08931 100644 --- a/base/common/tests/date_lut_default_timezone.cpp +++ b/base/common/tests/date_lut_default_timezone.cpp @@ -2,7 +2,7 @@ #include #include -int main(int argc, char ** argv) +int main(int, char **) { try { diff --git a/base/common/tests/date_lut_init.cpp b/base/common/tests/date_lut_init.cpp index 3b03e36b02d..48f0d6063c7 100644 --- a/base/common/tests/date_lut_init.cpp +++ b/base/common/tests/date_lut_init.cpp @@ -1,7 +1,7 @@ #include /// Позволяет проверить время инициализации DateLUT. -int main(int argc, char ** argv) +int main(int, char **) { DateLUT::instance(); return 0; diff --git a/base/common/tests/gtest_find_symbols.cpp b/base/common/tests/gtest_find_symbols.cpp index f18b4df832a..79a7ed032df 100644 --- a/base/common/tests/gtest_find_symbols.cpp +++ b/base/common/tests/gtest_find_symbols.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -22,4 +23,16 @@ TEST(FindSymbols, SimpleTest) ASSERT_EQ(find_last_symbols_or_null<' '>(begin, end), end - 11); ASSERT_EQ(find_last_symbols_or_null<'H'>(begin, end), begin); ASSERT_EQ((find_last_symbols_or_null<'a', 'e'>(begin, end)), end - 4); + + { + std::vector vals; + splitInto<' ', ','>(vals, "hello, world", true); + ASSERT_EQ(vals, (std::vector{"hello", "world"})); + } + + { + std::vector vals; + splitInto<' ', ','>(vals, "s String", true); + ASSERT_EQ(vals, (std::vector{"s", "String"})); + } } diff --git a/base/common/tests/gtest_strong_typedef.cpp b/base/common/tests/gtest_strong_typedef.cpp index 6bf2f1eaad1..8ee9535ce81 100644 --- a/base/common/tests/gtest_strong_typedef.cpp +++ b/base/common/tests/gtest_strong_typedef.cpp @@ -54,7 +54,7 @@ TEST(StrongTypedefSuite, NoDefaultCtor) { struct NoDefaultCtor { - NoDefaultCtor(int i) {} + NoDefaultCtor(int) {} // NOLINT }; STRONG_TYPEDEF(NoDefaultCtor, MyStruct); diff --git a/base/common/Types.h b/base/common/types.h similarity index 100% rename from base/common/Types.h rename to base/common/types.h diff --git a/base/common/ya.make b/base/common/ya.make new file mode 100644 index 00000000000..a41c8f4c583 --- /dev/null +++ b/base/common/ya.make @@ -0,0 +1,12 @@ +LIBRARY() + +PEERDIR( + contrib/libs/poco/Util +) + +SRCS( + argsToConfig.cpp + coverage.cpp +) + +END() diff --git a/base/daemon/BaseDaemon.cpp b/base/daemon/BaseDaemon.cpp index 71a11964481..c3750f6feeb 100644 --- a/base/daemon/BaseDaemon.cpp +++ b/base/daemon/BaseDaemon.cpp @@ -99,12 +99,12 @@ static void writeSignalIDtoSignalPipe(int sig) } /** Signal handler for HUP / USR1 */ -static void closeLogsSignalHandler(int sig, siginfo_t * info, void * context) +static void closeLogsSignalHandler(int sig, siginfo_t *, void *) { writeSignalIDtoSignalPipe(sig); } -static void terminateRequestedSignalHandler(int sig, siginfo_t * info, void * context) +static void terminateRequestedSignalHandler(int sig, siginfo_t *, void *) { writeSignalIDtoSignalPipe(sig); } @@ -174,6 +174,10 @@ public: { int sig = 0; DB::readBinary(sig, in); + // We may log some specific signals afterwards, with different log + // levels and more info, but for completeness we log all signals + // here at trace level. + LOG_TRACE(log, "Received signal " << strsignal(sig) << " (" << sig << ")"); if (sig == Signals::StopThread) { @@ -362,19 +366,8 @@ void BaseDaemon::reloadConfiguration() } -BaseDaemon::BaseDaemon() +namespace { - checkRequiredInstructions(); -} - - -BaseDaemon::~BaseDaemon() -{ - writeSignalIDtoSignalPipe(SignalListener::StopThread); - signal_listener_thread.join(); - signal_pipe.close(); -} - enum class InstructionFail { @@ -388,7 +381,7 @@ enum class InstructionFail AVX512 = 7 }; -static std::string instructionFailToString(InstructionFail fail) +std::string instructionFailToString(InstructionFail fail) { switch (fail) { @@ -413,16 +406,16 @@ static std::string instructionFailToString(InstructionFail fail) } -static sigjmp_buf jmpbuf; +sigjmp_buf jmpbuf; -static void sigIllCheckHandler(int sig, siginfo_t * info, void * context) +void sigIllCheckHandler(int, siginfo_t *, void *) { siglongjmp(jmpbuf, 1); } /// Check if necessary sse extensions are available by trying to execute some sse instructions. /// If instruction is unavailable, SIGILL will be sent by kernel. -static void checkRequiredInstructions(volatile InstructionFail & fail) +void checkRequiredInstructionsImpl(volatile InstructionFail & fail) { #if __SSE3__ fail = InstructionFail::SSE3; @@ -463,8 +456,9 @@ static void checkRequiredInstructions(volatile InstructionFail & fail) fail = InstructionFail::NONE; } - -void BaseDaemon::checkRequiredInstructions() +/// Check SSE and others instructions availability +/// Calls exit on fail +void checkRequiredInstructions() { struct sigaction sa{}; struct sigaction sa_old{}; @@ -487,7 +481,7 @@ void BaseDaemon::checkRequiredInstructions() exit(1); } - ::checkRequiredInstructions(fail); + checkRequiredInstructionsImpl(fail); if (sigaction(signal, &sa_old, nullptr)) { @@ -496,6 +490,22 @@ void BaseDaemon::checkRequiredInstructions() } } +} + + +BaseDaemon::BaseDaemon() +{ + checkRequiredInstructions(); +} + + +BaseDaemon::~BaseDaemon() +{ + writeSignalIDtoSignalPipe(SignalListener::StopThread); + signal_listener_thread.join(); + signal_pipe.close(); +} + void BaseDaemon::terminate() { @@ -784,7 +794,9 @@ void BaseDaemon::initializeTerminationAndSignalProcessing() void BaseDaemon::logRevision() const { - Logger::root().information("Starting " + std::string{VERSION_FULL} + " with revision " + std::to_string(ClickHouseRevision::get())); + Logger::root().information("Starting " + std::string{VERSION_FULL} + + " with revision " + std::to_string(ClickHouseRevision::get()) + + ", PID " + std::to_string(getpid())); } /// Makes server shutdown if at least one Poco::Task have failed. diff --git a/base/daemon/BaseDaemon.h b/base/daemon/BaseDaemon.h index b7070c76e9b..f55dbb2a549 100644 --- a/base/daemon/BaseDaemon.h +++ b/base/daemon/BaseDaemon.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -128,7 +128,7 @@ public: /// close all process FDs except /// 0-2 -- stdin, stdout, stderr /// also doesn't close global internal pipes for signal handling - void closeFDs(); + static void closeFDs(); protected: /// Возвращает TaskManager приложения @@ -198,12 +198,6 @@ protected: std::string config_path; DB::ConfigProcessor::LoadedConfig loaded_config; Poco::Util::AbstractConfiguration * last_configuration = nullptr; - -private: - - /// Check SSE and others instructions availability - /// Calls exit on fail - void checkRequiredInstructions(); }; diff --git a/base/daemon/GraphiteWriter.cpp b/base/daemon/GraphiteWriter.cpp index eeb6b4c1705..f28019dec01 100644 --- a/base/daemon/GraphiteWriter.cpp +++ b/base/daemon/GraphiteWriter.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include diff --git a/base/ext/chrono_io.h b/base/ext/chrono_io.h index 8fa448b9e6a..40b76a96478 100644 --- a/base/ext/chrono_io.h +++ b/base/ext/chrono_io.h @@ -1,16 +1,30 @@ #pragma once #include +#include #include -#include +#include namespace ext { + inline std::string to_string(const std::time_t & time) + { + std::stringstream ss; + ss << std::put_time(std::localtime(&time), "%Y-%m-%d %X"); + return ss.str(); + } + template std::string to_string(const std::chrono::time_point & tp) { - return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp)); + // Don't use DateLUT because it shows weird characters for + // TimePoint::max(). I wish we could use C++20 format, but it's not + // there yet. + // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp)); + + auto in_time_t = std::chrono::system_clock::to_time_t(tp); + return to_string(in_time_t); } template > diff --git a/base/loggers/OwnSplitChannel.cpp b/base/loggers/OwnSplitChannel.cpp index 3b809c022b5..eda61e1105e 100644 --- a/base/loggers/OwnSplitChannel.cpp +++ b/base/loggers/OwnSplitChannel.cpp @@ -71,7 +71,8 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg) /// Also log to system.text_log table, if message is not too noisy - if (text_log_max_priority && msg.getPriority() <= text_log_max_priority) + auto text_log_max_priority_loaded = text_log_max_priority.load(std::memory_order_relaxed); + if (text_log_max_priority_loaded && msg.getPriority() <= text_log_max_priority_loaded) { TextLogElement elem; @@ -108,7 +109,7 @@ void OwnSplitChannel::addTextLog(std::shared_ptr log, int max_prior { std::lock_guard lock(text_log_mutex); text_log = log; - text_log_max_priority = max_priority; + text_log_max_priority.store(max_priority, std::memory_order_relaxed); } } diff --git a/base/loggers/OwnSplitChannel.h b/base/loggers/OwnSplitChannel.h index 78308e97ab7..ac313b383bb 100644 --- a/base/loggers/OwnSplitChannel.h +++ b/base/loggers/OwnSplitChannel.h @@ -33,7 +33,7 @@ private: std::mutex text_log_mutex; std::weak_ptr text_log; - int text_log_max_priority = -1; + std::atomic text_log_max_priority = -1; }; } diff --git a/base/mysqlxx/Value.h b/base/mysqlxx/Value.h index 2b3465d52d1..9fdb33a442d 100644 --- a/base/mysqlxx/Value.h +++ b/base/mysqlxx/Value.h @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include diff --git a/base/ya.make b/base/ya.make index 8b137891791..25ab5886b2c 100644 --- a/base/ya.make +++ b/base/ya.make @@ -1 +1,3 @@ - +RECURSE( + common +) diff --git a/cmake/find/cctz.cmake b/cmake/find/cctz.cmake deleted file mode 100644 index aae8078512d..00000000000 --- a/cmake/find/cctz.cmake +++ /dev/null @@ -1,23 +0,0 @@ -option (USE_INTERNAL_CCTZ_LIBRARY "Set to FALSE to use system cctz library instead of bundled" ${NOT_UNBUNDLED}) - -if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/cctz/include/cctz/time_zone.h") - if(USE_INTERNAL_CCTZ_LIBRARY) - message(WARNING "submodule contrib/cctz is missing. to fix try run: \n git submodule update --init --recursive") - endif() - set(USE_INTERNAL_CCTZ_LIBRARY 0) - set(MISSING_INTERNAL_CCTZ_LIBRARY 1) -endif() - -if (NOT USE_INTERNAL_CCTZ_LIBRARY) - find_library (CCTZ_LIBRARY cctz) - find_path (CCTZ_INCLUDE_DIR NAMES cctz/civil_time.h civil_time.h PATHS ${CCTZ_INCLUDE_PATHS}) -endif () - -if (CCTZ_LIBRARY AND CCTZ_INCLUDE_DIR) -elseif (NOT MISSING_INTERNAL_CCTZ_LIBRARY) - set (USE_INTERNAL_CCTZ_LIBRARY 1) - set (CCTZ_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/cctz/include") - set (CCTZ_LIBRARY cctz) -endif () - -message (STATUS "Using cctz: ${CCTZ_INCLUDE_DIR} : ${CCTZ_LIBRARY}") diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 46a9105cefe..9c0a0aac0dd 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -75,9 +75,7 @@ if (USE_INTERNAL_ZLIB_LIBRARY) endif () endif () -if (USE_INTERNAL_CCTZ_LIBRARY) - add_subdirectory (cctz-cmake) -endif () +add_subdirectory (cctz-cmake) add_subdirectory (jemalloc-cmake) diff --git a/contrib/cctz-cmake/CMakeLists.txt b/contrib/cctz-cmake/CMakeLists.txt index 9c2f6d9a658..5770b1ef09c 100644 --- a/contrib/cctz-cmake/CMakeLists.txt +++ b/contrib/cctz-cmake/CMakeLists.txt @@ -1,31 +1,50 @@ -SET(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/cctz) +option (USE_INTERNAL_CCTZ "Use internal cctz library" ${NOT_UNBUNDLED}) -add_library(cctz - ${LIBRARY_DIR}/src/civil_time_detail.cc - ${LIBRARY_DIR}/src/time_zone_fixed.cc - ${LIBRARY_DIR}/src/time_zone_format.cc - ${LIBRARY_DIR}/src/time_zone_if.cc - ${LIBRARY_DIR}/src/time_zone_impl.cc - ${LIBRARY_DIR}/src/time_zone_info.cc - ${LIBRARY_DIR}/src/time_zone_libc.cc - ${LIBRARY_DIR}/src/time_zone_lookup.cc - ${LIBRARY_DIR}/src/time_zone_posix.cc - ${LIBRARY_DIR}/src/zone_info_source.cc +if (USE_INTERNAL_CCTZ) + SET(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/cctz) - ${LIBRARY_DIR}/src/time_zone_libc.h - ${LIBRARY_DIR}/src/time_zone_if.h - ${LIBRARY_DIR}/src/tzfile.h - ${LIBRARY_DIR}/src/time_zone_impl.h - ${LIBRARY_DIR}/src/time_zone_posix.h - ${LIBRARY_DIR}/src/time_zone_info.h + SET (SRCS + ${LIBRARY_DIR}/src/civil_time_detail.cc + ${LIBRARY_DIR}/src/time_zone_fixed.cc + ${LIBRARY_DIR}/src/time_zone_format.cc + ${LIBRARY_DIR}/src/time_zone_if.cc + ${LIBRARY_DIR}/src/time_zone_impl.cc + ${LIBRARY_DIR}/src/time_zone_info.cc + ${LIBRARY_DIR}/src/time_zone_libc.cc + ${LIBRARY_DIR}/src/time_zone_lookup.cc + ${LIBRARY_DIR}/src/time_zone_posix.cc + ${LIBRARY_DIR}/src/zone_info_source.cc + ) - ${LIBRARY_DIR}/include/cctz/time_zone.h - ${LIBRARY_DIR}/include/cctz/civil_time_detail.h - ${LIBRARY_DIR}/include/cctz/civil_time.h) + add_library (cctz ${SRCS}) + target_include_directories (cctz PUBLIC ${LIBRARY_DIR}/include) -if (CMAKE_SYSTEM MATCHES "FreeBSD") - # yes, need linux, because bsd check inside linux in time_zone_libc.cc:24 - target_compile_definitions (cctz PRIVATE __USE_BSD linux _XOPEN_SOURCE=600) + if (OS_FREEBSD) + # yes, need linux, because bsd check inside linux in time_zone_libc.cc:24 + target_compile_definitions (cctz PRIVATE __USE_BSD linux _XOPEN_SOURCE=600) + endif () +else () + find_library (LIBRARY_CCTZ cctz) + find_path (INCLUDE_CCTZ NAMES cctz/civil_time.h) + + add_library (cctz UNKNOWN IMPORTED) + set_property (TARGET cctz PROPERTY IMPORTED_LOCATION ${LIBRARY_CCTZ}) + set_property (TARGET cctz PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INCLUDE_CCTZ}) + + set(CMAKE_REQUIRED_LIBRARIES cctz) + check_cxx_source_compiles( + " + #include + int main() { + cctz::civil_day date; + } + " + EXTERNAL_CCTZ_WORKS + ) + + if (NOT EXTERNAL_CCTZ_WORKS) + message (FATAL_ERROR "cctz is unusable: ${LIBRARY_CCTZ} ${INCLUDE_CCTZ}") + endif () endif () -target_include_directories (cctz PUBLIC ${LIBRARY_DIR}/include) +message (STATUS "Using cctz") diff --git a/contrib/poco b/contrib/poco index 5da7ce676c2..ddca76ba495 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit 5da7ce676c20e36ee9a2547eb48a50083b3d5237 +Subproject commit ddca76ba4956cb57150082394536cc43ff28f6fa diff --git a/contrib/replxx b/contrib/replxx index 07cbfbec550..f1332626639 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit 07cbfbec550133b88c91c4073fa5af2ae2ae6a9a +Subproject commit f1332626639d6492eaf170758642da14fbbda7bf diff --git a/dbms/programs/client/Client.cpp b/dbms/programs/client/Client.cpp index 24afea67090..7f1bc49c759 100644 --- a/dbms/programs/client/Client.cpp +++ b/dbms/programs/client/Client.cpp @@ -281,7 +281,7 @@ private: } /// Should we celebrate a bit? - bool isNewYearMode() + static bool isNewYearMode() { time_t current_time = time(nullptr); @@ -294,7 +294,7 @@ private: || (now.month() == 1 && now.day() <= 5); } - bool isChineseNewYearMode(const String & local_tz) + static bool isChineseNewYearMode(const String & local_tz) { /// Days of Dec. 20 in Chinese calendar starting from year 2019 to year 2105 static constexpr UInt16 chineseNewYearIndicators[] @@ -481,8 +481,6 @@ private: if (server_revision >= Suggest::MIN_SERVER_REVISION && !config().getBool("disable_suggestion", false)) { - if (config().has("case_insensitive_suggestion")) - Suggest::instance().setCaseInsensitive(); /// Load suggestion data from the server. Suggest::instance().load(connection_parameters, config().getInt("suggestion_limit")); } @@ -1594,7 +1592,7 @@ private: std::cout << "Ok." << std::endl; } - void showClientVersion() + static void showClientVersion() { std::cout << DBMS_NAME << " client version " << VERSION_STRING << VERSION_OFFICIAL << "." << std::endl; } @@ -1720,7 +1718,6 @@ public: ("always_load_suggestion_data", "Load suggestion data even if clickhouse-client is run in non-interactive mode. Used for testing.") ("suggestion_limit", po::value()->default_value(10000), "Suggestion limit for how many databases, tables and columns to fetch.") - ("case_insensitive_suggestion", "Case sensitive suggestions.") ("multiline,m", "multiline") ("multiquery,n", "multiquery") ("format,f", po::value(), "default output format") diff --git a/dbms/programs/client/Suggest.cpp b/dbms/programs/client/Suggest.cpp index 9058bafd138..f7141449f54 100644 --- a/dbms/programs/client/Suggest.cpp +++ b/dbms/programs/client/Suggest.cpp @@ -50,16 +50,15 @@ void Suggest::load(const ConnectionParameters & connection_parameters, size_t su /// Note that keyword suggestions are available even if we cannot load data from server. - if (case_insensitive) - std::sort(words.begin(), words.end(), [](const std::string & str1, const std::string & str2) + std::sort(words.begin(), words.end()); + words_no_case = words; + std::sort(words_no_case.begin(), words_no_case.end(), [](const std::string & str1, const std::string & str2) + { + return std::lexicographical_compare(begin(str1), end(str1), begin(str2), end(str2), [](const char char1, const char char2) { - return std::lexicographical_compare(begin(str1), end(str1), begin(str2), end(str2), [](const char char1, const char char2) - { - return std::tolower(char1) < std::tolower(char2); - }); + return std::tolower(char1) < std::tolower(char2); }); - else - std::sort(words.begin(), words.end()); + }); ready = true; }); diff --git a/dbms/programs/client/Suggest.h b/dbms/programs/client/Suggest.h index a4d357d09b3..6c81a388ea7 100644 --- a/dbms/programs/client/Suggest.h +++ b/dbms/programs/client/Suggest.h @@ -23,9 +23,6 @@ public: return instance; } - /// Need to set before load - void setCaseInsensitive() { case_insensitive = true; } - void load(const ConnectionParameters & connection_parameters, size_t suggestion_limit); /// Older server versions cannot execute the query above. diff --git a/dbms/programs/copier/ClusterCopier.cpp b/dbms/programs/copier/ClusterCopier.cpp index 35c112da4d1..4431362913d 100644 --- a/dbms/programs/copier/ClusterCopier.cpp +++ b/dbms/programs/copier/ClusterCopier.cpp @@ -499,7 +499,10 @@ ASTPtr ClusterCopier::removeAliasColumnsFromCreateQuery(const ASTPtr & query_ast return new_query_ast; } -std::shared_ptr ClusterCopier::rewriteCreateQueryStorage(const ASTPtr & create_query_ast, const DatabaseAndTableName & new_table, const ASTPtr & new_storage_ast) + +/// Replaces ENGINE and table name in a create query +static std::shared_ptr rewriteCreateQueryStorage( + const ASTPtr & create_query_ast, const DatabaseAndTableName & new_table, const ASTPtr & new_storage_ast) { const auto & create = create_query_ast->as(); auto res = std::make_shared(create); @@ -1178,7 +1181,7 @@ String ClusterCopier::getRemoteCreateTable(const DatabaseAndTableName & table, C ASTPtr ClusterCopier::getCreateTableForPullShard(const ConnectionTimeouts & timeouts, TaskShard & task_shard) { /// Fetch and parse (possibly) new definition - auto connection_entry = task_shard.info.pool->get(timeouts, &task_cluster->settings_pull); + auto connection_entry = task_shard.info.pool->get(timeouts, &task_cluster->settings_pull, true); String create_query_pull_str = getRemoteCreateTable( task_shard.task_table.table_pull, *connection_entry, diff --git a/dbms/programs/copier/ClusterCopier.h b/dbms/programs/copier/ClusterCopier.h index 90a003a2528..2b7b4fff9e2 100644 --- a/dbms/programs/copier/ClusterCopier.h +++ b/dbms/programs/copier/ClusterCopier.h @@ -88,11 +88,6 @@ protected: /// Removes MATERIALIZED and ALIAS columns from create table query static ASTPtr removeAliasColumnsFromCreateQuery(const ASTPtr &query_ast); - /// Replaces ENGINE and table name in a create query - std::shared_ptr - rewriteCreateQueryStorage(const ASTPtr & create_query_ast, const DatabaseAndTableName & new_table, - const ASTPtr & new_storage_ast); - bool tryDropPartition(ShardPartition & task_partition, const zkutil::ZooKeeperPtr & zookeeper, const CleanStateClock & clean_state_clock); @@ -119,8 +114,7 @@ protected: Connection & connection, const Settings * settings = nullptr); - ASTPtr getCreateTableForPullShard(const ConnectionTimeouts & timeouts, - TaskShard & task_shard); + ASTPtr getCreateTableForPullShard(const ConnectionTimeouts & timeouts, TaskShard & task_shard); void createShardInternalTables(const ConnectionTimeouts & timeouts, TaskShard & task_shard, diff --git a/dbms/programs/copier/Internals.h b/dbms/programs/copier/Internals.h index 59184ec8c0d..c47f42417d3 100644 --- a/dbms/programs/copier/Internals.h +++ b/dbms/programs/copier/Internals.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/programs/local/LocalServer.cpp b/dbms/programs/local/LocalServer.cpp index eded06416f7..2d93c792350 100644 --- a/dbms/programs/local/LocalServer.cpp +++ b/dbms/programs/local/LocalServer.cpp @@ -118,6 +118,20 @@ void LocalServer::tryInitPath() } +static void attachSystemTables() +{ + DatabasePtr system_database = DatabaseCatalog::instance().tryGetDatabase(DatabaseCatalog::SYSTEM_DATABASE); + if (!system_database) + { + /// TODO: add attachTableDelayed into DatabaseMemory to speedup loading + system_database = std::make_shared(DatabaseCatalog::SYSTEM_DATABASE); + DatabaseCatalog::instance().attachDatabase(DatabaseCatalog::SYSTEM_DATABASE, system_database); + } + + attachSystemTablesLocal(*system_database); +} + + int LocalServer::main(const std::vector & /*args*/) try { @@ -248,20 +262,6 @@ std::string LocalServer::getInitialCreateTableQuery() } -void LocalServer::attachSystemTables() -{ - DatabasePtr system_database = DatabaseCatalog::instance().tryGetDatabase(DatabaseCatalog::SYSTEM_DATABASE); - if (!system_database) - { - /// TODO: add attachTableDelayed into DatabaseMemory to speedup loading - system_database = std::make_shared(DatabaseCatalog::SYSTEM_DATABASE); - DatabaseCatalog::instance().attachDatabase(DatabaseCatalog::SYSTEM_DATABASE, system_database); - } - - attachSystemTablesLocal(*system_database); -} - - void LocalServer::processQueries() { String initial_create_query = getInitialCreateTableQuery(); @@ -375,7 +375,7 @@ static void showClientVersion() std::cout << DBMS_NAME << " client version " << VERSION_STRING << VERSION_OFFICIAL << "." << '\n'; } -std::string LocalServer::getHelpHeader() const +static std::string getHelpHeader() { return "usage: clickhouse-local [initial table definition] [--query ]\n" @@ -390,7 +390,7 @@ std::string LocalServer::getHelpHeader() const "Either through corresponding command line parameters --table --structure --input-format and --file."; } -std::string LocalServer::getHelpFooter() const +static std::string getHelpFooter() { return "Example printing memory used by each Unix user:\n" diff --git a/dbms/programs/local/LocalServer.h b/dbms/programs/local/LocalServer.h index a79ab484107..874319c1ea5 100644 --- a/dbms/programs/local/LocalServer.h +++ b/dbms/programs/local/LocalServer.h @@ -37,13 +37,9 @@ private: void tryInitPath(); void applyCmdOptions(); void applyCmdSettings(); - void attachSystemTables(); void processQueries(); void setupUsers(); - std::string getHelpHeader() const; - std::string getHelpFooter() const; - protected: std::unique_ptr context; diff --git a/dbms/programs/obfuscator/Obfuscator.cpp b/dbms/programs/obfuscator/Obfuscator.cpp index 7251c0a4473..0352eba0a0a 100644 --- a/dbms/programs/obfuscator/Obfuscator.cpp +++ b/dbms/programs/obfuscator/Obfuscator.cpp @@ -181,7 +181,7 @@ private: UInt64 seed; public: - UnsignedIntegerModel(UInt64 seed_) : seed(seed_) {} + explicit UnsignedIntegerModel(UInt64 seed_) : seed(seed_) {} void train(const IColumn &) override {} void finalize() override {} @@ -222,7 +222,7 @@ private: UInt64 seed; public: - SignedIntegerModel(UInt64 seed_) : seed(seed_) {} + explicit SignedIntegerModel(UInt64 seed_) : seed(seed_) {} void train(const IColumn &) override {} void finalize() override {} @@ -271,7 +271,7 @@ private: Float res_prev_value = 0; public: - FloatModel(UInt64 seed_) : seed(seed_) {} + explicit FloatModel(UInt64 seed_) : seed(seed_) {} void train(const IColumn &) override {} void finalize() override {} @@ -372,7 +372,7 @@ private: UInt64 seed; public: - FixedStringModel(UInt64 seed_) : seed(seed_) {} + explicit FixedStringModel(UInt64 seed_) : seed(seed_) {} void train(const IColumn &) override {} void finalize() override {} @@ -414,7 +414,7 @@ private: const DateLUTImpl & date_lut; public: - DateTimeModel(UInt64 seed_) : seed(seed_), date_lut(DateLUT::instance()) {} + explicit DateTimeModel(UInt64 seed_) : seed(seed_), date_lut(DateLUT::instance()) {} void train(const IColumn &) override {} void finalize() override {} @@ -529,13 +529,13 @@ private: static constexpr CodePoint END = -2; - NGramHash hashContext(const CodePoint * begin, const CodePoint * end) const + static NGramHash hashContext(const CodePoint * begin, const CodePoint * end) { return CRC32Hash()(StringRef(reinterpret_cast(begin), (end - begin) * sizeof(CodePoint))); } /// By the way, we don't have to use actual Unicode numbers. We use just arbitrary bijective mapping. - CodePoint readCodePoint(const char *& pos, const char * end) + static CodePoint readCodePoint(const char *& pos, const char * end) { size_t length = UTF8::seqLength(*pos); @@ -550,7 +550,7 @@ private: return res; } - bool writeCodePoint(CodePoint code, char *& pos, const char * end) + static bool writeCodePoint(CodePoint code, char *& pos, const char * end) { size_t length = (code & 0xFF000000) ? 4 @@ -567,7 +567,7 @@ private: } public: - MarkovModel(MarkovModelParameters params_) + explicit MarkovModel(MarkovModelParameters params_) : params(std::move(params_)), code_points(params.order, BEGIN) {} void consume(const char * data, size_t size) @@ -677,7 +677,7 @@ public: if (!histogram.total) continue; - double average = histogram.total / histogram.buckets.size(); + double average = double(histogram.total) / histogram.buckets.size(); UInt64 new_total = 0; for (auto & bucket : histogram.buckets) @@ -836,7 +836,7 @@ private: ModelPtr nested_model; public: - ArrayModel(ModelPtr nested_model_) : nested_model(std::move(nested_model_)) {} + explicit ArrayModel(ModelPtr nested_model_) : nested_model(std::move(nested_model_)) {} void train(const IColumn & column) override { @@ -874,7 +874,7 @@ private: ModelPtr nested_model; public: - NullableModel(ModelPtr nested_model_) : nested_model(std::move(nested_model_)) {} + explicit NullableModel(ModelPtr nested_model_) : nested_model(std::move(nested_model_)) {} void train(const IColumn & column) override { diff --git a/dbms/programs/odbc-bridge/ODBCBridge.cpp b/dbms/programs/odbc-bridge/ODBCBridge.cpp index 565ee5602ca..9e1e8400cc1 100644 --- a/dbms/programs/odbc-bridge/ODBCBridge.cpp +++ b/dbms/programs/odbc-bridge/ODBCBridge.cpp @@ -5,6 +5,13 @@ #include #include #include + +#if USE_POCO_SQLODBC || USE_POCO_DATAODBC +// It doesn't make much sense to build this bridge without ODBC, but we +// still do this. +#include +#endif + #include #include #include @@ -111,7 +118,7 @@ void ODBCBridge::defineOptions(Poco::Util::OptionSet & options) .binding("help") .callback(Poco::Util::OptionCallback(this, &Me::handleHelp))); - ServerApplication::defineOptions(options); /// Don't need complex BaseDaemon's .xml config + ServerApplication::defineOptions(options); // NOLINT Don't need complex BaseDaemon's .xml config } void ODBCBridge::initialize(Application & self) @@ -126,6 +133,8 @@ void ODBCBridge::initialize(Application & self) buildLoggers(config(), logger(), self.commandName()); + BaseDaemon::logRevision(); + log = &logger(); hostname = config().getString("listen-host", "localhost"); port = config().getUInt("http-port"); @@ -138,7 +147,13 @@ void ODBCBridge::initialize(Application & self) initializeTerminationAndSignalProcessing(); - ServerApplication::initialize(self); +#if USE_POCO_SQLODBC || USE_POCO_DATAODBC + // It doesn't make much sense to build this bridge without ODBC, but we + // still do this. + Poco::Data::ODBC::Connector::registerConnector(); +#endif + + ServerApplication::initialize(self); // NOLINT } void ODBCBridge::uninitialize() diff --git a/dbms/programs/performance-test/ConfigPreprocessor.cpp b/dbms/programs/performance-test/ConfigPreprocessor.cpp index 850fd9f14c6..471a796e9f3 100644 --- a/dbms/programs/performance-test/ConfigPreprocessor.cpp +++ b/dbms/programs/performance-test/ConfigPreprocessor.cpp @@ -41,7 +41,7 @@ void ConfigPreprocessor::removeConfigurationsIf( std::vector & configs, ConfigPreprocessor::FilterType filter_type, const Strings & values, - bool leave) const + bool leave) { auto checker = [&filter_type, &values, &leave] (XMLConfigurationPtr & config) { diff --git a/dbms/programs/performance-test/ConfigPreprocessor.h b/dbms/programs/performance-test/ConfigPreprocessor.h index 375bf9503cb..dac59405799 100644 --- a/dbms/programs/performance-test/ConfigPreprocessor.h +++ b/dbms/programs/performance-test/ConfigPreprocessor.h @@ -39,11 +39,11 @@ private: /// Removes configurations that has a given value. /// If leave is true, the logic is reversed. - void removeConfigurationsIf( + static void removeConfigurationsIf( std::vector & configs, FilterType filter_type, const Strings & values, - bool leave = false) const; + bool leave = false); const Strings paths; }; diff --git a/dbms/programs/performance-test/PerformanceTest.cpp b/dbms/programs/performance-test/PerformanceTest.cpp index c71760a1e58..53adab75fc4 100644 --- a/dbms/programs/performance-test/PerformanceTest.cpp +++ b/dbms/programs/performance-test/PerformanceTest.cpp @@ -52,8 +52,6 @@ void waitQuery(Connection & connection) } } -namespace fs = std::filesystem; - PerformanceTest::PerformanceTest( const XMLConfigurationPtr & config_, Connection & connection_, diff --git a/dbms/programs/performance-test/ReportBuilder.cpp b/dbms/programs/performance-test/ReportBuilder.cpp index ef4417f1713..13363609ca5 100644 --- a/dbms/programs/performance-test/ReportBuilder.cpp +++ b/dbms/programs/performance-test/ReportBuilder.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include @@ -40,7 +40,7 @@ ReportBuilder::ReportBuilder(const std::string & server_version_) { } -std::string ReportBuilder::getCurrentTime() const +static std::string getCurrentTime() { return DateLUT::instance().timeToString(time(nullptr)); } @@ -119,7 +119,7 @@ std::string ReportBuilder::buildFullReport( if (statistics.sampler.size() != 0) { JSONString quantiles(4); /// here, 4 is the size of \t padding - for (double percent = 10; percent <= 90; percent += 10) + for (int percent = 10; percent <= 90; percent += 10) { std::string quantile_key = std::to_string(percent / 100.0); while (quantile_key.back() == '0') @@ -163,7 +163,7 @@ std::string ReportBuilder::buildFullReport( std::string ReportBuilder::buildCompactReport( const PerformanceTestInfo & test_info, std::vector & stats, - const std::vector & queries_to_run) const + const std::vector & queries_to_run) { FormatSettings settings; std::ostringstream output; diff --git a/dbms/programs/performance-test/ReportBuilder.h b/dbms/programs/performance-test/ReportBuilder.h index 473ba42b728..531680a6456 100644 --- a/dbms/programs/performance-test/ReportBuilder.h +++ b/dbms/programs/performance-test/ReportBuilder.h @@ -10,16 +10,16 @@ class ReportBuilder { public: ReportBuilder(const std::string & server_version_); + std::string buildFullReport( const PerformanceTestInfo & test_info, std::vector & stats, const std::vector & queries_to_run) const; - - std::string buildCompactReport( + static std::string buildCompactReport( const PerformanceTestInfo & test_info, std::vector & stats, - const std::vector & queries_to_run) const; + const std::vector & queries_to_run); private: std::string server_version; @@ -27,10 +27,6 @@ private: size_t num_cores; size_t num_threads; size_t ram; - -private: - std::string getCurrentTime() const; - }; } diff --git a/dbms/programs/performance-test/TestStats.cpp b/dbms/programs/performance-test/TestStats.cpp index 5268f8bb328..024130ce4d2 100644 --- a/dbms/programs/performance-test/TestStats.cpp +++ b/dbms/programs/performance-test/TestStats.cpp @@ -17,9 +17,9 @@ std::string TestStats::getStatisticByName(const std::string & statistic_name) { std::string result = "\n"; - for (double percent = 10; percent <= 90; percent += 10) + for (int percent = 10; percent <= 90; percent += 10) { - result += FOUR_SPACES + std::to_string((percent / 100)); + result += FOUR_SPACES + std::to_string((percent / 100.0)); result += ": " + std::to_string(sampler.quantileInterpolated(percent / 100.0)); result += "\n"; } diff --git a/dbms/programs/server/HTTPHandler.cpp b/dbms/programs/server/HTTPHandler.cpp index 0b0f73fcdc5..0d447a56740 100644 --- a/dbms/programs/server/HTTPHandler.cpp +++ b/dbms/programs/server/HTTPHandler.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/programs/server/HTTPHandler.h b/dbms/programs/server/HTTPHandler.h index fb6c9fb532c..681a3cce932 100644 --- a/dbms/programs/server/HTTPHandler.h +++ b/dbms/programs/server/HTTPHandler.h @@ -77,7 +77,7 @@ private: Poco::Net::HTTPServerResponse & response, Output & used_output); - void pushDelayedResults(Output & used_output); + static void pushDelayedResults(Output & used_output); }; } diff --git a/dbms/programs/server/MySQLHandler.h b/dbms/programs/server/MySQLHandler.h index cbd9233651a..ca5d045beb0 100644 --- a/dbms/programs/server/MySQLHandler.h +++ b/dbms/programs/server/MySQLHandler.h @@ -1,7 +1,7 @@ #pragma once #include #include -#include +#include #include #include #include "IServer.h" diff --git a/dbms/programs/server/Server.cpp b/dbms/programs/server/Server.cpp index 81c2de8ce3a..86f65fb09f1 100644 --- a/dbms/programs/server/Server.cpp +++ b/dbms/programs/server/Server.cpp @@ -27,7 +27,7 @@ #include #include #include "config_core.h" -#include +#include #include #include #include @@ -175,7 +175,7 @@ int Server::run() std::cout << DBMS_NAME << " server version " << VERSION_STRING << VERSION_OFFICIAL << "." << std::endl; return 0; } - return Application::run(); + return Application::run(); // NOLINT } void Server::initialize(Poco::Util::Application & self) diff --git a/dbms/programs/server/TCPHandler.cpp b/dbms/programs/server/TCPHandler.cpp index cb2df69d5d1..a5ecf2963ea 100644 --- a/dbms/programs/server/TCPHandler.cpp +++ b/dbms/programs/server/TCPHandler.cpp @@ -162,7 +162,7 @@ void TCPHandler::runImpl() std::optional exception; bool network_error = false; - bool send_exception_with_stack_trace = connection_context.getSettingsRef().calculate_text_stack_trace; + bool send_exception_with_stack_trace = true; try { @@ -647,8 +647,6 @@ void TCPHandler::processOrdinaryQueryWithProcessors(size_t num_threads) */ if (!isQueryCancelled()) { - pipeline.finalize(); - sendTotals(lazy_format->getTotals()); sendExtremes(lazy_format->getExtremes()); sendProfileInfo(lazy_format->getProfileInfo()); @@ -1032,7 +1030,7 @@ void TCPHandler::receiveUnexpectedData() last_block_in.header, client_revision); - Block skip_block = skip_block_in->read(); + skip_block_in->read(); throw NetException("Unexpected packet Data received from client", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT); } diff --git a/dbms/programs/server/TCPHandler.h b/dbms/programs/server/TCPHandler.h index e3783ac282a..cac9c8dd2fe 100644 --- a/dbms/programs/server/TCPHandler.h +++ b/dbms/programs/server/TCPHandler.h @@ -2,7 +2,7 @@ #include -#include +#include #include #include #include diff --git a/dbms/src/Access/AccessRights.cpp b/dbms/src/Access/AccessRights.cpp index 5b66538e876..80de185ed8f 100644 --- a/dbms/src/Access/AccessRights.cpp +++ b/dbms/src/Access/AccessRights.cpp @@ -75,6 +75,9 @@ public: Node & operator =(const Node & src) { + if (this == &src) + return *this; + node_name = src.node_name; level = src.level; inherited_access = src.inherited_access; @@ -158,7 +161,7 @@ public: void revoke(const AccessFlags & access_to_revoke, const Helper & helper) { if constexpr (mode == NORMAL_REVOKE_MODE) - { + { // NOLINT if (level == TABLE_LEVEL) removeExplicitGrantsRec(access_to_revoke); else @@ -166,11 +169,12 @@ public: } else if constexpr (mode == PARTIAL_REVOKE_MODE) { - AccessFlags new_partial_revokes = access_to_revoke - explicit_grants; if (level == TABLE_LEVEL) removeExplicitGrantsRec(access_to_revoke); else removeExplicitGrants(access_to_revoke); + + AccessFlags new_partial_revokes = access_to_revoke - explicit_grants; removePartialRevokesRec(new_partial_revokes); partial_revokes |= new_partial_revokes; } diff --git a/dbms/src/Access/IAccessStorage.cpp b/dbms/src/Access/IAccessStorage.cpp index 236f1325beb..3dfc3e232ba 100644 --- a/dbms/src/Access/IAccessStorage.cpp +++ b/dbms/src/Access/IAccessStorage.cpp @@ -379,7 +379,7 @@ void IAccessStorage::throwNotFound(std::type_index type, const String & name) co } -void IAccessStorage::throwBadCast(const UUID & id, std::type_index type, const String & name, std::type_index required_type) const +void IAccessStorage::throwBadCast(const UUID & id, std::type_index type, const String & name, std::type_index required_type) { throw Exception( "ID {" + toString(id) + "}: " + getTypeName(type) + backQuote(name) + " expected to be of type " + getTypeName(required_type), diff --git a/dbms/src/Access/IAccessStorage.h b/dbms/src/Access/IAccessStorage.h index 505c635aac8..30a1a6bdc32 100644 --- a/dbms/src/Access/IAccessStorage.h +++ b/dbms/src/Access/IAccessStorage.h @@ -151,8 +151,9 @@ protected: static String getTypeName(std::type_index type) { return IAccessEntity::getTypeName(type); } [[noreturn]] void throwNotFound(const UUID & id) const; [[noreturn]] void throwNotFound(std::type_index type, const String & name) const; - [[noreturn]] void throwBadCast(const UUID & id, std::type_index type, const String & name, std::type_index required_type) const; - [[noreturn]] void throwIDCollisionCannotInsert(const UUID & id, std::type_index type, const String & name, std::type_index existing_type, const String & existing_name) const; + [[noreturn]] static void throwBadCast(const UUID & id, std::type_index type, const String & name, std::type_index required_type); + [[noreturn]] void throwIDCollisionCannotInsert( + const UUID & id, std::type_index type, const String & name, std::type_index existing_type, const String & existing_name) const; [[noreturn]] void throwNameCollisionCannotInsert(std::type_index type, const String & name) const; [[noreturn]] void throwNameCollisionCannotRename(std::type_index type, const String & old_name, const String & new_name) const; [[noreturn]] void throwReadonlyCannotInsert(std::type_index type, const String & name) const; diff --git a/dbms/src/Access/QuotaContext.cpp b/dbms/src/Access/QuotaContext.cpp index 815d9440eaa..a48c41dc419 100644 --- a/dbms/src/Access/QuotaContext.cpp +++ b/dbms/src/Access/QuotaContext.cpp @@ -135,6 +135,9 @@ struct QuotaContext::Impl QuotaContext::Interval & QuotaContext::Interval::operator =(const Interval & src) { + if (this == &src) + return *this; + randomize_interval = src.randomize_interval; duration = src.duration; end_of_interval.store(src.end_of_interval.load()); diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp index e188116cc2c..0c1cb1d0d36 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.cpp @@ -87,15 +87,11 @@ AggregateFunctionPtr createAggregateFunctionGroupArraySample(const std::string & { assertUnary(name, argument_types); - UInt64 max_elems = std::numeric_limits::max(); - UInt64 seed = 123456; - - UInt64 * params[2] = {&max_elems, &seed}; if (parameters.size() != 1 && parameters.size() != 2) throw Exception("Incorrect number of parameters for aggregate function " + name + ", should be 1 or 2", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - for (auto i = 0ul; i < parameters.size(); ++i) + auto get_parameter = [&](size_t i) { auto type = parameters[i].getType(); if (type != Field::Types::Int64 && type != Field::Types::UInt64) @@ -105,8 +101,16 @@ AggregateFunctionPtr createAggregateFunctionGroupArraySample(const std::string & (type == Field::Types::UInt64 && parameters[i].get() == 0)) throw Exception("Parameter for aggregate function " + name + " should be positive number", ErrorCodes::BAD_ARGUMENTS); - *params[i] = parameters[i].get(); - } + return parameters[i].get(); + }; + + UInt64 max_elems = get_parameter(0); + + UInt64 seed; + if (parameters.size() >= 2) + seed = get_parameter(1); + else + seed = thread_local_rng(); return createAggregateFunctionGroupArrayImpl>(argument_types[0], max_elems, seed); } diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.h index afee81a8d66..2d345cff1f7 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupArray.h @@ -17,7 +17,6 @@ #include #include -#include #define AGGREGATE_FUNCTION_GROUP_ARRAY_MAX_ARRAY_SIZE 0xFFFFFF diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.cpp b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.cpp index e7d6ea2528c..64a8200aa52 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.cpp +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.cpp @@ -25,7 +25,8 @@ template class AggregateFunctionGroupUniqArrayDate : public AggregateFunctionGroupUniqArray { public: - AggregateFunctionGroupUniqArrayDate(const DataTypePtr & argument_type, UInt64 max_elems_ = std::numeric_limits::max()) : AggregateFunctionGroupUniqArray(argument_type, max_elems_) {} + explicit AggregateFunctionGroupUniqArrayDate(const DataTypePtr & argument_type, UInt64 max_elems_ = std::numeric_limits::max()) + : AggregateFunctionGroupUniqArray(argument_type, max_elems_) {} DataTypePtr getReturnType() const override { return std::make_shared(std::make_shared()); } }; @@ -33,7 +34,8 @@ template class AggregateFunctionGroupUniqArrayDateTime : public AggregateFunctionGroupUniqArray { public: - AggregateFunctionGroupUniqArrayDateTime(const DataTypePtr & argument_type, UInt64 max_elems_ = std::numeric_limits::max()) : AggregateFunctionGroupUniqArray(argument_type, max_elems_) {} + explicit AggregateFunctionGroupUniqArrayDateTime(const DataTypePtr & argument_type, UInt64 max_elems_ = std::numeric_limits::max()) + : AggregateFunctionGroupUniqArray(argument_type, max_elems_) {} DataTypePtr getReturnType() const override { return std::make_shared(std::make_shared()); } }; diff --git a/dbms/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.cpp b/dbms/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.cpp index 64f37cd2e14..44631d5832a 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.cpp +++ b/dbms/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.cpp @@ -56,7 +56,7 @@ AggregateFunctionPtr createAggregateFunctionSimpleLinearRegression( FOR_LEASTSQR_TYPES_2(M, Float64) #define DISPATCH(T1, T2) \ if (which_x.idx == TypeIndex::T1 && which_y.idx == TypeIndex::T2) \ - return std::make_shared>( \ + return std::make_shared>(/* NOLINT */ \ arguments, \ params \ ); diff --git a/dbms/src/AggregateFunctions/QuantileTiming.h b/dbms/src/AggregateFunctions/QuantileTiming.h index 8dde3dd5383..d7f425ee2d7 100644 --- a/dbms/src/AggregateFunctions/QuantileTiming.h +++ b/dbms/src/AggregateFunctions/QuantileTiming.h @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB diff --git a/dbms/src/AggregateFunctions/ReservoirSampler.h b/dbms/src/AggregateFunctions/ReservoirSampler.h index 648707ae940..b61027ce692 100644 --- a/dbms/src/AggregateFunctions/ReservoirSampler.h +++ b/dbms/src/AggregateFunctions/ReservoirSampler.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/src/AggregateFunctions/ReservoirSamplerDeterministic.h b/dbms/src/AggregateFunctions/ReservoirSamplerDeterministic.h index 52d0181fce1..a520b8236b7 100644 --- a/dbms/src/AggregateFunctions/ReservoirSamplerDeterministic.h +++ b/dbms/src/AggregateFunctions/ReservoirSamplerDeterministic.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/src/AggregateFunctions/UniquesHashSet.h b/dbms/src/AggregateFunctions/UniquesHashSet.h index d1df7b0df0d..13c59cd5225 100644 --- a/dbms/src/AggregateFunctions/UniquesHashSet.h +++ b/dbms/src/AggregateFunctions/UniquesHashSet.h @@ -2,7 +2,7 @@ #include -#include +#include #include #include diff --git a/dbms/src/Client/ConnectionPoolWithFailover.cpp b/dbms/src/Client/ConnectionPoolWithFailover.cpp index 03387c32099..52bd73c9de4 100644 --- a/dbms/src/Client/ConnectionPoolWithFailover.cpp +++ b/dbms/src/Client/ConnectionPoolWithFailover.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include #include diff --git a/dbms/src/Client/ConnectionPoolWithFailover.h b/dbms/src/Client/ConnectionPoolWithFailover.h index 78073b52108..bdc06656ff1 100644 --- a/dbms/src/Client/ConnectionPoolWithFailover.h +++ b/dbms/src/Client/ConnectionPoolWithFailover.h @@ -44,8 +44,8 @@ public: /** Allocates connection to work. */ Entry get(const ConnectionTimeouts & timeouts, - const Settings * settings = nullptr, - bool force_connected = true) override; /// From IConnectionPool + const Settings * settings, + bool force_connected) override; /// From IConnectionPool /** Allocates up to the specified number of connections to work. * Connections provide access to different replicas of one shard. diff --git a/dbms/src/Client/MultiplexedConnections.cpp b/dbms/src/Client/MultiplexedConnections.cpp index 9bc9303c1c5..b0ff2104ab1 100644 --- a/dbms/src/Client/MultiplexedConnections.cpp +++ b/dbms/src/Client/MultiplexedConnections.cpp @@ -1,5 +1,7 @@ #include #include +#include + namespace DB { @@ -308,10 +310,10 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead throw Exception("Timeout exceeded while reading from " + dumpAddressesUnlocked(), ErrorCodes::TIMEOUT_EXCEEDED); } - /// TODO Absolutely wrong code: read_list could be empty; rand() is not thread safe and has low quality; motivation of rand is unclear. + /// TODO Absolutely wrong code: read_list could be empty; motivation of rand is unclear. /// This code path is disabled by default. - auto & socket = read_list[rand() % read_list.size()]; + auto & socket = read_list[thread_local_rng() % read_list.size()]; if (fd_to_replica_state_idx.empty()) { fd_to_replica_state_idx.reserve(replica_states.size()); diff --git a/dbms/src/Columns/ColumnAggregateFunction.cpp b/dbms/src/Columns/ColumnAggregateFunction.cpp index 507ff6c2db9..ce07acd1c0d 100644 --- a/dbms/src/Columns/ColumnAggregateFunction.cpp +++ b/dbms/src/Columns/ColumnAggregateFunction.cpp @@ -501,7 +501,7 @@ MutableColumns ColumnAggregateFunction::scatter(IColumn::ColumnIndex num_columns size_t num_rows = size(); { - size_t reserve_size = num_rows / num_columns * 1.1; /// 1.1 is just a guess. Better to use n-sigma rule. + size_t reserve_size = double(num_rows) / num_columns * 1.1; /// 1.1 is just a guess. Better to use n-sigma rule. if (reserve_size > 1) for (auto & column : columns) @@ -576,8 +576,9 @@ ColumnAggregateFunction::MutablePtr ColumnAggregateFunction::createView() const } ColumnAggregateFunction::ColumnAggregateFunction(const ColumnAggregateFunction & src_) - : foreign_arenas(concatArenas(src_.foreign_arenas, src_.my_arena)), - func(src_.func), src(src_.getPtr()), data(src_.data.begin(), src_.data.end()) + : COWHelper(src_), + foreign_arenas(concatArenas(src_.foreign_arenas, src_.my_arena)), + func(src_.func), src(src_.getPtr()), data(src_.data.begin(), src_.data.end()) { } diff --git a/dbms/src/Columns/ColumnLowCardinality.cpp b/dbms/src/Columns/ColumnLowCardinality.cpp index 23d43579bbd..aaec56f37cc 100644 --- a/dbms/src/Columns/ColumnLowCardinality.cpp +++ b/dbms/src/Columns/ColumnLowCardinality.cpp @@ -16,6 +16,12 @@ namespace ErrorCodes namespace { + void checkColumn(const IColumn & column) + { + if (!dynamic_cast(&column)) + throw Exception("ColumnUnique expected as an argument of ColumnLowCardinality.", ErrorCodes::ILLEGAL_COLUMN); + } + template PaddedPODArray * getIndexesData(IColumn & indexes) { @@ -651,13 +657,6 @@ ColumnLowCardinality::Dictionary::Dictionary(ColumnPtr column_unique_, bool is_s checkColumn(*column_unique); } -void ColumnLowCardinality::Dictionary::checkColumn(const IColumn & column) -{ - - if (!dynamic_cast(&column)) - throw Exception("ColumnUnique expected as an argument of ColumnLowCardinality.", ErrorCodes::ILLEGAL_COLUMN); -} - void ColumnLowCardinality::Dictionary::setShared(const ColumnPtr & column_unique_) { checkColumn(*column_unique_); diff --git a/dbms/src/Columns/ColumnLowCardinality.h b/dbms/src/Columns/ColumnLowCardinality.h index 621fffb4a19..a76bd98de17 100644 --- a/dbms/src/Columns/ColumnLowCardinality.h +++ b/dbms/src/Columns/ColumnLowCardinality.h @@ -275,8 +275,6 @@ private: private: WrappedPtr column_unique; bool shared = false; - - void checkColumn(const IColumn & column); }; Dictionary dictionary; diff --git a/dbms/src/Columns/ColumnVector.cpp b/dbms/src/Columns/ColumnVector.cpp index a1c86953dc9..c4339548b19 100644 --- a/dbms/src/Columns/ColumnVector.cpp +++ b/dbms/src/Columns/ColumnVector.cpp @@ -118,7 +118,7 @@ void ColumnVector::getPermutation(bool reverse, size_t limit, int nan_directi if (s >= 256 && s <= std::numeric_limits::max()) { PaddedPODArray> pairs(s); - for (UInt32 i = 0; i < s; ++i) + for (UInt32 i = 0; i < UInt32(s); ++i) pairs[i] = {data[i], i}; RadixSort>::executeLSD(pairs.data(), s); diff --git a/dbms/src/Common/Arena.h b/dbms/src/Common/Arena.h index feee0de0f76..e1556ef73c5 100644 --- a/dbms/src/Common/Arena.h +++ b/dbms/src/Common/Arena.h @@ -4,7 +4,6 @@ #include #include #include -#include #if __has_include() # include #endif diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index 88a4b47ec8c..10d28078d58 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -228,13 +228,13 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod /// If initialized column is nullable. bool is_nullable = false; - static const ColumnLowCardinality & getLowCardinalityColumn(const IColumn * low_cardinality_column) + static const ColumnLowCardinality & getLowCardinalityColumn(const IColumn * column) { - auto column = typeid_cast(low_cardinality_column); - if (!column) + auto low_cardinality_column = typeid_cast(column); + if (!low_cardinality_column) throw Exception("Invalid aggregation key type for HashMethodSingleLowCardinalityColumn method. " "Excepted LowCardinality, got " + column->getName(), ErrorCodes::LOGICAL_ERROR); - return *column; + return *low_cardinality_column; } HashMethodSingleLowCardinalityColumn( diff --git a/dbms/src/Common/ConcurrentBoundedQueue.h b/dbms/src/Common/ConcurrentBoundedQueue.h index 69034c512a0..b888d68a286 100644 --- a/dbms/src/Common/ConcurrentBoundedQueue.h +++ b/dbms/src/Common/ConcurrentBoundedQueue.h @@ -6,7 +6,7 @@ #include #include -#include +#include namespace detail diff --git a/dbms/src/Common/Config/ConfigProcessor.cpp b/dbms/src/Common/Config/ConfigProcessor.cpp index ce96bb70ddc..7c39518d30b 100644 --- a/dbms/src/Common/Config/ConfigProcessor.cpp +++ b/dbms/src/Common/Config/ConfigProcessor.cpp @@ -219,7 +219,7 @@ void ConfigProcessor::merge(XMLDocumentPtr config, XMLDocumentPtr with) mergeRecursive(config, config_root, with_root); } -std::string ConfigProcessor::layerFromHost() +static std::string layerFromHost() { utsname buf; if (uname(&buf)) @@ -294,7 +294,7 @@ void ConfigProcessor::doIncludesRecursive( auto process_include = [&](const Node * include_attr, const std::function & get_node, const char * error_msg) { - std::string name = include_attr->getNodeValue(); + const std::string & name = include_attr->getNodeValue(); const Node * node_to_include = get_node(name); if (!node_to_include) { diff --git a/dbms/src/Common/Config/ConfigProcessor.h b/dbms/src/Common/Config/ConfigProcessor.h index ae88234f077..b6f772f8c16 100644 --- a/dbms/src/Common/Config/ConfigProcessor.h +++ b/dbms/src/Common/Config/ConfigProcessor.h @@ -97,8 +97,8 @@ public: /// If preprocessed_dir is empty - calculate from loaded_config.path + /preprocessed_configs/ void savePreprocessedConfig(const LoadedConfig & loaded_config, std::string preprocessed_dir); - /// Set path of main config.xml . It will be cutted from all configs placed to preprocessed_configs/ - void setConfigPath(const std::string & config_path); + /// Set path of main config.xml. It will be cutted from all configs placed to preprocessed_configs/ + static void setConfigPath(const std::string & config_path); public: using Files = std::vector; @@ -131,8 +131,6 @@ private: void merge(XMLDocumentPtr config, XMLDocumentPtr with); - std::string layerFromHost(); - void doIncludesRecursive( XMLDocumentPtr config, XMLDocumentPtr include_from, diff --git a/dbms/src/Common/CounterInFile.h b/dbms/src/Common/CounterInFile.h index 1f483ff5f40..3ede8349a17 100644 --- a/dbms/src/Common/CounterInFile.h +++ b/dbms/src/Common/CounterInFile.h @@ -16,7 +16,7 @@ #include #include -#include +#include namespace DB diff --git a/dbms/src/Common/CurrentThread.cpp b/dbms/src/Common/CurrentThread.cpp index ba2f28a8fe9..884cf2bfa9c 100644 --- a/dbms/src/Common/CurrentThread.cpp +++ b/dbms/src/Common/CurrentThread.cpp @@ -2,7 +2,6 @@ #include "CurrentThread.h" #include -#include #include #include #include diff --git a/dbms/src/Common/CurrentThread.h b/dbms/src/Common/CurrentThread.h index 1e0140c6330..de1ad969bc4 100644 --- a/dbms/src/Common/CurrentThread.h +++ b/dbms/src/Common/CurrentThread.h @@ -3,7 +3,6 @@ #include #include -#include #include #include @@ -107,4 +106,3 @@ private: }; } - diff --git a/dbms/src/Common/DNSResolver.cpp b/dbms/src/Common/DNSResolver.cpp index 2bf35ca417a..b0b9c61bc84 100644 --- a/dbms/src/Common/DNSResolver.cpp +++ b/dbms/src/Common/DNSResolver.cpp @@ -170,8 +170,8 @@ bool DNSResolver::updateCache() { { std::lock_guard lock(impl->drop_mutex); - for (auto & host : impl->new_hosts) - impl->known_hosts.insert(std::move(host)); + for (const auto & host : impl->new_hosts) + impl->known_hosts.insert(host); impl->new_hosts.clear(); impl->host_name.emplace(Poco::Net::DNS::hostName()); diff --git a/dbms/src/Common/Exception.cpp b/dbms/src/Common/Exception.cpp index 97219379ee8..bc3d4a78969 100644 --- a/dbms/src/Common/Exception.cpp +++ b/dbms/src/Common/Exception.cpp @@ -273,7 +273,7 @@ void tryLogException(std::exception_ptr e, const char * log_name, const std::str { try { - std::rethrow_exception(std::move(e)); + std::rethrow_exception(std::move(e)); // NOLINT } catch (...) { @@ -285,7 +285,7 @@ void tryLogException(std::exception_ptr e, Poco::Logger * logger, const std::str { try { - std::rethrow_exception(std::move(e)); + std::rethrow_exception(std::move(e)); // NOLINT } catch (...) { @@ -327,7 +327,7 @@ std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace) { try { - std::rethrow_exception(std::move(e)); + std::rethrow_exception(std::move(e)); // NOLINT } catch (...) { diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h index 5521cc043ad..ea85076fa5f 100644 --- a/dbms/src/Common/HashTable/HashTable.h +++ b/dbms/src/Common/HashTable/HashTable.h @@ -8,8 +8,6 @@ #include -#include - #include #include #include diff --git a/dbms/src/Common/HyperLogLogCounter.h b/dbms/src/Common/HyperLogLogCounter.h index 82eed74de89..0acffaaaf33 100644 --- a/dbms/src/Common/HyperLogLogCounter.h +++ b/dbms/src/Common/HyperLogLogCounter.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include diff --git a/dbms/src/Common/MemoryTracker.cpp b/dbms/src/Common/MemoryTracker.cpp index 084ed1af3cc..efc1ff7785e 100644 --- a/dbms/src/Common/MemoryTracker.cpp +++ b/dbms/src/Common/MemoryTracker.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include @@ -94,7 +93,7 @@ void MemoryTracker::alloc(Int64 size) free(size); /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc - auto untrack_lock = blocker.cancel(); + auto untrack_lock = blocker.cancel(); // NOLINT std::stringstream message; message << "Memory tracker"; @@ -119,7 +118,7 @@ void MemoryTracker::alloc(Int64 size) free(size); /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc - auto untrack_lock = blocker.cancel(); + auto untrack_lock = blocker.cancel(); // NOLINT std::stringstream message; message << "Memory limit"; diff --git a/dbms/src/Common/MemoryTracker.h b/dbms/src/Common/MemoryTracker.h index 7a203bcdf52..98f416cec40 100644 --- a/dbms/src/Common/MemoryTracker.h +++ b/dbms/src/Common/MemoryTracker.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include #include diff --git a/dbms/src/Common/NamePrompter.h b/dbms/src/Common/NamePrompter.h index 956a60f9a4e..116e2b43a0a 100644 --- a/dbms/src/Common/NamePrompter.h +++ b/dbms/src/Common/NamePrompter.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -32,15 +33,7 @@ private: size_t m = lhs.size(); size_t n = rhs.size(); - static constexpr size_t small_buffer_size = 64; - size_t small_buffer[small_buffer_size]; - std::unique_ptr alloc_buffer; - size_t * row = small_buffer; - if (n + 1 > small_buffer_size) - { - row = new size_t[n + 1]; - alloc_buffer.reset(row); - } + PODArrayWithStackMemory row(n + 1); for (size_t i = 1; i <= n; ++i) row[i] = i; diff --git a/dbms/src/Common/OpenSSLHelpers.cpp b/dbms/src/Common/OpenSSLHelpers.cpp index d482534ef24..caa843f96f6 100644 --- a/dbms/src/Common/OpenSSLHelpers.cpp +++ b/dbms/src/Common/OpenSSLHelpers.cpp @@ -23,7 +23,7 @@ String getOpenSSLErrors() SCOPE_EXIT(BIO_free(mem)); ERR_print_errors(mem); char * buf = nullptr; - long size = BIO_get_mem_data(mem, &buf); + size_t size = BIO_get_mem_data(mem, &buf); return String(buf, size); } diff --git a/dbms/src/Common/PODArray.h b/dbms/src/Common/PODArray.h index 272cbdc4fe6..8fe1f74484e 100644 --- a/dbms/src/Common/PODArray.h +++ b/dbms/src/Common/PODArray.h @@ -8,7 +8,6 @@ #include -#include #include #include diff --git a/dbms/src/Common/PoolWithFailoverBase.h b/dbms/src/Common/PoolWithFailoverBase.h index 35f9b1b136b..a923088db3d 100644 --- a/dbms/src/Common/PoolWithFailoverBase.h +++ b/dbms/src/Common/PoolWithFailoverBase.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/src/Common/ProfileEvents.h b/dbms/src/Common/ProfileEvents.h index ca327c9810b..e1b68e43e52 100644 --- a/dbms/src/Common/ProfileEvents.h +++ b/dbms/src/Common/ProfileEvents.h @@ -36,7 +36,7 @@ namespace ProfileEvents Counters(VariableContext level_ = VariableContext::Thread, Counters * parent_ = &global_counters); /// Global level static initializer - Counters(Counter * allocated_counters) + Counters(Counter * allocated_counters) noexcept : counters(allocated_counters), parent(nullptr), level(VariableContext::Global) {} Counter & operator[] (Event event) diff --git a/dbms/src/Common/ShellCommand.cpp b/dbms/src/Common/ShellCommand.cpp index 706d5bfbece..cf7ddb559c1 100644 --- a/dbms/src/Common/ShellCommand.cpp +++ b/dbms/src/Common/ShellCommand.cpp @@ -37,25 +37,49 @@ namespace ErrorCodes ShellCommand::ShellCommand(pid_t pid_, int in_fd_, int out_fd_, int err_fd_, bool terminate_in_destructor_) : pid(pid_) , terminate_in_destructor(terminate_in_destructor_) - , log(&Poco::Logger::get("ShellCommand")) , in(in_fd_) , out(out_fd_) , err(err_fd_) {} +Poco::Logger * ShellCommand::getLogger() +{ + return &Poco::Logger::get("ShellCommand"); +} + ShellCommand::~ShellCommand() { if (terminate_in_destructor) { + LOG_TRACE(getLogger(), "Will kill shell command pid " << pid << " with SIGTERM"); int retcode = kill(pid, SIGTERM); if (retcode != 0) - LOG_WARNING(log, "Cannot kill pid " << pid << " errno '" << errnoToString(retcode) << "'"); + LOG_WARNING(getLogger(), "Cannot kill shell command pid " << pid << " errno '" << errnoToString(retcode) << "'"); } else if (!wait_called) tryWait(); } +void ShellCommand::logCommand(const char * filename, char * const argv[]) +{ + std::stringstream log_message; + log_message << "Will start shell command '" << filename << "' with arguments "; + for (int i = 0; argv != nullptr && argv[i] != nullptr; ++i) + { + if (i > 0) + { + log_message << ", "; + } + + /// NOTE: No escaping is performed. + log_message << "'" << argv[i] << "'"; + } + LOG_TRACE(ShellCommand::getLogger(), log_message.str()); +} + std::unique_ptr ShellCommand::executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor) { + logCommand(filename, argv); + /** Here it is written that with a normal call `vfork`, there is a chance of deadlock in multithreaded programs, * because of the resolving of symbols in the shared library * http://www.oracle.com/technetwork/server-storage/solaris10/subprocess-136439.html @@ -95,6 +119,13 @@ std::unique_ptr ShellCommand::executeImpl(const char * filename, c _exit(int(ReturnCodes::CANNOT_DUP_STDERR)); } + // Reset the signal mask: it may be non-empty and will be inherited + // by the child process, which might not expect this. + sigset_t mask; + sigemptyset(&mask); + sigprocmask(0, nullptr, &mask); + sigprocmask(SIG_UNBLOCK, &mask, nullptr); + execv(filename, argv); /// If the process is running, then `execv` does not return here. @@ -103,6 +134,8 @@ std::unique_ptr ShellCommand::executeImpl(const char * filename, c std::unique_ptr res(new ShellCommand(pid, pipe_stdin.fds_rw[1], pipe_stdout.fds_rw[0], pipe_stderr.fds_rw[0], terminate_in_destructor)); + LOG_TRACE(getLogger(), "Started shell command '" << filename << "' with pid " << pid); + /// Now the ownership of the file descriptors is passed to the result. pipe_stdin.fds_rw[1] = -1; pipe_stdout.fds_rw[0] = -1; @@ -155,10 +188,14 @@ int ShellCommand::tryWait() { wait_called = true; + LOG_TRACE(getLogger(), "Will wait for shell command pid " << pid); + int status = 0; if (-1 == waitpid(pid, &status, 0)) throwFromErrno("Cannot waitpid", ErrorCodes::CANNOT_WAITPID); + LOG_TRACE(getLogger(), "Wait for shell command pid " << pid << " completed with status " << status); + if (WIFEXITED(status)) return WEXITSTATUS(status); diff --git a/dbms/src/Common/ShellCommand.h b/dbms/src/Common/ShellCommand.h index 0298ec73a2b..e609a3fc85b 100644 --- a/dbms/src/Common/ShellCommand.h +++ b/dbms/src/Common/ShellCommand.h @@ -30,10 +30,13 @@ private: bool wait_called = false; bool terminate_in_destructor; - Poco::Logger * log; - ShellCommand(pid_t pid_, int in_fd_, int out_fd_, int err_fd_, bool terminate_in_destructor_); + static Poco::Logger * getLogger(); + + /// Print command name and the list of arguments to log. NOTE: No escaping of arguments is performed. + static void logCommand(const char * filename, char * const argv[]); + static std::unique_ptr executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor); public: diff --git a/dbms/src/Common/SimpleIncrement.h b/dbms/src/Common/SimpleIncrement.h index 29e0010b0fa..a91ef06d45b 100644 --- a/dbms/src/Common/SimpleIncrement.h +++ b/dbms/src/Common/SimpleIncrement.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/dbms/src/Common/SipHash.h b/dbms/src/Common/SipHash.h index 023a1d4b18c..f71da206f1d 100644 --- a/dbms/src/Common/SipHash.h +++ b/dbms/src/Common/SipHash.h @@ -13,7 +13,7 @@ * (~ 700 MB/sec, 15 million strings per second) */ -#include +#include #include #include #include @@ -151,8 +151,8 @@ public: void get128(char * out) { finalize(); - reinterpret_cast(out)[0] = v0 ^ v1; - reinterpret_cast(out)[1] = v2 ^ v3; + unalignedStore(out, v0 ^ v1); + unalignedStore(out + 8, v2 ^ v3); } /// template for avoiding 'unsigned long long' vs 'unsigned long' problem on old poco in macos diff --git a/dbms/src/Common/SmallObjectPool.h b/dbms/src/Common/SmallObjectPool.h index 4eaf6cd26ab..469c102bdae 100644 --- a/dbms/src/Common/SmallObjectPool.h +++ b/dbms/src/Common/SmallObjectPool.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include diff --git a/dbms/src/Common/Stopwatch.h b/dbms/src/Common/Stopwatch.h index 502cd2e9010..772caa75373 100644 --- a/dbms/src/Common/Stopwatch.h +++ b/dbms/src/Common/Stopwatch.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include diff --git a/dbms/src/Common/SymbolIndex.cpp b/dbms/src/Common/SymbolIndex.cpp index a9cdc1fa867..14c4d627d61 100644 --- a/dbms/src/Common/SymbolIndex.cpp +++ b/dbms/src/Common/SymbolIndex.cpp @@ -168,7 +168,7 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, const ElfW(Sym) * elf_sym = reinterpret_cast(correct_address(info->dlpi_addr, it->d_un.d_ptr)); /* Iterate over the symbol table */ - for (ElfW(Word) sym_index = 0; sym_index < sym_cnt; ++sym_index) + for (ElfW(Word) sym_index = 0; sym_index < ElfW(Word)(sym_cnt); ++sym_index) { /// We are not interested in empty symbols. if (!elf_sym[sym_index].st_size) @@ -186,7 +186,7 @@ void collectSymbolsFromProgramHeaders(dl_phdr_info * info, symbol.address_begin = reinterpret_cast(info->dlpi_addr + elf_sym[sym_index].st_value); symbol.address_end = reinterpret_cast(info->dlpi_addr + elf_sym[sym_index].st_value + elf_sym[sym_index].st_size); symbol.name = sym_name; - symbols.push_back(std::move(symbol)); + symbols.push_back(symbol); } break; @@ -227,7 +227,7 @@ void collectSymbolsFromELFSymbolTable( symbol.address_begin = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value); symbol.address_end = reinterpret_cast(info->dlpi_addr + symbol_table_entry->st_value + symbol_table_entry->st_size); symbol.name = symbol_name; - symbols.push_back(std::move(symbol)); + symbols.push_back(symbol); } } diff --git a/dbms/src/Common/ThreadFuzzer.cpp b/dbms/src/Common/ThreadFuzzer.cpp index 1348842e4fb..0176934cbe9 100644 --- a/dbms/src/Common/ThreadFuzzer.cpp +++ b/dbms/src/Common/ThreadFuzzer.cpp @@ -162,7 +162,7 @@ static void injection( { int migrate_to = std::uniform_int_distribution<>(0, num_cpus_loaded - 1)(thread_local_rng); - cpu_set_t set; + cpu_set_t set{}; CPU_ZERO(&set); CPU_SET(migrate_to, &set); diff --git a/dbms/src/Common/ThreadPool.cpp b/dbms/src/Common/ThreadPool.cpp index c1cad465ed2..2b57ee5dbb6 100644 --- a/dbms/src/Common/ThreadPool.cpp +++ b/dbms/src/Common/ThreadPool.cpp @@ -28,8 +28,11 @@ ThreadPoolImpl::ThreadPoolImpl(size_t max_threads_) } template -ThreadPoolImpl::ThreadPoolImpl(size_t max_threads_, size_t max_free_threads_, size_t queue_size_) - : max_threads(max_threads_), max_free_threads(max_free_threads_), queue_size(queue_size_) +ThreadPoolImpl::ThreadPoolImpl(size_t max_threads_, size_t max_free_threads_, size_t queue_size_, bool shutdown_on_exception_) + : max_threads(max_threads_) + , max_free_threads(max_free_threads_) + , queue_size(queue_size_) + , shutdown_on_exception(shutdown_on_exception_) { } @@ -225,10 +228,20 @@ void ThreadPoolImpl::worker(typename std::list::iterator thread_ { std::unique_lock lock(mutex); if (!first_exception) - first_exception = std::current_exception(); - shutdown = true; + first_exception = std::current_exception(); // NOLINT + if (shutdown_on_exception) + shutdown = true; --scheduled_jobs; } + + DB::tryLogCurrentException("ThreadPool", + std::string("Exception in ThreadPool(") + + "max_threads: " + std::to_string(max_threads) + + ", max_free_threads: " + std::to_string(max_free_threads) + + ", queue_size: " + std::to_string(queue_size) + + ", shutdown_on_exception: " + std::to_string(shutdown_on_exception) + + ")."); + job_finished.notify_all(); new_job_or_shutdown.notify_all(); return; @@ -257,11 +270,11 @@ template class ThreadPoolImpl; template class ThreadPoolImpl; -void ExceptionHandler::setException(std::exception_ptr && exception) +void ExceptionHandler::setException(std::exception_ptr exception) { std::unique_lock lock(mutex); if (!first_exception) - first_exception = std::move(exception); + first_exception = std::move(exception); // NOLINT } void ExceptionHandler::throwIfException() diff --git a/dbms/src/Common/ThreadPool.h b/dbms/src/Common/ThreadPool.h index 662d34afadd..3a5e4b9a81e 100644 --- a/dbms/src/Common/ThreadPool.h +++ b/dbms/src/Common/ThreadPool.h @@ -33,7 +33,7 @@ public: explicit ThreadPoolImpl(size_t max_threads_); /// queue_size - maximum number of running plus scheduled jobs. It can be greater than max_threads. Zero means unlimited. - ThreadPoolImpl(size_t max_threads_, size_t max_free_threads_, size_t queue_size_); + ThreadPoolImpl(size_t max_threads_, size_t max_free_threads_, size_t queue_size_, bool shutdown_on_exception_ = true); /// Add new job. Locks until number of scheduled jobs is less than maximum or exception in one of threads was thrown. /// If any thread was throw an exception, first exception will be rethrown from this method, @@ -79,6 +79,7 @@ private: size_t scheduled_jobs = 0; bool shutdown = false; + const bool shutdown_on_exception = true; struct JobWithPriority { @@ -128,7 +129,7 @@ using FreeThreadPool = ThreadPoolImpl; class GlobalThreadPool : public FreeThreadPool, private boost::noncopyable { public: - GlobalThreadPool() : FreeThreadPool(10000, 1000, 10000) {} + GlobalThreadPool() : FreeThreadPool(10000, 1000, 10000, false) {} static GlobalThreadPool & instance(); }; @@ -221,7 +222,7 @@ using ThreadPool = ThreadPoolImpl; class ExceptionHandler { public: - void setException(std::exception_ptr && exception); + void setException(std::exception_ptr exception); void throwIfException(); private: diff --git a/dbms/src/Common/UTF8Helpers.cpp b/dbms/src/Common/UTF8Helpers.cpp index c18cce6df4e..d393ee2328b 100644 --- a/dbms/src/Common/UTF8Helpers.cpp +++ b/dbms/src/Common/UTF8Helpers.cpp @@ -75,15 +75,11 @@ static int wcwidth(wchar_t wc) switch (width) { case widechar_nonprint: - [[fallthrough]]; case widechar_combining: - [[fallthrough]]; case widechar_unassigned: return 0; case widechar_ambiguous: - [[fallthrough]]; case widechar_private_use: - [[fallthrough]]; case widechar_widened_in_9: return 1; default: diff --git a/dbms/src/Common/ZooKeeper/TestKeeper.cpp b/dbms/src/Common/ZooKeeper/TestKeeper.cpp index cb53ae52cc3..0bcef6cd75f 100644 --- a/dbms/src/Common/ZooKeeper/TestKeeper.cpp +++ b/dbms/src/Common/ZooKeeper/TestKeeper.cpp @@ -67,7 +67,7 @@ static void processWatchesImpl(const String & path, TestKeeper::Watches & watche struct TestKeeperCreateRequest final : CreateRequest, TestKeeperRequest { TestKeeperCreateRequest() = default; - TestKeeperCreateRequest(const CreateRequest & base) : CreateRequest(base) {} + explicit TestKeeperCreateRequest(const CreateRequest & base) : CreateRequest(base) {} ResponsePtr createResponse() const override; ResponsePtr process(TestKeeper::Container & container, int64_t zxid) const override; @@ -80,7 +80,7 @@ struct TestKeeperCreateRequest final : CreateRequest, TestKeeperRequest struct TestKeeperRemoveRequest final : RemoveRequest, TestKeeperRequest { TestKeeperRemoveRequest() = default; - TestKeeperRemoveRequest(const RemoveRequest & base) : RemoveRequest(base) {} + explicit TestKeeperRemoveRequest(const RemoveRequest & base) : RemoveRequest(base) {} bool isMutable() const override { return true; } ResponsePtr createResponse() const override; ResponsePtr process(TestKeeper::Container & container, int64_t zxid) const override; @@ -107,7 +107,7 @@ struct TestKeeperGetRequest final : GetRequest, TestKeeperRequest struct TestKeeperSetRequest final : SetRequest, TestKeeperRequest { TestKeeperSetRequest() = default; - TestKeeperSetRequest(const SetRequest & base) : SetRequest(base) {} + explicit TestKeeperSetRequest(const SetRequest & base) : SetRequest(base) {} bool isMutable() const override { return true; } ResponsePtr createResponse() const override; ResponsePtr process(TestKeeper::Container & container, int64_t zxid) const override; @@ -127,14 +127,14 @@ struct TestKeeperListRequest final : ListRequest, TestKeeperRequest struct TestKeeperCheckRequest final : CheckRequest, TestKeeperRequest { TestKeeperCheckRequest() = default; - TestKeeperCheckRequest(const CheckRequest & base) : CheckRequest(base) {} + explicit TestKeeperCheckRequest(const CheckRequest & base) : CheckRequest(base) {} ResponsePtr createResponse() const override; ResponsePtr process(TestKeeper::Container & container, int64_t zxid) const override; }; struct TestKeeperMultiRequest final : MultiRequest, TestKeeperRequest { - TestKeeperMultiRequest(const Requests & generic_requests) + explicit TestKeeperMultiRequest(const Requests & generic_requests) { requests.reserve(generic_requests.size()); diff --git a/dbms/src/Common/ZooKeeper/Types.h b/dbms/src/Common/ZooKeeper/Types.h index 8577061c1ea..33be2853068 100644 --- a/dbms/src/Common/ZooKeeper/Types.h +++ b/dbms/src/Common/ZooKeeper/Types.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include diff --git a/dbms/src/Common/ZooKeeper/ZooKeeper.cpp b/dbms/src/Common/ZooKeeper/ZooKeeper.cpp index f5a630d6a5b..bac7ed0bf80 100644 --- a/dbms/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/dbms/src/Common/ZooKeeper/ZooKeeper.cpp @@ -5,9 +5,9 @@ #include #include -#include #include +#include #include #include #include @@ -61,7 +61,7 @@ void ZooKeeper::init(const std::string & implementation, const std::string & hos throw KeeperException("No addresses passed to ZooKeeper constructor.", Coordination::ZBADARGUMENTS); std::vector addresses_strings; - boost::split(addresses_strings, hosts, boost::is_any_of(",")); + splitInto<','>(addresses_strings, hosts); Coordination::ZooKeeper::Addresses addresses; addresses.reserve(addresses_strings.size()); diff --git a/dbms/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/dbms/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 0525ebd377b..c55a2738a2a 100644 --- a/dbms/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/dbms/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -481,7 +481,7 @@ struct ZooKeeperCloseResponse final : ZooKeeperResponse struct ZooKeeperCreateRequest final : CreateRequest, ZooKeeperRequest { ZooKeeperCreateRequest() = default; - ZooKeeperCreateRequest(const CreateRequest & base) : CreateRequest(base) {} + explicit ZooKeeperCreateRequest(const CreateRequest & base) : CreateRequest(base) {} ZooKeeper::OpNum getOpNum() const override { return 1; } void writeImpl(WriteBuffer & out) const override @@ -513,7 +513,7 @@ struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest { ZooKeeperRemoveRequest() = default; - ZooKeeperRemoveRequest(const RemoveRequest & base) : RemoveRequest(base) {} + explicit ZooKeeperRemoveRequest(const RemoveRequest & base) : RemoveRequest(base) {} ZooKeeper::OpNum getOpNum() const override { return 2; } void writeImpl(WriteBuffer & out) const override @@ -571,7 +571,7 @@ struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest { ZooKeeperSetRequest() = default; - ZooKeeperSetRequest(const SetRequest & base) : SetRequest(base) {} + explicit ZooKeeperSetRequest(const SetRequest & base) : SetRequest(base) {} ZooKeeper::OpNum getOpNum() const override { return 5; } void writeImpl(WriteBuffer & out) const override @@ -614,7 +614,7 @@ struct ZooKeeperListResponse final : ListResponse, ZooKeeperResponse struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest { ZooKeeperCheckRequest() = default; - ZooKeeperCheckRequest(const CheckRequest & base) : CheckRequest(base) {} + explicit ZooKeeperCheckRequest(const CheckRequest & base) : CheckRequest(base) {} ZooKeeper::OpNum getOpNum() const override { return 13; } void writeImpl(WriteBuffer & out) const override @@ -710,7 +710,7 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse { - ZooKeeperMultiResponse(const Requests & requests) + explicit ZooKeeperMultiResponse(const Requests & requests) { responses.reserve(requests.size()); diff --git a/dbms/src/Common/ZooKeeper/tests/zkutil_test_async.cpp b/dbms/src/Common/ZooKeeper/tests/zkutil_test_async.cpp index 59c203a4683..17258c529ff 100644 --- a/dbms/src/Common/ZooKeeper/tests/zkutil_test_async.cpp +++ b/dbms/src/Common/ZooKeeper/tests/zkutil_test_async.cpp @@ -24,6 +24,7 @@ try while (true) { std::vector> futures; + futures.reserve(nodes.size()); for (auto & node : nodes) futures.push_back(zookeeper.asyncGet("/tmp/" + node)); diff --git a/dbms/src/Common/ZooKeeper/tests/zkutil_test_commands_new_lib.cpp b/dbms/src/Common/ZooKeeper/tests/zkutil_test_commands_new_lib.cpp index 1050584a138..aa348163adf 100644 --- a/dbms/src/Common/ZooKeeper/tests/zkutil_test_commands_new_lib.cpp +++ b/dbms/src/Common/ZooKeeper/tests/zkutil_test_commands_new_lib.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include using namespace Coordination; @@ -25,7 +25,7 @@ try std::string addresses_arg = argv[1]; std::vector addresses_strings; - boost::split(addresses_strings, addresses_arg, boost::is_any_of(",")); + splitInto<','>(addresses_strings, addresses_arg); ZooKeeper::Addresses addresses; addresses.reserve(addresses_strings.size()); for (const auto & address_string : addresses_strings) diff --git a/dbms/src/Common/formatIPv6.h b/dbms/src/Common/formatIPv6.h index 2e687414224..2529d6dc796 100644 --- a/dbms/src/Common/formatIPv6.h +++ b/dbms/src/Common/formatIPv6.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include diff --git a/dbms/src/Common/hex.h b/dbms/src/Common/hex.h index 81fa725e17d..db094e1dfd1 100644 --- a/dbms/src/Common/hex.h +++ b/dbms/src/Common/hex.h @@ -21,7 +21,7 @@ inline char hexDigitLowercase(unsigned char c) #include #include -#include +#include /// Maps 0..255 to 00..FF or 00..ff correspondingly diff --git a/dbms/src/Common/isLocalAddress.h b/dbms/src/Common/isLocalAddress.h index 63de5e000a9..3d0db2d9550 100644 --- a/dbms/src/Common/isLocalAddress.h +++ b/dbms/src/Common/isLocalAddress.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/dbms/src/Common/malloc.cpp b/dbms/src/Common/malloc.cpp index 4e57fbbebdc..ec472d5d1d6 100644 --- a/dbms/src/Common/malloc.cpp +++ b/dbms/src/Common/malloc.cpp @@ -30,15 +30,15 @@ static void dummyFunctionForInterposing() __attribute__((used)); static void dummyFunctionForInterposing() { void* dummy; - /// Suppression for PVS-Studio. - free(nullptr); // -V575 - ignore(malloc(0)); // -V575 - ignore(calloc(0, 0)); // -V575 - ignore(realloc(nullptr, 0)); // -V575 - ignore(posix_memalign(&dummy, 0, 0)); // -V575 - ignore(aligned_alloc(0, 0)); // -V575 - ignore(valloc(0)); // -V575 - ignore(memalign(0, 0)); // -V575 - ignore(pvalloc(0)); // -V575 + /// Suppression for PVS-Studio and clang-tidy. + free(nullptr); // -V575 NOLINT + ignore(malloc(0)); // -V575 NOLINT + ignore(calloc(0, 0)); // -V575 NOLINT + ignore(realloc(nullptr, 0)); // -V575 NOLINT + ignore(posix_memalign(&dummy, 0, 0)); // -V575 NOLINT + ignore(aligned_alloc(0, 0)); // -V575 NOLINT + ignore(valloc(0)); // -V575 NOLINT + ignore(memalign(0, 0)); // -V575 NOLINT + ignore(pvalloc(0)); // -V575 NOLINT } #endif diff --git a/dbms/src/Common/new_delete.cpp b/dbms/src/Common/new_delete.cpp index 9791a53470c..0aa5f8aacce 100644 --- a/dbms/src/Common/new_delete.cpp +++ b/dbms/src/Common/new_delete.cpp @@ -19,7 +19,7 @@ namespace Memory { -ALWAYS_INLINE void trackMemory(std::size_t size) +inline ALWAYS_INLINE void trackMemory(std::size_t size) { #if USE_JEMALLOC /// The nallocx() function allocates no memory, but it performs the same size computation as the mallocx() function @@ -31,7 +31,7 @@ ALWAYS_INLINE void trackMemory(std::size_t size) #endif } -ALWAYS_INLINE bool trackMemoryNoExcept(std::size_t size) noexcept +inline ALWAYS_INLINE bool trackMemoryNoExcept(std::size_t size) noexcept { try { @@ -45,7 +45,7 @@ ALWAYS_INLINE bool trackMemoryNoExcept(std::size_t size) noexcept return true; } -ALWAYS_INLINE void untrackMemory(void * ptr [[maybe_unused]], std::size_t size [[maybe_unused]] = 0) noexcept +inline ALWAYS_INLINE void untrackMemory(void * ptr [[maybe_unused]], std::size_t size [[maybe_unused]] = 0) noexcept { try { diff --git a/dbms/src/Common/parseAddress.cpp b/dbms/src/Common/parseAddress.cpp index 1e266ad503f..664e0ca2f59 100644 --- a/dbms/src/Common/parseAddress.cpp +++ b/dbms/src/Common/parseAddress.cpp @@ -19,7 +19,7 @@ std::pair parseAddress(const std::string & str, UInt16 defa const char * begin = str.data(); const char * end = begin + str.size(); - const char * port = end; + const char * port = end; // NOLINT if (begin[0] == '[') { diff --git a/dbms/src/Common/parseAddress.h b/dbms/src/Common/parseAddress.h index f7357b60cd4..602a9adc0b2 100644 --- a/dbms/src/Common/parseAddress.h +++ b/dbms/src/Common/parseAddress.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB diff --git a/dbms/src/Common/parseGlobs.cpp b/dbms/src/Common/parseGlobs.cpp index b7595d4591c..79b441441bc 100644 --- a/dbms/src/Common/parseGlobs.cpp +++ b/dbms/src/Common/parseGlobs.cpp @@ -3,6 +3,8 @@ #include #include #include +#include + namespace DB { @@ -38,10 +40,12 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob if (buffer.find(',') == std::string::npos) { - size_t range_begin, range_end; + size_t range_begin = 0; + size_t range_end = 0; char point; std::istringstream iss_range(buffer); iss_range >> range_begin >> point >> point >> range_end; + assert(iss_range.good()); oss_for_replacing << range_begin; for (size_t i = range_begin + 1; i <= range_end; ++i) { diff --git a/dbms/src/Common/tests/AvalancheTest.cpp b/dbms/src/Common/tests/AvalancheTest.cpp deleted file mode 100644 index b77ee2fa056..00000000000 --- a/dbms/src/Common/tests/AvalancheTest.cpp +++ /dev/null @@ -1,26 +0,0 @@ -/// Taken from SMHasher. - -#include "AvalancheTest.h" - -//----------------------------------------------------------------------------- - -double maxBias(std::vector & counts, int reps) -{ - double worst = 0; - - for (auto count : counts) - { - double c = static_cast(count) / static_cast(reps); - - double d = fabs(c * 2 - 1); - - if (d > worst) - { - worst = d; - } - } - - return worst; -} - -//----------------------------------------------------------------------------- diff --git a/dbms/src/Common/tests/AvalancheTest.h b/dbms/src/Common/tests/AvalancheTest.h deleted file mode 100644 index 3c457fa5466..00000000000 --- a/dbms/src/Common/tests/AvalancheTest.h +++ /dev/null @@ -1,264 +0,0 @@ -/// Taken from SMHasher. - -//----------------------------------------------------------------------------- -// Flipping a single bit of a key should cause an "avalanche" of changes in -// the hash function's output. Ideally, each output bits should flip 50% of -// the time - if the probability of an output bit flipping is not 50%, that bit -// is "biased". Too much bias means that patterns applied to the input will -// cause "echoes" of the patterns in the output, which in turn can cause the -// hash function to fail to create an even, random distribution of hash values. - - -#pragma once - -#include "Random.h" - -#include -#include -#include - -// Avalanche fails if a bit is biased by more than 1% - -#define AVALANCHE_FAIL 0.01 - -double maxBias(std::vector & counts, int reps); - -typedef void (*pfHash)(const void * blob, const int len, const uint32_t seed, void * out); - - -inline uint32_t getbit(const void * block, int len, uint32_t bit) -{ - uint8_t * b = reinterpret_cast(const_cast(block)); - - int byte = bit >> 3; - bit = bit & 0x7; - - if (byte < len) - return (b[byte] >> bit) & 1; - - return 0; -} - -template -inline uint32_t getbit(T & blob, uint32_t bit) -{ - return getbit(&blob, sizeof(blob), bit); -} - -inline void flipbit(void * block, int len, uint32_t bit) -{ - uint8_t * b = reinterpret_cast(block); - - int byte = bit >> 3; - bit = bit & 0x7; - - if (byte < len) - b[byte] ^= (1 << bit); -} - -template -inline void flipbit(T & blob, uint32_t bit) -{ - flipbit(&blob, sizeof(blob), bit); -} - -//----------------------------------------------------------------------------- - -template -void calcBias(pfHash hash, std::vector & counts, int reps, Rand & r) -{ - const int keybytes = sizeof(keytype); - const int hashbytes = sizeof(hashtype); - - const int keybits = keybytes * 8; - const int hashbits = hashbytes * 8; - - keytype K; - hashtype A, B; - - for (int irep = 0; irep < reps; irep++) - { - if (irep % (reps / 10) == 0) - printf("."); - - r.rand_p(&K, keybytes); - - hash(&K, keybytes, 0, &A); - - int * cursor = counts.data(); - - for (int iBit = 0; iBit < keybits; iBit++) - { - flipbit(&K, keybytes, iBit); - hash(&K, keybytes, 0, &B); - flipbit(&K, keybytes, iBit); - - for (int iOut = 0; iOut < hashbits; iOut++) - { - int bitA = getbit(&A, hashbytes, iOut); - int bitB = getbit(&B, hashbytes, iOut); - - (*cursor++) += (bitA ^ bitB); - } - } - } -} - -//----------------------------------------------------------------------------- - -template -bool AvalancheTest(pfHash hash, const int reps) -{ - Rand r(48273); - - const int keybytes = sizeof(keytype); - const int hashbytes = sizeof(hashtype); - - const int keybits = keybytes * 8; - const int hashbits = hashbytes * 8; - - printf("Testing %3d-bit keys -> %3d-bit hashes, %8d reps", keybits, hashbits, reps); - - //---------- - - std::vector bins(keybits * hashbits, 0); - - calcBias(hash, bins, reps, r); - - //---------- - - bool result = true; - - double b = maxBias(bins, reps); - - printf(" worst bias is %f%%", b * 100.0); - - if (b > AVALANCHE_FAIL) - { - printf(" !!!!! "); - result = false; - } - - printf("\n"); - - return result; -} - - -//----------------------------------------------------------------------------- -// BIC test variant - store all intermediate data in a table, draw diagram -// afterwards (much faster) - -template -void BicTest3(pfHash hash, const int reps, bool verbose = true) -{ - const int keybytes = sizeof(keytype); - const int keybits = keybytes * 8; - const int hashbytes = sizeof(hashtype); - const int hashbits = hashbytes * 8; - const int pagesize = hashbits * hashbits * 4; - - Rand r(11938); - - double maxBias = 0; - int maxK = 0; - int maxA = 0; - int maxB = 0; - - keytype key; - hashtype h1, h2; - - std::vector bins(keybits * pagesize, 0); - - for (int keybit = 0; keybit < keybits; keybit++) - { - if (keybit % (keybits / 10) == 0) - printf("."); - - int * page = &bins[keybit * pagesize]; - - for (int irep = 0; irep < reps; irep++) - { - r.rand_p(&key, keybytes); - hash(&key, keybytes, 0, &h1); - flipbit(key, keybit); - hash(&key, keybytes, 0, &h2); - - hashtype d = h1 ^ h2; - - for (int out1 = 0; out1 < hashbits - 1; out1++) - for (int out2 = out1 + 1; out2 < hashbits; out2++) - { - int * b = &page[(out1 * hashbits + out2) * 4]; - - uint32_t x = getbit(d, out1) | (getbit(d, out2) << 1); - - b[x]++; - } - } - } - - printf("\n"); - - for (int out1 = 0; out1 < hashbits - 1; out1++) - { - for (int out2 = out1 + 1; out2 < hashbits; out2++) - { - if (verbose) - printf("(%3d,%3d) - ", out1, out2); - - for (int keybit = 0; keybit < keybits; keybit++) - { - int * page = &bins[keybit * pagesize]; - int * bins_in_page = &page[(out1 * hashbits + out2) * 4]; - - double bias = 0; - - for (int b = 0; b < 4; b++) - { - double b2 = static_cast(bins_in_page[b]) / static_cast(reps / 2); - b2 = fabs(b2 * 2 - 1); - - if (b2 > bias) - bias = b2; - } - - if (bias > maxBias) - { - maxBias = bias; - maxK = keybit; - maxA = out1; - maxB = out2; - } - - if (verbose) - { - if (bias < 0.01) - printf("."); - else if (bias < 0.05) - printf("o"); - else if (bias < 0.33) - printf("O"); - else - printf("X"); - } - } - - // Finished keybit - - if (verbose) - printf("\n"); - } - - if (verbose) - { - for (int i = 0; i < keybits + 12; i++) - printf("-"); - printf("\n"); - } - } - - printf("Max bias %f - (%3d : %3d,%3d)\n", maxBias, maxK, maxA, maxB); -} - -//----------------------------------------------------------------------------- diff --git a/dbms/src/Common/tests/CMakeLists.txt b/dbms/src/Common/tests/CMakeLists.txt index 6f1c825227c..4ed21ba4ef2 100644 --- a/dbms/src/Common/tests/CMakeLists.txt +++ b/dbms/src/Common/tests/CMakeLists.txt @@ -4,9 +4,6 @@ if(OPENSSL_CRYPTO_LIBRARY) target_link_libraries (hashes_test PRIVATE ${OPENSSL_CRYPTO_LIBRARY}) endif() -add_executable (sip_hash sip_hash.cpp) -target_link_libraries (sip_hash PRIVATE clickhouse_common_io) - add_executable (sip_hash_perf sip_hash_perf.cpp) target_link_libraries (sip_hash_perf PRIVATE clickhouse_common_io) @@ -25,7 +22,7 @@ target_link_libraries (parallel_aggregation PRIVATE dbms) add_executable (parallel_aggregation2 parallel_aggregation2.cpp) target_link_libraries (parallel_aggregation2 PRIVATE dbms) -add_executable (int_hashes_perf int_hashes_perf.cpp AvalancheTest.cpp Random.cpp) +add_executable (int_hashes_perf int_hashes_perf.cpp) target_link_libraries (int_hashes_perf PRIVATE clickhouse_common_io) add_executable (simple_cache simple_cache.cpp) diff --git a/dbms/src/Common/tests/Random.cpp b/dbms/src/Common/tests/Random.cpp deleted file mode 100644 index 5264a2d9adc..00000000000 --- a/dbms/src/Common/tests/Random.cpp +++ /dev/null @@ -1,9 +0,0 @@ -/// Taken from SMHasher. -#include "Random.h" - -Rand g_rand1(1); -Rand g_rand2(2); -Rand g_rand3(3); -Rand g_rand4(4); - -//----------------------------------------------------------------------------- diff --git a/dbms/src/Common/tests/Random.h b/dbms/src/Common/tests/Random.h deleted file mode 100644 index ea341b55e3a..00000000000 --- a/dbms/src/Common/tests/Random.h +++ /dev/null @@ -1,129 +0,0 @@ -/// Taken from SMHasher. - -#pragma once - -#include - -//----------------------------------------------------------------------------- -// Xorshift RNG based on code by George Marsaglia -// http://en.wikipedia.org/wiki/Xorshift - -struct Rand -{ - uint32_t x; - uint32_t y; - uint32_t z; - uint32_t w; - - Rand() - { - reseed(static_cast(0)); - } - - explicit Rand(uint32_t seed) - { - reseed(seed); - } - - void reseed(uint32_t seed) - { - x = 0x498b3bc5 ^ seed; - y = 0; - z = 0; - w = 0; - - for (int i = 0; i < 10; i++) - mix(); - } - - void reseed(uint64_t seed) - { - x = 0x498b3bc5 ^ static_cast(seed >> 0); - y = 0x5a05089a ^ static_cast(seed >> 32); - z = 0; - w = 0; - - for (int i = 0; i < 10; i++) - mix(); - } - - //----------------------------------------------------------------------------- - - void mix(void) - { - uint32_t t = x ^ (x << 11); - x = y; - y = z; - z = w; - w = w ^ (w >> 19) ^ t ^ (t >> 8); - } - - uint32_t rand_u32(void) - { - mix(); - - return x; - } - - uint64_t rand_u64(void) - { - mix(); - - uint64_t a = x; - uint64_t b = y; - - return (a << 32) | b; - } - - void rand_p(void * blob, int bytes) - { - uint32_t * blocks = reinterpret_cast(blob); - - while (bytes >= 4) - { - blocks[0] = rand_u32(); - blocks++; - bytes -= 4; - } - - uint8_t * tail = reinterpret_cast(blocks); - - for (int i = 0; i < bytes; i++) - { - tail[i] = static_cast(rand_u32()); - } - } -}; - -//----------------------------------------------------------------------------- - -extern Rand g_rand1; - -inline uint32_t rand_u32(void) -{ - return g_rand1.rand_u32(); -} -inline uint64_t rand_u64(void) -{ - return g_rand1.rand_u64(); -} - -inline void rand_p(void * blob, int bytes) -{ - uint32_t * blocks = static_cast(blob); - - while (bytes >= 4) - { - *blocks++ = rand_u32(); - bytes -= 4; - } - - uint8_t * tail = reinterpret_cast(blocks); - - for (int i = 0; i < bytes; i++) - { - tail[i] = static_cast(rand_u32()); - } -} - -//----------------------------------------------------------------------------- diff --git a/dbms/src/Common/tests/auto_array.cpp b/dbms/src/Common/tests/auto_array.cpp index bbb533b65e8..9e9a18ffec9 100644 --- a/dbms/src/Common/tests/auto_array.cpp +++ b/dbms/src/Common/tests/auto_array.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -12,6 +13,8 @@ int main(int argc, char ** argv) { + pcg64 rng; + { size_t n = 10; using T = std::string; @@ -43,7 +46,7 @@ int main(int argc, char ** argv) Arr arr2 = std::move(arr); - std::cerr << arr.size() << ", " << arr2.size() << std::endl; + std::cerr << arr.size() << ", " << arr2.size() << std::endl; // NOLINT for (auto & elem : arr2) std::cerr << elem << std::endl; @@ -63,7 +66,7 @@ int main(int argc, char ** argv) { Arr key(n); for (size_t j = 0; j < n; ++j) - key[j] = DB::toString(rand()); + key[j] = DB::toString(rng()); map[std::move(key)] = "Hello, world! " + DB::toString(i); } @@ -107,7 +110,7 @@ int main(int argc, char ** argv) { Arr key(n); for (size_t j = 0; j < n; ++j) - key[j] = DB::toString(rand()); + key[j] = DB::toString(rng()); vec.push_back(std::move(key)); } @@ -152,7 +155,7 @@ int main(int argc, char ** argv) Map::LookupResult it; bool inserted; - map.emplace(rand(), it, inserted); + map.emplace(rng(), it, inserted); if (inserted) { new (&it->getMapped()) Arr(n); @@ -182,7 +185,7 @@ int main(int argc, char ** argv) } arr2 = std::move(arr1); - arr1.resize(n); + arr1.resize(n); // NOLINT std::cerr << "arr1.size(): " << arr1.size() << ", arr2.size(): " << arr2.size() << std::endl diff --git a/dbms/src/Common/tests/cow_columns.cpp b/dbms/src/Common/tests/cow_columns.cpp index 7b629e264e5..b4c3637be5a 100644 --- a/dbms/src/Common/tests/cow_columns.cpp +++ b/dbms/src/Common/tests/cow_columns.cpp @@ -28,7 +28,7 @@ private: friend class COWHelper; int data; - ConcreteColumn(int data_) : data(data_) {} + explicit ConcreteColumn(int data_) : data(data_) {} ConcreteColumn(const ConcreteColumn &) = default; MutableColumnPtr test() const override diff --git a/dbms/src/Common/tests/cow_compositions.cpp b/dbms/src/Common/tests/cow_compositions.cpp index 8d0110a0290..0335693d1bd 100644 --- a/dbms/src/Common/tests/cow_compositions.cpp +++ b/dbms/src/Common/tests/cow_compositions.cpp @@ -30,7 +30,7 @@ private: friend class COWHelper; int data; - ConcreteColumn(int data_) : data(data_) {} + explicit ConcreteColumn(int data_) : data(data_) {} ConcreteColumn(const ConcreteColumn &) = default; public: @@ -45,7 +45,7 @@ private: ConcreteColumn::WrappedPtr wrapped; - ColumnComposition(int data) : wrapped(ConcreteColumn::create(data)) {} + explicit ColumnComposition(int data) : wrapped(ConcreteColumn::create(data)) {} ColumnComposition(const ColumnComposition &) = default; IColumn::MutablePtr deepMutate() const override diff --git a/dbms/src/Common/tests/gtest_rw_lock.cpp b/dbms/src/Common/tests/gtest_rw_lock.cpp index 81bc0d38a56..dec4c732fd5 100644 --- a/dbms/src/Common/tests/gtest_rw_lock.cpp +++ b/dbms/src/Common/tests/gtest_rw_lock.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/src/Common/tests/gtest_sensitive_data_masker.cpp b/dbms/src/Common/tests/gtest_sensitive_data_masker.cpp index 9b9af39f6ea..d5133b2ef95 100644 --- a/dbms/src/Common/tests/gtest_sensitive_data_masker.cpp +++ b/dbms/src/Common/tests/gtest_sensitive_data_masker.cpp @@ -63,7 +63,7 @@ TEST(Common, SensitiveDataMasker) #ifndef NDEBUG // simple benchmark auto start = std::chrono::high_resolution_clock::now(); - constexpr unsigned long int iterations = 200000; + static constexpr size_t iterations = 200000; for (int i = 0; i < iterations; ++i) { std::string query2 = "SELECT id FROM mysql('localhost:3308', 'database', 'table', 'root', 'qwerty123') WHERE ssn='123-45-6789' or " diff --git a/dbms/src/Common/tests/int_hashes_perf.cpp b/dbms/src/Common/tests/int_hashes_perf.cpp index 6792b22dfce..7e8495cef27 100644 --- a/dbms/src/Common/tests/int_hashes_perf.cpp +++ b/dbms/src/Common/tests/int_hashes_perf.cpp @@ -8,7 +8,6 @@ #include #include #include -#include "AvalancheTest.h" /// Taken from SMHasher. static void setAffinity() @@ -253,28 +252,13 @@ static inline void test(size_t n, const UInt64 * data, const char * name) std::cerr << "Latency of "; report(name, n, watch.elapsedSeconds(), tsc_diff, res); } - - /// quality. Methods are taken from SMHasher. - { - auto wrapper = [](const void * blob, const int, const uint32_t, void * out) - { - *reinterpret_cast(out) = Func(*reinterpret_cast(blob)); - }; - - std::cerr << "Avalanche: " << std::endl; - AvalancheTest(wrapper, 300000); - // std::cerr << "Bit Independence Criteria: " << std::endl; - // BicTest3(wrapper, 2000000); - - std::cerr << std::endl; - } } int main(int argc, char ** argv) { - size_t n = (atoi(argv[1]) + (BUF_SIZE - 1)) / BUF_SIZE * BUF_SIZE; - size_t method = argc <= 2 ? 0 : atoi(argv[2]); + size_t n = (std::stol(argv[1]) + (BUF_SIZE - 1)) / BUF_SIZE * BUF_SIZE; + size_t method = argc <= 2 ? 0 : std::stol(argv[2]); std::cerr << std::fixed << std::setprecision(2); diff --git a/dbms/src/Common/tests/integer_hash_tables_and_hashes.cpp b/dbms/src/Common/tests/integer_hash_tables_and_hashes.cpp index 29e4a31bfb3..b5ba0be6420 100644 --- a/dbms/src/Common/tests/integer_hash_tables_and_hashes.cpp +++ b/dbms/src/Common/tests/integer_hash_tables_and_hashes.cpp @@ -343,8 +343,8 @@ int main(int argc, char ** argv) return 1; } - size_t n = atoi(argv[1]); -// size_t m = atoi(argv[2]); + size_t n = std::stol(argv[1]); +// size_t m = std::stol(argv[2]); std::cerr << std::fixed << std::setprecision(3); diff --git a/dbms/src/Common/tests/parallel_aggregation.cpp b/dbms/src/Common/tests/parallel_aggregation.cpp index 4533ff3a88e..5b70386fd9f 100644 --- a/dbms/src/Common/tests/parallel_aggregation.cpp +++ b/dbms/src/Common/tests/parallel_aggregation.cpp @@ -82,6 +82,7 @@ static void aggregate12(Map & map, Source::const_iterator begin, Source::const_i { if (prev_it != end && *it == *prev_it) { + assert(found != nullptr); ++found->getMapped(); continue; } @@ -89,6 +90,7 @@ static void aggregate12(Map & map, Source::const_iterator begin, Source::const_i bool inserted; map.emplace(*it, found, inserted); + assert(found != nullptr); ++found->getMapped(); } } @@ -107,6 +109,7 @@ static void aggregate22(MapTwoLevel & map, Source::const_iterator begin, Source: { if (*it == *prev_it) { + assert(found != nullptr); ++found->getMapped(); continue; } @@ -114,6 +117,7 @@ static void aggregate22(MapTwoLevel & map, Source::const_iterator begin, Source: bool inserted; map.emplace(*it, found, inserted); + assert(found != nullptr); ++found->getMapped(); } } @@ -246,9 +250,9 @@ void aggregate5(Map & local_map, MapSmallLocks & global_map, Source::const_itera int main(int argc, char ** argv) { - size_t n = atoi(argv[1]); - size_t num_threads = atoi(argv[2]); - size_t method = argc <= 3 ? 0 : atoi(argv[3]); + size_t n = std::stol(argv[1]); + size_t num_threads = std::stol(argv[2]); + size_t method = argc <= 3 ? 0 : std::stol(argv[3]); std::cerr << std::fixed << std::setprecision(2); diff --git a/dbms/src/Common/tests/parallel_aggregation2.cpp b/dbms/src/Common/tests/parallel_aggregation2.cpp index 30c7f05222c..1f8e598b122 100644 --- a/dbms/src/Common/tests/parallel_aggregation2.cpp +++ b/dbms/src/Common/tests/parallel_aggregation2.cpp @@ -93,6 +93,7 @@ struct AggregateIndependentWithSequentialKeysOptimization { if (it != begin && *it == prev_key) { + assert(place != nullptr); updater(place->getMapped()); continue; } @@ -100,6 +101,7 @@ struct AggregateIndependentWithSequentialKeysOptimization bool inserted; map.emplace(*it, place, inserted); + assert(place != nullptr); if (inserted) creator(place->getMapped()); @@ -285,9 +287,9 @@ struct Merger int main(int argc, char ** argv) { - size_t n = atoi(argv[1]); - size_t num_threads = atoi(argv[2]); - size_t method = argc <= 3 ? 0 : atoi(argv[3]); + size_t n = std::stol(argv[1]); + size_t num_threads = std::stol(argv[2]); + size_t method = argc <= 3 ? 0 : std::stol(argv[3]); std::cerr << std::fixed << std::setprecision(2); diff --git a/dbms/src/Common/tests/pod_array.cpp b/dbms/src/Common/tests/pod_array.cpp index 2a3093b3de7..de15b485411 100644 --- a/dbms/src/Common/tests/pod_array.cpp +++ b/dbms/src/Common/tests/pod_array.cpp @@ -409,7 +409,7 @@ static void test3() Array arr2{std::move(arr)}; - ASSERT_CHECK((arr.empty()), res); + ASSERT_CHECK((arr.empty()), res); // NOLINT ASSERT_CHECK((arr2.size() == 3), res); ASSERT_CHECK((arr2[0] == 1), res); @@ -428,7 +428,7 @@ static void test3() Array arr2{std::move(arr)}; - ASSERT_CHECK((arr.empty()), res); + ASSERT_CHECK((arr.empty()), res); // NOLINT ASSERT_CHECK((arr2.size() == 5), res); ASSERT_CHECK((arr2[0] == 1), res); diff --git a/dbms/src/Common/tests/radix_sort.cpp b/dbms/src/Common/tests/radix_sort.cpp index 44225d2b218..a7313d05cec 100644 --- a/dbms/src/Common/tests/radix_sort.cpp +++ b/dbms/src/Common/tests/radix_sort.cpp @@ -1,6 +1,7 @@ #if !defined(__APPLE__) && !defined(__FreeBSD__) #include #endif +#include #include #include #include @@ -31,6 +32,8 @@ static void NO_INLINE sort3(Key * data, size_t size) int main(int argc, char ** argv) { + pcg64 rng; + if (argc < 3) { std::cerr << "Usage: program n method\n"; @@ -48,7 +51,7 @@ int main(int argc, char ** argv) Stopwatch watch; for (auto & elem : data) - elem = rand(); + elem = rng(); watch.stop(); double elapsed = watch.elapsedSeconds(); diff --git a/dbms/src/Common/tests/sip_hash.cpp b/dbms/src/Common/tests/sip_hash.cpp deleted file mode 100644 index 046ea0edc15..00000000000 --- a/dbms/src/Common/tests/sip_hash.cpp +++ /dev/null @@ -1,154 +0,0 @@ -#include -#include - -#include -#include - -#include - -/// Adapted version https://www.131002.net/siphash/siphash24.c - -/* - SipHash-2-4 output with - k = 00 01 02 ... - and - in = (empty string) - in = 00 (1 byte) - in = 00 01 (2 bytes) - in = 00 01 02 (3 bytes) - ... - in = 00 01 02 ... 3e (63 bytes) -*/ -uint8_t vectors[64][8] = -{ - { 0x31, 0x0e, 0x0e, 0xdd, 0x47, 0xdb, 0x6f, 0x72, }, - { 0xfd, 0x67, 0xdc, 0x93, 0xc5, 0x39, 0xf8, 0x74, }, - { 0x5a, 0x4f, 0xa9, 0xd9, 0x09, 0x80, 0x6c, 0x0d, }, - { 0x2d, 0x7e, 0xfb, 0xd7, 0x96, 0x66, 0x67, 0x85, }, - { 0xb7, 0x87, 0x71, 0x27, 0xe0, 0x94, 0x27, 0xcf, }, - { 0x8d, 0xa6, 0x99, 0xcd, 0x64, 0x55, 0x76, 0x18, }, - { 0xce, 0xe3, 0xfe, 0x58, 0x6e, 0x46, 0xc9, 0xcb, }, - { 0x37, 0xd1, 0x01, 0x8b, 0xf5, 0x00, 0x02, 0xab, }, - { 0x62, 0x24, 0x93, 0x9a, 0x79, 0xf5, 0xf5, 0x93, }, - { 0xb0, 0xe4, 0xa9, 0x0b, 0xdf, 0x82, 0x00, 0x9e, }, - { 0xf3, 0xb9, 0xdd, 0x94, 0xc5, 0xbb, 0x5d, 0x7a, }, - { 0xa7, 0xad, 0x6b, 0x22, 0x46, 0x2f, 0xb3, 0xf4, }, - { 0xfb, 0xe5, 0x0e, 0x86, 0xbc, 0x8f, 0x1e, 0x75, }, - { 0x90, 0x3d, 0x84, 0xc0, 0x27, 0x56, 0xea, 0x14, }, - { 0xee, 0xf2, 0x7a, 0x8e, 0x90, 0xca, 0x23, 0xf7, }, - { 0xe5, 0x45, 0xbe, 0x49, 0x61, 0xca, 0x29, 0xa1, }, - { 0xdb, 0x9b, 0xc2, 0x57, 0x7f, 0xcc, 0x2a, 0x3f, }, - { 0x94, 0x47, 0xbe, 0x2c, 0xf5, 0xe9, 0x9a, 0x69, }, - { 0x9c, 0xd3, 0x8d, 0x96, 0xf0, 0xb3, 0xc1, 0x4b, }, - { 0xbd, 0x61, 0x79, 0xa7, 0x1d, 0xc9, 0x6d, 0xbb, }, - { 0x98, 0xee, 0xa2, 0x1a, 0xf2, 0x5c, 0xd6, 0xbe, }, - { 0xc7, 0x67, 0x3b, 0x2e, 0xb0, 0xcb, 0xf2, 0xd0, }, - { 0x88, 0x3e, 0xa3, 0xe3, 0x95, 0x67, 0x53, 0x93, }, - { 0xc8, 0xce, 0x5c, 0xcd, 0x8c, 0x03, 0x0c, 0xa8, }, - { 0x94, 0xaf, 0x49, 0xf6, 0xc6, 0x50, 0xad, 0xb8, }, - { 0xea, 0xb8, 0x85, 0x8a, 0xde, 0x92, 0xe1, 0xbc, }, - { 0xf3, 0x15, 0xbb, 0x5b, 0xb8, 0x35, 0xd8, 0x17, }, - { 0xad, 0xcf, 0x6b, 0x07, 0x63, 0x61, 0x2e, 0x2f, }, - { 0xa5, 0xc9, 0x1d, 0xa7, 0xac, 0xaa, 0x4d, 0xde, }, - { 0x71, 0x65, 0x95, 0x87, 0x66, 0x50, 0xa2, 0xa6, }, - { 0x28, 0xef, 0x49, 0x5c, 0x53, 0xa3, 0x87, 0xad, }, - { 0x42, 0xc3, 0x41, 0xd8, 0xfa, 0x92, 0xd8, 0x32, }, - { 0xce, 0x7c, 0xf2, 0x72, 0x2f, 0x51, 0x27, 0x71, }, - { 0xe3, 0x78, 0x59, 0xf9, 0x46, 0x23, 0xf3, 0xa7, }, - { 0x38, 0x12, 0x05, 0xbb, 0x1a, 0xb0, 0xe0, 0x12, }, - { 0xae, 0x97, 0xa1, 0x0f, 0xd4, 0x34, 0xe0, 0x15, }, - { 0xb4, 0xa3, 0x15, 0x08, 0xbe, 0xff, 0x4d, 0x31, }, - { 0x81, 0x39, 0x62, 0x29, 0xf0, 0x90, 0x79, 0x02, }, - { 0x4d, 0x0c, 0xf4, 0x9e, 0xe5, 0xd4, 0xdc, 0xca, }, - { 0x5c, 0x73, 0x33, 0x6a, 0x76, 0xd8, 0xbf, 0x9a, }, - { 0xd0, 0xa7, 0x04, 0x53, 0x6b, 0xa9, 0x3e, 0x0e, }, - { 0x92, 0x59, 0x58, 0xfc, 0xd6, 0x42, 0x0c, 0xad, }, - { 0xa9, 0x15, 0xc2, 0x9b, 0xc8, 0x06, 0x73, 0x18, }, - { 0x95, 0x2b, 0x79, 0xf3, 0xbc, 0x0a, 0xa6, 0xd4, }, - { 0xf2, 0x1d, 0xf2, 0xe4, 0x1d, 0x45, 0x35, 0xf9, }, - { 0x87, 0x57, 0x75, 0x19, 0x04, 0x8f, 0x53, 0xa9, }, - { 0x10, 0xa5, 0x6c, 0xf5, 0xdf, 0xcd, 0x9a, 0xdb, }, - { 0xeb, 0x75, 0x09, 0x5c, 0xcd, 0x98, 0x6c, 0xd0, }, - { 0x51, 0xa9, 0xcb, 0x9e, 0xcb, 0xa3, 0x12, 0xe6, }, - { 0x96, 0xaf, 0xad, 0xfc, 0x2c, 0xe6, 0x66, 0xc7, }, - { 0x72, 0xfe, 0x52, 0x97, 0x5a, 0x43, 0x64, 0xee, }, - { 0x5a, 0x16, 0x45, 0xb2, 0x76, 0xd5, 0x92, 0xa1, }, - { 0xb2, 0x74, 0xcb, 0x8e, 0xbf, 0x87, 0x87, 0x0a, }, - { 0x6f, 0x9b, 0xb4, 0x20, 0x3d, 0xe7, 0xb3, 0x81, }, - { 0xea, 0xec, 0xb2, 0xa3, 0x0b, 0x22, 0xa8, 0x7f, }, - { 0x99, 0x24, 0xa4, 0x3c, 0xc1, 0x31, 0x57, 0x24, }, - { 0xbd, 0x83, 0x8d, 0x3a, 0xaf, 0xbf, 0x8d, 0xb7, }, - { 0x0b, 0x1a, 0x2a, 0x32, 0x65, 0xd5, 0x1a, 0xea, }, - { 0x13, 0x50, 0x79, 0xa3, 0x23, 0x1c, 0xe6, 0x60, }, - { 0x93, 0x2b, 0x28, 0x46, 0xe4, 0xd7, 0x06, 0x66, }, - { 0xe1, 0x91, 0x5f, 0x5c, 0xb1, 0xec, 0xa4, 0x6c, }, - { 0xf3, 0x25, 0x96, 0x5c, 0xa1, 0x6d, 0x62, 0x9f, }, - { 0x57, 0x5f, 0xf2, 0x8e, 0x60, 0x38, 0x1b, 0xe5, }, - { 0x72, 0x45, 0x06, 0xeb, 0x4c, 0x32, 0x8a, 0x95, } -}; - - -static int test_vectors() -{ -#define MAXLEN 64 - char in[MAXLEN]; - - union - { - char out[16]; - uint64_t out64[2]; - }; - - union - { - char k[16]; - uint64_t k64[2]; - }; - - int i; - int ok = 1; - - for (i = 0; i < 16; ++i) - k[i] = i; - - for (i = 0; i < MAXLEN; ++i) - { - in[i] = i; - - size_t part = i == 0 ? 0 : (rand() % i); - - SipHash hash(k64[0], k64[1]); - - hash.update(in, part); - hash.update(in + part, i - part); - - hash.get128(out); - - uint64_t test_vector; - memcpy(&test_vector, vectors[i], 8); - - if ((out64[0] ^ out64[1]) != test_vector) - { - std::cerr << "test vector failed for " << i << " bytes" << std::endl; - ok = 0; - } - } - - return ok; -} - - -int main(int, char **) -{ - size_t n = 100000; - - size_t i = 0; - for (; i < n; ++i) - if (!test_vectors()) - break; - - if (i == n) - std::cerr << "test vectors ok" << std::endl; - - return 0; -} diff --git a/dbms/src/Compression/CompressedReadBuffer.cpp b/dbms/src/Compression/CompressedReadBuffer.cpp index bae56e2a283..a5eb59e1f59 100644 --- a/dbms/src/Compression/CompressedReadBuffer.cpp +++ b/dbms/src/Compression/CompressedReadBuffer.cpp @@ -14,7 +14,12 @@ bool CompressedReadBuffer::nextImpl() if (!size_compressed) return false; - memory.resize(size_decompressed + codec->getAdditionalSizeAtTheEndOfBuffer()); + auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); + + /// This is for clang static analyzer. + assert(size_decompressed + additional_size_at_the_end_of_buffer > 0); + + memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); @@ -39,8 +44,10 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n) if (!readCompressedData(size_decompressed, size_compressed_without_checksum)) return bytes_read; - /// If the decompressed block is placed entirely where it needs to be copied. - if (size_decompressed + codec->getAdditionalSizeAtTheEndOfBuffer() <= n - bytes_read) + auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); + + /// If the decompressed block fits entirely where it needs to be copied. + if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read) { decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum); bytes_read += size_decompressed; @@ -49,7 +56,11 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n) else { bytes += offset(); - memory.resize(size_decompressed + codec->getAdditionalSizeAtTheEndOfBuffer()); + + /// This is for clang static analyzer. + assert(size_decompressed + additional_size_at_the_end_of_buffer > 0); + + memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); pos = working_buffer.begin(); diff --git a/dbms/src/Compression/CompressedReadBufferBase.cpp b/dbms/src/Compression/CompressedReadBufferBase.cpp index 1cc22c4f5af..25efa6cdfb5 100644 --- a/dbms/src/Compression/CompressedReadBufferBase.cpp +++ b/dbms/src/Compression/CompressedReadBufferBase.cpp @@ -1,7 +1,8 @@ #include "CompressedReadBufferBase.h" #include -#include +#include +#include #include #include #include @@ -118,6 +119,9 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed, size_compressed_without_checksum = ICompressionCodec::readCompressedBlockSize(own_compressed_buffer.data()); size_decompressed = ICompressionCodec::readDecompressedBlockSize(own_compressed_buffer.data()); + /// This is for clang static analyzer. + assert(size_decompressed > 0); + if (size_compressed_without_checksum > DBMS_MAX_COMPRESSED_SIZE) throw Exception("Too large size_compressed_without_checksum: " + toString(size_compressed_without_checksum) @@ -130,9 +134,11 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed, ProfileEvents::increment(ProfileEvents::ReadCompressedBytes, size_compressed_without_checksum + sizeof(Checksum)); + auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); + /// Is whole compressed block located in 'compressed_in->' buffer? if (compressed_in->offset() >= header_size && - compressed_in->position() + size_compressed_without_checksum + codec->getAdditionalSizeAtTheEndOfBuffer() - header_size <= compressed_in->buffer().end()) + compressed_in->position() + size_compressed_without_checksum + additional_size_at_the_end_of_buffer - header_size <= compressed_in->buffer().end()) { compressed_in->position() -= header_size; compressed_buffer = compressed_in->position(); @@ -140,7 +146,7 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed, } else { - own_compressed_buffer.resize(size_compressed_without_checksum + codec->getAdditionalSizeAtTheEndOfBuffer()); + own_compressed_buffer.resize(size_compressed_without_checksum + additional_size_at_the_end_of_buffer); compressed_buffer = own_compressed_buffer.data(); compressed_in->readStrict(compressed_buffer + header_size, size_compressed_without_checksum - header_size); } diff --git a/dbms/src/Compression/CompressedReadBufferFromFile.cpp b/dbms/src/Compression/CompressedReadBufferFromFile.cpp index b87ab5f1528..ddd8bba686f 100644 --- a/dbms/src/Compression/CompressedReadBufferFromFile.cpp +++ b/dbms/src/Compression/CompressedReadBufferFromFile.cpp @@ -1,3 +1,5 @@ +#include + #include "CompressedReadBufferFromFile.h" #include @@ -16,13 +18,18 @@ namespace ErrorCodes bool CompressedReadBufferFromFile::nextImpl() { - size_t size_decompressed; + size_t size_decompressed = 0; size_t size_compressed_without_checksum; size_compressed = readCompressedData(size_decompressed, size_compressed_without_checksum); if (!size_compressed) return false; - memory.resize(size_decompressed + codec->getAdditionalSizeAtTheEndOfBuffer()); + auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); + + /// This is for clang static analyzer. + assert(size_decompressed + additional_size_at_the_end_of_buffer > 0); + + memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); @@ -95,8 +102,10 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) if (!new_size_compressed) return bytes_read; + auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); + /// If the decompressed block fits entirely where it needs to be copied. - if (size_decompressed + codec->getAdditionalSizeAtTheEndOfBuffer() <= n - bytes_read) + if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read) { decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum); bytes_read += size_decompressed; @@ -106,7 +115,11 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) { size_compressed = new_size_compressed; bytes += offset(); - memory.resize(size_decompressed + codec->getAdditionalSizeAtTheEndOfBuffer()); + + /// This is for clang static analyzer. + assert(size_decompressed + additional_size_at_the_end_of_buffer > 0); + + memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); pos = working_buffer.begin(); diff --git a/dbms/src/Compression/CompressionCodecMultiple.cpp b/dbms/src/Compression/CompressionCodecMultiple.cpp index 4f5d45606c2..4a43650a8a7 100644 --- a/dbms/src/Compression/CompressionCodecMultiple.cpp +++ b/dbms/src/Compression/CompressionCodecMultiple.cpp @@ -98,18 +98,20 @@ void CompressionCodecMultiple::doDecompressData(const char * source, UInt32 sour /// Insert all data into compressed buf source_size -= (compression_methods_size + 1); - for (long idx = compression_methods_size - 1; idx >= 0; --idx) + for (int idx = compression_methods_size - 1; idx >= 0; --idx) { UInt8 compression_method = source[idx + 1]; const auto codec = CompressionCodecFactory::instance().get(compression_method); - compressed_buf.resize(compressed_buf.size() + codec->getAdditionalSizeAtTheEndOfBuffer()); + auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer(); + + compressed_buf.resize(compressed_buf.size() + additional_size_at_the_end_of_buffer); UInt32 uncompressed_size = ICompressionCodec::readDecompressedBlockSize(compressed_buf.data()); if (idx == 0 && uncompressed_size != decompressed_size) throw Exception("Wrong final decompressed size in codec Multiple, got " + toString(uncompressed_size) + ", expected " + toString(decompressed_size), ErrorCodes::CORRUPTED_DATA); - uncompressed_buf.resize(uncompressed_size + codec->getAdditionalSizeAtTheEndOfBuffer()); + uncompressed_buf.resize(uncompressed_size + additional_size_at_the_end_of_buffer); codec->decompress(compressed_buf.data(), source_size, uncompressed_buf.data()); uncompressed_buf.swap(compressed_buf); source_size = uncompressed_size; diff --git a/dbms/src/Compression/LZ4_decompress_faster.cpp b/dbms/src/Compression/LZ4_decompress_faster.cpp index 989b34b97bf..801b4925013 100644 --- a/dbms/src/Compression/LZ4_decompress_faster.cpp +++ b/dbms/src/Compression/LZ4_decompress_faster.cpp @@ -6,8 +6,7 @@ #include #include #include -#include -#include +#include #include #ifdef __SSE2__ diff --git a/dbms/src/Compression/tests/gtest_compressionCodec.cpp b/dbms/src/Compression/tests/gtest_compressionCodec.cpp index e3b226c302d..df9471718d1 100644 --- a/dbms/src/Compression/tests/gtest_compressionCodec.cpp +++ b/dbms/src/Compression/tests/gtest_compressionCodec.cpp @@ -95,7 +95,7 @@ std::string bin(const T & value, size_t bits = sizeof(T)*8) static const uint8_t MAX_BITS = sizeof(T)*8; assert(bits <= MAX_BITS); - return std::bitset(static_cast(value)) + return std::bitset(static_cast(value)) .to_string().substr(MAX_BITS - bits, bits); } @@ -182,7 +182,7 @@ public: return *this; } - operator bool() const + explicit operator bool() const { return ItemsLeft() > 0; } @@ -519,7 +519,7 @@ public: CODEC_WITHOUT_DATA_TYPE, }; - CompressionCodecPtr makeCodec(MakeCodecParam with_data_type) + static CompressionCodecPtr makeCodec(MakeCodecParam with_data_type) { const auto & codec_string = std::get<0>(GetParam()).codec_statement; const auto & data_type = with_data_type == CODEC_WITH_DATA_TYPE ? std::get<1>(GetParam()).data_type : nullptr; @@ -527,7 +527,7 @@ public: return ::makeCodec(codec_string, data_type); } - void testTranscoding(ICompressionCodec & codec) + static void testTranscoding(ICompressionCodec & codec) { NoOpTimer timer; ::testTranscoding(timer, codec, std::get<1>(GetParam()), std::get<0>(GetParam()).expected_compression_ratio); @@ -706,9 +706,9 @@ typename std::conditional_t, std::uniform_real_distr template -struct MonotonicGenerator +struct MonotonicGenerator // NOLINT { - MonotonicGenerator(T stride_ = 1, T max_step = 10) + explicit MonotonicGenerator(T stride_ = 1, T max_step = 10) // NOLINT : prev_value(0), stride(stride_), random_engine(0), @@ -732,7 +732,7 @@ private: template struct RandomGenerator { - RandomGenerator(T seed = 0, T value_min = std::numeric_limits::min(), T value_max = std::numeric_limits::max()) + explicit RandomGenerator(T seed = 0, T value_min = std::numeric_limits::min(), T value_max = std::numeric_limits::max()) : random_engine(seed), distribution(value_min, value_max) { diff --git a/dbms/src/Core/Defines.h b/dbms/src/Core/Defines.h index ce3a8122ead..f2d4a517712 100644 --- a/dbms/src/Core/Defines.h +++ b/dbms/src/Core/Defines.h @@ -1,5 +1,7 @@ #pragma once +#include + #define DBMS_DEFAULT_HOST "localhost" #define DBMS_DEFAULT_PORT 9000 #define DBMS_DEFAULT_SECURE_PORT 9440 @@ -81,76 +83,6 @@ // more aliases: https://mailman.videolan.org/pipermail/x264-devel/2014-May/010660.html -#if defined(_MSC_VER) - #define ALWAYS_INLINE __forceinline - #define NO_INLINE static __declspec(noinline) - #define MAY_ALIAS -#else - #define ALWAYS_INLINE __attribute__((__always_inline__)) - #define NO_INLINE __attribute__((__noinline__)) - #define MAY_ALIAS __attribute__((__may_alias__)) -#endif - - -#define PLATFORM_NOT_SUPPORTED "The only supported platforms are x86_64 and AArch64, PowerPC (work in progress)" - -#if !defined(__x86_64__) && !defined(__aarch64__) && !defined(__PPC__) - #error PLATFORM_NOT_SUPPORTED -#endif - -/// Check for presence of address sanitizer -#if !defined(ADDRESS_SANITIZER) -#if defined(__has_feature) - #if __has_feature(address_sanitizer) - #define ADDRESS_SANITIZER 1 - #endif -#elif defined(__SANITIZE_ADDRESS__) - #define ADDRESS_SANITIZER 1 -#endif -#endif - -#if !defined(THREAD_SANITIZER) -#if defined(__has_feature) - #if __has_feature(thread_sanitizer) - #define THREAD_SANITIZER 1 - #endif -#elif defined(__SANITIZE_THREAD__) - #define THREAD_SANITIZER 1 -#endif -#endif - -#if !defined(MEMORY_SANITIZER) -#if defined(__has_feature) - #if __has_feature(memory_sanitizer) - #define MEMORY_SANITIZER 1 - #endif -#elif defined(__MEMORY_SANITIZER__) - #define MEMORY_SANITIZER 1 -#endif -#endif - -/// TODO Strange enough, there is no way to detect UB sanitizer. - -/// Explicitly allow undefined behaviour for certain functions. Use it as a function attribute. -/// It is useful in case when compiler cannot see (and exploit) it, but UBSan can. -/// Example: multiplication of signed integers with possibility of overflow when both sides are from user input. -#if defined(__clang__) - #define NO_SANITIZE_UNDEFINED __attribute__((__no_sanitize__("undefined"))) - #define NO_SANITIZE_ADDRESS __attribute__((__no_sanitize__("address"))) - #define NO_SANITIZE_THREAD __attribute__((__no_sanitize__("thread"))) -#else - /// It does not work in GCC. GCC 7 cannot recognize this attribute and GCC 8 simply ignores it. - #define NO_SANITIZE_UNDEFINED - #define NO_SANITIZE_ADDRESS - #define NO_SANITIZE_THREAD -#endif - -#if defined __GNUC__ && !defined __clang__ - #define OPTIMIZE(x) __attribute__((__optimize__(x))) -#else - #define OPTIMIZE(x) -#endif - /// Marks that extra information is sent to a shard. It could be any magic numbers. #define DBMS_DISTRIBUTED_SIGNATURE_HEADER 0xCAFEDACEull #define DBMS_DISTRIBUTED_SIGNATURE_HEADER_OLD_FORMAT 0xCAFECABEull @@ -159,7 +91,3 @@ # define ASAN_UNPOISON_MEMORY_REGION(a, b) # define ASAN_POISON_MEMORY_REGION(a, b) #endif - -/// A macro for suppressing warnings about unused variables or function results. -/// Useful for structured bindings which have no standard way to declare this. -#define UNUSED(...) (void)(__VA_ARGS__) diff --git a/dbms/src/Core/ExternalTable.cpp b/dbms/src/Core/ExternalTable.cpp index 49da7890d76..62a99cea97e 100644 --- a/dbms/src/Core/ExternalTable.cpp +++ b/dbms/src/Core/ExternalTable.cpp @@ -1,5 +1,4 @@ #include -#include #include #include #include @@ -8,13 +7,13 @@ #include #include #include -#include #include #include - -#include #include #include +#include +#include +#include namespace DB @@ -42,11 +41,11 @@ ExternalTableDataPtr BaseExternalTable::getData(const Context & context) void BaseExternalTable::clean() { - name = ""; - file = ""; - format = ""; + name.clear(); + file.clear(); + format.clear(); structure.clear(); - sample_block = Block(); + sample_block.clear(); read_buffer.reset(); } @@ -58,22 +57,16 @@ void BaseExternalTable::write() std::cerr << "format " << format << std::endl; std::cerr << "structure: \n"; for (const auto & elem : structure) - std::cerr << "\t" << elem.first << " " << elem.second << std::endl; -} - -std::vector BaseExternalTable::split(const std::string & s, const std::string & d) -{ - std::vector res; - boost::split(res, s, boost::algorithm::is_any_of(d), boost::algorithm::token_compress_on); - return res; + std::cerr << '\t' << elem.first << ' ' << elem.second << std::endl; } void BaseExternalTable::parseStructureFromStructureField(const std::string & argument) { - std::vector vals = split(argument, " ,"); + std::vector vals; + splitInto<' ', ','>(vals, argument, true); - if (vals.size() & 1) - throw Exception("Odd number of attributes in section structure", ErrorCodes::BAD_ARGUMENTS); + if (vals.size() % 2 != 0) + throw Exception("Odd number of attributes in section structure: " + std::to_string(vals.size()), ErrorCodes::BAD_ARGUMENTS); for (size_t i = 0; i < vals.size(); i += 2) structure.emplace_back(vals[i], vals[i + 1]); @@ -81,7 +74,8 @@ void BaseExternalTable::parseStructureFromStructureField(const std::string & arg void BaseExternalTable::parseStructureFromTypesField(const std::string & argument) { - std::vector vals = split(argument, " ,"); + std::vector vals; + splitInto<' ', ','>(vals, argument, true); for (size_t i = 0; i < vals.size(); ++i) structure.emplace_back("_" + toString(i + 1), vals[i]); diff --git a/dbms/src/Core/ExternalTable.h b/dbms/src/Core/ExternalTable.h index 8fa9bcbb6b7..f26af1cc6ca 100644 --- a/dbms/src/Core/ExternalTable.h +++ b/dbms/src/Core/ExternalTable.h @@ -66,8 +66,6 @@ protected: /// Function for debugging information output void write(); - static std::vector split(const std::string & s, const std::string & d); - /// Construct the `structure` vector from the text field `structure` virtual void parseStructureFromStructureField(const std::string & argument); diff --git a/dbms/src/Core/SettingsCollection.cpp b/dbms/src/Core/SettingsCollection.cpp index f2ac331b028..d45c082eb0b 100644 --- a/dbms/src/Core/SettingsCollection.cpp +++ b/dbms/src/Core/SettingsCollection.cpp @@ -208,7 +208,7 @@ void SettingMaxThreads::setAuto() is_auto = true; } -UInt64 SettingMaxThreads::getAutoValue() const +UInt64 SettingMaxThreads::getAutoValue() { static auto res = getNumberOfPhysicalCPUCores(); return res; @@ -465,7 +465,7 @@ void SettingURI::deserialize(ReadBuffer & buf, SettingsBinaryFormat) case static_cast(EnumType::NAME): return IO_NAME; #define IMPLEMENT_SETTING_ENUM_FROM_STRING_HELPER_(NAME, IO_NAME) \ - if (s == IO_NAME) \ + if (s == (IO_NAME)) \ { \ set(EnumType::NAME); \ return; \ @@ -474,7 +474,7 @@ void SettingURI::deserialize(ReadBuffer & buf, SettingsBinaryFormat) #define IMPLEMENT_SETTING_ENUM_CONCAT_NAMES_HELPER_(NAME, IO_NAME) \ if (!all_io_names.empty()) \ all_io_names += ", "; \ - all_io_names += String("'") + IO_NAME + "'"; + all_io_names += String("'") + (IO_NAME) + "'"; #define LOAD_BALANCING_LIST_OF_NAMES(M) \ diff --git a/dbms/src/Core/SettingsCollection.h b/dbms/src/Core/SettingsCollection.h index 700e96f0d40..466b80b1fd3 100644 --- a/dbms/src/Core/SettingsCollection.h +++ b/dbms/src/Core/SettingsCollection.h @@ -91,7 +91,7 @@ struct SettingMaxThreads void deserialize(ReadBuffer & buf, SettingsBinaryFormat format); void setAuto(); - UInt64 getAutoValue() const; + static UInt64 getAutoValue(); }; diff --git a/dbms/src/Core/Types.h b/dbms/src/Core/Types.h index a7863511e69..208da48797e 100644 --- a/dbms/src/Core/Types.h +++ b/dbms/src/Core/Types.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include diff --git a/dbms/src/Core/tests/string_pool.cpp b/dbms/src/Core/tests/string_pool.cpp index a2ee0ec3ea5..358381c4c0c 100644 --- a/dbms/src/Core/tests/string_pool.cpp +++ b/dbms/src/Core/tests/string_pool.cpp @@ -27,7 +27,7 @@ int main(int argc, char ** argv) std::ofstream devnull("/dev/null"); DB::ReadBufferFromFileDescriptor in(STDIN_FILENO); - size_t n = atoi(argv[1]); + size_t n = std::stol(argv[1]); size_t elems_show = 1; using Vec = std::vector; diff --git a/dbms/src/DataStreams/AddingDefaultsBlockInputStream.cpp b/dbms/src/DataStreams/AddingDefaultsBlockInputStream.cpp index d685b0225e6..d2df3dbc496 100644 --- a/dbms/src/DataStreams/AddingDefaultsBlockInputStream.cpp +++ b/dbms/src/DataStreams/AddingDefaultsBlockInputStream.cpp @@ -32,6 +32,101 @@ namespace ErrorCodes } +static void checkCalculated(const ColumnWithTypeAndName & col_read, + const ColumnWithTypeAndName & col_defaults, + size_t defaults_needed) +{ + size_t column_size = col_read.column->size(); + + if (column_size != col_defaults.column->size()) + throw Exception("Mismatch column sizes while adding defaults", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + + if (column_size < defaults_needed) + throw Exception("Unexpected defaults count", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + + if (!col_read.type->equals(*col_defaults.type)) + throw Exception("Mismach column types while adding defaults", ErrorCodes::TYPE_MISMATCH); +} + +static void mixNumberColumns( + TypeIndex type_idx, + MutableColumnPtr & column_mixed, + const ColumnPtr & col_defaults, + const BlockMissingValues::RowsBitMask & defaults_mask) +{ + auto call = [&](const auto & types) + { + using Types = std::decay_t; + using DataType = typename Types::LeftType; + + if constexpr (!std::is_same_v && !std::is_same_v) + { + using FieldType = typename DataType::FieldType; + using ColVecType = std::conditional_t, ColumnDecimal, ColumnVector>; + + auto col_read = typeid_cast(column_mixed.get()); + if (!col_read) + return false; + + typename ColVecType::Container & dst = col_read->getData(); + + if (auto const_col_defs = checkAndGetColumnConst(col_defaults.get())) + { + FieldType value = checkAndGetColumn(const_col_defs->getDataColumnPtr().get())->getData()[0]; + + for (size_t i = 0; i < defaults_mask.size(); ++i) + if (defaults_mask[i]) + dst[i] = value; + + return true; + } + else if (auto col_defs = checkAndGetColumn(col_defaults.get())) + { + auto & src = col_defs->getData(); + for (size_t i = 0; i < defaults_mask.size(); ++i) + if (defaults_mask[i]) + dst[i] = src[i]; + + return true; + } + } + + return false; + }; + + if (!callOnIndexAndDataType(type_idx, call)) + throw Exception("Unexpected type on mixNumberColumns", ErrorCodes::LOGICAL_ERROR); +} + +static MutableColumnPtr mixColumns(const ColumnWithTypeAndName & col_read, + const ColumnWithTypeAndName & col_defaults, + const BlockMissingValues::RowsBitMask & defaults_mask) +{ + size_t column_size = col_read.column->size(); + size_t defaults_needed = defaults_mask.size(); + + MutableColumnPtr column_mixed = col_read.column->cloneEmpty(); + + for (size_t i = 0; i < defaults_needed; ++i) + { + if (defaults_mask[i]) + { + if (isColumnConst(*col_defaults.column)) + column_mixed->insert((*col_defaults.column)[i]); + else + column_mixed->insertFrom(*col_defaults.column, i); + } + else + column_mixed->insertFrom(*col_read.column, i); + } + + for (size_t i = defaults_needed; i < column_size; ++i) + column_mixed->insertFrom(*col_read.column, i); + + return column_mixed; +} + + AddingDefaultsBlockInputStream::AddingDefaultsBlockInputStream(const BlockInputStreamPtr & input, const ColumnDefaults & column_defaults_, const Context & context_) @@ -124,95 +219,4 @@ Block AddingDefaultsBlockInputStream::readImpl() return res; } -void AddingDefaultsBlockInputStream::checkCalculated(const ColumnWithTypeAndName & col_read, - const ColumnWithTypeAndName & col_defaults, - size_t defaults_needed) const -{ - size_t column_size = col_read.column->size(); - - if (column_size != col_defaults.column->size()) - throw Exception("Mismatch column sizes while adding defaults", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); - - if (column_size < defaults_needed) - throw Exception("Unexpected defaults count", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); - - if (!col_read.type->equals(*col_defaults.type)) - throw Exception("Mismach column types while adding defaults", ErrorCodes::TYPE_MISMATCH); -} - -void AddingDefaultsBlockInputStream::mixNumberColumns(TypeIndex type_idx, MutableColumnPtr & column_mixed, const ColumnPtr & col_defaults, - const BlockMissingValues::RowsBitMask & defaults_mask) const -{ - auto call = [&](const auto & types) -> bool - { - using Types = std::decay_t; - using DataType = typename Types::LeftType; - - if constexpr (!std::is_same_v && !std::is_same_v) - { - using FieldType = typename DataType::FieldType; - using ColVecType = std::conditional_t, ColumnDecimal, ColumnVector>; - - auto col_read = typeid_cast(column_mixed.get()); - if (!col_read) - return false; - - typename ColVecType::Container & dst = col_read->getData(); - - if (auto const_col_defs = checkAndGetColumnConst(col_defaults.get())) - { - FieldType value = checkAndGetColumn(const_col_defs->getDataColumnPtr().get())->getData()[0]; - - for (size_t i = 0; i < defaults_mask.size(); ++i) - if (defaults_mask[i]) - dst[i] = value; - - return true; - } - else if (auto col_defs = checkAndGetColumn(col_defaults.get())) - { - auto & src = col_defs->getData(); - for (size_t i = 0; i < defaults_mask.size(); ++i) - if (defaults_mask[i]) - dst[i] = src[i]; - - return true; - } - } - - return false; - }; - - if (!callOnIndexAndDataType(type_idx, call)) - throw Exception("Unexpected type on mixNumberColumns", ErrorCodes::LOGICAL_ERROR); -} - -MutableColumnPtr AddingDefaultsBlockInputStream::mixColumns(const ColumnWithTypeAndName & col_read, - const ColumnWithTypeAndName & col_defaults, - const BlockMissingValues::RowsBitMask & defaults_mask) const -{ - size_t column_size = col_read.column->size(); - size_t defaults_needed = defaults_mask.size(); - - MutableColumnPtr column_mixed = col_read.column->cloneEmpty(); - - for (size_t i = 0; i < defaults_needed; ++i) - { - if (defaults_mask[i]) - { - if (isColumnConst(*col_defaults.column)) - column_mixed->insert((*col_defaults.column)[i]); - else - column_mixed->insertFrom(*col_defaults.column, i); - } - else - column_mixed->insertFrom(*col_read.column, i); - } - - for (size_t i = defaults_needed; i < column_size; ++i) - column_mixed->insertFrom(*col_read.column, i); - - return column_mixed; -} - } diff --git a/dbms/src/DataStreams/AddingDefaultsBlockInputStream.h b/dbms/src/DataStreams/AddingDefaultsBlockInputStream.h index 0d6f36861a4..cf16ec3fb75 100644 --- a/dbms/src/DataStreams/AddingDefaultsBlockInputStream.h +++ b/dbms/src/DataStreams/AddingDefaultsBlockInputStream.h @@ -27,12 +27,6 @@ private: Block header; const ColumnDefaults column_defaults; const Context & context; - - void checkCalculated(const ColumnWithTypeAndName & col_read, const ColumnWithTypeAndName & col_defaults, size_t needed) const; - MutableColumnPtr mixColumns(const ColumnWithTypeAndName & col_read, const ColumnWithTypeAndName & col_defaults, - const BlockMissingValues::RowsBitMask & defaults_mask) const; - void mixNumberColumns(TypeIndex type_idx, MutableColumnPtr & column_mixed, const ColumnPtr & col_defaults, - const BlockMissingValues::RowsBitMask & defaults_mask) const; }; } diff --git a/dbms/src/DataStreams/CollapsingFinalBlockInputStream.cpp b/dbms/src/DataStreams/CollapsingFinalBlockInputStream.cpp index c81b72abec4..a90d41b23ba 100644 --- a/dbms/src/DataStreams/CollapsingFinalBlockInputStream.cpp +++ b/dbms/src/DataStreams/CollapsingFinalBlockInputStream.cpp @@ -10,17 +10,7 @@ namespace DB CollapsingFinalBlockInputStream::~CollapsingFinalBlockInputStream() { - /// You must cancel all `MergingBlockPtr` so that they do not try to put blocks in `output_blocks`. - previous.block.cancel(); - last_positive.block.cancel(); - - while (!queue.empty()) - { - Cursor c = queue.top(); - queue.pop(); - c.block.cancel(); - } - + queue = {}; for (auto & block : output_blocks) delete block; } diff --git a/dbms/src/DataStreams/CollapsingFinalBlockInputStream.h b/dbms/src/DataStreams/CollapsingFinalBlockInputStream.h index 6f198dbab8c..c237d5465c3 100644 --- a/dbms/src/DataStreams/CollapsingFinalBlockInputStream.h +++ b/dbms/src/DataStreams/CollapsingFinalBlockInputStream.h @@ -1,10 +1,14 @@ #pragma once + #include #include #include #include #include #include +#include +#include + namespace DB { @@ -13,6 +17,7 @@ namespace ErrorCodes extern const int BAD_TYPE_OF_FIELD; } + /// Collapses the same rows with the opposite sign roughly like CollapsingSortedBlockInputStream. /// Outputs the rows in random order (the input streams must still be ordered). /// Outputs only rows with a positive sign. @@ -37,10 +42,7 @@ public: Block getHeader() const override { return children.at(0)->getHeader(); } -protected: - Block readImpl() override; -private: struct MergingBlock; using BlockPlainPtrs = std::vector; @@ -95,74 +97,11 @@ private: BlockPlainPtrs * output_blocks; }; +private: + Block readImpl() override; + /// When deleting the last block reference, adds a block to `output_blocks`. - class MergingBlockPtr - { - public: - MergingBlockPtr() : ptr() {} - - explicit MergingBlockPtr(MergingBlock * ptr_) : ptr(ptr_) - { - if (ptr) - ++ptr->refcount; - } - - MergingBlockPtr(const MergingBlockPtr & rhs) : ptr(rhs.ptr) - { - if (ptr) - ++ptr->refcount; - } - - MergingBlockPtr & operator=(const MergingBlockPtr & rhs) - { - destroy(); - ptr = rhs.ptr; - if (ptr) - ++ptr->refcount; - return *this; - } - - ~MergingBlockPtr() - { - destroy(); - } - - /// Zero the pointer and do not add a block to output_blocks. - void cancel() - { - if (ptr) - { - --ptr->refcount; - if (!ptr->refcount) - delete ptr; - ptr = nullptr; - } - } - - MergingBlock & operator*() const { return *ptr; } - MergingBlock * operator->() const { return ptr; } - operator bool() const { return !!ptr; } - bool operator!() const { return !ptr; } - - private: - MergingBlock * ptr; - - void destroy() - { - if (ptr) - { - --ptr->refcount; - if (!ptr->refcount) - { - if (std::uncaught_exceptions()) - delete ptr; - else - ptr->output_blocks->push_back(ptr); - } - ptr = nullptr; - } - } - }; + using MergingBlockPtr = boost::intrusive_ptr; struct Cursor { @@ -257,4 +196,16 @@ private: void reportBadSign(Int8 sign); }; + +inline void intrusive_ptr_add_ref(CollapsingFinalBlockInputStream::MergingBlock * ptr) +{ + ++ptr->refcount; +} + +inline void intrusive_ptr_release(CollapsingFinalBlockInputStream::MergingBlock * ptr) +{ + if (0 == --ptr->refcount) + ptr->output_blocks->push_back(ptr); +} + } diff --git a/dbms/src/DataStreams/MergingSortedBlockInputStream.cpp b/dbms/src/DataStreams/MergingSortedBlockInputStream.cpp index 0b8c20abe5e..10822485d52 100644 --- a/dbms/src/DataStreams/MergingSortedBlockInputStream.cpp +++ b/dbms/src/DataStreams/MergingSortedBlockInputStream.cpp @@ -2,6 +2,8 @@ #include #include +#include + #include #include @@ -21,6 +23,7 @@ MergingSortedBlockInputStream::MergingSortedBlockInputStream( : description(description_), max_block_size(max_block_size_), limit(limit_), quiet(quiet_) , average_block_sizes(average_block_sizes_), source_blocks(inputs_.size()) , cursors(inputs_.size()), out_row_sources_buf(out_row_sources_buf_) + , log(&Logger::get("MergingSortedBlockInputStream")) { children.insert(children.end(), inputs_.begin(), inputs_.end()); header = children.at(0)->getHeader(); diff --git a/dbms/src/DataStreams/MergingSortedBlockInputStream.h b/dbms/src/DataStreams/MergingSortedBlockInputStream.h index e6c2b257013..d9c9f1c26cc 100644 --- a/dbms/src/DataStreams/MergingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/MergingSortedBlockInputStream.h @@ -1,8 +1,5 @@ #pragma once -#include - -#include #include #include @@ -14,6 +11,9 @@ #include +namespace Poco { class Logger; } + + namespace DB { @@ -176,7 +176,7 @@ private: template void merge(MutableColumns & merged_columns, TSortingHeap & queue); - Logger * log = &Logger::get("MergingSortedBlockInputStream"); + Poco::Logger * log; /// Read is finished. bool finished = false; diff --git a/dbms/src/DataStreams/ParallelInputsProcessor.h b/dbms/src/DataStreams/ParallelInputsProcessor.h index 505bfac567c..a786dac7497 100644 --- a/dbms/src/DataStreams/ParallelInputsProcessor.h +++ b/dbms/src/DataStreams/ParallelInputsProcessor.h @@ -206,6 +206,8 @@ private: } loop(thread_num); + + handler.onFinishThread(thread_num); } catch (...) { @@ -217,8 +219,6 @@ private: handler.onException(exception, thread_num); } - handler.onFinishThread(thread_num); - /// The last thread on the output indicates that there is no more data. if (0 == --active_threads) { @@ -242,7 +242,19 @@ private: } } - handler.onFinish(); /// TODO If in `onFinish` or `onFinishThread` there is an exception, then std::terminate is called. + try + { + handler.onFinish(); + } + catch (...) + { + exception = std::current_exception(); + } + + if (exception) + { + handler.onException(exception, thread_num); + } } } diff --git a/dbms/src/DataStreams/SizeLimits.h b/dbms/src/DataStreams/SizeLimits.h index 1779831a86f..48fd2b9343f 100644 --- a/dbms/src/DataStreams/SizeLimits.h +++ b/dbms/src/DataStreams/SizeLimits.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB diff --git a/dbms/src/DataStreams/SummingSortedBlockInputStream.cpp b/dbms/src/DataStreams/SummingSortedBlockInputStream.cpp index fbb1622326e..ed5b1b820b4 100644 --- a/dbms/src/DataStreams/SummingSortedBlockInputStream.cpp +++ b/dbms/src/DataStreams/SummingSortedBlockInputStream.cpp @@ -37,6 +37,89 @@ namespace return false; } + + /// Returns true if merge result is not empty + bool mergeMap(const SummingSortedBlockInputStream::MapDescription & desc, Row & row, SortCursor & cursor) + { + /// Strongly non-optimal. + + Row & left = row; + Row right(left.size()); + + for (size_t col_num : desc.key_col_nums) + right[col_num] = (*cursor->all_columns[col_num])[cursor->pos].template get(); + + for (size_t col_num : desc.val_col_nums) + right[col_num] = (*cursor->all_columns[col_num])[cursor->pos].template get(); + + auto at_ith_column_jth_row = [&](const Row & matrix, size_t i, size_t j) -> const Field & + { + return matrix[i].get()[j]; + }; + + auto tuple_of_nth_columns_at_jth_row = [&](const Row & matrix, const ColumnNumbers & col_nums, size_t j) -> Array + { + size_t size = col_nums.size(); + Array res(size); + for (size_t col_num_index = 0; col_num_index < size; ++col_num_index) + res[col_num_index] = at_ith_column_jth_row(matrix, col_nums[col_num_index], j); + return res; + }; + + std::map merged; + + auto accumulate = [](Array & dst, const Array & src) + { + bool has_non_zero = false; + size_t size = dst.size(); + for (size_t i = 0; i < size; ++i) + if (applyVisitor(FieldVisitorSum(src[i]), dst[i])) + has_non_zero = true; + return has_non_zero; + }; + + auto merge = [&](const Row & matrix) + { + size_t rows = matrix[desc.key_col_nums[0]].get().size(); + + for (size_t j = 0; j < rows; ++j) + { + Array key = tuple_of_nth_columns_at_jth_row(matrix, desc.key_col_nums, j); + Array value = tuple_of_nth_columns_at_jth_row(matrix, desc.val_col_nums, j); + + auto it = merged.find(key); + if (merged.end() == it) + merged.emplace(std::move(key), std::move(value)); + else + { + if (!accumulate(it->second, value)) + merged.erase(it); + } + } + }; + + merge(left); + merge(right); + + for (size_t col_num : desc.key_col_nums) + row[col_num] = Array(merged.size()); + for (size_t col_num : desc.val_col_nums) + row[col_num] = Array(merged.size()); + + size_t row_num = 0; + for (const auto & key_value : merged) + { + for (size_t col_num_index = 0, size = desc.key_col_nums.size(); col_num_index < size; ++col_num_index) + row[desc.key_col_nums[col_num_index]].get()[row_num] = key_value.first[col_num_index]; + + for (size_t col_num_index = 0, size = desc.val_col_nums.size(); col_num_index < size; ++col_num_index) + row[desc.val_col_nums[col_num_index]].get()[row_num] = key_value.second[col_num_index]; + + ++row_num; + } + + return row_num != 0; + } } @@ -46,7 +129,8 @@ SummingSortedBlockInputStream::SummingSortedBlockInputStream( /// List of columns to be summed. If empty, all numeric columns that are not in the description are taken. const Names & column_names_to_sum, size_t max_block_size_) - : MergingSortedBlockInputStream(inputs_, description_, max_block_size_) + : MergingSortedBlockInputStream(inputs_, description_, max_block_size_), + log(&Logger::get("SummingSortedBlockInputStream")) { current_row.resize(num_columns); @@ -401,89 +485,6 @@ void SummingSortedBlockInputStream::merge(MutableColumns & merged_columns, Sorti } -bool SummingSortedBlockInputStream::mergeMap(const MapDescription & desc, Row & row, SortCursor & cursor) -{ - /// Strongly non-optimal. - - Row & left = row; - Row right(left.size()); - - for (size_t col_num : desc.key_col_nums) - right[col_num] = (*cursor->all_columns[col_num])[cursor->pos].template get(); - - for (size_t col_num : desc.val_col_nums) - right[col_num] = (*cursor->all_columns[col_num])[cursor->pos].template get(); - - auto at_ith_column_jth_row = [&](const Row & matrix, size_t i, size_t j) -> const Field & - { - return matrix[i].get()[j]; - }; - - auto tuple_of_nth_columns_at_jth_row = [&](const Row & matrix, const ColumnNumbers & col_nums, size_t j) -> Array - { - size_t size = col_nums.size(); - Array res(size); - for (size_t col_num_index = 0; col_num_index < size; ++col_num_index) - res[col_num_index] = at_ith_column_jth_row(matrix, col_nums[col_num_index], j); - return res; - }; - - std::map merged; - - auto accumulate = [](Array & dst, const Array & src) - { - bool has_non_zero = false; - size_t size = dst.size(); - for (size_t i = 0; i < size; ++i) - if (applyVisitor(FieldVisitorSum(src[i]), dst[i])) - has_non_zero = true; - return has_non_zero; - }; - - auto merge = [&](const Row & matrix) - { - size_t rows = matrix[desc.key_col_nums[0]].get().size(); - - for (size_t j = 0; j < rows; ++j) - { - Array key = tuple_of_nth_columns_at_jth_row(matrix, desc.key_col_nums, j); - Array value = tuple_of_nth_columns_at_jth_row(matrix, desc.val_col_nums, j); - - auto it = merged.find(key); - if (merged.end() == it) - merged.emplace(std::move(key), std::move(value)); - else - { - if (!accumulate(it->second, value)) - merged.erase(it); - } - } - }; - - merge(left); - merge(right); - - for (size_t col_num : desc.key_col_nums) - row[col_num] = Array(merged.size()); - for (size_t col_num : desc.val_col_nums) - row[col_num] = Array(merged.size()); - - size_t row_num = 0; - for (const auto & key_value : merged) - { - for (size_t col_num_index = 0, size = desc.key_col_nums.size(); col_num_index < size; ++col_num_index) - row[desc.key_col_nums[col_num_index]].get()[row_num] = key_value.first[col_num_index]; - - for (size_t col_num_index = 0, size = desc.val_col_nums.size(); col_num_index < size; ++col_num_index) - row[desc.val_col_nums[col_num_index]].get()[row_num] = key_value.second[col_num_index]; - - ++row_num; - } - - return row_num != 0; -} - - void SummingSortedBlockInputStream::addRow(SortCursor & cursor) { for (auto & desc : columns_to_aggregate) diff --git a/dbms/src/DataStreams/SummingSortedBlockInputStream.h b/dbms/src/DataStreams/SummingSortedBlockInputStream.h index bdd68c991cc..09bced85b1d 100644 --- a/dbms/src/DataStreams/SummingSortedBlockInputStream.h +++ b/dbms/src/DataStreams/SummingSortedBlockInputStream.h @@ -10,6 +10,8 @@ #include +namespace Poco { class Logger; } + namespace DB { @@ -35,12 +37,19 @@ public: String getName() const override { return "SummingSorted"; } + /// Stores numbers of key-columns and value-columns. + struct MapDescription + { + std::vector key_col_nums; + std::vector val_col_nums; + }; + protected: /// Can return 1 more records than max_block_size. Block readImpl() override; private: - Logger * log = &Logger::get("SummingSortedBlockInputStream"); + Poco::Logger * log; /// Read up to the end. bool finished = false; @@ -120,13 +129,6 @@ private: AggregateDescription(const AggregateDescription &) = delete; }; - /// Stores numbers of key-columns and value-columns. - struct MapDescription - { - std::vector key_col_nums; - std::vector val_col_nums; - }; - std::vector columns_to_aggregate; std::vector maps_to_sum; @@ -146,9 +148,6 @@ private: /// Insert the summed row for the current group into the result and updates some of per-block flags if the row is not "zero". void insertCurrentRowIfNeeded(MutableColumns & merged_columns); - /// Returns true if merge result is not empty - bool mergeMap(const MapDescription & desc, Row & row, SortCursor & cursor); - // Add the row under the cursor to the `row`. void addRow(SortCursor & cursor); }; diff --git a/dbms/src/DataStreams/tests/finish_sorting_stream.cpp b/dbms/src/DataStreams/tests/finish_sorting_stream.cpp index 861965e1e1c..cfc9ba217b3 100644 --- a/dbms/src/DataStreams/tests/finish_sorting_stream.cpp +++ b/dbms/src/DataStreams/tests/finish_sorting_stream.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -11,6 +12,7 @@ #include + using namespace DB; namespace DB @@ -24,12 +26,12 @@ namespace DB int main(int argc, char ** argv) { - srand(123456); + pcg64 rng; try { - size_t m = argc >= 2 ? atoi(argv[1]) : 2; - size_t n = argc >= 3 ? atoi(argv[2]) : 10; + size_t m = argc >= 2 ? std::stol(argv[1]) : 2; + size_t n = argc >= 3 ? std::stol(argv[2]) : 10; Blocks blocks; for (size_t t = 0; t < m; ++t) @@ -46,7 +48,7 @@ int main(int argc, char ** argv) vec.resize(n); for (size_t j = 0; j < n; ++j) - vec[j] = rand() % 10; + vec[j] = rng() % 10; column.column = std::move(col); block.insert(column); diff --git a/dbms/src/DataTypes/DataTypeDecimalBase.h b/dbms/src/DataTypes/DataTypeDecimalBase.h index bb5b0ca4ee8..2cf73467644 100644 --- a/dbms/src/DataTypes/DataTypeDecimalBase.h +++ b/dbms/src/DataTypes/DataTypeDecimalBase.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include diff --git a/dbms/src/DataTypes/DataTypeLowCardinality.cpp b/dbms/src/DataTypes/DataTypeLowCardinality.cpp index 417c988e5b9..ebec1571e9e 100644 --- a/dbms/src/DataTypes/DataTypeLowCardinality.cpp +++ b/dbms/src/DataTypes/DataTypeLowCardinality.cpp @@ -77,7 +77,7 @@ struct KeysSerializationVersion throw Exception("Invalid version for DataTypeLowCardinality key column.", ErrorCodes::LOGICAL_ERROR); } - KeysSerializationVersion(UInt64 version) : value(static_cast(version)) { checkVersion(version); } + explicit KeysSerializationVersion(UInt64 version) : value(static_cast(version)) { checkVersion(version); } }; /// Version is stored at the start of each granule. It's used to store indexes type and flags. diff --git a/dbms/src/Databases/DatabaseDictionary.h b/dbms/src/Databases/DatabaseDictionary.h index cd5dde3177c..b586cb1403f 100644 --- a/dbms/src/Databases/DatabaseDictionary.h +++ b/dbms/src/Databases/DatabaseDictionary.h @@ -19,7 +19,7 @@ namespace DB /* Database to store StorageDictionary tables * automatically creates tables for all dictionaries */ -class DatabaseDictionary : public IDatabase +class DatabaseDictionary final : public IDatabase { public: DatabaseDictionary(const String & name_); @@ -37,7 +37,7 @@ public: const Context & context, const String & table_name) const override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name = {}) override; + DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; bool empty(const Context & context) const override; diff --git a/dbms/src/Databases/DatabaseLazy.h b/dbms/src/Databases/DatabaseLazy.h index 8d1f20c068d..cc4b19d2036 100644 --- a/dbms/src/Databases/DatabaseLazy.h +++ b/dbms/src/Databases/DatabaseLazy.h @@ -15,7 +15,7 @@ class DatabaseLazyIterator; * Works like DatabaseOrdinary, but stores in memory only cache. * Can be used only with *Log engines. */ -class DatabaseLazy : public DatabaseOnDisk +class DatabaseLazy final : public DatabaseOnDisk { public: DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_); @@ -60,7 +60,7 @@ public: bool empty(const Context & context) const override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name = {}) override; + DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; void attachTable(const String & table_name, const StoragePtr & table) override; diff --git a/dbms/src/Databases/DatabaseMemory.h b/dbms/src/Databases/DatabaseMemory.h index 63fa896fefa..941cede3d67 100644 --- a/dbms/src/Databases/DatabaseMemory.h +++ b/dbms/src/Databases/DatabaseMemory.h @@ -16,7 +16,7 @@ namespace DB * All tables are created by calling code. * TODO: Maybe DatabaseRuntime is more suitable class name. */ -class DatabaseMemory : public DatabaseWithOwnTablesBase +class DatabaseMemory final : public DatabaseWithOwnTablesBase { public: DatabaseMemory(const String & name_); diff --git a/dbms/src/Databases/DatabaseMySQL.h b/dbms/src/Databases/DatabaseMySQL.h index f0112481661..8755d13c671 100644 --- a/dbms/src/Databases/DatabaseMySQL.h +++ b/dbms/src/Databases/DatabaseMySQL.h @@ -17,7 +17,7 @@ namespace DB * It doesn't make any manipulations with filesystem. * All tables are created by calling code after real-time pull-out structure from remote MySQL */ -class DatabaseMySQL : public IDatabase +class DatabaseMySQL final : public IDatabase { public: ~DatabaseMySQL() override; @@ -30,7 +30,7 @@ public: bool empty(const Context & context) const override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name = {}) override; + DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; ASTPtr getCreateDatabaseQuery(const Context & /*context*/) const override; diff --git a/dbms/src/Databases/DatabaseOrdinary.h b/dbms/src/Databases/DatabaseOrdinary.h index 41c03b5103e..edac2007def 100644 --- a/dbms/src/Databases/DatabaseOrdinary.h +++ b/dbms/src/Databases/DatabaseOrdinary.h @@ -11,7 +11,7 @@ namespace DB * It stores tables list in filesystem using list of .sql files, * that contain declaration of table represented by SQL ATTACH TABLE query. */ -class DatabaseOrdinary : public DatabaseWithDictionaries //DatabaseWithOwnTablesBase +class DatabaseOrdinary final : public DatabaseWithDictionaries { public: DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context); diff --git a/dbms/src/Databases/DatabaseWithDictionaries.h b/dbms/src/Databases/DatabaseWithDictionaries.h index 5ec37bdbb1a..e47ab6206c5 100644 --- a/dbms/src/Databases/DatabaseWithDictionaries.h +++ b/dbms/src/Databases/DatabaseWithDictionaries.h @@ -20,9 +20,9 @@ public: StoragePtr tryGetTable(const Context & context, const String & table_name) const override; - DatabaseTablesIteratorPtr getTablesWithDictionaryTablesIterator(const Context & context, const FilterByNameFunction & filter_by_dictionary_name = {}) override; + DatabaseTablesIteratorPtr getTablesWithDictionaryTablesIterator(const Context & context, const FilterByNameFunction & filter_by_dictionary_name) override; - DatabaseDictionariesIteratorPtr getDictionariesIterator(const Context & context, const FilterByNameFunction & filter_by_dictionary_name = {}) override; + DatabaseDictionariesIteratorPtr getDictionariesIterator(const Context & context, const FilterByNameFunction & filter_by_dictionary_name) override; bool isDictionaryExist(const Context & context, const String & dictionary_name) const override; diff --git a/dbms/src/Databases/DatabasesCommon.h b/dbms/src/Databases/DatabasesCommon.h index 1cefb8949bc..3bf7460da01 100644 --- a/dbms/src/Databases/DatabasesCommon.h +++ b/dbms/src/Databases/DatabasesCommon.h @@ -33,7 +33,7 @@ public: StoragePtr detachTable(const String & table_name) override; - DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name = {}) override; + DatabaseTablesIteratorPtr getTablesIterator(const Context & context, const FilterByNameFunction & filter_by_table_name) override; void shutdown() override; diff --git a/dbms/src/Databases/IDatabase.h b/dbms/src/Databases/IDatabase.h index ed449ba9591..1dc1f9eb36b 100644 --- a/dbms/src/Databases/IDatabase.h +++ b/dbms/src/Databases/IDatabase.h @@ -44,7 +44,7 @@ public: }; /// Copies list of tables and iterates through such snapshot. -class DatabaseTablesSnapshotIterator : public IDatabaseTablesIterator +class DatabaseTablesSnapshotIterator final : public IDatabaseTablesIterator { private: Tables tables; diff --git a/dbms/src/Dictionaries/CacheDictionary.cpp b/dbms/src/Dictionaries/CacheDictionary.cpp index 2294c99c111..36a8c704f4f 100644 --- a/dbms/src/Dictionaries/CacheDictionary.cpp +++ b/dbms/src/Dictionaries/CacheDictionary.cpp @@ -447,8 +447,8 @@ CacheDictionary::Attribute CacheDictionary::createAttributeWithType(const Attrib { #define DISPATCH(TYPE) \ case AttributeUnderlyingType::ut##TYPE: \ - attr.null_values = TYPE(null_value.get>()); \ - attr.arrays = std::make_unique>(size); \ + attr.null_values = TYPE(null_value.get>()); /* NOLINT */ \ + attr.arrays = std::make_unique>(size); /* NOLINT */ \ bytes_allocated += size * sizeof(TYPE); \ break; DISPATCH(UInt8) diff --git a/dbms/src/Dictionaries/ComplexKeyCacheDictionary_createAttributeWithType.cpp b/dbms/src/Dictionaries/ComplexKeyCacheDictionary_createAttributeWithType.cpp index e15a6fb3014..ba9f8d014fd 100644 --- a/dbms/src/Dictionaries/ComplexKeyCacheDictionary_createAttributeWithType.cpp +++ b/dbms/src/Dictionaries/ComplexKeyCacheDictionary_createAttributeWithType.cpp @@ -11,8 +11,8 @@ ComplexKeyCacheDictionary::createAttributeWithType(const AttributeUnderlyingType { #define DISPATCH(TYPE) \ case AttributeUnderlyingType::ut##TYPE: \ - attr.null_values = TYPE(null_value.get>()); \ - attr.arrays = std::make_unique>(size); \ + attr.null_values = TYPE(null_value.get>()); /* NOLINT */ \ + attr.arrays = std::make_unique>(size); /* NOLINT */ \ bytes_allocated += size * sizeof(TYPE); \ break; DISPATCH(UInt8) diff --git a/dbms/src/Dictionaries/DictionarySourceHelpers.h b/dbms/src/Dictionaries/DictionarySourceHelpers.h index e3d7bd521c3..99d5df5bda7 100644 --- a/dbms/src/Dictionaries/DictionarySourceHelpers.h +++ b/dbms/src/Dictionaries/DictionarySourceHelpers.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB diff --git a/dbms/src/Dictionaries/Embedded/GeoDictionariesLoader.h b/dbms/src/Dictionaries/Embedded/GeoDictionariesLoader.h index 3231a92b724..d09e69cf561 100644 --- a/dbms/src/Dictionaries/Embedded/GeoDictionariesLoader.h +++ b/dbms/src/Dictionaries/Embedded/GeoDictionariesLoader.h @@ -10,7 +10,6 @@ class GeoDictionariesLoader { public: - std::unique_ptr reloadRegionsHierarchies(const Poco::Util::AbstractConfiguration & config); - - std::unique_ptr reloadRegionsNames(const Poco::Util::AbstractConfiguration & config); + static std::unique_ptr reloadRegionsHierarchies(const Poco::Util::AbstractConfiguration & config); + static std::unique_ptr reloadRegionsNames(const Poco::Util::AbstractConfiguration & config); }; diff --git a/dbms/src/Dictionaries/Embedded/GeodataProviders/Types.h b/dbms/src/Dictionaries/Embedded/GeodataProviders/Types.h index e823ce23e0b..5f291ee7abd 100644 --- a/dbms/src/Dictionaries/Embedded/GeodataProviders/Types.h +++ b/dbms/src/Dictionaries/Embedded/GeodataProviders/Types.h @@ -1,6 +1,6 @@ #pragma once -#include +#include using RegionID = UInt32; diff --git a/dbms/src/Dictionaries/Embedded/RegionsHierarchy.h b/dbms/src/Dictionaries/Embedded/RegionsHierarchy.h index 63f5ce99a59..25625284e32 100644 --- a/dbms/src/Dictionaries/Embedded/RegionsHierarchy.h +++ b/dbms/src/Dictionaries/Embedded/RegionsHierarchy.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include "GeodataProviders/IHierarchiesProvider.h" diff --git a/dbms/src/Dictionaries/Embedded/RegionsNames.h b/dbms/src/Dictionaries/Embedded/RegionsNames.h index 1b00fd508d2..7216f238156 100644 --- a/dbms/src/Dictionaries/Embedded/RegionsNames.h +++ b/dbms/src/Dictionaries/Embedded/RegionsNames.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include "GeodataProviders/INamesProvider.h" diff --git a/dbms/src/Dictionaries/FileDictionarySource.cpp b/dbms/src/Dictionaries/FileDictionarySource.cpp index de2718e64a8..1de8cf6a057 100644 --- a/dbms/src/Dictionaries/FileDictionarySource.cpp +++ b/dbms/src/Dictionaries/FileDictionarySource.cpp @@ -48,6 +48,7 @@ FileDictionarySource::FileDictionarySource(const FileDictionarySource & other) BlockInputStreamPtr FileDictionarySource::loadAll() { + LOG_TRACE(&Poco::Logger::get("FileDictionary"), "loadAll " + toString()); auto in_ptr = std::make_unique(filepath); auto stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); last_modification = getLastModification(); diff --git a/dbms/src/Dictionaries/FileDictionarySource.h b/dbms/src/Dictionaries/FileDictionarySource.h index 396a04503d6..3d00c026e07 100644 --- a/dbms/src/Dictionaries/FileDictionarySource.h +++ b/dbms/src/Dictionaries/FileDictionarySource.h @@ -39,7 +39,14 @@ public: throw Exception{"Method loadKeys is unsupported for FileDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; } - bool isModified() const override { return getLastModification() > last_modification; } + bool isModified() const override + { + // We can't count on that the mtime increases or that it has + // a particular relation to system time, so just check for strict + // equality. + return getLastModification() != last_modification; + } + bool supportsSelectiveLoad() const override { return false; } ///Not supported for FileDictionarySource diff --git a/dbms/src/Dictionaries/HashedDictionary.cpp b/dbms/src/Dictionaries/HashedDictionary.cpp index 722a6e3584c..c401142584c 100644 --- a/dbms/src/Dictionaries/HashedDictionary.cpp +++ b/dbms/src/Dictionaries/HashedDictionary.cpp @@ -444,9 +444,9 @@ void HashedDictionary::addAttributeSize(const Attribute & attribute) bucket_count = map_ref->bucket_count(); /** TODO: more accurate calculation */ - bytes_allocated += sizeof(CollectionType); + bytes_allocated += sizeof(SparseCollectionType); bytes_allocated += bucket_count; - bytes_allocated += map_ref->size() * sizeof(Key) * sizeof(T); + bytes_allocated += map_ref->size() * (sizeof(Key) + sizeof(T)); } } diff --git a/dbms/src/Dictionaries/PolygonDictionary.cpp b/dbms/src/Dictionaries/PolygonDictionary.cpp index 0c763e23db4..b7d9ed41402 100644 --- a/dbms/src/Dictionaries/PolygonDictionary.cpp +++ b/dbms/src/Dictionaries/PolygonDictionary.cpp @@ -187,8 +187,7 @@ void IPolygonDictionary::createAttributes() appendNullValue(attr.underlying_type, attr.null_value); if (attr.hierarchical) - throw Exception{name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), - ErrorCodes::TYPE_MISMATCH}; + throw Exception{name + ": hierarchical attributes not supported for dictionary of polygonal type", ErrorCodes::TYPE_MISMATCH}; } } diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 8bd6ff06054..9b331269b23 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -50,9 +50,9 @@ namespace DB using ValueType = ExternalResultDescription::ValueType; template - inline void insert(IColumn & column, const String & stringValue) + inline void insert(IColumn & column, const String & string_value) { - assert_cast &>(column).insertValue(parse(stringValue)); + assert_cast &>(column).insertValue(parse(string_value)); } void insertValue(IColumn & column, const ValueType type, const Poco::Redis::BulkString & bulk_string) @@ -60,50 +60,50 @@ namespace DB if (bulk_string.isNull()) throw Exception{"Type mismatch, expected not Null String", ErrorCodes::TYPE_MISMATCH}; - String stringValue = bulk_string.value(); + const String & string_value = bulk_string.value(); switch (type) { case ValueType::vtUInt8: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtUInt16: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtUInt32: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtUInt64: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtInt8: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtInt16: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtInt32: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtInt64: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtFloat32: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtFloat64: - insert(column, stringValue); + insert(column, string_value); break; case ValueType::vtString: - assert_cast(column).insert(parse(stringValue)); + assert_cast(column).insert(parse(string_value)); break; case ValueType::vtDate: - assert_cast(column).insertValue(parse(stringValue).getDayNum()); + assert_cast(column).insertValue(parse(string_value).getDayNum()); break; case ValueType::vtDateTime: - assert_cast(column).insertValue(static_cast(parse(stringValue))); + assert_cast(column).insertValue(static_cast(parse(string_value))); break; case ValueType::vtUUID: - assert_cast(column).insertValue(parse(stringValue)); + assert_cast(column).insertValue(parse(string_value)); break; } } diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index c51e5cdadd4..61b0227df65 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -96,7 +96,10 @@ namespace DB if (db_index != 0) { RedisCommand command("SELECT"); - command << static_cast(db_index); + // Use poco's Int64, because it is defined as long long, and on + // MacOS, for the purposes of template instantiation, this type is + // distinct from int64_t, which is our Int64. + command << static_cast(db_index); String reply = client->execute(command); if (reply != "+OK\r\n") throw Exception{"Selecting database with index " + DB::toString(db_index) @@ -183,7 +186,7 @@ namespace DB /// Do not store more than max_block_size values for one request. if (primary_with_secondary.size() == max_block_size + 1) { - hkeys.add(std::move(primary_with_secondary)); + hkeys.add(primary_with_secondary); primary_with_secondary.clear(); primary_with_secondary.addRedisType(key); } diff --git a/dbms/src/Dictionaries/TrieDictionary.cpp b/dbms/src/Dictionaries/TrieDictionary.cpp index dcefc873b4f..d5ab7dc31ce 100644 --- a/dbms/src/Dictionaries/TrieDictionary.cpp +++ b/dbms/src/Dictionaries/TrieDictionary.cpp @@ -34,6 +34,18 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +static void validateKeyTypes(const DataTypes & key_types) +{ + if (key_types.size() != 1) + throw Exception{"Expected a single IP address", ErrorCodes::TYPE_MISMATCH}; + + const auto & actual_type = key_types[0]->getName(); + + if (actual_type != "UInt32" && actual_type != "FixedString(16)") + throw Exception{"Key does not match, expected either UInt32 or FixedString(16)", ErrorCodes::TYPE_MISMATCH}; +} + + TrieDictionary::TrieDictionary( const std::string & database_, const std::string & name_, @@ -416,17 +428,6 @@ void TrieDictionary::calculateBytesAllocated() bytes_allocated += btrie_allocated(trie); } -void TrieDictionary::validateKeyTypes(const DataTypes & key_types) const -{ - if (key_types.size() != 1) - throw Exception{"Expected a single IP address", ErrorCodes::TYPE_MISMATCH}; - - const auto & actual_type = key_types[0]->getName(); - - if (actual_type != "UInt32" && actual_type != "FixedString(16)") - throw Exception{"Key does not match, expected either UInt32 or FixedString(16)", ErrorCodes::TYPE_MISMATCH}; -} - template void TrieDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) diff --git a/dbms/src/Dictionaries/TrieDictionary.h b/dbms/src/Dictionaries/TrieDictionary.h index 81f5a02a00b..59f946ebe71 100644 --- a/dbms/src/Dictionaries/TrieDictionary.h +++ b/dbms/src/Dictionaries/TrieDictionary.h @@ -207,8 +207,6 @@ private: void calculateBytesAllocated(); - void validateKeyTypes(const DataTypes & key_types) const; - template void createAttributeImpl(Attribute & attribute, const Field & null_value); diff --git a/dbms/src/Dictionaries/getDictionaryConfigurationFromAST.cpp b/dbms/src/Dictionaries/getDictionaryConfigurationFromAST.cpp index a1d898b2621..2a1f4933ba2 100644 --- a/dbms/src/Dictionaries/getDictionaryConfigurationFromAST.cpp +++ b/dbms/src/Dictionaries/getDictionaryConfigurationFromAST.cpp @@ -31,12 +31,11 @@ namespace using NamesToTypeNames = std::unordered_map; /// Get value from field and convert it to string. /// Also remove quotes from strings. -String getUnescapedFieldString(const Field & field) +String getFieldAsString(const Field & field) { - String string = applyVisitor(FieldVisitorToString(), field); - if (!string.empty() && string.front() == '\'' && string.back() == '\'') - return string.substr(1, string.size() - 2); - return string; + if (field.getType() == Field::Types::Which::String) + return field.get(); + return applyVisitor(FieldVisitorToString(), field); } @@ -180,13 +179,13 @@ void buildSingleAttribute( type_element->appendChild(type); attribute_element->appendChild(type_element); - AutoPtr null_value_element(doc->createElement("null_value")); - String null_value_str; - if (dict_attr->default_value) - null_value_str = getUnescapedFieldString(dict_attr->default_value->as()->value); - AutoPtr null_value(doc->createTextNode(null_value_str)); - null_value_element->appendChild(null_value); - attribute_element->appendChild(null_value_element); + AutoPtr null_value_element(doc->createElement("null_value")); + String null_value_str; + if (dict_attr->default_value) + null_value_str = getFieldAsString(dict_attr->default_value->as()->value); + AutoPtr null_value(doc->createTextNode(null_value_str)); + null_value_element->appendChild(null_value); + attribute_element->appendChild(null_value_element); if (dict_attr->expression != nullptr) { @@ -197,7 +196,7 @@ void buildSingleAttribute( if (const auto * literal = dict_attr->expression->as(); literal && literal->value.getType() == Field::Types::String) { - expression_str = getUnescapedFieldString(literal->value); + expression_str = getFieldAsString(literal->value); } else expression_str = queryToString(dict_attr->expression); @@ -346,7 +345,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments( } else if (auto literal = pair->second->as(); literal) { - AutoPtr value(doc->createTextNode(getUnescapedFieldString(literal->value))); + AutoPtr value(doc->createTextNode(getFieldAsString(literal->value))); current_xml_element->appendChild(value); } else if (auto list = pair->second->as(); list) diff --git a/dbms/src/Disks/DiskLocal.cpp b/dbms/src/Disks/DiskLocal.cpp index 418ce966955..a1c2641e2f3 100644 --- a/dbms/src/Disks/DiskLocal.cpp +++ b/dbms/src/Disks/DiskLocal.cpp @@ -1,4 +1,5 @@ #include "DiskLocal.h" +#include #include "DiskFactory.h" #include @@ -11,7 +12,6 @@ namespace DB { - namespace ErrorCodes { extern const int UNKNOWN_ELEMENT_IN_CONFIG; @@ -254,6 +254,33 @@ Poco::Timestamp DiskLocal::getLastModified(const String & path) return Poco::File(disk_path + path).getLastModified(); } +void DiskLocal::createHardLink(const String & src_path, const String & dst_path) +{ + DB::createHardLink(disk_path + src_path, disk_path + dst_path); +} + +void DiskLocal::createFile(const String & path) +{ + Poco::File(disk_path + path).createFile(); +} + +void DiskLocal::setReadOnly(const String & path) +{ + Poco::File(disk_path + path).setReadOnly(true); +} + +bool inline isSameDiskType(const IDisk & one, const IDisk & another) +{ + return typeid(one) == typeid(another); +} + +void DiskLocal::copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) +{ + if (isSameDiskType(*this, *to_disk)) + Poco::File(disk_path + from_path).copyTo(to_disk->getPath() + to_path); /// Use more optimal way. + else + IDisk::copy(from_path, to_disk, to_path); /// Copy files through buffers. +} void DiskLocalReservation::update(UInt64 new_size) { diff --git a/dbms/src/Disks/DiskLocal.h b/dbms/src/Disks/DiskLocal.h index 0bca5dc72d4..61a3994b655 100644 --- a/dbms/src/Disks/DiskLocal.h +++ b/dbms/src/Disks/DiskLocal.h @@ -61,27 +61,31 @@ public: DiskDirectoryIteratorPtr iterateDirectory(const String & path) override; + void createFile(const String & path) override; + void moveFile(const String & from_path, const String & to_path) override; void replaceFile(const String & from_path, const String & to_path) override; void copyFile(const String & from_path, const String & to_path) override; + void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) override; + void listFiles(const String & path, std::vector & file_names) override; std::unique_ptr readFile( const String & path, - size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - size_t estimated_size = 0, - size_t aio_threshold = 0, - size_t mmap_threshold = 0) const override; + size_t buf_size, + size_t estimated_size, + size_t aio_threshold, + size_t mmap_threshold) const override; std::unique_ptr writeFile( const String & path, - size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - WriteMode mode = WriteMode::Rewrite, - size_t estimated_size = 0, - size_t aio_threshold = 0) override; + size_t buf_size, + WriteMode mode, + size_t estimated_size, + size_t aio_threshold) override; void remove(const String & path) override; @@ -91,6 +95,10 @@ public: Poco::Timestamp getLastModified(const String & path) override; + void setReadOnly(const String & path) override; + + void createHardLink(const String & src_path, const String & dst_path) override; + private: bool tryReserve(UInt64 bytes); diff --git a/dbms/src/Disks/DiskMemory.cpp b/dbms/src/Disks/DiskMemory.cpp index 15b2b2152b1..05ae59600ca 100644 --- a/dbms/src/Disks/DiskMemory.cpp +++ b/dbms/src/Disks/DiskMemory.cpp @@ -21,7 +21,7 @@ namespace ErrorCodes } -class DiskMemoryDirectoryIterator : public IDiskDirectoryIterator +class DiskMemoryDirectoryIterator final : public IDiskDirectoryIterator { public: explicit DiskMemoryDirectoryIterator(std::vector && dir_file_paths_) @@ -42,8 +42,9 @@ private: std::vector::iterator iter; }; + /// Adapter with actual behaviour as ReadBufferFromString. -class ReadIndirectBuffer : public ReadBufferFromFileBase +class ReadIndirectBuffer final : public ReadBufferFromFileBase { public: ReadIndirectBuffer(String path_, const String & data_) @@ -71,8 +72,9 @@ private: const String path; }; + /// This class is responsible to update files metadata after buffer is finalized. -class WriteIndirectBuffer : public WriteBufferFromFileBase +class WriteIndirectBuffer final : public WriteBufferFromFileBase { public: WriteIndirectBuffer(DiskMemory * disk_, String path_, WriteMode mode_, size_t buf_size) @@ -386,10 +388,27 @@ void DiskMemory::removeRecursive(const String & path) void DiskMemory::listFiles(const String & path, std::vector & file_names) { + std::lock_guard lock(mutex); + for (auto it = iterateDirectory(path); it->isValid(); it->next()) file_names.push_back(it->name()); } +void DiskMemory::createHardLink(const String &, const String &) +{ + throw Exception("Method createHardLink is not implemented for memory disks", ErrorCodes::NOT_IMPLEMENTED); +} + +void DiskMemory::createFile(const String &) +{ + throw Exception("Method createFile is not implemented for memory disks", ErrorCodes::NOT_IMPLEMENTED); +} + +void DiskMemory::setReadOnly(const String &) +{ + throw Exception("Method setReadOnly is not implemented for memory disks", ErrorCodes::NOT_IMPLEMENTED); +} + using DiskMemoryPtr = std::shared_ptr; diff --git a/dbms/src/Disks/DiskMemory.h b/dbms/src/Disks/DiskMemory.h index f67a361a948..b0c1d30c61d 100644 --- a/dbms/src/Disks/DiskMemory.h +++ b/dbms/src/Disks/DiskMemory.h @@ -54,6 +54,8 @@ public: DiskDirectoryIteratorPtr iterateDirectory(const String & path) override; + void createFile(const String & path) override; + void moveFile(const String & from_path, const String & to_path) override; void replaceFile(const String & from_path, const String & to_path) override; @@ -64,26 +66,30 @@ public: std::unique_ptr readFile( const String & path, - size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - size_t estimated_size = 0, - size_t aio_threshold = 0, - size_t mmap_threshold = 0) const override; + size_t buf_size, + size_t estimated_size, + size_t aio_threshold, + size_t mmap_threshold) const override; std::unique_ptr writeFile( const String & path, - size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - WriteMode mode = WriteMode::Rewrite, - size_t estimated_size = 0, - size_t aio_threshold = 0) override; + size_t buf_size, + WriteMode mode, + size_t estimated_size, + size_t aio_threshold) override; void remove(const String & path) override; void removeRecursive(const String & path) override; - void setLastModified(const String &, const Poco::Timestamp &) override { } + void setLastModified(const String &, const Poco::Timestamp &) override {} Poco::Timestamp getLastModified(const String &) override { return Poco::Timestamp(); } + void setReadOnly(const String & path) override; + + void createHardLink(const String & src_path, const String & dst_path) override; + private: void createDirectoriesImpl(const String & path); void replaceFileImpl(const String & from_path, const String & to_path); diff --git a/dbms/src/Disks/DiskS3.cpp b/dbms/src/Disks/DiskS3.cpp index b610a1fc42f..762056498c7 100644 --- a/dbms/src/Disks/DiskS3.cpp +++ b/dbms/src/Disks/DiskS3.cpp @@ -34,6 +34,15 @@ namespace ErrorCodes namespace { + String getRandomName() + { + std::uniform_int_distribution distribution('a', 'z'); + String res(32, ' '); /// The number of bits of entropy should be not less than 128. + for (auto & c : res) + c = distribution(thread_local_rng); + return res; + } + template void throwIfError(Aws::Utils::Outcome && response) { @@ -138,7 +147,7 @@ namespace }; /// Reads data from S3 using stored paths in metadata. - class ReadIndirectBufferFromS3 : public ReadBufferFromFileBase + class ReadIndirectBufferFromS3 final : public ReadBufferFromFileBase { public: ReadIndirectBufferFromS3( @@ -251,7 +260,7 @@ namespace }; /// Stores data in S3 and adds the object key (S3 path) and object size to metadata file on local FS. - class WriteIndirectBufferFromS3 : public WriteBufferFromFileBase + class WriteIndirectBufferFromS3 final : public WriteBufferFromFileBase { public: WriteIndirectBufferFromS3( @@ -294,7 +303,12 @@ namespace finalized = true; } - void sync() override { metadata.save(true); } + void sync() override + { + if (finalized) + metadata.save(true); + } + std::string getFileName() const override { return metadata.metadata_file_path; } private: @@ -319,7 +333,7 @@ namespace } -class DiskS3DirectoryIterator : public IDiskDirectoryIterator +class DiskS3DirectoryIterator final : public IDiskDirectoryIterator { public: DiskS3DirectoryIterator(const String & full_path, const String & folder_path_) : iter(full_path), folder_path(folder_path_) {} @@ -346,7 +360,7 @@ private: using DiskS3Ptr = std::shared_ptr; -class DiskS3Reservation : public IReservation +class DiskS3Reservation final : public IReservation { public: DiskS3Reservation(const DiskS3Ptr & disk_, UInt64 size_) @@ -471,14 +485,12 @@ void DiskS3::copyFile(const String & from_path, const String & to_path) Metadata from(metadata_path + from_path); Metadata to(metadata_path + to_path, true); - for (UInt32 i = 0; i < from.s3_objects_count; ++i) + for (const auto & [path, size] : from.s3_objects) { - auto path = from.s3_objects[i].first; - auto size = from.s3_objects[i].second; auto new_path = s3_root_path + getRandomName(); Aws::S3::Model::CopyObjectRequest req; + req.SetCopySource(bucket + "/" + path); req.SetBucket(bucket); - req.SetCopySource(path); req.SetKey(new_path); throwIfError(client->CopyObject(req)); @@ -570,14 +582,6 @@ void DiskS3::removeRecursive(const String & path) } } -String DiskS3::getRandomName() const -{ - std::uniform_int_distribution distribution('a', 'z'); - String res(32, ' '); /// The number of bits of entropy should be not less than 128. - for (auto & c : res) - c = distribution(thread_local_rng); - return res; -} bool DiskS3::tryReserve(UInt64 bytes) { @@ -620,6 +624,27 @@ Poco::Timestamp DiskS3::getLastModified(const String & path) return Poco::File(metadata_path + path).getLastModified(); } +void DiskS3::createHardLink(const String & src_path, const String & dst_path) +{ + /** + * TODO: Replace with optimal implementation: + * Store links into a list in metadata file. + * Hardlink creation is adding new link to list and just metadata file copy. + */ + copyFile(src_path, dst_path); +} + +void DiskS3::createFile(const String & path) +{ + /// Create empty metadata file. + Metadata metadata(metadata_path + path, true); + metadata.save(); +} + +void DiskS3::setReadOnly(const String & path) +{ + Poco::File(metadata_path + path).setReadOnly(true); +} DiskS3Reservation::~DiskS3Reservation() { @@ -647,24 +672,29 @@ DiskS3Reservation::~DiskS3Reservation() } } -inline void checkWriteAccess(std::shared_ptr & disk) +namespace { - auto file = disk->writeFile("test_acl", DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite); + +void checkWriteAccess(IDisk & disk) +{ + auto file = disk.writeFile("test_acl", DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite); file->write("test", 4); } -inline void checkReadAccess(const String & disk_name, std::shared_ptr & disk) +void checkReadAccess(const String & disk_name, IDisk & disk) { - auto file = disk->readFile("test_acl", DBMS_DEFAULT_BUFFER_SIZE); + auto file = disk.readFile("test_acl", DBMS_DEFAULT_BUFFER_SIZE); String buf(4, '0'); file->readStrict(buf.data(), 4); if (buf != "test") throw Exception("No read access to S3 bucket in disk " + disk_name, ErrorCodes::PATH_ACCESS_DENIED); } -inline void checkRemoveAccess(std::shared_ptr & disk) +void checkRemoveAccess(IDisk & disk) { - disk->remove("test_acl"); + disk.remove("test_acl"); +} + } void registerDiskS3(DiskFactory & factory) @@ -691,9 +721,9 @@ void registerDiskS3(DiskFactory & factory) = std::make_shared(name, client, uri.bucket, uri.key, metadata_path, context.getSettingsRef().s3_min_upload_part_size); /// This code is used only to check access to the corresponding disk. - checkWriteAccess(s3disk); - checkReadAccess(name, s3disk); - checkRemoveAccess(s3disk); + checkWriteAccess(*s3disk); + checkReadAccess(name, *s3disk); + checkRemoveAccess(*s3disk); return s3disk; }; diff --git a/dbms/src/Disks/DiskS3.h b/dbms/src/Disks/DiskS3.h index 5b59089ffd1..1b61ed1cde3 100644 --- a/dbms/src/Disks/DiskS3.h +++ b/dbms/src/Disks/DiskS3.h @@ -31,7 +31,7 @@ public: const String & getName() const override { return name; } - const String & getPath() const override { return s3_root_path; } + const String & getPath() const override { return metadata_path; } ReservationPtr reserve(UInt64 bytes) override; @@ -71,29 +71,33 @@ public: std::unique_ptr readFile( const String & path, - size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - size_t estimated_size = 0, - size_t aio_threshold = 0, - size_t mmap_threshold = 0) const override; + size_t buf_size, + size_t estimated_size, + size_t aio_threshold, + size_t mmap_threshold) const override; std::unique_ptr writeFile( const String & path, - size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - WriteMode mode = WriteMode::Rewrite, - size_t estimated_size = 0, - size_t aio_threshold = 0) override; + size_t buf_size, + WriteMode mode, + size_t estimated_size, + size_t aio_threshold) override; void remove(const String & path) override; void removeRecursive(const String & path) override; + void createHardLink(const String & src_path, const String & dst_path) override; + void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; -private: - String getRandomName() const; + void createFile(const String & path) override; + void setReadOnly(const String & path) override; + +private: bool tryReserve(UInt64 bytes); private: diff --git a/dbms/src/Disks/IDisk.cpp b/dbms/src/Disks/IDisk.cpp index 48b080e1704..36ab2a49573 100644 --- a/dbms/src/Disks/IDisk.cpp +++ b/dbms/src/Disks/IDisk.cpp @@ -1,4 +1,9 @@ #include "IDisk.h" +#include +#include +#include +#include +#include namespace DB { @@ -7,4 +12,36 @@ bool IDisk::isDirectoryEmpty(const String & path) { return !iterateDirectory(path)->isValid(); } + +void copyFile(IDisk & from_disk, const String & from_path, IDisk & to_disk, const String & to_path) +{ + LOG_DEBUG( + &Poco::Logger::get("IDisk"), + "Copying from " << from_disk.getName() << " " << from_path << " to " << to_disk.getName() << " " << to_path); + + auto in = from_disk.readFile(from_path); + auto out = to_disk.writeFile(to_path); + copyData(*in, *out); +} + +void IDisk::copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) +{ + if (isFile(from_path)) + { + DB::copyFile(*this, from_path, *to_disk, to_path + fileName(from_path)); + } + else + { + Poco::Path path(from_path); + const String & dir_name = path.directory(path.depth() - 1); + const String dest = to_path + dir_name + "/"; + to_disk->createDirectories(dest); + + for (auto it = iterateDirectory(from_path); it->isValid(); it->next()) + { + copy(it->path(), to_disk, dest); + } + } +} + } diff --git a/dbms/src/Disks/IDisk.h b/dbms/src/Disks/IDisk.h index 877c6f84706..7d0b429720e 100644 --- a/dbms/src/Disks/IDisk.h +++ b/dbms/src/Disks/IDisk.h @@ -111,6 +111,9 @@ public: /// Return `true` if the specified directory is empty. bool isDirectoryEmpty(const String & path); + /// Create empty file at `path`. + virtual void createFile(const String & path) = 0; + /// Move the file from `from_path` to `to_path`. /// If a file with `to_path` path already exists, an exception will be thrown . virtual void moveFile(const String & from_path, const String & to_path) = 0; @@ -122,6 +125,9 @@ public: /// Copy the file from `from_path` to `to_path`. virtual void copyFile(const String & from_path, const String & to_path) = 0; + /// Recursively copy data containing at `from_path` to `to_path` located at `to_disk`. + virtual void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path); + /// List files at `path` and add their names to `file_names` virtual void listFiles(const String & path, std::vector & file_names) = 0; @@ -147,11 +153,24 @@ public: /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. virtual void removeRecursive(const String & path) = 0; + /// Remove file or directory if it exists. + void removeIfExists(const String & path) + { + if (exists(path)) + remove(path); + } + /// Set last modified time to file or directory at `path`. virtual void setLastModified(const String & path, const Poco::Timestamp & timestamp) = 0; /// Get last modified time of file or directory at `path`. virtual Poco::Timestamp getLastModified(const String & path) = 0; + + /// Set file at `path` as read-only. + virtual void setReadOnly(const String & path) = 0; + + /// Create hardlink from `src_path` to `dst_path`. + virtual void createHardLink(const String & src_path, const String & dst_path) = 0; }; using DiskPtr = std::shared_ptr; diff --git a/dbms/src/Formats/ProtobufReader.cpp b/dbms/src/Formats/ProtobufReader.cpp index 3874ec3e447..5426e8fac62 100644 --- a/dbms/src/Formats/ProtobufReader.cpp +++ b/dbms/src/Formats/ProtobufReader.cpp @@ -273,30 +273,35 @@ UInt64 ProtobufReader::SimpleReader::continueReadingVarint(UInt64 first_byte) char c; #define PROTOBUF_READER_READ_VARINT_BYTE(byteNo) \ - in.readStrict(c); \ - ++cursor; \ - if constexpr (byteNo < 10) \ + do \ { \ - result |= static_cast(static_cast(c)) << (7 * (byteNo - 1)); \ - if (likely(!(c & 0x80))) \ - return result; \ - } \ - else \ - { \ - if (likely(c == 1)) \ - return result; \ - } \ - if constexpr (byteNo < 9) \ - result &= ~(static_cast(0x80) << (7 * (byteNo - 1))); - PROTOBUF_READER_READ_VARINT_BYTE(2) - PROTOBUF_READER_READ_VARINT_BYTE(3) - PROTOBUF_READER_READ_VARINT_BYTE(4) - PROTOBUF_READER_READ_VARINT_BYTE(5) - PROTOBUF_READER_READ_VARINT_BYTE(6) - PROTOBUF_READER_READ_VARINT_BYTE(7) - PROTOBUF_READER_READ_VARINT_BYTE(8) - PROTOBUF_READER_READ_VARINT_BYTE(9) - PROTOBUF_READER_READ_VARINT_BYTE(10) + in.readStrict(c); \ + ++cursor; \ + if constexpr ((byteNo) < 10) \ + { \ + result |= static_cast(static_cast(c)) << (7 * ((byteNo) - 1)); \ + if (likely(!(c & 0x80))) \ + return result; \ + } \ + else \ + { \ + if (likely(c == 1)) \ + return result; \ + } \ + if constexpr ((byteNo) < 9) \ + result &= ~(static_cast(0x80) << (7 * ((byteNo) - 1))); \ + } while (false) + + PROTOBUF_READER_READ_VARINT_BYTE(2); + PROTOBUF_READER_READ_VARINT_BYTE(3); + PROTOBUF_READER_READ_VARINT_BYTE(4); + PROTOBUF_READER_READ_VARINT_BYTE(5); + PROTOBUF_READER_READ_VARINT_BYTE(6); + PROTOBUF_READER_READ_VARINT_BYTE(7); + PROTOBUF_READER_READ_VARINT_BYTE(8); + PROTOBUF_READER_READ_VARINT_BYTE(9); + PROTOBUF_READER_READ_VARINT_BYTE(10); + #undef PROTOBUF_READER_READ_VARINT_BYTE throwUnknownFormat(); @@ -307,28 +312,32 @@ void ProtobufReader::SimpleReader::ignoreVarint() char c; #define PROTOBUF_READER_IGNORE_VARINT_BYTE(byteNo) \ - in.readStrict(c); \ - ++cursor; \ - if constexpr (byteNo < 10) \ + do \ { \ - if (likely(!(c & 0x80))) \ - return; \ - } \ - else \ - { \ - if (likely(c == 1)) \ - return; \ - } - PROTOBUF_READER_IGNORE_VARINT_BYTE(1) - PROTOBUF_READER_IGNORE_VARINT_BYTE(2) - PROTOBUF_READER_IGNORE_VARINT_BYTE(3) - PROTOBUF_READER_IGNORE_VARINT_BYTE(4) - PROTOBUF_READER_IGNORE_VARINT_BYTE(5) - PROTOBUF_READER_IGNORE_VARINT_BYTE(6) - PROTOBUF_READER_IGNORE_VARINT_BYTE(7) - PROTOBUF_READER_IGNORE_VARINT_BYTE(8) - PROTOBUF_READER_IGNORE_VARINT_BYTE(9) - PROTOBUF_READER_IGNORE_VARINT_BYTE(10) + in.readStrict(c); \ + ++cursor; \ + if constexpr ((byteNo) < 10) \ + { \ + if (likely(!(c & 0x80))) \ + return; \ + } \ + else \ + { \ + if (likely(c == 1)) \ + return; \ + } \ + } while (false) + + PROTOBUF_READER_IGNORE_VARINT_BYTE(1); + PROTOBUF_READER_IGNORE_VARINT_BYTE(2); + PROTOBUF_READER_IGNORE_VARINT_BYTE(3); + PROTOBUF_READER_IGNORE_VARINT_BYTE(4); + PROTOBUF_READER_IGNORE_VARINT_BYTE(5); + PROTOBUF_READER_IGNORE_VARINT_BYTE(6); + PROTOBUF_READER_IGNORE_VARINT_BYTE(7); + PROTOBUF_READER_IGNORE_VARINT_BYTE(8); + PROTOBUF_READER_IGNORE_VARINT_BYTE(9); + PROTOBUF_READER_IGNORE_VARINT_BYTE(10); #undef PROTOBUF_READER_IGNORE_VARINT_BYTE throwUnknownFormat(); @@ -846,7 +855,7 @@ private: std::unique_ptr ProtobufReader::createConverter( \ const google::protobuf::FieldDescriptor * field) \ { \ - return std::make_unique>(simple_reader, field); \ + return std::make_unique>(simple_reader, field); /* NOLINT */ \ } PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_INT32, Int64); PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SINT32, Int64); diff --git a/dbms/src/Formats/ProtobufSchemas.cpp b/dbms/src/Formats/ProtobufSchemas.cpp index 11afbffe694..f4973263bc8 100644 --- a/dbms/src/Formats/ProtobufSchemas.cpp +++ b/dbms/src/Formats/ProtobufSchemas.cpp @@ -24,7 +24,7 @@ ProtobufSchemas & ProtobufSchemas::instance() class ProtobufSchemas::ImporterWithSourceTree : public google::protobuf::compiler::MultiFileErrorCollector { public: - ImporterWithSourceTree(const String & schema_directory) : importer(&disk_source_tree, this) + explicit ImporterWithSourceTree(const String & schema_directory) : importer(&disk_source_tree, this) { disk_source_tree.MapPath("", schema_directory); } diff --git a/dbms/src/Functions/CRC.cpp b/dbms/src/Functions/CRC.cpp index 22814b2c26c..e5730e6610f 100644 --- a/dbms/src/Functions/CRC.cpp +++ b/dbms/src/Functions/CRC.cpp @@ -11,15 +11,13 @@ template struct CRCBase { T tab[256]; - CRCBase(T polynomial) + explicit CRCBase(T polynomial) { for (size_t i = 0; i < 256; ++i) { T c = i; for (size_t j = 0; j < 8; ++j) - { c = c & 1 ? polynomial ^ (c >> 1) : c >> 1; - } tab[i] = c; } } @@ -34,13 +32,9 @@ struct CRCImpl { static CRCBase base(polynomial); - T i, crc; - - crc = 0; - for (i = 0; i < size; i++) - { + T crc = 0; + for (size_t i = 0; i < size; i++) crc = base.tab[(crc ^ buf[i]) & 0xff] ^ (crc >> 8); - } return crc; } }; @@ -63,10 +57,12 @@ struct CRC32ZLIBImpl static constexpr auto name = "CRC32"; static UInt32 make_crc(const unsigned char *buf, size_t size) - { return crc32_z(0L, buf, size); } + { + return crc32_z(0L, buf, size); + } }; -} // \anonymous +} namespace DB { diff --git a/dbms/src/Functions/DivisionUtils.h b/dbms/src/Functions/DivisionUtils.h index 9bf153d4d6e..df3b86f721d 100644 --- a/dbms/src/Functions/DivisionUtils.h +++ b/dbms/src/Functions/DivisionUtils.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include diff --git a/dbms/src/Functions/FunctionFQDN.cpp b/dbms/src/Functions/FunctionFQDN.cpp index ed49b43632e..12be3508e40 100644 --- a/dbms/src/Functions/FunctionFQDN.cpp +++ b/dbms/src/Functions/FunctionFQDN.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include diff --git a/dbms/src/Functions/FunctionHelpers.cpp b/dbms/src/Functions/FunctionHelpers.cpp index fde1774695c..2210be65b32 100644 --- a/dbms/src/Functions/FunctionHelpers.cpp +++ b/dbms/src/Functions/FunctionHelpers.cpp @@ -167,7 +167,7 @@ void validateFunctionArgumentTypes(const IFunction & func, { if (arguments.size() < mandatory_args.size() || arguments.size() > mandatory_args.size() + optional_args.size()) { - auto joinArgumentTypes = [](const auto & args, const String sep = ", ") -> String + auto joinArgumentTypes = [](const auto & args, const String sep = ", ") { String result; for (const auto & a : args) diff --git a/dbms/src/Functions/FunctionsConsistentHashing.h b/dbms/src/Functions/FunctionsConsistentHashing.h index 7ddf1c676ff..59bc8e2f521 100644 --- a/dbms/src/Functions/FunctionsConsistentHashing.h +++ b/dbms/src/Functions/FunctionsConsistentHashing.h @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB diff --git a/dbms/src/Functions/FunctionsLogical.cpp b/dbms/src/Functions/FunctionsLogical.cpp index 8b8f03a0c89..b5fe1ff59e1 100644 --- a/dbms/src/Functions/FunctionsLogical.cpp +++ b/dbms/src/Functions/FunctionsLogical.cpp @@ -152,7 +152,7 @@ class AssociativeApplierImpl public: /// Remembers the last N columns from `in`. - AssociativeApplierImpl(const UInt8ColumnPtrs & in) + explicit AssociativeApplierImpl(const UInt8ColumnPtrs & in) : vec(in[in.size() - N]->getData()), next(in) {} /// Returns a combination of values in the i-th row of all columns stored in the constructor. @@ -176,7 +176,7 @@ class AssociativeApplierImpl using ResultValueType = typename Op::ResultType; public: - AssociativeApplierImpl(const UInt8ColumnPtrs & in) + explicit AssociativeApplierImpl(const UInt8ColumnPtrs & in) : vec(in[in.size() - 1]->getData()) {} inline ResultValueType apply(const size_t i) const { return vec[i]; } @@ -239,7 +239,7 @@ class AssociativeGenericApplierImpl public: /// Remembers the last N columns from `in`. - AssociativeGenericApplierImpl(const ColumnRawPtrs & in) + explicit AssociativeGenericApplierImpl(const ColumnRawPtrs & in) : val_getter{ValueGetterBuilder::build(in[in.size() - N])}, next{in} {} /// Returns a combination of values in the i-th row of all columns stored in the constructor. @@ -265,7 +265,7 @@ class AssociativeGenericApplierImpl public: /// Remembers the last N columns from `in`. - AssociativeGenericApplierImpl(const ColumnRawPtrs & in) + explicit AssociativeGenericApplierImpl(const ColumnRawPtrs & in) : val_getter{ValueGetterBuilder::build(in[in.size() - 1])} {} inline ResultValueType apply(const size_t i) const { return val_getter(i); } diff --git a/dbms/src/Functions/FunctionsStringArray.h b/dbms/src/Functions/FunctionsStringArray.h index b206f07522e..6460ec61804 100644 --- a/dbms/src/Functions/FunctionsStringArray.h +++ b/dbms/src/Functions/FunctionsStringArray.h @@ -214,9 +214,6 @@ public: ErrorCodes::ILLEGAL_COLUMN); sep = col->getValue(); - - if (sep.empty()) - throw Exception("Illegal separator for function " + getName() + ". Must be not empty.", ErrorCodes::BAD_ARGUMENTS); } /// Returns the position of the argument that is the column of strings @@ -235,19 +232,32 @@ public: /// Get the next token, if any, or return false. bool get(Pos & token_begin, Pos & token_end) { - if (!pos) - return false; - - token_begin = pos; - pos = reinterpret_cast(memmem(pos, end - pos, sep.data(), sep.size())); - - if (pos) + if (sep.empty()) { + if (pos == end) + return false; + + token_begin = pos; + pos += 1; token_end = pos; - pos += sep.size(); } else - token_end = end; + { + if (!pos) + return false; + + token_begin = pos; + + pos = reinterpret_cast(memmem(pos, end - pos, sep.data(), sep.size())); + + if (pos) + { + token_end = pos; + pos += sep.size(); + } + else + token_end = end; + } return true; } diff --git a/dbms/src/Functions/FunctionsStringRegex.cpp b/dbms/src/Functions/FunctionsStringRegex.cpp index 186e58f83fa..4f89c3ae305 100644 --- a/dbms/src/Functions/FunctionsStringRegex.cpp +++ b/dbms/src/Functions/FunctionsStringRegex.cpp @@ -306,8 +306,8 @@ struct MultiMatchAnyImpl MultiRegexps::ScratchPtr smart_scratch(scratch); auto on_match = []([[maybe_unused]] unsigned int id, - unsigned long long /* from */, - unsigned long long /* to */, + unsigned long long /* from */, // NOLINT + unsigned long long /* to */, // NOLINT unsigned int /* flags */, void * context) -> int { @@ -407,8 +407,8 @@ struct MultiMatchAllIndicesImpl MultiRegexps::ScratchPtr smart_scratch(scratch); auto on_match = [](unsigned int id, - unsigned long long /* from */, - unsigned long long /* to */, + unsigned long long /* from */, // NOLINT + unsigned long long /* to */, // NOLINT unsigned int /* flags */, void * context) -> int { diff --git a/dbms/src/Functions/GatherUtils/concat.cpp b/dbms/src/Functions/GatherUtils/concat.cpp index 070f8477259..0962baed603 100644 --- a/dbms/src/Functions/GatherUtils/concat.cpp +++ b/dbms/src/Functions/GatherUtils/concat.cpp @@ -1,3 +1,5 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + #include "GatherUtils.h" #include "Selectors.h" #include "Algorithms.h" @@ -46,6 +48,9 @@ void concat(const std::vector> & sources, IArraySi throw Exception("Concat function should get at least 1 ArraySource", ErrorCodes::LOGICAL_ERROR); return ArrayConcat::select(*sources.front(), sink, sources); } + } } + +#endif diff --git a/dbms/src/Functions/GatherUtils/resizeConstantSize.cpp b/dbms/src/Functions/GatherUtils/resizeConstantSize.cpp index 5b1e9b59548..e88e51e5231 100644 --- a/dbms/src/Functions/GatherUtils/resizeConstantSize.cpp +++ b/dbms/src/Functions/GatherUtils/resizeConstantSize.cpp @@ -1,3 +1,5 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + #include "GatherUtils.h" #include "Selectors.h" #include "Algorithms.h" @@ -21,3 +23,5 @@ void resizeConstantSize(IArraySource & array_source, IValueSource & value_source ArrayResizeConstant::select(sink, array_source, value_source, size); } } + +#endif diff --git a/dbms/src/Functions/GatherUtils/resizeDynamicSize.cpp b/dbms/src/Functions/GatherUtils/resizeDynamicSize.cpp index 2f601f2354d..207f97d1290 100644 --- a/dbms/src/Functions/GatherUtils/resizeDynamicSize.cpp +++ b/dbms/src/Functions/GatherUtils/resizeDynamicSize.cpp @@ -1,3 +1,5 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + #include "GatherUtils.h" #include "Selectors.h" #include "Algorithms.h" @@ -21,3 +23,5 @@ void resizeDynamicSize(IArraySource & array_source, IValueSource & value_source, ArrayResizeDynamic::select(sink, array_source, value_source, size_column); } } + +#endif diff --git a/dbms/src/Functions/GatherUtils/sliceDynamicOffsetBounded.cpp b/dbms/src/Functions/GatherUtils/sliceDynamicOffsetBounded.cpp index b266a253eeb..e4ea70dd09e 100644 --- a/dbms/src/Functions/GatherUtils/sliceDynamicOffsetBounded.cpp +++ b/dbms/src/Functions/GatherUtils/sliceDynamicOffsetBounded.cpp @@ -1,3 +1,5 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + #include "GatherUtils.h" #include "Selectors.h" #include "Algorithms.h" @@ -18,3 +20,5 @@ void sliceDynamicOffsetBounded(IArraySource & src, IArraySink & sink, const ICol SliceDynamicOffsetBoundedSelectArraySource::select(src, sink, offset_column, length_column); } } + +#endif diff --git a/dbms/src/Functions/GatherUtils/sliceDynamicOffsetUnbounded.cpp b/dbms/src/Functions/GatherUtils/sliceDynamicOffsetUnbounded.cpp index 9cb2a74367e..ba7d6835830 100644 --- a/dbms/src/Functions/GatherUtils/sliceDynamicOffsetUnbounded.cpp +++ b/dbms/src/Functions/GatherUtils/sliceDynamicOffsetUnbounded.cpp @@ -1,3 +1,5 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + #include "GatherUtils.h" #include "Selectors.h" #include "Algorithms.h" @@ -19,3 +21,5 @@ void sliceDynamicOffsetUnbounded(IArraySource & src, IArraySink & sink, const IC SliceDynamicOffsetUnboundedSelectArraySource::select(src, sink, offset_column); } } + +#endif diff --git a/dbms/src/Functions/GatherUtils/sliceFromLeftConstantOffsetBounded.cpp b/dbms/src/Functions/GatherUtils/sliceFromLeftConstantOffsetBounded.cpp index 5b86fdff682..d2f5082ad55 100644 --- a/dbms/src/Functions/GatherUtils/sliceFromLeftConstantOffsetBounded.cpp +++ b/dbms/src/Functions/GatherUtils/sliceFromLeftConstantOffsetBounded.cpp @@ -1,3 +1,5 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + #include "GatherUtils.h" #include "Selectors.h" #include "Algorithms.h" @@ -19,3 +21,5 @@ void sliceFromLeftConstantOffsetBounded(IArraySource & src, IArraySink & sink, s SliceFromLeftConstantOffsetBoundedSelectArraySource::select(src, sink, offset, length); } } + +#endif diff --git a/dbms/src/Functions/GatherUtils/sliceFromLeftConstantOffsetUnbounded.cpp b/dbms/src/Functions/GatherUtils/sliceFromLeftConstantOffsetUnbounded.cpp index d2f7dcc0789..6f283d0dfec 100644 --- a/dbms/src/Functions/GatherUtils/sliceFromLeftConstantOffsetUnbounded.cpp +++ b/dbms/src/Functions/GatherUtils/sliceFromLeftConstantOffsetUnbounded.cpp @@ -1,3 +1,5 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + #include "GatherUtils.h" #include "Selectors.h" #include "Algorithms.h" @@ -19,3 +21,5 @@ void sliceFromLeftConstantOffsetUnbounded(IArraySource & src, IArraySink & sink, SliceFromLeftConstantOffsetUnboundedSelectArraySource::select(src, sink, offset); } } + +#endif diff --git a/dbms/src/Functions/GatherUtils/sliceFromRightConstantOffsetBounded.cpp b/dbms/src/Functions/GatherUtils/sliceFromRightConstantOffsetBounded.cpp index b46547ddff8..1a6385924f4 100644 --- a/dbms/src/Functions/GatherUtils/sliceFromRightConstantOffsetBounded.cpp +++ b/dbms/src/Functions/GatherUtils/sliceFromRightConstantOffsetBounded.cpp @@ -1,3 +1,5 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + #include "GatherUtils.h" #include "Selectors.h" #include "Algorithms.h" @@ -19,3 +21,5 @@ void sliceFromRightConstantOffsetBounded(IArraySource & src, IArraySink & sink, SliceFromRightConstantOffsetBoundedSelectArraySource::select(src, sink, offset, length); } } + +#endif diff --git a/dbms/src/Functions/GatherUtils/sliceFromRightConstantOffsetUnbounded.cpp b/dbms/src/Functions/GatherUtils/sliceFromRightConstantOffsetUnbounded.cpp index 9e95a338ab6..e669c5d50b8 100644 --- a/dbms/src/Functions/GatherUtils/sliceFromRightConstantOffsetUnbounded.cpp +++ b/dbms/src/Functions/GatherUtils/sliceFromRightConstantOffsetUnbounded.cpp @@ -1,3 +1,5 @@ +#ifndef __clang_analyzer__ // It's too hard to analyze. + #include "GatherUtils.h" #include "Selectors.h" #include "Algorithms.h" @@ -19,3 +21,5 @@ void sliceFromRightConstantOffsetUnbounded(IArraySource & src, IArraySink & sink SliceFromRightConstantOffsetUnboundedSelectArraySource::select(src, sink, offset); } } + +#endif diff --git a/dbms/src/Functions/GeoUtils.cpp b/dbms/src/Functions/GeoUtils.cpp index 488a102e208..382c02e7008 100644 --- a/dbms/src/Functions/GeoUtils.cpp +++ b/dbms/src/Functions/GeoUtils.cpp @@ -132,17 +132,17 @@ inline std::tuple split(const Encoded & combined, uint8_t prec lat.fill(0); lon.fill(0); - uint8_t i = 0; + size_t i = 0; for (; i < precision * BITS_PER_SYMBOL - 1; i += 2) { // longitude is even bits - lon[i/2] = combined[i]; - lat[i/2] = combined[i + 1]; + lon[i / 2] = combined[i]; + lat[i / 2] = combined[i + 1]; } // precision is even, read the last bit as lat. if (precision & 0x1) { - lon[i/2] = combined[precision * BITS_PER_SYMBOL - 1]; + lon[i / 2] = combined[precision * BITS_PER_SYMBOL - 1]; } return std::tie(lon, lat); @@ -152,7 +152,7 @@ inline void base32Encode(const Encoded & binary, uint8_t precision, char * out) { extern const char geohash_base32_encode_lookup_table[32]; - for (uint8_t i = 0; i < precision * BITS_PER_SYMBOL; i += BITS_PER_SYMBOL) + for (size_t i = 0; i < precision * BITS_PER_SYMBOL; i += BITS_PER_SYMBOL) { uint8_t v = binary[i]; v <<= 1; @@ -319,9 +319,9 @@ UInt64 geohashesInBox(const GeohashesInBoxPreparedArgs & args, char * out) } UInt64 items = 0; - for (auto lon = args.longitude_min; lon < args.longitude_max; lon += args.longitude_step) + for (auto lon = args.longitude_min; lon < args.longitude_max; lon += args.longitude_step) // NOLINT { - for (auto lat = args.latitude_min; lat < args.latitude_max; lat += args.latitude_step) + for (auto lat = args.latitude_min; lat < args.latitude_max; lat += args.latitude_step) // NOLINT { assert(items <= args.items_count); diff --git a/dbms/src/Functions/URL/URLHierarchy.cpp b/dbms/src/Functions/URL/URLHierarchy.cpp index 0ed7a11f41c..776b70c23cf 100644 --- a/dbms/src/Functions/URL/URLHierarchy.cpp +++ b/dbms/src/Functions/URL/URLHierarchy.cpp @@ -32,7 +32,7 @@ public: void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {} /// Returns the position of the argument that is the column of rows - size_t getStringsArgumentPosition() + static size_t getStringsArgumentPosition() { return 0; } diff --git a/dbms/src/Functions/URL/URLPathHierarchy.cpp b/dbms/src/Functions/URL/URLPathHierarchy.cpp index 5bea296a9ca..d695877b1c5 100644 --- a/dbms/src/Functions/URL/URLPathHierarchy.cpp +++ b/dbms/src/Functions/URL/URLPathHierarchy.cpp @@ -31,7 +31,7 @@ public: void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {} /// Returns the position of the argument that is the column of rows - size_t getStringsArgumentPosition() + static size_t getStringsArgumentPosition() { return 0; } diff --git a/dbms/src/Functions/URL/extractURLParameterNames.cpp b/dbms/src/Functions/URL/extractURLParameterNames.cpp index aa9b87d69f6..0f7966fadeb 100644 --- a/dbms/src/Functions/URL/extractURLParameterNames.cpp +++ b/dbms/src/Functions/URL/extractURLParameterNames.cpp @@ -29,7 +29,7 @@ public: } /// Returns the position of the argument that is the column of rows - size_t getStringsArgumentPosition() + static size_t getStringsArgumentPosition() { return 0; } diff --git a/dbms/src/Functions/URL/extractURLParameters.cpp b/dbms/src/Functions/URL/extractURLParameters.cpp index ab0aef2c039..1c1a155fd65 100644 --- a/dbms/src/Functions/URL/extractURLParameters.cpp +++ b/dbms/src/Functions/URL/extractURLParameters.cpp @@ -31,7 +31,7 @@ public: void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {} /// Returns the position of the argument that is the column of rows - size_t getStringsArgumentPosition() + static size_t getStringsArgumentPosition() { return 0; } diff --git a/dbms/src/Functions/array/array.cpp b/dbms/src/Functions/array/array.cpp index d517ced8203..aa4b945055a 100644 --- a/dbms/src/Functions/array/array.cpp +++ b/dbms/src/Functions/array/array.cpp @@ -19,7 +19,7 @@ public: return std::make_shared(context); } - FunctionArray(const Context & context_) + explicit FunctionArray(const Context & context_) : context(context_) { } diff --git a/dbms/src/Functions/array/arrayConcat.cpp b/dbms/src/Functions/array/arrayConcat.cpp index 30da20c7766..f96584e3f54 100644 --- a/dbms/src/Functions/array/arrayConcat.cpp +++ b/dbms/src/Functions/array/arrayConcat.cpp @@ -27,7 +27,7 @@ class FunctionArrayConcat : public IFunction public: static constexpr auto name = "arrayConcat"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionArrayConcat(const Context & context_) : context(context_) {} + explicit FunctionArrayConcat(const Context & context_) : context(context_) {} String getName() const override { return name; } diff --git a/dbms/src/Functions/array/arrayDistinct.cpp b/dbms/src/Functions/array/arrayDistinct.cpp index 920ee6d39fd..d24de638865 100644 --- a/dbms/src/Functions/array/arrayDistinct.cpp +++ b/dbms/src/Functions/array/arrayDistinct.cpp @@ -62,21 +62,21 @@ private: static constexpr size_t INITIAL_SIZE_DEGREE = 9; template - bool executeNumber( + static bool executeNumber( const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data_col, ColumnArray::Offsets & res_offsets, const ColumnNullable * nullable_col); - bool executeString( + static bool executeString( const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data_col, ColumnArray::Offsets & res_offsets, const ColumnNullable * nullable_col); - void executeHashed( + static void executeHashed( const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data_col, diff --git a/dbms/src/Functions/array/arrayElement.cpp b/dbms/src/Functions/array/arrayElement.cpp index 2921a4bd02a..6113a16ddfa 100644 --- a/dbms/src/Functions/array/arrayElement.cpp +++ b/dbms/src/Functions/array/arrayElement.cpp @@ -53,29 +53,29 @@ private: ArrayImpl::NullMapBuilder & builder, size_t input_rows_count); template - bool executeNumberConst(Block & block, const ColumnNumbers & arguments, size_t result, const Field & index, + static bool executeNumberConst(Block & block, const ColumnNumbers & arguments, size_t result, const Field & index, ArrayImpl::NullMapBuilder & builder); template - bool executeNumber(Block & block, const ColumnNumbers & arguments, size_t result, const PaddedPODArray & indices, + static bool executeNumber(Block & block, const ColumnNumbers & arguments, size_t result, const PaddedPODArray & indices, ArrayImpl::NullMapBuilder & builder); - bool executeStringConst(Block & block, const ColumnNumbers & arguments, size_t result, const Field & index, + static bool executeStringConst(Block & block, const ColumnNumbers & arguments, size_t result, const Field & index, ArrayImpl::NullMapBuilder & builder); template - bool executeString(Block & block, const ColumnNumbers & arguments, size_t result, const PaddedPODArray & indices, + static bool executeString(Block & block, const ColumnNumbers & arguments, size_t result, const PaddedPODArray & indices, ArrayImpl::NullMapBuilder & builder); - bool executeGenericConst(Block & block, const ColumnNumbers & arguments, size_t result, const Field & index, + static bool executeGenericConst(Block & block, const ColumnNumbers & arguments, size_t result, const Field & index, ArrayImpl::NullMapBuilder & builder); template - bool executeGeneric(Block & block, const ColumnNumbers & arguments, size_t result, const PaddedPODArray & indices, + static bool executeGeneric(Block & block, const ColumnNumbers & arguments, size_t result, const PaddedPODArray & indices, ArrayImpl::NullMapBuilder & builder); template - bool executeConst(Block & block, const ColumnNumbers & arguments, size_t result, + static bool executeConst(Block & block, const ColumnNumbers & arguments, size_t result, const PaddedPODArray & indices, ArrayImpl::NullMapBuilder & builder, size_t input_rows_count); @@ -95,7 +95,7 @@ namespace ArrayImpl class NullMapBuilder { public: - operator bool() const { return src_null_map; } + explicit operator bool() const { return src_null_map; } bool operator!() const { return !src_null_map; } void initSource(const UInt8 * src_null_map_) diff --git a/dbms/src/Functions/array/arrayEnumerateRanked.cpp b/dbms/src/Functions/array/arrayEnumerateRanked.cpp index 7be0cbc44ce..dc7bd6d5dea 100644 --- a/dbms/src/Functions/array/arrayEnumerateRanked.cpp +++ b/dbms/src/Functions/array/arrayEnumerateRanked.cpp @@ -36,10 +36,7 @@ ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments) if (type_array) { if (depths.size() < array_num && prev_array_depth) - { depths.emplace_back(prev_array_depth); - prev_array_depth = 0; - } prev_array_depth = type_array->getNumberOfDimensions(); ++array_num; diff --git a/dbms/src/Functions/array/arrayIntersect.cpp b/dbms/src/Functions/array/arrayIntersect.cpp index 4673f4a7a05..ffeb6e99222 100644 --- a/dbms/src/Functions/array/arrayIntersect.cpp +++ b/dbms/src/Functions/array/arrayIntersect.cpp @@ -39,7 +39,7 @@ class FunctionArrayIntersect : public IFunction public: static constexpr auto name = "arrayIntersect"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionArrayIntersect(const Context & context_) : context(context_) {} + explicit FunctionArrayIntersect(const Context & context_) : context(context_) {} String getName() const override { return name; } diff --git a/dbms/src/Functions/array/arrayPushBack.cpp b/dbms/src/Functions/array/arrayPushBack.cpp index a9c4ed88a7a..74d9596dcd2 100644 --- a/dbms/src/Functions/array/arrayPushBack.cpp +++ b/dbms/src/Functions/array/arrayPushBack.cpp @@ -10,7 +10,7 @@ class FunctionArrayPushBack : public FunctionArrayPush public: static constexpr auto name = "arrayPushBack"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionArrayPushBack(const Context & context_) : FunctionArrayPush(context_, false, name) {} + explicit FunctionArrayPushBack(const Context & context_) : FunctionArrayPush(context_, false, name) {} }; void registerFunctionArrayPushBack(FunctionFactory & factory) diff --git a/dbms/src/Functions/array/arrayPushFront.cpp b/dbms/src/Functions/array/arrayPushFront.cpp index e0cc56c8ae2..ab8535b6672 100644 --- a/dbms/src/Functions/array/arrayPushFront.cpp +++ b/dbms/src/Functions/array/arrayPushFront.cpp @@ -11,7 +11,7 @@ class FunctionArrayPushFront : public FunctionArrayPush public: static constexpr auto name = "arrayPushFront"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionArrayPushFront(const Context & context_) : FunctionArrayPush(context_, true, name) {} + explicit FunctionArrayPushFront(const Context & context_) : FunctionArrayPush(context_, true, name) {} }; diff --git a/dbms/src/Functions/array/arrayResize.cpp b/dbms/src/Functions/array/arrayResize.cpp index 903a39aa4ab..9e34e7ccd92 100644 --- a/dbms/src/Functions/array/arrayResize.cpp +++ b/dbms/src/Functions/array/arrayResize.cpp @@ -26,7 +26,7 @@ class FunctionArrayResize : public IFunction public: static constexpr auto name = "arrayResize"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionArrayResize(const Context & context_) : context(context_) {} + explicit FunctionArrayResize(const Context & context_) : context(context_) {} String getName() const override { return name; } diff --git a/dbms/src/Functions/array/arrayReverse.cpp b/dbms/src/Functions/array/arrayReverse.cpp index 81ddb8e8e5e..8913b27db4c 100644 --- a/dbms/src/Functions/array/arrayReverse.cpp +++ b/dbms/src/Functions/array/arrayReverse.cpp @@ -45,11 +45,11 @@ public: private: template - bool executeNumber(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data); + static bool executeNumber(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data); - bool executeFixedString(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data); - bool executeString(const IColumn & src_data, const ColumnArray::Offsets & src_array_offsets, IColumn & res_data); - bool executeGeneric(const IColumn & src_data, const ColumnArray::Offsets & src_array_offsets, IColumn & res_data); + static bool executeFixedString(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data); + static bool executeString(const IColumn & src_data, const ColumnArray::Offsets & src_array_offsets, IColumn & res_data); + static bool executeGeneric(const IColumn & src_data, const ColumnArray::Offsets & src_array_offsets, IColumn & res_data); }; diff --git a/dbms/src/Functions/array/arraySort.cpp b/dbms/src/Functions/array/arraySort.cpp index 17a711e8902..478c7e52614 100644 --- a/dbms/src/Functions/array/arraySort.cpp +++ b/dbms/src/Functions/array/arraySort.cpp @@ -23,7 +23,7 @@ struct ArraySortImpl { const IColumn & column; - Less(const IColumn & column_) : column(column_) {} + explicit Less(const IColumn & column_) : column(column_) {} bool operator()(size_t lhs, size_t rhs) const { diff --git a/dbms/src/Functions/array/arrayUniq.cpp b/dbms/src/Functions/array/arrayUniq.cpp index d5aedb20883..d94efc47970 100644 --- a/dbms/src/Functions/array/arrayUniq.cpp +++ b/dbms/src/Functions/array/arrayUniq.cpp @@ -214,7 +214,7 @@ void FunctionArrayUniq::executeMethodImpl( for (ColumnArray::Offset j = prev_off; j < off; ++j) { if constexpr (has_null_map) - { + { // NOLINT if ((*null_map)[j]) { found_null = true; diff --git a/dbms/src/Functions/array/hasAll.cpp b/dbms/src/Functions/array/hasAll.cpp index 6ae1640e382..8d833adb5f5 100644 --- a/dbms/src/Functions/array/hasAll.cpp +++ b/dbms/src/Functions/array/hasAll.cpp @@ -10,7 +10,7 @@ class FunctionArrayHasAll : public FunctionArrayHasAllAny public: static constexpr auto name = "hasAll"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionArrayHasAll(const Context & context_) : FunctionArrayHasAllAny(context_, true, name) {} + explicit FunctionArrayHasAll(const Context & context_) : FunctionArrayHasAllAny(context_, true, name) {} }; void registerFunctionHasAll(FunctionFactory & factory) diff --git a/dbms/src/Functions/array/hasAny.cpp b/dbms/src/Functions/array/hasAny.cpp index 756e5311b50..84a3a736364 100644 --- a/dbms/src/Functions/array/hasAny.cpp +++ b/dbms/src/Functions/array/hasAny.cpp @@ -10,7 +10,7 @@ class FunctionArrayHasAny : public FunctionArrayHasAllAny public: static constexpr auto name = "hasAny"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionArrayHasAny(const Context & context_) : FunctionArrayHasAllAny(context_, false, name) {} + explicit FunctionArrayHasAny(const Context & context_) : FunctionArrayHasAllAny(context_, false, name) {} }; void registerFunctionHasAny(FunctionFactory & factory) diff --git a/dbms/src/Functions/array/range.cpp b/dbms/src/Functions/array/range.cpp index b04dcce7519..283eb760fcf 100644 --- a/dbms/src/Functions/array/range.cpp +++ b/dbms/src/Functions/array/range.cpp @@ -28,7 +28,7 @@ public: static constexpr auto name = "range"; static constexpr size_t max_elements = 100'000'000; static FunctionPtr create(const Context & context_) { return std::make_shared(context_); } - FunctionRange(const Context & context_) : context(context_) {} + explicit FunctionRange(const Context & context_) : context(context_) {} private: const Context & context; diff --git a/dbms/src/Functions/bitCount.cpp b/dbms/src/Functions/bitCount.cpp index 17805254c02..73df2c680da 100644 --- a/dbms/src/Functions/bitCount.cpp +++ b/dbms/src/Functions/bitCount.cpp @@ -26,7 +26,7 @@ struct BitCountImpl if constexpr (std::is_same_v) return __builtin_popcount(static_cast(a)); else - return __builtin_popcountll(ext::bit_cast(a)); + return __builtin_popcountll(ext::bit_cast(a)); } #if USE_EMBEDDED_COMPILER diff --git a/dbms/src/Functions/blockSerializedSize.cpp b/dbms/src/Functions/blockSerializedSize.cpp index 0a13e8ab013..0e8833d795f 100644 --- a/dbms/src/Functions/blockSerializedSize.cpp +++ b/dbms/src/Functions/blockSerializedSize.cpp @@ -39,7 +39,7 @@ public: input_rows_count, size)->convertToFullColumnIfConst(); } - UInt64 blockSerializedSizeOne(const ColumnWithTypeAndName & elem) const + static UInt64 blockSerializedSizeOne(const ColumnWithTypeAndName & elem) { ColumnPtr full_column = elem.column->convertToFullColumnIfConst(); diff --git a/dbms/src/Functions/caseWithExpression.cpp b/dbms/src/Functions/caseWithExpression.cpp index e1e124fd1ef..dc48536b4a7 100644 --- a/dbms/src/Functions/caseWithExpression.cpp +++ b/dbms/src/Functions/caseWithExpression.cpp @@ -21,7 +21,7 @@ public: static FunctionPtr create(const Context & context_) { return std::make_shared(context_); } public: - FunctionCaseWithExpression(const Context & context_) : context(context_) {} + explicit FunctionCaseWithExpression(const Context & context_) : context(context_) {} bool isVariadic() const override { return true; } size_t getNumberOfArguments() const override { return 0; } String getName() const override { return name; } diff --git a/dbms/src/Functions/coalesce.cpp b/dbms/src/Functions/coalesce.cpp index 6560aa88cc3..9d5d4df4a7f 100644 --- a/dbms/src/Functions/coalesce.cpp +++ b/dbms/src/Functions/coalesce.cpp @@ -26,7 +26,7 @@ public: return std::make_shared(context); } - FunctionCoalesce(const Context & context_) : context(context_) {} + explicit FunctionCoalesce(const Context & context_) : context(context_) {} std::string getName() const override { diff --git a/dbms/src/Functions/concat.cpp b/dbms/src/Functions/concat.cpp index 224dc32ca33..fd9448a5327 100644 --- a/dbms/src/Functions/concat.cpp +++ b/dbms/src/Functions/concat.cpp @@ -32,7 +32,7 @@ class ConcatImpl : public IFunction { public: static constexpr auto name = Name::name; - ConcatImpl(const Context & context_) : context(context_) {} + explicit ConcatImpl(const Context & context_) : context(context_) {} static FunctionPtr create(const Context & context) { return std::make_shared(context); } String getName() const override { return name; } diff --git a/dbms/src/Functions/convertCharset.cpp b/dbms/src/Functions/convertCharset.cpp index 00ccc8ae2c0..372bf08a0d2 100644 --- a/dbms/src/Functions/convertCharset.cpp +++ b/dbms/src/Functions/convertCharset.cpp @@ -78,13 +78,13 @@ private: /// Separate converter is created for each thread. using Pool = ObjectPoolMap; - Pool::Pointer getConverter(const String & charset) + static Pool::Pointer getConverter(const String & charset) { static Pool pool; return pool.get(charset, [&charset] { return new Converter(charset); }); } - void convert(const String & from_charset, const String & to_charset, + static void convert(const String & from_charset, const String & to_charset, const ColumnString::Chars & from_chars, const ColumnString::Offsets & from_offsets, ColumnString::Chars & to_chars, ColumnString::Offsets & to_offsets) { diff --git a/dbms/src/Functions/evalMLMethod.cpp b/dbms/src/Functions/evalMLMethod.cpp index e9fe3c087c7..4f6ca87a17b 100644 --- a/dbms/src/Functions/evalMLMethod.cpp +++ b/dbms/src/Functions/evalMLMethod.cpp @@ -35,7 +35,7 @@ public: { return std::make_shared(context); } - FunctionEvalMLMethod(const Context & context_) : context(context_) + explicit FunctionEvalMLMethod(const Context & context_) : context(context_) {} String getName() const override diff --git a/dbms/src/Functions/formatDateTime.cpp b/dbms/src/Functions/formatDateTime.cpp index 5a3ee7b1c48..5273348d00e 100644 --- a/dbms/src/Functions/formatDateTime.cpp +++ b/dbms/src/Functions/formatDateTime.cpp @@ -93,7 +93,7 @@ private: Func func; size_t shift; - Action(Func func_, size_t shift_ = 0) : func(func_), shift(shift_) {} + explicit Action(Func func_, size_t shift_ = 0) : func(func_), shift(shift_) {} void perform(char *& target, Time source, const DateLUTImpl & timezone) { diff --git a/dbms/src/Functions/getMacro.cpp b/dbms/src/Functions/getMacro.cpp index 02dec99d9d8..ca0c135d38a 100644 --- a/dbms/src/Functions/getMacro.cpp +++ b/dbms/src/Functions/getMacro.cpp @@ -33,7 +33,7 @@ public: return std::make_shared(context.getMacros()); } - FunctionGetMacro(MultiVersion::Version macros_) : macros(std::move(macros_)) {} + explicit FunctionGetMacro(MultiVersion::Version macros_) : macros(std::move(macros_)) {} String getName() const override { diff --git a/dbms/src/Functions/getScalar.cpp b/dbms/src/Functions/getScalar.cpp index d6c0d79557c..a6e9d00148e 100644 --- a/dbms/src/Functions/getScalar.cpp +++ b/dbms/src/Functions/getScalar.cpp @@ -27,7 +27,7 @@ public: return std::make_shared(context); } - FunctionGetScalar(const Context & context_) : context(context_) {} + explicit FunctionGetScalar(const Context & context_) : context(context_) {} String getName() const override { diff --git a/dbms/src/Functions/if.cpp b/dbms/src/Functions/if.cpp index 7ae1042c22c..dfafbe36471 100644 --- a/dbms/src/Functions/if.cpp +++ b/dbms/src/Functions/if.cpp @@ -173,7 +173,7 @@ class FunctionIf : public FunctionIfBase public: static constexpr auto name = "if"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionIf(const Context & context_) : context(context_) {} + explicit FunctionIf(const Context & context_) : context(context_) {} private: template @@ -416,7 +416,7 @@ private: return true; } - bool executeString(const ColumnUInt8 * cond_col, Block & block, const ColumnNumbers & arguments, size_t result) + static bool executeString(const ColumnUInt8 * cond_col, Block & block, const ColumnNumbers & arguments, size_t result) { const IColumn * col_then_untyped = block.getByPosition(arguments[1]).column.get(); const IColumn * col_else_untyped = block.getByPosition(arguments[2]).column.get(); @@ -494,7 +494,7 @@ private: return false; } - bool executeGenericArray(const ColumnUInt8 * cond_col, Block & block, const ColumnNumbers & arguments, size_t result) + static bool executeGenericArray(const ColumnUInt8 * cond_col, Block & block, const ColumnNumbers & arguments, size_t result) { /// For generic implementation, arrays must be of same type. if (!block.getByPosition(arguments[1]).type->equals(*block.getByPosition(arguments[2]).type)) diff --git a/dbms/src/Functions/ifNotFinite.cpp b/dbms/src/Functions/ifNotFinite.cpp index 605eeddf515..184c08877b9 100644 --- a/dbms/src/Functions/ifNotFinite.cpp +++ b/dbms/src/Functions/ifNotFinite.cpp @@ -15,7 +15,7 @@ class FunctionIfNotFinite : public IFunction public: static constexpr auto name = "ifNotFinite"; - FunctionIfNotFinite(const Context & context_) : context(context_) {} + explicit FunctionIfNotFinite(const Context & context_) : context(context_) {} static FunctionPtr create(const Context & context) { diff --git a/dbms/src/Functions/ifNull.cpp b/dbms/src/Functions/ifNull.cpp index 05e9ded387e..1be8916d795 100644 --- a/dbms/src/Functions/ifNull.cpp +++ b/dbms/src/Functions/ifNull.cpp @@ -19,7 +19,7 @@ class FunctionIfNull : public IFunction public: static constexpr auto name = "ifNull"; - FunctionIfNull(const Context & context_) : context(context_) {} + explicit FunctionIfNull(const Context & context_) : context(context_) {} static FunctionPtr create(const Context & context) { diff --git a/dbms/src/Functions/multiIf.cpp b/dbms/src/Functions/multiIf.cpp index 68609af9102..2340f7826c7 100644 --- a/dbms/src/Functions/multiIf.cpp +++ b/dbms/src/Functions/multiIf.cpp @@ -34,7 +34,7 @@ class FunctionMultiIf final : public FunctionIfBase public: static constexpr auto name = "multiIf"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionMultiIf(const Context & context_) : context(context_) {} + explicit FunctionMultiIf(const Context & context_) : context(context_) {} public: String getName() const override { return name; } diff --git a/dbms/src/Functions/neighbor.cpp b/dbms/src/Functions/neighbor.cpp index 1080507ced5..dc83aeb0d31 100644 --- a/dbms/src/Functions/neighbor.cpp +++ b/dbms/src/Functions/neighbor.cpp @@ -29,7 +29,7 @@ public: static constexpr auto name = "neighbor"; static FunctionPtr create(const Context & context) { return std::make_shared(context); } - FunctionNeighbor(const Context & context_) : context(context_) {} + explicit FunctionNeighbor(const Context & context_) : context(context_) {} /// Get the name of the function. String getName() const override { return name; } diff --git a/dbms/src/Functions/nullIf.cpp b/dbms/src/Functions/nullIf.cpp index 12b9124863e..34655ebece6 100644 --- a/dbms/src/Functions/nullIf.cpp +++ b/dbms/src/Functions/nullIf.cpp @@ -25,7 +25,7 @@ public: return std::make_shared(context); } - FunctionNullIf(const Context & context_) : context(context_) {} + explicit FunctionNullIf(const Context & context_) : context(context_) {} std::string getName() const override { diff --git a/dbms/src/Functions/reinterpretAsFixedString.cpp b/dbms/src/Functions/reinterpretAsFixedString.cpp index 3be84c7810a..b56d97708d2 100644 --- a/dbms/src/Functions/reinterpretAsFixedString.cpp +++ b/dbms/src/Functions/reinterpretAsFixedString.cpp @@ -39,7 +39,7 @@ public: throw Exception("Cannot reinterpret " + type.getName() + " as FixedString because it is not fixed size and contiguous in memory", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } - void NO_INLINE executeToFixedString(const IColumn & src, ColumnFixedString & dst, size_t n) + static void NO_INLINE executeToFixedString(const IColumn & src, ColumnFixedString & dst, size_t n) { size_t rows = src.size(); ColumnFixedString::Chars & data_to = dst.getChars(); @@ -54,7 +54,7 @@ public: } } - void NO_INLINE executeContiguousToFixedString(const IColumn & src, ColumnFixedString & dst, size_t n) + static void NO_INLINE executeContiguousToFixedString(const IColumn & src, ColumnFixedString & dst, size_t n) { size_t rows = src.size(); ColumnFixedString::Chars & data_to = dst.getChars(); diff --git a/dbms/src/Functions/reinterpretAsString.cpp b/dbms/src/Functions/reinterpretAsString.cpp index 09ae1a20dee..b86c9b14c5a 100644 --- a/dbms/src/Functions/reinterpretAsString.cpp +++ b/dbms/src/Functions/reinterpretAsString.cpp @@ -39,7 +39,7 @@ public: throw Exception("Cannot reinterpret " + type.getName() + " as String because it is not contiguous in memory", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } - void executeToString(const IColumn & src, ColumnString & dst) + static void executeToString(const IColumn & src, ColumnString & dst) { size_t rows = src.size(); ColumnString::Chars & data_to = dst.getChars(); diff --git a/dbms/src/Functions/transform.cpp b/dbms/src/Functions/transform.cpp index 42111899320..0ad3c6874bd 100644 --- a/dbms/src/Functions/transform.cpp +++ b/dbms/src/Functions/transform.cpp @@ -188,7 +188,7 @@ public: } private: - void executeConst(Block & block, const ColumnNumbers & arguments, const size_t result, size_t input_rows_count) + static void executeConst(Block & block, const ColumnNumbers & arguments, const size_t result, size_t input_rows_count) { /// Materialize the input column and compute the function as usual. diff --git a/dbms/src/Functions/trim.cpp b/dbms/src/Functions/trim.cpp index 46f69530005..f674afbd310 100644 --- a/dbms/src/Functions/trim.cpp +++ b/dbms/src/Functions/trim.cpp @@ -79,14 +79,14 @@ private: const char * char_end = char_data + size; if constexpr (mode::trim_left) - { + { // NOLINT const char * found = find_first_not_symbols<' '>(char_data, char_end); size_t num_chars = found - char_data; char_data += num_chars; } if constexpr (mode::trim_right) - { + { // NOLINT const char * found = find_last_not_symbols_or_null<' '>(char_data, char_end); if (found) char_end = found + 1; diff --git a/dbms/src/IO/AIO.cpp b/dbms/src/IO/AIO.cpp index ed22b263a94..33fb79fcf95 100644 --- a/dbms/src/IO/AIO.cpp +++ b/dbms/src/IO/AIO.cpp @@ -30,12 +30,12 @@ int io_destroy(aio_context_t ctx) return syscall(__NR_io_destroy, ctx); } -int io_submit(aio_context_t ctx, long nr, struct iocb * iocbpp[]) +int io_submit(aio_context_t ctx, long nr, struct iocb * iocbpp[]) // NOLINT { return syscall(__NR_io_submit, ctx, nr, iocbpp); } -int io_getevents(aio_context_t ctx, long min_nr, long max_nr, io_event * events, struct timespec * timeout) +int io_getevents(aio_context_t ctx, long min_nr, long max_nr, io_event * events, struct timespec * timeout) // NOLINT { return syscall(__NR_io_getevents, ctx, min_nr, max_nr, events, timeout); } diff --git a/dbms/src/IO/BitHelpers.h b/dbms/src/IO/BitHelpers.h index 321fb4d254e..35fe630029c 100644 --- a/dbms/src/IO/BitHelpers.h +++ b/dbms/src/IO/BitHelpers.h @@ -4,7 +4,8 @@ #include #include -#include +#include +#include #if defined(__OpenBSD__) || defined(__FreeBSD__) # include @@ -115,6 +116,8 @@ private: template inline UInt64 getBitsFromBitBuffer(UInt8 bits_to_read) { + assert(bits_to_read > 0); + // push down the high-bits const UInt64 result = static_cast(bits_buffer >> (sizeof(bits_buffer) * 8 - bits_to_read)); @@ -186,6 +189,8 @@ public: // write `bits_to_write` low-bits of `value` to the buffer inline void writeBits(UInt8 bits_to_write, UInt64 value) { + assert(bits_to_write > 0); + UInt32 capacity = BIT_BUFFER_SIZE - bits_count; if (capacity < bits_to_write) { diff --git a/dbms/src/IO/MemoryReadWriteBuffer.cpp b/dbms/src/IO/MemoryReadWriteBuffer.cpp index f9b5547bb13..0b0d9704de6 100644 --- a/dbms/src/IO/MemoryReadWriteBuffer.cpp +++ b/dbms/src/IO/MemoryReadWriteBuffer.cpp @@ -1,5 +1,4 @@ #include -#include #include diff --git a/dbms/src/IO/Progress.h b/dbms/src/IO/Progress.h index b15b2695e39..a3efb96db98 100644 --- a/dbms/src/IO/Progress.h +++ b/dbms/src/IO/Progress.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include diff --git a/dbms/src/IO/ReadBufferAIO.h b/dbms/src/IO/ReadBufferAIO.h index 2dd11b44095..77274c47073 100644 --- a/dbms/src/IO/ReadBufferAIO.h +++ b/dbms/src/IO/ReadBufferAIO.h @@ -25,7 +25,7 @@ namespace DB /** Class for asynchronous data reading. */ -class ReadBufferAIO : public ReadBufferFromFileBase +class ReadBufferAIO final : public ReadBufferFromFileBase { public: ReadBufferAIO(const std::string & filename_, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, int flags_ = -1, diff --git a/dbms/src/IO/ReadBufferFromHDFS.cpp b/dbms/src/IO/ReadBufferFromHDFS.cpp index 42419be3117..6d40f8326c2 100644 --- a/dbms/src/IO/ReadBufferFromHDFS.cpp +++ b/dbms/src/IO/ReadBufferFromHDFS.cpp @@ -22,7 +22,7 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl HDFSBuilderPtr builder; HDFSFSPtr fs; - ReadBufferFromHDFSImpl(const std::string & hdfs_name_) + explicit ReadBufferFromHDFSImpl(const std::string & hdfs_name_) : hdfs_uri(hdfs_name_) , builder(createHDFSBuilder(hdfs_uri)) , fs(createHDFSFS(builder.get())) diff --git a/dbms/src/IO/ReadHelpers.cpp b/dbms/src/IO/ReadHelpers.cpp index eba724f2193..7c3c99ecd7d 100644 --- a/dbms/src/IO/ReadHelpers.cpp +++ b/dbms/src/IO/ReadHelpers.cpp @@ -752,9 +752,9 @@ ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf) UInt16 year = 0; if (!append_digit(year) - || !append_digit(year) - || !append_digit(year) - || !append_digit(year)) + || !append_digit(year) // NOLINT + || !append_digit(year) // NOLINT + || !append_digit(year)) // NOLINT return error(); if (!ignore_delimiter()) diff --git a/dbms/src/IO/ReadHelpers.h b/dbms/src/IO/ReadHelpers.h index e17ddae9566..266b5ae1917 100644 --- a/dbms/src/IO/ReadHelpers.h +++ b/dbms/src/IO/ReadHelpers.h @@ -1009,7 +1009,7 @@ readTextWithSizeSuffix(T & x, ReadBuffer & buf) } else if (*buf.position() == 'i') { - x = (x << power_of_two); /// For binary suffixes, such as ki, Mi, Gi, etc. + x = (x << power_of_two); // NOLINT /// For binary suffixes, such as ki, Mi, Gi, etc. ++buf.position(); } return; diff --git a/dbms/src/IO/S3Common.cpp b/dbms/src/IO/S3Common.cpp index f41d5c3b253..e169eff7846 100644 --- a/dbms/src/IO/S3Common.cpp +++ b/dbms/src/IO/S3Common.cpp @@ -36,7 +36,7 @@ public: Aws::Utils::Logging::LogLevel GetLogLevel() const final { return Aws::Utils::Logging::LogLevel::Trace; } - void Log(Aws::Utils::Logging::LogLevel log_level, const char * tag, const char * format_str, ...) final + void Log(Aws::Utils::Logging::LogLevel log_level, const char * tag, const char * format_str, ...) final // NOLINT { auto & [level, prio] = convertLogLevel(log_level); LOG_SIMPLE(log, std::string(tag) + ": " + format_str, level, prio); @@ -84,7 +84,7 @@ namespace S3 return ret; } - std::shared_ptr ClientFactory::create( + std::shared_ptr ClientFactory::create( // NOLINT const String & endpoint, const String & access_key_id, const String & secret_access_key) diff --git a/dbms/src/IO/WriteBufferAIO.h b/dbms/src/IO/WriteBufferAIO.h index 57d035a8ef3..4fdeac9e9b9 100644 --- a/dbms/src/IO/WriteBufferAIO.h +++ b/dbms/src/IO/WriteBufferAIO.h @@ -24,7 +24,7 @@ namespace DB /** Class for asynchronous data writing. */ -class WriteBufferAIO : public WriteBufferFromFileBase +class WriteBufferAIO final : public WriteBufferFromFileBase { public: WriteBufferAIO(const std::string & filename_, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, int flags_ = -1, mode_t mode_ = 0666, diff --git a/dbms/src/IO/WriteBufferFromHDFS.cpp b/dbms/src/IO/WriteBufferFromHDFS.cpp index 9733d761ee4..0793a966559 100644 --- a/dbms/src/IO/WriteBufferFromHDFS.cpp +++ b/dbms/src/IO/WriteBufferFromHDFS.cpp @@ -26,7 +26,7 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl HDFSBuilderPtr builder; HDFSFSPtr fs; - WriteBufferFromHDFSImpl(const std::string & hdfs_name_) + explicit WriteBufferFromHDFSImpl(const std::string & hdfs_name_) : hdfs_uri(hdfs_name_) , builder(createHDFSBuilder(hdfs_uri)) , fs(createHDFSFS(builder.get())) diff --git a/dbms/src/IO/WriteBufferFromHTTPServerResponse.h b/dbms/src/IO/WriteBufferFromHTTPServerResponse.h index 1d38562ab1f..1307c007656 100644 --- a/dbms/src/IO/WriteBufferFromHTTPServerResponse.h +++ b/dbms/src/IO/WriteBufferFromHTTPServerResponse.h @@ -41,7 +41,7 @@ namespace DB /// Also this class write and flush special X-ClickHouse-Progress HTTP headers /// if no data was sent at the time of progress notification. /// This allows to implement progress bar in HTTP clients. -class WriteBufferFromHTTPServerResponse : public BufferWithOwnMemory +class WriteBufferFromHTTPServerResponse final : public BufferWithOwnMemory { private: Poco::Net::HTTPServerRequest & request; diff --git a/dbms/src/IO/WriteBufferFromVector.h b/dbms/src/IO/WriteBufferFromVector.h index 45ae39779da..c68bc9438bf 100644 --- a/dbms/src/IO/WriteBufferFromVector.h +++ b/dbms/src/IO/WriteBufferFromVector.h @@ -65,7 +65,7 @@ public: set(reinterpret_cast(vector.data() + old_size), (size - old_size) * sizeof(typename VectorType::value_type)); } - void finalize() override + void finalize() override final { if (is_finished) return; diff --git a/dbms/src/IO/WriteBufferValidUTF8.cpp b/dbms/src/IO/WriteBufferValidUTF8.cpp index b40424fc463..0e04aa4c22d 100644 --- a/dbms/src/IO/WriteBufferValidUTF8.cpp +++ b/dbms/src/IO/WriteBufferValidUTF8.cpp @@ -81,7 +81,7 @@ void WriteBufferValidUTF8::nextImpl() size_t len = length_of_utf8_sequence[static_cast(*p)]; if (len > 4) - { + { // NOLINT /// Invalid start of sequence. Skip one byte. putValid(valid_start, p - valid_start); putReplacement(); diff --git a/dbms/src/IO/WriteBufferValidUTF8.h b/dbms/src/IO/WriteBufferValidUTF8.h index 1d934be3779..6b92a2286ca 100644 --- a/dbms/src/IO/WriteBufferValidUTF8.h +++ b/dbms/src/IO/WriteBufferValidUTF8.h @@ -11,7 +11,7 @@ namespace DB * If the valid UTF-8 is already written, it works faster. * Note: before using the resulting string, destroy this object. */ -class WriteBufferValidUTF8 : public BufferWithOwnMemory +class WriteBufferValidUTF8 final : public BufferWithOwnMemory { private: WriteBuffer & output_buffer; diff --git a/dbms/src/IO/WriteIntText.h b/dbms/src/IO/WriteIntText.h index 4e18826600e..15276bba63f 100644 --- a/dbms/src/IO/WriteIntText.h +++ b/dbms/src/IO/WriteIntText.h @@ -3,7 +3,6 @@ #include #include #include -#include /// 40 digits or 39 digits and a sign #define WRITE_HELPERS_MAX_INT_WIDTH 40U diff --git a/dbms/src/IO/parseDateTimeBestEffort.cpp b/dbms/src/IO/parseDateTimeBestEffort.cpp index 59fb39ed846..6e747b13b3f 100644 --- a/dbms/src/IO/parseDateTimeBestEffort.cpp +++ b/dbms/src/IO/parseDateTimeBestEffort.cpp @@ -68,7 +68,7 @@ inline void readDecimalNumber(T & res, const char * src) template inline void readDecimalNumber(T & res, size_t num_digits, const char * src) { -#define READ_DECIMAL_NUMBER(N) res *= common::exp10_i32(N); readDecimalNumber(res, src); src += N; num_digits -= N; break +#define READ_DECIMAL_NUMBER(N) do { res *= common::exp10_i32(N); readDecimalNumber(res, src); src += (N); num_digits -= (N); } while (false) while (num_digits) { @@ -77,7 +77,7 @@ inline void readDecimalNumber(T & res, size_t num_digits, const char * src) case 3: READ_DECIMAL_NUMBER(3); break; case 2: READ_DECIMAL_NUMBER(2); break; case 1: READ_DECIMAL_NUMBER(1); break; - default: READ_DECIMAL_NUMBER(4); + default: READ_DECIMAL_NUMBER(4); break; } } #undef DECIMAL_NUMBER_CASE @@ -479,12 +479,12 @@ ReturnType parseDateTimeBestEffortImpl(time_t & res, ReadBuffer & in, const Date if (read_alpha_month(alpha)) { } - else if (0 == strncasecmp(alpha, "UTC", 3)) has_time_zone_offset = true; + else if (0 == strncasecmp(alpha, "UTC", 3)) has_time_zone_offset = true; // NOLINT else if (0 == strncasecmp(alpha, "GMT", 3)) has_time_zone_offset = true; else if (0 == strncasecmp(alpha, "MSK", 3)) { has_time_zone_offset = true; time_zone_offset_hour = 3; } else if (0 == strncasecmp(alpha, "MSD", 3)) { has_time_zone_offset = true; time_zone_offset_hour = 4; } - else if (0 == strncasecmp(alpha, "Mon", 3)) has_day_of_week = true; + else if (0 == strncasecmp(alpha, "Mon", 3)) has_day_of_week = true; // NOLINT else if (0 == strncasecmp(alpha, "Tue", 3)) has_day_of_week = true; else if (0 == strncasecmp(alpha, "Wed", 3)) has_day_of_week = true; else if (0 == strncasecmp(alpha, "Thu", 3)) has_day_of_week = true; diff --git a/dbms/src/IO/readFloatText.h b/dbms/src/IO/readFloatText.h index 1b9da8db49c..fc3ffc43a91 100644 --- a/dbms/src/IO/readFloatText.h +++ b/dbms/src/IO/readFloatText.h @@ -2,7 +2,6 @@ #include #include #include -#include #include #include diff --git a/dbms/src/IO/tests/gtest_DateTime64_parsing_and_writing.cpp b/dbms/src/IO/tests/gtest_DateTime64_parsing_and_writing.cpp index fd94c9c9eee..3e5a1998380 100644 --- a/dbms/src/IO/tests/gtest_DateTime64_parsing_and_writing.cpp +++ b/dbms/src/IO/tests/gtest_DateTime64_parsing_and_writing.cpp @@ -37,7 +37,7 @@ TEST_P(DateTime64StringParseTest, readDateTime64Text) const auto & param = GetParam(); ReadBufferFromMemory read_buffer(param.string.data(), param.string.size()); - DateTime64 actual; + DateTime64 actual{}; EXPECT_TRUE(tryReadDateTime64Text(actual, param.scale, read_buffer, param.timezone)); EXPECT_EQ(param.dt64, actual); diff --git a/dbms/src/IO/tests/gtest_aio_seek_back_after_eof.cpp b/dbms/src/IO/tests/gtest_aio_seek_back_after_eof.cpp index 22cfffdda05..5f1c1c0b945 100644 --- a/dbms/src/IO/tests/gtest_aio_seek_back_after_eof.cpp +++ b/dbms/src/IO/tests/gtest_aio_seek_back_after_eof.cpp @@ -5,9 +5,11 @@ #include #include #include +#include #include #include + namespace { std::string createTmpFileForEOFtest() @@ -21,7 +23,7 @@ std::string createTmpFileForEOFtest() { /// We have no tmp in docker /// So we have to use root - std::string almost_rand_dir = std::string{"/"} + std::to_string(rand()) + "foo"; + std::string almost_rand_dir = std::string{"/"} + std::to_string(randomSeed()) + "foo"; return almost_rand_dir; } diff --git a/dbms/src/IO/tests/gtest_bit_io.cpp b/dbms/src/IO/tests/gtest_bit_io.cpp index 5291dddd25e..98539ea85a3 100644 --- a/dbms/src/IO/tests/gtest_bit_io.cpp +++ b/dbms/src/IO/tests/gtest_bit_io.cpp @@ -32,7 +32,7 @@ std::string bin(const T & value, size_t bits = sizeof(T) * 8) static const uint8_t MAX_BITS = sizeof(T)*8; assert(bits <= MAX_BITS); - return std::bitset(static_cast(value)) + return std::bitset(static_cast(value)) .to_string().substr(MAX_BITS - bits, bits); } @@ -112,7 +112,7 @@ struct TestCaseParameter std::vector> bits_and_vals; std::string expected_buffer_binary; - TestCaseParameter(std::vector> vals, std::string binary = std::string{}) + TestCaseParameter(std::vector> vals, std::string binary = std::string{}) // NOLINT : bits_and_vals(std::move(vals)), expected_buffer_binary(binary) {} diff --git a/dbms/src/IO/tests/hashing_read_buffer.cpp b/dbms/src/IO/tests/hashing_read_buffer.cpp index cb6108d15d8..be31d5b05d3 100644 --- a/dbms/src/IO/tests/hashing_read_buffer.cpp +++ b/dbms/src/IO/tests/hashing_read_buffer.cpp @@ -3,14 +3,18 @@ #include #include "hashing_buffer.h" #include +#include + static void test(size_t data_size) { + pcg64 rng; + std::vector vec(data_size); char * data = vec.data(); for (size_t i = 0; i < data_size; ++i) - data[i] = rand() & 255; + data[i] = rng() & 255; CityHash_v1_0_2::uint128 reference = referenceHash(data, data_size); diff --git a/dbms/src/IO/tests/hashing_write_buffer.cpp b/dbms/src/IO/tests/hashing_write_buffer.cpp index cf7c18d1c77..461c39139c4 100644 --- a/dbms/src/IO/tests/hashing_write_buffer.cpp +++ b/dbms/src/IO/tests/hashing_write_buffer.cpp @@ -1,15 +1,18 @@ #include #include +#include #include "hashing_buffer.h" static void test(size_t data_size) { + pcg64 rng; + std::vector vec(data_size); char * data = vec.data(); for (size_t i = 0; i < data_size; ++i) - data[i] = rand() & 255; + data[i] = rng() & 255; CityHash_v1_0_2::uint128 reference = referenceHash(data, data_size); @@ -20,14 +23,14 @@ static void test(size_t data_size) for (size_t pos = 0; pos < data_size;) { - size_t len = std::min(static_cast(rand() % 10000 + 1), data_size - pos); + size_t len = std::min(static_cast(rng() % 10000 + 1), data_size - pos); buf.write(data + pos, len); buf.next(); pos += len; } if (buf.getHash() != reference) - FAIL("failed on data size " << data_size << " writing random chunks of up to 10000 bytes"); + FAIL("failed on data size " << data_size << " writing rngom chunks of up to 10000 bytes"); } { @@ -35,14 +38,14 @@ static void test(size_t data_size) for (size_t pos = 0; pos < data_size;) { - size_t len = std::min(static_cast(rand() % 5 + 1), data_size - pos); + size_t len = std::min(static_cast(rng() % 5 + 1), data_size - pos); buf.write(data + pos, len); buf.next(); pos += len; } if (buf.getHash() != reference) - FAIL("failed on data size " << data_size << " writing random chunks of up to 5 bytes"); + FAIL("failed on data size " << data_size << " writing rngom chunks of up to 5 bytes"); } { @@ -50,14 +53,14 @@ static void test(size_t data_size) for (size_t pos = 0; pos < data_size;) { - size_t len = std::min(static_cast(2048 + rand() % 3 - 1), data_size - pos); + size_t len = std::min(static_cast(2048 + rng() % 3 - 1), data_size - pos); buf.write(data + pos, len); buf.next(); pos += len; } if (buf.getHash() != reference) - FAIL("failed on data size " << data_size << " writing random chunks of 2048 +-1 bytes"); + FAIL("failed on data size " << data_size << " writing rngom chunks of 2048 +-1 bytes"); } { diff --git a/dbms/src/IO/tests/parse_int_perf.cpp b/dbms/src/IO/tests/parse_int_perf.cpp index 11558289d24..266dee96fb6 100644 --- a/dbms/src/IO/tests/parse_int_perf.cpp +++ b/dbms/src/IO/tests/parse_int_perf.cpp @@ -1,7 +1,7 @@ #include #include -#include +#include #include #include @@ -40,7 +40,9 @@ int main(int argc, char ** argv) using T = UInt8; - size_t n = atoi(argv[1]); + size_t n = std::stol(argv[1]); + assert(n > 0); + std::vector data(n); std::vector data2(n); diff --git a/dbms/src/IO/tests/ryu_test.cpp b/dbms/src/IO/tests/ryu_test.cpp index d8c385f2d0b..0713a01960f 100644 --- a/dbms/src/IO/tests/ryu_test.cpp +++ b/dbms/src/IO/tests/ryu_test.cpp @@ -6,7 +6,7 @@ struct DecomposedFloat64 { - DecomposedFloat64(double x) + explicit DecomposedFloat64(double x) { memcpy(&x_uint, &x, sizeof(x)); } @@ -43,7 +43,7 @@ struct DecomposedFloat64 struct DecomposedFloat32 { - DecomposedFloat32(float x) + explicit DecomposedFloat32(float x) { memcpy(&x_uint, &x, sizeof(x)); } diff --git a/dbms/src/IO/tests/valid_utf8_perf.cpp b/dbms/src/IO/tests/valid_utf8_perf.cpp index b483c333934..ce9edcbb382 100644 --- a/dbms/src/IO/tests/valid_utf8_perf.cpp +++ b/dbms/src/IO/tests/valid_utf8_perf.cpp @@ -12,7 +12,7 @@ int main(int argc, char ** argv) { int repeats = 1; if (argc >= 2) - repeats = atoi(argv[1]); + repeats = std::stol(argv[1]); std::string text((std::istreambuf_iterator(std::cin)), std::istreambuf_iterator()); diff --git a/dbms/src/Interpreters/ActionsVisitor.cpp b/dbms/src/Interpreters/ActionsVisitor.cpp index bfb786dc850..3e07105c2f1 100644 --- a/dbms/src/Interpreters/ActionsVisitor.cpp +++ b/dbms/src/Interpreters/ActionsVisitor.cpp @@ -496,18 +496,17 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & { /// If the argument is not a lambda expression, call it recursively and find out its type. visit(child, data); - std::string name = child_column_name; - if (data.hasColumn(name)) + if (data.hasColumn(child_column_name)) { - argument_types.push_back(data.getSampleBlock().getByName(name).type); - argument_names.push_back(name); + argument_types.push_back(data.getSampleBlock().getByName(child_column_name).type); + argument_names.push_back(child_column_name); } else { if (data.only_consts) arguments_present = false; else - throw Exception("Unknown identifier: " + name, ErrorCodes::UNKNOWN_IDENTIFIER); + throw Exception("Unknown identifier: " + child_column_name, ErrorCodes::UNKNOWN_IDENTIFIER); } } } diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp index 30d2afdb2d8..1a40b7cefc3 100644 --- a/dbms/src/Interpreters/Aggregator.cpp +++ b/dbms/src/Interpreters/Aggregator.cpp @@ -82,8 +82,8 @@ void AggregatedDataVariants::convertToTwoLevel() { #define M(NAME) \ case Type::NAME: \ - NAME ## _two_level = std::make_unique(*NAME); \ - NAME.reset(); \ + NAME ## _two_level = std::make_unique(*(NAME)); \ + (NAME).reset(); \ type = Type::NAME ## _two_level; \ break; @@ -510,7 +510,7 @@ void NO_INLINE Aggregator::executeWithoutKeyImpl( AggregatedDataWithoutKey & res, size_t rows, AggregateFunctionInstruction * aggregate_instructions, - Arena * arena) const + Arena * arena) { /// Adding values for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst) diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h index 0fa9bd509bd..bf6fb88ca64 100644 --- a/dbms/src/Interpreters/Aggregator.h +++ b/dbms/src/Interpreters/Aggregator.h @@ -1082,11 +1082,11 @@ protected: AggregateFunctionInstruction * aggregate_instructions) const; /// For case when there are no keys (all aggregate into one row). - void executeWithoutKeyImpl( + static void executeWithoutKeyImpl( AggregatedDataWithoutKey & res, size_t rows, AggregateFunctionInstruction * aggregate_instructions, - Arena * arena) const; + Arena * arena); template void writeToTemporaryFileImpl( diff --git a/dbms/src/Interpreters/CatBoostModel.cpp b/dbms/src/Interpreters/CatBoostModel.cpp index 552905cfd5a..ca286aa5ab5 100644 --- a/dbms/src/Interpreters/CatBoostModel.cpp +++ b/dbms/src/Interpreters/CatBoostModel.cpp @@ -54,7 +54,7 @@ struct CatBoostWrapperAPI double * result, size_t resultSize); int (* GetStringCatFeatureHash)(const char * data, size_t size); - int (* GetIntegerCatFeatureHash)(long long val); + int (* GetIntegerCatFeatureHash)(uint64_t val); size_t (* GetFloatFeaturesCount)(ModelCalcerHandle* calcer); size_t (* GetCatFeaturesCount)(ModelCalcerHandle* calcer); @@ -218,7 +218,7 @@ private: /// Buffer should be allocated with features_count * column->size() elements. /// Place string pointers in positions buffer[0], buffer[features_count], ... , buffer[size * features_count] - void placeStringColumn(const ColumnString & column, const char ** buffer, size_t features_count) const + static void placeStringColumn(const ColumnString & column, const char ** buffer, size_t features_count) { size_t size = column.size(); for (size_t i = 0; i < size; ++i) @@ -231,8 +231,8 @@ private: /// Buffer should be allocated with features_count * column->size() elements. /// Place string pointers in positions buffer[0], buffer[features_count], ... , buffer[size * features_count] /// Returns PODArray which holds data (because ColumnFixedString doesn't store terminating zero). - PODArray placeFixedStringColumn( - const ColumnFixedString & column, const char ** buffer, size_t features_count) const + static PODArray placeFixedStringColumn( + const ColumnFixedString & column, const char ** buffer, size_t features_count) { size_t size = column.size(); size_t str_size = column.getN(); @@ -281,8 +281,8 @@ private: /// Place columns into buffer, returns data which was used for fixed string columns. /// Buffer should contains column->size() values, each value contains size strings. - std::vector> placeStringColumns( - const ColumnRawPtrs & columns, size_t offset, size_t size, const char ** buffer) const + static std::vector> placeStringColumns( + const ColumnRawPtrs & columns, size_t offset, size_t size, const char ** buffer) { if (size == 0) return {}; diff --git a/dbms/src/Interpreters/ClientInfo.cpp b/dbms/src/Interpreters/ClientInfo.cpp index 743fe934dc3..ed806e5ad57 100644 --- a/dbms/src/Interpreters/ClientInfo.cpp +++ b/dbms/src/Interpreters/ClientInfo.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index 74340027dec..ab9b4a2c31b 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -1321,7 +1321,18 @@ BackgroundProcessingPool & Context::getBackgroundPool() { auto lock = getLock(); if (!shared->background_pool) - shared->background_pool.emplace(settings.background_pool_size); + { + BackgroundProcessingPool::PoolSettings pool_settings; + auto & config = getConfigRef(); + pool_settings.thread_sleep_seconds = config.getDouble("background_processing_pool_thread_sleep_seconds", 10); + pool_settings.thread_sleep_seconds_random_part = config.getDouble("background_processing_pool_thread_sleep_seconds_random_part", 1.0); + pool_settings.thread_sleep_seconds_if_nothing_to_do = config.getDouble("background_processing_pool_thread_sleep_seconds_if_nothing_to_do", 0.1); + pool_settings.task_sleep_seconds_when_no_work_min = config.getDouble("background_processing_pool_task_sleep_seconds_when_no_work_min", 10); + pool_settings.task_sleep_seconds_when_no_work_max = config.getDouble("background_processing_pool_task_sleep_seconds_when_no_work_max", 600); + pool_settings.task_sleep_seconds_when_no_work_multiplier = config.getDouble("background_processing_pool_task_sleep_seconds_when_no_work_multiplier", 1.1); + pool_settings.task_sleep_seconds_when_no_work_random_part = config.getDouble("background_processing_pool_task_sleep_seconds_when_no_work_random_part", 1.0); + shared->background_pool.emplace(settings.background_pool_size, pool_settings); + } return *shared->background_pool; } diff --git a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp index c05e617f84d..4e249ecb44a 100644 --- a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -31,7 +31,7 @@ namespace struct JoinedElement { - JoinedElement(const ASTTablesInSelectQueryElement & table_element) + explicit JoinedElement(const ASTTablesInSelectQueryElement & table_element) : element(table_element) { if (element.table_join) diff --git a/dbms/src/Interpreters/DDLWorker.cpp b/dbms/src/Interpreters/DDLWorker.cpp index 8dafdbb4e37..8fe87eb9f3b 100644 --- a/dbms/src/Interpreters/DDLWorker.cpp +++ b/dbms/src/Interpreters/DDLWorker.cpp @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -668,7 +668,7 @@ void DDLWorker::processTask(DDLTask & task, const ZooKeeperPtr & zookeeper) } -bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, const StoragePtr storage) const +bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, const StoragePtr storage) { /// Pure DROP queries have to be executed on each node separately if (auto query = ast_ddl->as(); query && query->kind != ASTDropQuery::Kind::Truncate) @@ -794,7 +794,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( /// Does nothing if wasn't previously locked lock->unlock(); - std::this_thread::sleep_for(std::chrono::milliseconds(std::uniform_int_distribution(0, 1000)(rng))); + std::this_thread::sleep_for(std::chrono::milliseconds(std::uniform_int_distribution(0, 1000)(rng))); } /// Not executed by leader so was not executed at all diff --git a/dbms/src/Interpreters/DDLWorker.h b/dbms/src/Interpreters/DDLWorker.h index 39ae768d7d8..32b7cd5f172 100644 --- a/dbms/src/Interpreters/DDLWorker.h +++ b/dbms/src/Interpreters/DDLWorker.h @@ -59,7 +59,7 @@ private: void processTask(DDLTask & task, const ZooKeeperPtr & zookeeper); /// Check that query should be executed on leader replica only - bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage) const; + static bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage); /// Check that shard has consistent config with table void checkShardConfig(const String & table, const DDLTask & task, StoragePtr storage) const; @@ -84,7 +84,7 @@ private: void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); /// Init task node - void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); + static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); void runMainThread(); diff --git a/dbms/src/Interpreters/ExpressionJIT.cpp b/dbms/src/Interpreters/ExpressionJIT.cpp index 656df317554..9b60323a885 100644 --- a/dbms/src/Interpreters/ExpressionJIT.cpp +++ b/dbms/src/Interpreters/ExpressionJIT.cpp @@ -137,7 +137,7 @@ struct SymbolResolver : public llvm::orc::SymbolResolver { llvm::LegacyJITSymbolResolver & impl; - SymbolResolver(llvm::LegacyJITSymbolResolver & impl_) : impl(impl_) {} + explicit SymbolResolver(llvm::LegacyJITSymbolResolver & impl_) : impl(impl_) {} llvm::orc::SymbolNameSet getResponsibilitySet(const llvm::orc::SymbolNameSet & symbols) final { diff --git a/dbms/src/Interpreters/ExternalLoader.cpp b/dbms/src/Interpreters/ExternalLoader.cpp index 5a012bfb5a2..471c88e0ecd 100644 --- a/dbms/src/Interpreters/ExternalLoader.cpp +++ b/dbms/src/Interpreters/ExternalLoader.cpp @@ -257,13 +257,22 @@ private: auto update_time_from_repository = repository.getUpdateTime(path); - /// Actually it can't be less, but for sure we check less or equal - if (update_time_from_repository <= file_info.last_update_time) + // We can't count on that the mtime increases or that it has + // a particular relation to system time, so just check for strict + // equality. + // Note that on 1.x versions on Poco, the granularity of update + // time is one second, so the window where we can miss the changes + // is that wide (i.e. when we read the file and after that it + // is updated, but in the same second). + // The solution to this is probably switching to std::filesystem + // -- the work is underway to do so. + if (update_time_from_repository == file_info.last_update_time) { file_info.in_use = true; return false; } + LOG_TRACE(log, "Loading config file '" << path << "'."); auto file_contents = repository.load(path); /// get all objects' definitions @@ -411,6 +420,8 @@ public: if (configs == new_configs) return; + LOG_TRACE(log, "Configuration of reloadable objects has changed"); + configs = new_configs; std::vector removed_names; @@ -418,7 +429,10 @@ public: { auto new_config_it = new_configs->find(name); if (new_config_it == new_configs->end()) + { + LOG_TRACE(log, "Reloadable object '" << name << "' is removed"); removed_names.emplace_back(name); + } else { const auto & new_config = new_config_it->second; @@ -427,12 +441,17 @@ public: if (!config_is_same) { /// Configuration has been changed. + LOG_TRACE(log, "Configuration has changed for reloadable " + "object '" << info.name << "'"); info.object_config = new_config; if (info.triedToLoad()) { /// The object has been tried to load before, so it is currently in use or was in use /// and we should try to reload it with the new config. + LOG_TRACE(log, "Will reload '" << name << "'" + " because its configuration has changed and" + " there were attempts to load it before"); startLoading(info, true); } } @@ -446,7 +465,11 @@ public: { Info & info = infos.emplace(name, Info{name, config}).first->second; if (always_load_everything) + { + LOG_TRACE(log, "Will reload new object '" << name << "'" + " because always_load_everything flag is set."); startLoading(info); + } } } @@ -640,6 +663,10 @@ public: if (!should_update_flag) { info.next_update_time = calculateNextUpdateTime(info.object, info.error_count); + LOG_TRACE(log, "Object '" << info.name << "'" + " not modified, will not reload. " + "Next update at " + << ext::to_string(info.next_update_time)); continue; } @@ -651,6 +678,8 @@ public: /// Object was never loaded successfully and should be reloaded. startLoading(info); } + LOG_TRACE(log, "Object '" << info.name << "' is neither" + " loaded nor failed, so it will not be reloaded as outdated."); } } } @@ -844,8 +873,14 @@ private: { if (info.is_loading()) { + LOG_TRACE(log, "The object '" << info.name << + "' is already being loaded, force = " << forced_to_reload << "."); + if (!forced_to_reload) + { return; + } + cancelLoading(info); } @@ -855,6 +890,12 @@ private: info.loading_start_time = std::chrono::system_clock::now(); info.loading_end_time = TimePoint{}; + LOG_TRACE(log, "Will load the object '" << info.name << "' " + << (enable_async_loading ? std::string("in background") + : "immediately") + << ", force = " << forced_to_reload + << ", loading_id = " << info.loading_id); + if (enable_async_loading) { /// Put a job to the thread pool for the loading. @@ -868,7 +909,7 @@ private: } } - void cancelLoading(Info & info) + static void cancelLoading(Info & info) { if (!info.is_loading()) return; @@ -882,6 +923,7 @@ private: /// Does the loading, possibly in the separate thread. void doLoading(const String & name, size_t loading_id, bool forced_to_reload, size_t min_id_to_finish_loading_dependencies_, bool async) { + LOG_TRACE(log, "Start loading object '" << name << "'"); try { /// Prepare for loading. @@ -890,7 +932,11 @@ private: LoadingGuardForAsyncLoad lock(async, mutex); info = prepareToLoadSingleObject(name, loading_id, min_id_to_finish_loading_dependencies_, lock); if (!info) + { + LOG_TRACE(log, "Could not lock object '" << name + << "' for loading"); return; + } } /// Previous version can be used as the base for new loading, enabling loading only part of data. @@ -989,8 +1035,22 @@ private: /// We should check if this is still the same loading as we were doing. /// This is necessary because the object could be removed or load with another config while the `mutex` was unlocked. - if (!info || !info->is_loading() || (info->loading_id != loading_id)) + if (!info) + { + LOG_TRACE(log, "Next update time for '" << name << "' will not be set because this object was not found."); return; + } + if (!info->is_loading()) + { + LOG_TRACE(log, "Next update time for '" << name << "' will not be set because this object is not currently loading."); + return; + } + if (info->loading_id != loading_id) + { + LOG_TRACE(log, "Next update time for '" << name << "' will not be set because this object's current loading_id " + << info->loading_id << " is different from the specified " << loading_id << "."); + return; + } if (new_exception) { @@ -1018,6 +1078,8 @@ private: info->last_successful_update_time = current_time; info->state_id = info->loading_id; info->next_update_time = next_update_time; + LOG_TRACE(log, "Next update time for '" << info->name + << "' was set to " << ext::to_string(next_update_time)); } /// Removes the references to the loading thread from the maps. @@ -1046,21 +1108,50 @@ private: if (loaded_object) { if (!loaded_object->supportUpdates()) + { + LOG_TRACE(log, "Supposed update time for " + "'" + loaded_object->getLoadableName() + "'" + " is never (loaded, does not support updates)"); + return never; + } /// do not update loadable objects with zero as lifetime const auto & lifetime = loaded_object->getLifetime(); if (lifetime.min_sec == 0 && lifetime.max_sec == 0) + { + LOG_TRACE(log, "Supposed update time for " + "'" + loaded_object->getLoadableName() + "'" + " is never (loaded, lifetime 0)"); return never; + } if (!error_count) { std::uniform_int_distribution distribution{lifetime.min_sec, lifetime.max_sec}; - return std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}; + auto result = std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}; + LOG_TRACE(log, "Supposed update time for " + "'" << loaded_object->getLoadableName() << "'" + " is " << ext::to_string(result) + << " (loaded, lifetime [" << lifetime.min_sec + << ", " << lifetime.max_sec << "], no errors)"); + return result; } - } - return std::chrono::system_clock::now() + std::chrono::seconds(calculateDurationWithBackoff(rnd_engine, error_count)); + auto result = std::chrono::system_clock::now() + std::chrono::seconds(calculateDurationWithBackoff(rnd_engine, error_count)); + LOG_TRACE(log, "Supposed update time for '" << loaded_object->getLoadableName() << "'" + " is " << ext::to_string(result) + << " (backoff, " << error_count << " errors)"); + return result; + } + else + { + auto result = std::chrono::system_clock::now() + std::chrono::seconds(calculateDurationWithBackoff(rnd_engine, error_count)); + LOG_TRACE(log, "Supposed update time for unspecified object " + " is " << ext::to_string(result) + << " (backoff, " << error_count << " errors."); + return result; + } } const CreateObjectFunction create_object; diff --git a/dbms/src/Interpreters/InterpreterDropQuery.cpp b/dbms/src/Interpreters/InterpreterDropQuery.cpp index bb62d704d2d..12a39e5aac4 100644 --- a/dbms/src/Interpreters/InterpreterDropQuery.cpp +++ b/dbms/src/Interpreters/InterpreterDropQuery.cpp @@ -25,6 +25,12 @@ namespace ErrorCodes } +static DatabasePtr tryGetDatabase(const String & database_name, bool if_exists) +{ + return if_exists ? DatabaseCatalog::instance().tryGetDatabase(database_name) : DatabaseCatalog::instance().getDatabase(database_name); +} + + InterpreterDropQuery::InterpreterDropQuery(const ASTPtr & query_ptr_, Context & context_) : query_ptr(query_ptr_), context(context_) {} @@ -227,6 +233,7 @@ BlockIO InterpreterDropQuery::executeToTemporaryTable(const String & table_name, return {}; } + BlockIO InterpreterDropQuery::executeToDatabase(const String & database_name, ASTDropQuery::Kind kind, bool if_exists) { auto ddl_guard = DatabaseCatalog::instance().getDDLGuard(database_name, ""); @@ -263,10 +270,6 @@ BlockIO InterpreterDropQuery::executeToDatabase(const String & database_name, AS return {}; } -DatabasePtr InterpreterDropQuery::tryGetDatabase(const String & database_name, bool if_exists) -{ - return if_exists ? DatabaseCatalog::instance().tryGetDatabase(database_name) : DatabaseCatalog::instance().getDatabase(database_name); -} DatabaseAndTable InterpreterDropQuery::tryGetDatabaseAndTable(const String & database_name, const String & table_name, bool if_exists) { diff --git a/dbms/src/Interpreters/InterpreterDropQuery.h b/dbms/src/Interpreters/InterpreterDropQuery.h index 6a9c249973e..fd6f369e876 100644 --- a/dbms/src/Interpreters/InterpreterDropQuery.h +++ b/dbms/src/Interpreters/InterpreterDropQuery.h @@ -34,8 +34,6 @@ private: BlockIO executeToDictionary(const String & database_name, const String & dictionary_name, ASTDropQuery::Kind kind, bool if_exists, bool is_temporary, bool no_ddl_lock); - DatabasePtr tryGetDatabase(const String & database_name, bool exists); - DatabaseAndTable tryGetDatabaseAndTable(const String & database_name, const String & table_name, bool if_exists); BlockIO executeToTemporaryTable(const String & table_name, ASTDropQuery::Kind kind); diff --git a/dbms/src/Interpreters/InterpreterKillQueryQuery.cpp b/dbms/src/Interpreters/InterpreterKillQueryQuery.cpp index f384e005e3c..81a093f4eae 100644 --- a/dbms/src/Interpreters/InterpreterKillQueryQuery.cpp +++ b/dbms/src/Interpreters/InterpreterKillQueryQuery.cpp @@ -102,7 +102,7 @@ static QueryDescriptors extractQueriesExceptMeAndCheckAccess(const Block & proce res.emplace_back(std::move(query_id), std::move(query_user), i, false); } - if (res.empty() && !query_user.empty()) + if (res.empty() && !query_user.empty()) // NOLINT throw Exception("User " + my_client.current_user + " attempts to kill query created by " + query_user, ErrorCodes::ACCESS_DENIED); return res; diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp index 3adc2eadbfc..085ebe52557 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp +++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp @@ -2046,10 +2046,12 @@ void InterpreterSelectQuery::executeOrder(QueryPipeline & pipeline, InputSorting if (need_finish_sorting) { - pipeline.addSimpleTransform([&](const Block & header, QueryPipeline::StreamType stream_type) + pipeline.addSimpleTransform([&](const Block & header, QueryPipeline::StreamType stream_type) -> ProcessorPtr { - bool do_count_rows = stream_type == QueryPipeline::StreamType::Main; - return std::make_shared(header, output_order_descr, limit, do_count_rows); + if (stream_type != QueryPipeline::StreamType::Main) + return nullptr; + + return std::make_shared(header, output_order_descr, limit); }); pipeline.addSimpleTransform([&](const Block & header) -> ProcessorPtr @@ -2063,10 +2065,12 @@ void InterpreterSelectQuery::executeOrder(QueryPipeline & pipeline, InputSorting return; } - pipeline.addSimpleTransform([&](const Block & header, QueryPipeline::StreamType stream_type) + pipeline.addSimpleTransform([&](const Block & header, QueryPipeline::StreamType stream_type) -> ProcessorPtr { - bool do_count_rows = stream_type == QueryPipeline::StreamType::Main; - return std::make_shared(header, output_order_descr, limit, do_count_rows); + if (stream_type != QueryPipeline::StreamType::Main) + return nullptr; + + return std::make_shared(header, output_order_descr, limit); }); /// Merge the sorted blocks. diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.h b/dbms/src/Interpreters/InterpreterSelectQuery.h index c33dc5bdb57..d3007813218 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.h +++ b/dbms/src/Interpreters/InterpreterSelectQuery.h @@ -168,7 +168,7 @@ private: void executeMergeAggregated(Pipeline & pipeline, bool overflow_row, bool final); void executeTotalsAndHaving(Pipeline & pipeline, bool has_having, const ExpressionActionsPtr & expression, bool overflow_row, bool final); void executeHaving(Pipeline & pipeline, const ExpressionActionsPtr & expression); - void executeExpression(Pipeline & pipeline, const ExpressionActionsPtr & expression); + static void executeExpression(Pipeline & pipeline, const ExpressionActionsPtr & expression); void executeOrder(Pipeline & pipeline, InputSortingInfoPtr sorting_info); void executeWithFill(Pipeline & pipeline); void executeMergeSorted(Pipeline & pipeline); @@ -176,7 +176,7 @@ private: void executeUnion(Pipeline & pipeline, Block header); void executeLimitBy(Pipeline & pipeline); void executeLimit(Pipeline & pipeline); - void executeProjection(Pipeline & pipeline, const ExpressionActionsPtr & expression); + static void executeProjection(Pipeline & pipeline, const ExpressionActionsPtr & expression); void executeDistinct(Pipeline & pipeline, bool before_order, Names columns); void executeExtremes(Pipeline & pipeline); void executeSubqueriesInSetsAndJoins(Pipeline & pipeline, const std::unordered_map & subqueries_for_sets); @@ -187,14 +187,14 @@ private: void executeMergeAggregated(QueryPipeline & pipeline, bool overflow_row, bool final); void executeTotalsAndHaving(QueryPipeline & pipeline, bool has_having, const ExpressionActionsPtr & expression, bool overflow_row, bool final); void executeHaving(QueryPipeline & pipeline, const ExpressionActionsPtr & expression); - void executeExpression(QueryPipeline & pipeline, const ExpressionActionsPtr & expression); + static void executeExpression(QueryPipeline & pipeline, const ExpressionActionsPtr & expression); void executeOrder(QueryPipeline & pipeline, InputSortingInfoPtr sorting_info); void executeWithFill(QueryPipeline & pipeline); void executeMergeSorted(QueryPipeline & pipeline); void executePreLimit(QueryPipeline & pipeline, bool do_not_skip_offset); void executeLimitBy(QueryPipeline & pipeline); void executeLimit(QueryPipeline & pipeline); - void executeProjection(QueryPipeline & pipeline, const ExpressionActionsPtr & expression); + static void executeProjection(QueryPipeline & pipeline, const ExpressionActionsPtr & expression); void executeDistinct(QueryPipeline & pipeline, bool before_order, Names columns); void executeExtremes(QueryPipeline & pipeline); void executeSubqueriesInSetsAndJoins(QueryPipeline & pipeline, const std::unordered_map & subqueries_for_sets); diff --git a/dbms/src/Interpreters/InterpreterWatchQuery.cpp b/dbms/src/Interpreters/InterpreterWatchQuery.cpp index f9be3e22c55..91e7bfac2a3 100644 --- a/dbms/src/Interpreters/InterpreterWatchQuery.cpp +++ b/dbms/src/Interpreters/InterpreterWatchQuery.cpp @@ -28,10 +28,6 @@ namespace ErrorCodes extern const int SUPPORT_IS_DISABLED; } -BlockInputStreamPtr InterpreterWatchQuery::executeImpl() -{ - return std::make_shared(Block()); -} BlockIO InterpreterWatchQuery::execute() { diff --git a/dbms/src/Interpreters/InterpreterWatchQuery.h b/dbms/src/Interpreters/InterpreterWatchQuery.h index 9315ee1f889..7a5d57a1cf5 100644 --- a/dbms/src/Interpreters/InterpreterWatchQuery.h +++ b/dbms/src/Interpreters/InterpreterWatchQuery.h @@ -39,7 +39,6 @@ private: ASTPtr query_ptr; Context & context; - BlockInputStreamPtr executeImpl(); /// Table from where to read data, if not subquery. StoragePtr storage; /// Streams of read data diff --git a/dbms/src/Interpreters/Join.h b/dbms/src/Interpreters/Join.h index 64c6342cf04..d9f0cfb55cb 100644 --- a/dbms/src/Interpreters/Join.h +++ b/dbms/src/Interpreters/Join.h @@ -153,7 +153,7 @@ public: /** Add block of data from right hand of JOIN to the map. * Returns false, if some limit was exceeded and you should not insert more data. */ - bool addJoinedBlock(const Block & block, bool check_limits = true) override; + bool addJoinedBlock(const Block & block, bool check_limits) override; /** Join data from the map (that was previously built by calls to addJoinedBlock) to the block with data from "left" table. * Could be called from different threads in parallel. diff --git a/dbms/src/Interpreters/JoinSwitcher.h b/dbms/src/Interpreters/JoinSwitcher.h index 5e677b5205f..ecf042fb7ac 100644 --- a/dbms/src/Interpreters/JoinSwitcher.h +++ b/dbms/src/Interpreters/JoinSwitcher.h @@ -20,7 +20,7 @@ public: /// Add block of data from right hand of JOIN into current join object. /// If join-in-memory memory limit exceeded switches to join-on-disk and continue with it. /// @returns false, if join-on-disk disk limit exceeded - bool addJoinedBlock(const Block & block, bool check_limits = true) override; + bool addJoinedBlock(const Block & block, bool check_limits) override; void joinBlock(Block & block, std::shared_ptr & not_processed) override { diff --git a/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp index 39037613dda..dbe58cfc86a 100644 --- a/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp +++ b/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp @@ -129,7 +129,7 @@ struct ColumnAliasesMatcher std::vector> compound_identifiers; std::set allowed_long_names; /// original names allowed as aliases '--t.x as t.x' (select expressions only). - Data(const std::vector && tables_) + explicit Data(const std::vector && tables_) : tables(tables_) , public_names(false) {} diff --git a/dbms/src/Interpreters/LogicalExpressionsOptimizer.h b/dbms/src/Interpreters/LogicalExpressionsOptimizer.h index fa5289b3f5f..1a04b199a13 100644 --- a/dbms/src/Interpreters/LogicalExpressionsOptimizer.h +++ b/dbms/src/Interpreters/LogicalExpressionsOptimizer.h @@ -81,7 +81,7 @@ private: bool mayOptimizeDisjunctiveEqualityChain(const DisjunctiveEqualityChain & chain) const; /// Insert the IN expression into the OR chain. - void addInExpression(const DisjunctiveEqualityChain & chain); + static void addInExpression(const DisjunctiveEqualityChain & chain); /// Delete the equalities that were replaced by the IN expressions. void cleanupOrExpressions(); diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 7d934aed06a..74a11fc05e4 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -50,7 +50,7 @@ class MergeJoin : public IJoin public: MergeJoin(std::shared_ptr table_join_, const Block & right_sample_block); - bool addJoinedBlock(const Block & block, bool check_limits = true) override; + bool addJoinedBlock(const Block & block, bool check_limits) override; void joinBlock(Block &, ExtraBlockPtr & not_processed) override; void joinTotals(Block &) const override; void setTotals(const Block &) override; diff --git a/dbms/src/Interpreters/PartLog.cpp b/dbms/src/Interpreters/PartLog.cpp index d77bb3fed59..c797ad8a8a9 100644 --- a/dbms/src/Interpreters/PartLog.cpp +++ b/dbms/src/Interpreters/PartLog.cpp @@ -50,6 +50,7 @@ Block PartLogElement::createBlock() {ColumnUInt64::create(), std::make_shared(), "bytes_uncompressed"}, // Result bytes {ColumnUInt64::create(), std::make_shared(), "read_rows"}, {ColumnUInt64::create(), std::make_shared(), "read_bytes"}, + {ColumnUInt64::create(), std::make_shared(), "peak_memory_usage"}, /// Is there an error during the execution or commit {ColumnUInt16::create(), std::make_shared(), "error"}, @@ -87,6 +88,7 @@ void PartLogElement::appendToBlock(Block & block) const columns[i++]->insert(bytes_uncompressed); columns[i++]->insert(rows_read); columns[i++]->insert(bytes_read_uncompressed); + columns[i++]->insert(peak_memory_usage); columns[i++]->insert(error); columns[i++]->insert(exception); diff --git a/dbms/src/Interpreters/PartLog.h b/dbms/src/Interpreters/PartLog.h index 4c4930ccefa..b84138159a2 100644 --- a/dbms/src/Interpreters/PartLog.h +++ b/dbms/src/Interpreters/PartLog.h @@ -40,11 +40,13 @@ struct PartLogElement UInt64 bytes_uncompressed = 0; UInt64 rows_read = 0; UInt64 bytes_read_uncompressed = 0; + UInt64 peak_memory_usage = 0; /// Was the operation successful? UInt16 error = 0; String exception; + static std::string name() { return "PartLog"; } static Block createBlock(); diff --git a/dbms/src/Interpreters/ProcessList.cpp b/dbms/src/Interpreters/ProcessList.cpp index d0d020e2ba2..cf8001ab2c4 100644 --- a/dbms/src/Interpreters/ProcessList.cpp +++ b/dbms/src/Interpreters/ProcessList.cpp @@ -484,4 +484,32 @@ ProcessList::Info ProcessList::getInfo(bool get_thread_list, bool get_profile_ev ProcessListForUser::ProcessListForUser() = default; +ProcessListForUserInfo ProcessListForUser::getInfo(bool get_profile_events) const +{ + ProcessListForUserInfo res; + + res.memory_usage = user_memory_tracker.get(); + res.peak_memory_usage = user_memory_tracker.getPeak(); + + if (get_profile_events) + res.profile_counters = std::make_shared(user_performance_counters.getPartiallyAtomicSnapshot()); + + return res; +} + + +ProcessList::UserInfo ProcessList::getUserInfo(bool get_profile_events) const +{ + UserInfo per_user_infos; + + std::lock_guard lock(mutex); + + per_user_infos.reserve(user_to_queries.size()); + + for (const auto & [user, user_queries] : user_to_queries) + per_user_infos.emplace(user, user_queries.getInfo(get_profile_events)); + + return per_user_infos; +} + } diff --git a/dbms/src/Interpreters/ProcessList.h b/dbms/src/Interpreters/ProcessList.h index a5edf7f90d8..da6799c5eca 100644 --- a/dbms/src/Interpreters/ProcessList.h +++ b/dbms/src/Interpreters/ProcessList.h @@ -197,6 +197,17 @@ public: }; +/// Information of process list for user. +struct ProcessListForUserInfo +{ + Int64 memory_usage; + Int64 peak_memory_usage; + + // Optional field, filled by request. + std::shared_ptr profile_counters; +}; + + /// Data about queries for one user. struct ProcessListForUser { @@ -213,6 +224,8 @@ struct ProcessListForUser /// Count network usage for all simultaneously running queries of single user. ThrottlerPtr user_throttler; + ProcessListForUserInfo getInfo(bool get_profile_events = false) const; + /// Clears MemoryTracker for the user. /// Sometimes it is important to reset the MemoryTracker, because it may accumulate skew /// due to the fact that there are cases when memory can be allocated while processing the query, but released later. @@ -261,6 +274,8 @@ public: /// list, for iterators not to invalidate. NOTE: could replace with cyclic buffer, but not worth. using Container = std::list; using Info = std::vector; + using UserInfo = std::unordered_map; + /// User -> queries using UserToQueries = std::unordered_map; @@ -307,6 +322,9 @@ public: /// Get current state of process list. Info getInfo(bool get_thread_list = false, bool get_profile_events = false, bool get_settings = false) const; + /// Get current state of process list per user. + UserInfo getUserInfo(bool get_profile_events = false) const; + void setMaxSize(size_t max_size_) { std::lock_guard lock(mutex); diff --git a/dbms/src/Interpreters/QueryNormalizer.cpp b/dbms/src/Interpreters/QueryNormalizer.cpp index cb084ee2cbc..568b08b8f5a 100644 --- a/dbms/src/Interpreters/QueryNormalizer.cpp +++ b/dbms/src/Interpreters/QueryNormalizer.cpp @@ -26,7 +26,7 @@ namespace ErrorCodes class CheckASTDepth { public: - CheckASTDepth(QueryNormalizer::Data & data_) + explicit CheckASTDepth(QueryNormalizer::Data & data_) : data(data_) { if (data.level > data.settings.max_ast_depth) @@ -47,7 +47,7 @@ private: class RestoreAliasOnExitScope { public: - RestoreAliasOnExitScope(String & alias_) + explicit RestoreAliasOnExitScope(String & alias_) : alias(alias_) , copy(alias_) {} diff --git a/dbms/src/Interpreters/SelectQueryOptions.h b/dbms/src/Interpreters/SelectQueryOptions.h index a49245f5609..ed6f759a7d5 100644 --- a/dbms/src/Interpreters/SelectQueryOptions.h +++ b/dbms/src/Interpreters/SelectQueryOptions.h @@ -45,9 +45,9 @@ struct SelectQueryOptions return out; } - SelectQueryOptions & analyze(bool value = true) + SelectQueryOptions & analyze(bool dry_run = true) { - only_analyze = value; + only_analyze = dry_run; return *this; } diff --git a/dbms/src/Interpreters/SetVariants.cpp b/dbms/src/Interpreters/SetVariants.cpp index 56f2ff04230..52f54d2442a 100644 --- a/dbms/src/Interpreters/SetVariants.cpp +++ b/dbms/src/Interpreters/SetVariants.cpp @@ -23,7 +23,7 @@ void SetVariantsTemplate::init(Type type_) case Type::EMPTY: break; #define M(NAME) \ - case Type::NAME: NAME = std::make_unique(); break; + case Type::NAME: (NAME) = std::make_unique(); break; APPLY_FOR_SET_VARIANTS(M) #undef M } @@ -37,7 +37,7 @@ size_t SetVariantsTemplate::getTotalRowCount() const case Type::EMPTY: return 0; #define M(NAME) \ - case Type::NAME: return NAME->data.size(); + case Type::NAME: return (NAME)->data.size(); APPLY_FOR_SET_VARIANTS(M) #undef M } @@ -53,7 +53,7 @@ size_t SetVariantsTemplate::getTotalByteCount() const case Type::EMPTY: return 0; #define M(NAME) \ - case Type::NAME: return NAME->data.getBufferSizeInBytes(); + case Type::NAME: return (NAME)->data.getBufferSizeInBytes(); APPLY_FOR_SET_VARIANTS(M) #undef M } diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.cpp b/dbms/src/Interpreters/SyntaxAnalyzer.cpp index ce6f163ab5b..5e35c03a577 100644 --- a/dbms/src/Interpreters/SyntaxAnalyzer.cpp +++ b/dbms/src/Interpreters/SyntaxAnalyzer.cpp @@ -806,6 +806,7 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyzeSelect( /// TODO: Remove unneeded conversion std::vector tables_with_column_names; + tables_with_column_names.reserve(tables_with_columns.size()); for (const auto & table : tables_with_columns) tables_with_column_names.emplace_back(table.removeTypes()); @@ -887,7 +888,7 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyze(ASTPtr & query, const NamesAndTy return std::make_shared(result); } -void SyntaxAnalyzer::normalize(ASTPtr & query, Aliases & aliases, const Settings & settings) const +void SyntaxAnalyzer::normalize(ASTPtr & query, Aliases & aliases, const Settings & settings) { CustomizeFunctionsVisitor::Data data{settings.count_distinct_implementation}; CustomizeFunctionsVisitor(data).visit(query); diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.h b/dbms/src/Interpreters/SyntaxAnalyzer.h index 671e91b663f..1578b261571 100644 --- a/dbms/src/Interpreters/SyntaxAnalyzer.h +++ b/dbms/src/Interpreters/SyntaxAnalyzer.h @@ -99,7 +99,7 @@ public: private: const Context & context; - void normalize(ASTPtr & query, Aliases & aliases, const Settings & settings) const; + static void normalize(ASTPtr & query, Aliases & aliases, const Settings & settings); }; } diff --git a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp index 27542b97691..a0f411dcc96 100644 --- a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp +++ b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp @@ -34,7 +34,7 @@ bool TranslateQualifiedNamesMatcher::Data::unknownColumn(size_t table_pos, const auto nested1 = IdentifierSemantic::extractNestedName(identifier, table.table); auto nested2 = IdentifierSemantic::extractNestedName(identifier, table.alias); - String short_name = identifier.shortName(); + const String & short_name = identifier.shortName(); const Names & column_names = tables[table_pos].columns; for (auto & known_name : column_names) { diff --git a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.h b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.h index bcd032938aa..51c6c2c42f0 100644 --- a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.h +++ b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.h @@ -64,7 +64,7 @@ struct RestoreQualifiedNamesData { using TypeToVisit = ASTIdentifier; - void visit(ASTIdentifier & identifier, ASTPtr & ast); + static void visit(ASTIdentifier & identifier, ASTPtr & ast); }; using RestoreQualifiedNamesMatcher = OneTypeMatcher; diff --git a/dbms/src/Interpreters/executeQuery.cpp b/dbms/src/Interpreters/executeQuery.cpp index cfd588078ff..fefca6b580f 100644 --- a/dbms/src/Interpreters/executeQuery.cpp +++ b/dbms/src/Interpreters/executeQuery.cpp @@ -748,8 +748,6 @@ void executeQuery( auto executor = pipeline.execute(); executor->execute(context.getSettingsRef().max_threads); } - - pipeline.finalize(); } } catch (...) diff --git a/dbms/src/Interpreters/tests/aggregate.cpp b/dbms/src/Interpreters/tests/aggregate.cpp index df498d6039d..9959bca7aac 100644 --- a/dbms/src/Interpreters/tests/aggregate.cpp +++ b/dbms/src/Interpreters/tests/aggregate.cpp @@ -20,7 +20,7 @@ int main(int argc, char ** argv) try { - size_t n = argc == 2 ? atoi(argv[1]) : 10; + size_t n = argc == 2 ? std::stol(argv[1]) : 10; Block block; diff --git a/dbms/src/Interpreters/tests/expression.cpp b/dbms/src/Interpreters/tests/expression.cpp index bf0058e5289..8d64b4f64ce 100644 --- a/dbms/src/Interpreters/tests/expression.cpp +++ b/dbms/src/Interpreters/tests/expression.cpp @@ -63,7 +63,7 @@ int main(int argc, char ** argv) chain.finalize(); ExpressionActionsPtr expression = chain.getLastActions(); - size_t n = argc == 2 ? atoi(argv[1]) : 10; + size_t n = argc == 2 ? std::stol(argv[1]) : 10; Block block; diff --git a/dbms/src/Interpreters/tests/hash_map.cpp b/dbms/src/Interpreters/tests/hash_map.cpp index 7c52953fa9f..bc35bea16bb 100644 --- a/dbms/src/Interpreters/tests/hash_map.cpp +++ b/dbms/src/Interpreters/tests/hash_map.cpp @@ -92,8 +92,8 @@ int main(int argc, char ** argv) using Value = std::vector; #endif - size_t n = argc < 2 ? 10000000 : atoi(argv[1]); - //size_t m = atoi(argv[2]); + size_t n = argc < 2 ? 10000000 : std::stol(argv[1]); + //size_t m = std::stol(argv[2]); AggregateFunctionFactory factory; DataTypes data_types_empty; @@ -149,7 +149,7 @@ int main(int argc, char ** argv) << std::endl; } - if (argc < 3 || atoi(argv[2]) == 1) + if (argc < 3 || std::stol(argv[2]) == 1) { Stopwatch watch; @@ -179,7 +179,7 @@ int main(int argc, char ** argv) << std::endl; } - if (argc < 3 || atoi(argv[2]) == 2) + if (argc < 3 || std::stol(argv[2]) == 2) { Stopwatch watch; @@ -211,7 +211,7 @@ int main(int argc, char ** argv) } #if defined(__x86_64__) - if (argc < 3 || atoi(argv[2]) == 3) + if (argc < 3 || std::stol(argv[2]) == 3) { Stopwatch watch; @@ -243,7 +243,7 @@ int main(int argc, char ** argv) } #endif - if (argc < 3 || atoi(argv[2]) == 4) + if (argc < 3 || std::stol(argv[2]) == 4) { Stopwatch watch; @@ -263,7 +263,7 @@ int main(int argc, char ** argv) << std::endl; } - if (argc < 3 || atoi(argv[2]) == 5) + if (argc < 3 || std::stol(argv[2]) == 5) { Stopwatch watch; @@ -284,7 +284,7 @@ int main(int argc, char ** argv) << std::endl; } - if (argc < 3 || atoi(argv[2]) == 6) + if (argc < 3 || std::stol(argv[2]) == 6) { Stopwatch watch; diff --git a/dbms/src/Interpreters/tests/hash_map_lookup.cpp b/dbms/src/Interpreters/tests/hash_map_lookup.cpp index 13ff3234e8b..926e6b6766e 100644 --- a/dbms/src/Interpreters/tests/hash_map_lookup.cpp +++ b/dbms/src/Interpreters/tests/hash_map_lookup.cpp @@ -97,8 +97,8 @@ int main(int argc, char ** argv) return 1; } - size_t n = atoi(argv[1]); - size_t m = atoi(argv[2]); + size_t n = std::stol(argv[1]); + size_t m = std::stol(argv[2]); std::vector data(n); diff --git a/dbms/src/Interpreters/tests/hash_map_string.cpp b/dbms/src/Interpreters/tests/hash_map_string.cpp index 6c1f6b813ff..26f69f95e5e 100644 --- a/dbms/src/Interpreters/tests/hash_map_string.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string.cpp @@ -277,7 +277,7 @@ struct Grower : public HashTableGrower<> } /// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table. - [[noreturn]] void set(size_t /*num_elems*/) + [[noreturn]] static void set(size_t /*num_elems*/) { throw Poco::Exception(__PRETTY_FUNCTION__); } @@ -292,8 +292,8 @@ int main(int argc, char ** argv) return 1; } - size_t n = atoi(argv[1]); - size_t m = atoi(argv[2]); + size_t n = std::stol(argv[1]); + size_t m = std::stol(argv[2]); DB::Arena pool; std::vector data(n); diff --git a/dbms/src/Interpreters/tests/hash_map_string_2.cpp b/dbms/src/Interpreters/tests/hash_map_string_2.cpp index 32b723c1187..5f6954cb0b0 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_2.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_2.cpp @@ -54,16 +54,16 @@ struct STRUCT : public StringRef {}; \ namespace ZeroTraits \ { \ template <> \ - inline bool check(STRUCT x) { return 0 == x.size; } \ + inline bool check(STRUCT x) { return 0 == x.size; } /* NOLINT */ \ \ template <> \ - inline void set(STRUCT & x) { x.size = 0; } \ + inline void set(STRUCT & x) { x.size = 0; } /* NOLINT */ \ } \ \ template <> \ struct DefaultHash \ { \ - size_t operator() (STRUCT x) const \ + size_t operator() (STRUCT x) const /* NOLINT */ \ { \ return CityHash_v1_0_2::CityHash64(x.data, x.size); \ } \ @@ -614,8 +614,8 @@ int main(int argc, char ** argv) return 1; } - size_t n = atoi(argv[1]); - size_t m = atoi(argv[2]); + size_t n = std::stol(argv[1]); + size_t m = std::stol(argv[2]); DB::Arena pool; std::vector data(n); diff --git a/dbms/src/Interpreters/tests/hash_map_string_3.cpp b/dbms/src/Interpreters/tests/hash_map_string_3.cpp index 62ed0584d3f..3dfbe5fb0f2 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_3.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_3.cpp @@ -57,10 +57,10 @@ struct STRUCT : public StringRef {}; \ namespace ZeroTraits \ { \ template <> \ - inline bool check(STRUCT x) { return nullptr == x.data; } \ + inline bool check(STRUCT x) { return nullptr == x.data; } /* NOLINT */ \ \ template <> \ - inline void set(STRUCT & x) { x.data = nullptr; } \ + inline void set(STRUCT & x) { x.data = nullptr; } /* NOLINT */ \ } \ \ template <> \ @@ -466,8 +466,8 @@ int main(int argc, char ** argv) return 1; } - size_t n = atoi(argv[1]); - size_t m = atoi(argv[2]); + size_t n = std::stol(argv[1]); + size_t m = std::stol(argv[2]); DB::Arena pool; std::vector data(n); diff --git a/dbms/src/Interpreters/tests/hash_map_string_small.cpp b/dbms/src/Interpreters/tests/hash_map_string_small.cpp index 18ee1eb0531..7dac9691dc9 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_small.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_small.cpp @@ -102,8 +102,8 @@ int main(int argc, char ** argv) return 1; } - size_t n = atoi(argv[1]); - size_t m = atoi(argv[2]); + size_t n = std::stol(argv[1]); + size_t m = std::stol(argv[2]); DB::Arena pool; std::vector data(n); diff --git a/dbms/src/Interpreters/tests/in_join_subqueries_preprocessor.cpp b/dbms/src/Interpreters/tests/in_join_subqueries_preprocessor.cpp index 017ca2e1739..47dcb406114 100644 --- a/dbms/src/Interpreters/tests/in_join_subqueries_preprocessor.cpp +++ b/dbms/src/Interpreters/tests/in_join_subqueries_preprocessor.cpp @@ -28,7 +28,7 @@ namespace DB /// Simplified version of the StorageDistributed class. -class StorageDistributedFake : public ext::shared_ptr_helper, public DB::IStorage +class StorageDistributedFake final : public ext::shared_ptr_helper, public DB::IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Interpreters/tests/string_hash_map.cpp b/dbms/src/Interpreters/tests/string_hash_map.cpp index 37fbefbe987..3969458fced 100644 --- a/dbms/src/Interpreters/tests/string_hash_map.cpp +++ b/dbms/src/Interpreters/tests/string_hash_map.cpp @@ -211,8 +211,8 @@ int main(int argc, char ** argv) return 1; } - size_t n = atoi(argv[1]); - size_t m = atoi(argv[2]); + size_t n = std::stol(argv[1]); + size_t m = std::stol(argv[2]); DB::Arena pool(128 * 1024 * 1024); std::vector data(n); diff --git a/dbms/src/Interpreters/tests/two_level_hash_map.cpp b/dbms/src/Interpreters/tests/two_level_hash_map.cpp index e1370cd3932..604f0de2976 100644 --- a/dbms/src/Interpreters/tests/two_level_hash_map.cpp +++ b/dbms/src/Interpreters/tests/two_level_hash_map.cpp @@ -31,7 +31,7 @@ int main(int argc, char ** argv) return 1; } - size_t n = atoi(argv[1]); + size_t n = std::stol(argv[1]); std::vector data(n); diff --git a/dbms/src/Parsers/ASTTablesInSelectQuery.cpp b/dbms/src/Parsers/ASTTablesInSelectQuery.cpp index b085f5a28ae..60cb0475be7 100644 --- a/dbms/src/Parsers/ASTTablesInSelectQuery.cpp +++ b/dbms/src/Parsers/ASTTablesInSelectQuery.cpp @@ -11,7 +11,7 @@ do \ { \ if (member) \ { \ - res->member = member->clone(); \ + res->member = (member)->clone(); \ res->children.push_back(res->member); \ } \ } \ diff --git a/dbms/src/Parsers/ASTWithAlias.cpp b/dbms/src/Parsers/ASTWithAlias.cpp index 0239d0b34cd..ad93102e1b7 100644 --- a/dbms/src/Parsers/ASTWithAlias.cpp +++ b/dbms/src/Parsers/ASTWithAlias.cpp @@ -6,11 +6,11 @@ namespace DB { -void ASTWithAlias::writeAlias(const String & name, const FormatSettings & settings) const +static void writeAlias(const String & name, const ASTWithAlias::FormatSettings & settings) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << " AS " << (settings.hilite ? hilite_alias : ""); + settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " AS " << (settings.hilite ? IAST::hilite_alias : ""); settings.writeIdentifier(name); - settings.ostr << (settings.hilite ? hilite_none : ""); + settings.ostr << (settings.hilite ? IAST::hilite_none : ""); } diff --git a/dbms/src/Parsers/ASTWithAlias.h b/dbms/src/Parsers/ASTWithAlias.h index f7a777e0a43..7a272a157e2 100644 --- a/dbms/src/Parsers/ASTWithAlias.h +++ b/dbms/src/Parsers/ASTWithAlias.h @@ -32,8 +32,6 @@ public: protected: virtual void appendColumnNameImpl(WriteBuffer & ostr) const = 0; - - void writeAlias(const String & name, const FormatSettings & settings) const; }; /// helper for setting aliases and chaining result to other functions diff --git a/dbms/src/Parsers/CommonParsers.cpp b/dbms/src/Parsers/CommonParsers.cpp index ddbf1b17966..47868f5df48 100644 --- a/dbms/src/Parsers/CommonParsers.cpp +++ b/dbms/src/Parsers/CommonParsers.cpp @@ -50,7 +50,7 @@ bool ParserKeyword::parseImpl(Pos & pos, ASTPtr & /*node*/, Expected & expected) if (word_length != pos->size()) return false; - if (strncasecmp(pos->begin, current_word, word_length)) + if (0 != strncasecmp(pos->begin, current_word, word_length)) return false; ++pos; diff --git a/dbms/src/Parsers/ParserDropQuery.cpp b/dbms/src/Parsers/ParserDropQuery.cpp index c82b4d4997c..b4ff0cb3804 100644 --- a/dbms/src/Parsers/ParserDropQuery.cpp +++ b/dbms/src/Parsers/ParserDropQuery.cpp @@ -8,49 +8,10 @@ namespace DB { -namespace ErrorCodes +namespace { -} -bool ParserDropQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) -{ - ParserKeyword s_drop("DROP"); - ParserKeyword s_detach("DETACH"); - ParserKeyword s_truncate("TRUNCATE"); - - if (s_drop.ignore(pos, expected)) - return parseDropQuery(pos, node, expected); - else if (s_detach.ignore(pos, expected)) - return parseDetachQuery(pos, node, expected); - else if (s_truncate.ignore(pos, expected)) - return parseTruncateQuery(pos, node, expected); - else - return false; -} - -bool ParserDropQuery::parseDetachQuery(Pos & pos, ASTPtr & node, Expected & expected) -{ - if (parseDropQuery(pos, node, expected)) - { - auto * drop_query = node->as(); - drop_query->kind = ASTDropQuery::Kind::Detach; - return true; - } - return false; -} - -bool ParserDropQuery::parseTruncateQuery(Pos & pos, ASTPtr & node, Expected & expected) -{ - if (parseDropQuery(pos, node, expected)) - { - auto * drop_query = node->as(); - drop_query->kind = ASTDropQuery::Kind::Truncate; - return true; - } - return false; -} - -bool ParserDropQuery::parseDropQuery(Pos & pos, ASTPtr & node, Expected & expected) +bool parseDropQuery(IParser::Pos & pos, ASTPtr & node, Expected & expected) { ParserKeyword s_temporary("TEMPORARY"); ParserKeyword s_table("TABLE"); @@ -129,4 +90,44 @@ bool ParserDropQuery::parseDropQuery(Pos & pos, ASTPtr & node, Expected & expect return true; } +bool parseDetachQuery(IParser::Pos & pos, ASTPtr & node, Expected & expected) +{ + if (parseDropQuery(pos, node, expected)) + { + auto * drop_query = node->as(); + drop_query->kind = ASTDropQuery::Kind::Detach; + return true; + } + return false; +} + +bool parseTruncateQuery(IParser::Pos & pos, ASTPtr & node, Expected & expected) +{ + if (parseDropQuery(pos, node, expected)) + { + auto * drop_query = node->as(); + drop_query->kind = ASTDropQuery::Kind::Truncate; + return true; + } + return false; +} + +} + +bool ParserDropQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + ParserKeyword s_drop("DROP"); + ParserKeyword s_detach("DETACH"); + ParserKeyword s_truncate("TRUNCATE"); + + if (s_drop.ignore(pos, expected)) + return parseDropQuery(pos, node, expected); + else if (s_detach.ignore(pos, expected)) + return parseDetachQuery(pos, node, expected); + else if (s_truncate.ignore(pos, expected)) + return parseTruncateQuery(pos, node, expected); + else + return false; +} + } diff --git a/dbms/src/Parsers/ParserDropQuery.h b/dbms/src/Parsers/ParserDropQuery.h index 704f93de1cf..069b9c34ddd 100644 --- a/dbms/src/Parsers/ParserDropQuery.h +++ b/dbms/src/Parsers/ParserDropQuery.h @@ -21,10 +21,6 @@ class ParserDropQuery : public IParserBase protected: const char * getName() const override{ return "DROP query"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; - - bool parseDropQuery(Pos & pos, ASTPtr & node, Expected & expected); - bool parseDetachQuery(Pos & pos, ASTPtr & node, Expected & expected); - bool parseTruncateQuery(Pos & pos, ASTPtr & node, Expected & expected); }; } diff --git a/dbms/src/Processors/Executors/TreeExecutorBlockInputStream.cpp b/dbms/src/Processors/Executors/TreeExecutorBlockInputStream.cpp index d08edeaea80..e0242533518 100644 --- a/dbms/src/Processors/Executors/TreeExecutorBlockInputStream.cpp +++ b/dbms/src/Processors/Executors/TreeExecutorBlockInputStream.cpp @@ -120,6 +120,8 @@ void TreeExecutorBlockInputStream::init() connect(*totals_port, *input_totals_port); input_totals_port->setNeeded(); } + + initRowsBeforeLimit(); } void TreeExecutorBlockInputStream::execute(bool on_totals) @@ -206,42 +208,44 @@ void TreeExecutorBlockInputStream::execute(bool on_totals) } } -void TreeExecutorBlockInputStream::calcRowsBeforeLimit() +void TreeExecutorBlockInputStream::initRowsBeforeLimit() { - std::stack stack; - stack.push(root); + std::vector limit_transforms; + std::vector sources; - size_t rows_before_limit = 0; - bool has_limit = false; + struct StackEntry + { + IProcessor * processor; + bool visited_limit; + }; + + std::stack stack; + stack.push({root, false}); while (!stack.empty()) { - auto processor = stack.top(); + auto processor = stack.top().processor; + bool visited_limit = stack.top().visited_limit; stack.pop(); - if (auto * limit = typeid_cast(processor)) + if (!visited_limit) { - has_limit = true; - rows_before_limit += limit->getRowsBeforeLimitAtLeast(); - } - if (auto * source = typeid_cast(processor)) - { - if (auto & stream = source->getStream()) + if (auto * limit = typeid_cast(processor)) { - auto & profile_info = stream->getProfileInfo(); - if (profile_info.hasAppliedLimit()) - { - has_limit = true; - rows_before_limit += profile_info.getRowsBeforeLimit(); - } + visited_limit = true; + limit_transforms.emplace_back(limit); } - } - if (auto * sorting = typeid_cast(processor)) + if (auto * source = typeid_cast(processor)) + sources.emplace_back(source); + } + else if (auto * sorting = typeid_cast(processor)) { - rows_before_limit += sorting->getNumReadRows(); - has_limit = true; + if (!rows_before_limit_at_least) + rows_before_limit_at_least = std::make_shared(); + + sorting->setRowsBeforeLimitCounter(rows_before_limit_at_least); /// Don't go to children. Take rows_before_limit from last PartialSortingTransform. continue; @@ -250,12 +254,25 @@ void TreeExecutorBlockInputStream::calcRowsBeforeLimit() for (auto & child_port : processor->getInputs()) { auto * child_processor = &child_port.getOutputPort().getProcessor(); - stack.push(child_processor); + stack.push({child_processor, visited_limit}); } } - if (has_limit) - info.setRowsBeforeLimit(rows_before_limit); + if (!rows_before_limit_at_least && (!limit_transforms.empty() || !sources.empty())) + { + rows_before_limit_at_least = std::make_shared(); + + for (auto & limit : limit_transforms) + limit->setRowsBeforeLimitCounter(rows_before_limit_at_least); + + for (auto & source : sources) + source->setRowsBeforeLimitCounter(rows_before_limit_at_least); + } + + /// If there is a limit, then enable rows_before_limit_at_least + /// It is needed when zero rows is read, but we still want rows_before_limit_at_least in result. + if (!limit_transforms.empty()) + rows_before_limit_at_least->add(0); } Block TreeExecutorBlockInputStream::readImpl() @@ -271,7 +288,8 @@ Block TreeExecutorBlockInputStream::readImpl() totals = getHeader().cloneWithColumns(input_totals_port->pull().detachColumns()); } - calcRowsBeforeLimit(); + if (rows_before_limit_at_least && rows_before_limit_at_least->hasAppliedLimit()) + info.setRowsBeforeLimit(rows_before_limit_at_least->get()); return {}; } diff --git a/dbms/src/Processors/Executors/TreeExecutorBlockInputStream.h b/dbms/src/Processors/Executors/TreeExecutorBlockInputStream.h index 8170d8fdb50..3ab8dde6948 100644 --- a/dbms/src/Processors/Executors/TreeExecutorBlockInputStream.h +++ b/dbms/src/Processors/Executors/TreeExecutorBlockInputStream.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include namespace DB { @@ -55,6 +56,7 @@ private: IProcessor * root = nullptr; std::unique_ptr input_port; std::unique_ptr input_totals_port; + RowsBeforeLimitCounterPtr rows_before_limit_at_least; /// Remember sources that support progress. std::vector sources_with_progress; @@ -65,7 +67,7 @@ private: /// Execute tree step-by-step until root returns next chunk or execution is finished. void execute(bool on_totals); - void calcRowsBeforeLimit(); + void initRowsBeforeLimit(); /// Moved from pipe. std::vector> interpreter_context; diff --git a/dbms/src/Processors/ForkProcessor.cpp b/dbms/src/Processors/ForkProcessor.cpp index 913e7c2d1c7..7fa21c4236d 100644 --- a/dbms/src/Processors/ForkProcessor.cpp +++ b/dbms/src/Processors/ForkProcessor.cpp @@ -65,7 +65,7 @@ ForkProcessor::Status ForkProcessor::prepare() { ++num_processed_outputs; if (num_processed_outputs == num_active_outputs) - output.push(std::move(data)); /// Can push because no full or unneeded outputs. + output.push(std::move(data)); // NOLINT Can push because no full or unneeded outputs. else output.push(data.clone()); } diff --git a/dbms/src/Processors/Formats/IOutputFormat.cpp b/dbms/src/Processors/Formats/IOutputFormat.cpp index 971ad95d946..ff4ac393471 100644 --- a/dbms/src/Processors/Formats/IOutputFormat.cpp +++ b/dbms/src/Processors/Formats/IOutputFormat.cpp @@ -48,6 +48,9 @@ void IOutputFormat::work() { if (finished && !finalized) { + if (rows_before_limit_counter && rows_before_limit_counter->hasAppliedLimit()) + setRowsBeforeLimit(rows_before_limit_counter->get()); + finalize(); finalized = true; return; diff --git a/dbms/src/Processors/Formats/IOutputFormat.h b/dbms/src/Processors/Formats/IOutputFormat.h index 5200b897643..1137dd78446 100644 --- a/dbms/src/Processors/Formats/IOutputFormat.h +++ b/dbms/src/Processors/Formats/IOutputFormat.h @@ -2,6 +2,7 @@ #include #include +#include #include @@ -33,6 +34,8 @@ protected: bool finished = false; bool finalized = false; + RowsBeforeLimitCounterPtr rows_before_limit_counter; + virtual void consume(Chunk) = 0; virtual void consumeTotals(Chunk) {} virtual void consumeExtremes(Chunk) {} @@ -50,6 +53,9 @@ public: /// Value for rows_before_limit_at_least field. virtual void setRowsBeforeLimit(size_t /*rows_before_limit*/) {} + /// Counter to calculate rows_before_limit_at_least in processors pipeline. + void setRowsBeforeLimitCounter(RowsBeforeLimitCounterPtr counter) { rows_before_limit_counter.swap(counter); } + /// Notify about progress. Method could be called from different threads. /// Passed value are delta, that must be summarized. virtual void onProgress(const Progress & /*progress*/) {} diff --git a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index a6eed90d57a..52ceaf063b7 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -75,7 +75,7 @@ namespace ErrorCodes class InputStreamReadBufferAdapter : public avro::InputStream { public: - InputStreamReadBufferAdapter(ReadBuffer & in_) : in(in_) {} + explicit InputStreamReadBufferAdapter(ReadBuffer & in_) : in(in_) {} bool next(const uint8_t ** data, size_t * len) override { @@ -444,7 +444,7 @@ AvroDeserializer::SkipFn AvroDeserializer::createSkipFn(avro::NodePtr root_node) AvroDeserializer::AvroDeserializer(const ColumnsWithTypeAndName & columns, avro::ValidSchema schema) { - auto schema_root = schema.root(); + const auto & schema_root = schema.root(); if (schema_root->type() != avro::AVRO_RECORD) { throw Exception("Root schema must be a record", ErrorCodes::TYPE_MISMATCH); @@ -519,7 +519,7 @@ bool AvroRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) class AvroConfluentRowInputFormat::SchemaRegistry { public: - SchemaRegistry(const std::string & base_url_, size_t schema_cache_max_size = 1000) + explicit SchemaRegistry(const std::string & base_url_, size_t schema_cache_max_size = 1000) : base_url(base_url_), schema_cache(schema_cache_max_size) { if (base_url.empty()) diff --git a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 65fac6d87f7..97106d5e297 100644 --- a/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -52,7 +52,7 @@ namespace ErrorCodes class OutputStreamWriteBufferAdapter : public avro::OutputStream { public: - OutputStreamWriteBufferAdapter(WriteBuffer & out_) : out(out_) {} + explicit OutputStreamWriteBufferAdapter(WriteBuffer & out_) : out(out_) {} virtual bool next(uint8_t ** data, size_t * len) override { diff --git a/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index bc00fec7611..597410f7c01 100644 --- a/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -9,9 +9,8 @@ #include #include #include -#include -#include #include +#include namespace DB @@ -33,7 +32,7 @@ static CapnProtoRowInputFormat::NestedField split(const Block & header, size_t i if (!name.empty() && name[0] == '.') name.erase(0, 1); - boost::split(field.tokens, name, boost::is_any_of("._")); + splitInto<'.', '_'>(field.tokens, name); return field; } diff --git a/dbms/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp b/dbms/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp index 83e449ee368..42f3eb3f94e 100644 --- a/dbms/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp +++ b/dbms/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp @@ -128,7 +128,7 @@ private: return true; } - void setDataType(LiteralInfo & info) + static void setDataType(LiteralInfo & info) { /// Type (Field::Types:Which) of literal in AST can be: String, UInt64, Int64, Float64, Null or Array of simple literals (not of Arrays). /// Null and empty Array literals are considered as tokens, because template with Nullable(Nothing) or Array(Nothing) is useless. diff --git a/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp index ae16bc47e11..369c0740264 100644 --- a/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp +++ b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp @@ -12,12 +12,19 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -DB::RowInputFormatWithDiagnosticInfo::RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_) +static String alignedName(const String & name, size_t max_length) +{ + size_t spaces_count = max_length >= name.size() ? max_length - name.size() : 0; + return name + ", " + std::string(spaces_count, ' '); +} + + +RowInputFormatWithDiagnosticInfo::RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_) : IRowInputFormat(header_, in_, params_) { } -void DB::RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo() +void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo() { ++row_num; @@ -28,7 +35,7 @@ void DB::RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo() offset_of_current_row = in.offset(); } -String DB::RowInputFormatWithDiagnosticInfo::getDiagnosticInfo() +String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo() { if (in.eof()) /// Buffer has gone, cannot extract information about what has been parsed. return {}; @@ -158,12 +165,6 @@ bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(co return true; } -String RowInputFormatWithDiagnosticInfo::alignedName(const String & name, size_t max_length) const -{ - size_t spaces_count = max_length >= name.size() ? max_length - name.size() : 0; - return name + ", " + std::string(spaces_count, ' '); -} - void RowInputFormatWithDiagnosticInfo::resetParser() { IRowInputFormat::resetParser(); diff --git a/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h index 91f52f93c78..1d502ddc281 100644 --- a/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h +++ b/dbms/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h @@ -22,7 +22,6 @@ protected: void updateDiagnosticInfo(); bool deserializeFieldAndPrintDiagnosticInfo(const String & col_name, const DataTypePtr & type, IColumn & column, WriteBuffer & out, size_t file_column); - String alignedName(const String & name, size_t max_length) const; virtual bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) = 0; virtual void tryDeserializeFiled(const DataTypePtr & type, IColumn & column, size_t file_column, diff --git a/dbms/src/Processors/LimitTransform.cpp b/dbms/src/Processors/LimitTransform.cpp index 219a663198f..fe8990f7b0f 100644 --- a/dbms/src/Processors/LimitTransform.cpp +++ b/dbms/src/Processors/LimitTransform.cpp @@ -185,7 +185,9 @@ LimitTransform::Status LimitTransform::preparePair(PortsData & data) data.current_chunk = input.pull(true); auto rows = data.current_chunk.getNumRows(); - rows_before_limit_at_least += rows; + + if (rows_before_limit_at_least) + rows_before_limit_at_least->add(rows); /// Skip block (for 'always_read_till_end' case). if (is_limit_reached || output_finished) diff --git a/dbms/src/Processors/LimitTransform.h b/dbms/src/Processors/LimitTransform.h index 1e7ec3bf322..a6989483c00 100644 --- a/dbms/src/Processors/LimitTransform.h +++ b/dbms/src/Processors/LimitTransform.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include namespace DB @@ -29,7 +30,7 @@ private: std::vector sort_column_positions; size_t rows_read = 0; /// including the last read block - size_t rows_before_limit_at_least = 0; + RowsBeforeLimitCounterPtr rows_before_limit_at_least; /// State of port's pair. /// Chunks from different port pairs are not mixed for berret cache locality. @@ -65,7 +66,7 @@ public: InputPort & getInputPort() { return inputs.front(); } OutputPort & getOutputPort() { return outputs.front(); } - size_t getRowsBeforeLimitAtLeast() const { return rows_before_limit_at_least; } + void setRowsBeforeLimitCounter(RowsBeforeLimitCounterPtr counter) { rows_before_limit_at_least.swap(counter); } }; } diff --git a/dbms/src/Processors/Port.h b/dbms/src/Processors/Port.h index c5b3c5979d1..e200b8c1ecb 100644 --- a/dbms/src/Processors/Port.h +++ b/dbms/src/Processors/Port.h @@ -9,7 +9,6 @@ #include #include #include -#include namespace DB { diff --git a/dbms/src/Processors/QueryPipeline.cpp b/dbms/src/Processors/QueryPipeline.cpp index 8398c2359ac..fe7d466cbe6 100644 --- a/dbms/src/Processors/QueryPipeline.cpp +++ b/dbms/src/Processors/QueryPipeline.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace DB { @@ -438,6 +439,8 @@ void QueryPipeline::setOutput(ProcessorPtr output) connect(*streams.front(), main); connect(*totals_having_port, totals); connect(*extremes_port, extremes); + + initRowsBeforeLimit(); } void QueryPipeline::unitePipelines( @@ -552,25 +555,12 @@ void QueryPipeline::setProcessListElement(QueryStatus * elem) } } -void QueryPipeline::finalize() +void QueryPipeline::initRowsBeforeLimit() { - checkInitialized(); + RowsBeforeLimitCounterPtr rows_before_limit_at_least; - if (!output_format) - throw Exception("Cannot finalize pipeline because it doesn't have output.", ErrorCodes::LOGICAL_ERROR); - - calcRowsBeforeLimit(); -} - -void QueryPipeline::calcRowsBeforeLimit() -{ - /// TODO get from Remote - - UInt64 rows_before_limit_at_least = 0; - UInt64 rows_before_limit = 0; - - bool has_limit = false; - bool has_partial_sorting = false; + std::vector limits; + std::vector sources; std::unordered_set visited; @@ -593,33 +583,24 @@ void QueryPipeline::calcRowsBeforeLimit() if (!visited_limit) { - if (auto * limit = typeid_cast(processor)) + if (auto * limit = typeid_cast(processor)) { - has_limit = visited_limit = true; - rows_before_limit_at_least += limit->getRowsBeforeLimitAtLeast(); + visited_limit = true; + limits.emplace_back(limit); } if (auto * source = typeid_cast(processor)) - { - if (auto & stream = source->getStream()) - { - auto & info = stream->getProfileInfo(); - if (info.hasAppliedLimit()) - { - has_limit = visited_limit = true; - rows_before_limit_at_least += info.getRowsBeforeLimit(); - } - } - } + sources.emplace_back(source); } - - if (auto * sorting = typeid_cast(processor)) + else if (auto * sorting = typeid_cast(processor)) { - has_partial_sorting = true; - rows_before_limit += sorting->getNumReadRows(); + if (!rows_before_limit_at_least) + rows_before_limit_at_least = std::make_shared(); + + sorting->setRowsBeforeLimitCounter(rows_before_limit_at_least); /// Don't go to children. Take rows_before_limit from last PartialSortingTransform. - /// continue; + continue; } /// Skip totals and extremes port for output format. @@ -640,9 +621,24 @@ void QueryPipeline::calcRowsBeforeLimit() } } - /// Get num read rows from PartialSortingTransform if have it. - if (has_limit) - output_format->setRowsBeforeLimit(has_partial_sorting ? rows_before_limit : rows_before_limit_at_least); + if (!rows_before_limit_at_least && (!limits.empty() || !sources.empty())) + { + rows_before_limit_at_least = std::make_shared(); + + for (auto & limit : limits) + limit->setRowsBeforeLimitCounter(rows_before_limit_at_least); + + for (auto & source : sources) + source->setRowsBeforeLimitCounter(rows_before_limit_at_least); + } + + /// If there is a limit, then enable rows_before_limit_at_least + /// It is needed when zero rows is read, but we still want rows_before_limit_at_least in result. + if (!limits.empty()) + rows_before_limit_at_least->add(0); + + if (rows_before_limit_at_least) + output_format->setRowsBeforeLimitCounter(rows_before_limit_at_least); } Pipe QueryPipeline::getPipe() && diff --git a/dbms/src/Processors/QueryPipeline.h b/dbms/src/Processors/QueryPipeline.h index 12ac80b9feb..e01087b717a 100644 --- a/dbms/src/Processors/QueryPipeline.h +++ b/dbms/src/Processors/QueryPipeline.h @@ -140,9 +140,6 @@ public: void setProgressCallback(const ProgressCallback & callback); void setProcessListElement(QueryStatus * elem); - /// Call after execution. - void finalize(); - /// Recommend number of threads for pipeline execution. size_t getNumThreads() const { @@ -195,12 +192,12 @@ private: QueryStatus * process_list_element = nullptr; void checkInitialized(); - void checkSource(const ProcessorPtr & source, bool can_have_totals); + static void checkSource(const ProcessorPtr & source, bool can_have_totals); template void addSimpleTransformImpl(const TProcessorGetter & getter); - void calcRowsBeforeLimit(); + void initRowsBeforeLimit(); }; } diff --git a/dbms/src/Processors/ResizeProcessor.cpp b/dbms/src/Processors/ResizeProcessor.cpp index 25184b1ac5c..fd3d6510aa8 100644 --- a/dbms/src/Processors/ResizeProcessor.cpp +++ b/dbms/src/Processors/ResizeProcessor.cpp @@ -350,7 +350,7 @@ IProcessor::Status StrictResizeProcessor::prepare(const PortNumbers & updated_in if (waiting_output.status != OutputStatus::NeedData) throw Exception("Invalid status for associated output.", ErrorCodes::LOGICAL_ERROR); - waiting_output.port->pushData(input_with_data.port->pullData(/* set_not_deeded = */ true)); + waiting_output.port->pushData(input_with_data.port->pullData(/* set_not_needed = */ true)); waiting_output.status = OutputStatus::NotActive; if (input_with_data.port->isFinished()) diff --git a/dbms/src/Processors/RowsBeforeLimitCounter.h b/dbms/src/Processors/RowsBeforeLimitCounter.h new file mode 100644 index 00000000000..abee5a09405 --- /dev/null +++ b/dbms/src/Processors/RowsBeforeLimitCounter.h @@ -0,0 +1,30 @@ +#pragma once +#include +#include + +namespace DB +{ + +/// This class helps to calculate rows_before_limit_at_least. +class RowsBeforeLimitCounter +{ +public: + void add(uint64_t rows) + { + setAppliedLimit(); + rows_before_limit.fetch_add(rows, std::memory_order_release); + } + + uint64_t get() const { return rows_before_limit.load(std::memory_order_acquire); } + + void setAppliedLimit() { has_applied_limit.store(true, std::memory_order::release); } + bool hasAppliedLimit() const { return has_applied_limit.load(std::memory_order_acquire); } + +private: + std::atomic rows_before_limit = 0; + std::atomic_bool has_applied_limit = false; +}; + +using RowsBeforeLimitCounterPtr = std::shared_ptr; + +} diff --git a/dbms/src/Processors/Sources/SourceFromInputStream.cpp b/dbms/src/Processors/Sources/SourceFromInputStream.cpp index 3cc050aaf95..6f2a7eeb28a 100644 --- a/dbms/src/Processors/Sources/SourceFromInputStream.cpp +++ b/dbms/src/Processors/Sources/SourceFromInputStream.cpp @@ -95,6 +95,13 @@ void SourceFromInputStream::work() if (!typeid_cast(stream.get())) stream->cancel(false); + if (rows_before_limit) + { + auto & info = stream->getProfileInfo(); + if (info.hasAppliedLimit()) + rows_before_limit->add(info.getRowsBeforeLimit()); + } + stream->readSuffix(); if (auto totals_block = stream->getTotals()) @@ -120,6 +127,13 @@ Chunk SourceFromInputStream::generate() auto block = stream->read(); if (!block && !isCancelled()) { + if (rows_before_limit) + { + auto & info = stream->getProfileInfo(); + if (info.hasAppliedLimit()) + rows_before_limit->add(info.getRowsBeforeLimit()); + } + stream->readSuffix(); if (auto totals_block = stream->getTotals()) diff --git a/dbms/src/Processors/Sources/SourceFromInputStream.h b/dbms/src/Processors/Sources/SourceFromInputStream.h index 00b03220cec..83e7f9929c9 100644 --- a/dbms/src/Processors/Sources/SourceFromInputStream.h +++ b/dbms/src/Processors/Sources/SourceFromInputStream.h @@ -1,5 +1,6 @@ #pragma once #include +#include namespace DB { @@ -23,6 +24,8 @@ public: void addTotalsPort(); + void setRowsBeforeLimitCounter(RowsBeforeLimitCounterPtr counter) { rows_before_limit.swap(counter); } + /// Implementation for methods from ISourceWithProgress. void setLimits(const LocalLimits & limits_) final { stream->setLimits(limits_); } void setQuota(const QuotaContextPtr & quota_) final { stream->setQuota(quota_); } @@ -38,6 +41,8 @@ private: bool force_add_aggregating_info = false; BlockInputStreamPtr stream; + RowsBeforeLimitCounterPtr rows_before_limit; + Chunk totals; bool has_totals_port = false; bool has_totals = false; diff --git a/dbms/src/Processors/Transforms/FilterTransform.cpp b/dbms/src/Processors/Transforms/FilterTransform.cpp index 9cad9f85f92..27d77ddc3d6 100644 --- a/dbms/src/Processors/Transforms/FilterTransform.cpp +++ b/dbms/src/Processors/Transforms/FilterTransform.cpp @@ -93,7 +93,7 @@ void FilterTransform::removeFilterIfNeed(Chunk & chunk) void FilterTransform::transform(Chunk & chunk) { - size_t num_rows_before_filtration = chunk.getNumRows(); + size_t num_rows_before_filtration; auto columns = chunk.detachColumns(); { diff --git a/dbms/src/Processors/Transforms/PartialSortingTransform.cpp b/dbms/src/Processors/Transforms/PartialSortingTransform.cpp index 7e217ea0a07..018614f0165 100644 --- a/dbms/src/Processors/Transforms/PartialSortingTransform.cpp +++ b/dbms/src/Processors/Transforms/PartialSortingTransform.cpp @@ -5,16 +5,16 @@ namespace DB { PartialSortingTransform::PartialSortingTransform( - const Block & header_, SortDescription & description_, UInt64 limit_, bool do_count_rows_) + const Block & header_, SortDescription & description_, UInt64 limit_) : ISimpleTransform(header_, header_, false) - , description(description_), limit(limit_), do_count_rows(do_count_rows_) + , description(description_), limit(limit_) { } void PartialSortingTransform::transform(Chunk & chunk) { - if (do_count_rows) - read_rows += chunk.getNumRows(); + if (read_rows) + read_rows->add(chunk.getNumRows()); auto block = getInputPort().getHeader().cloneWithColumns(chunk.detachColumns()); chunk.clear(); diff --git a/dbms/src/Processors/Transforms/PartialSortingTransform.h b/dbms/src/Processors/Transforms/PartialSortingTransform.h index 905b294c0be..47ac90c6904 100644 --- a/dbms/src/Processors/Transforms/PartialSortingTransform.h +++ b/dbms/src/Processors/Transforms/PartialSortingTransform.h @@ -1,5 +1,6 @@ #pragma once #include +#include #include namespace DB @@ -12,17 +13,14 @@ class PartialSortingTransform : public ISimpleTransform { public: /// limit - if not 0, then you can sort each block not completely, but only `limit` first rows by order. - /// When count_rows is false, getNumReadRows() will always return 0. PartialSortingTransform( const Block & header_, SortDescription & description_, - UInt64 limit_ = 0, - bool do_count_rows_ = true); + UInt64 limit_ = 0); String getName() const override { return "PartialSortingTransform"; } - /// Total num rows passed to transform. - UInt64 getNumReadRows() const { return read_rows; } + void setRowsBeforeLimitCounter(RowsBeforeLimitCounterPtr counter) { read_rows.swap(counter); } protected: void transform(Chunk & chunk) override; @@ -30,11 +28,7 @@ protected: private: SortDescription description; UInt64 limit; - UInt64 read_rows = 0; - - /// Do we need calculate read_rows value? - /// Used to skip total row when count rows_before_limit_at_least. - bool do_count_rows; + RowsBeforeLimitCounterPtr read_rows; }; } diff --git a/dbms/src/Processors/tests/processors_test.cpp b/dbms/src/Processors/tests/processors_test.cpp index 3e2e6abd1da..b0270932234 100644 --- a/dbms/src/Processors/tests/processors_test.cpp +++ b/dbms/src/Processors/tests/processors_test.cpp @@ -122,7 +122,7 @@ class PrintSink : public ISink public: String getName() const override { return "Print"; } - PrintSink(String prefix_) + explicit PrintSink(String prefix_) : ISink(Block({ColumnWithTypeAndName{ ColumnUInt64::create(), std::make_shared(), "number" }})), prefix(std::move(prefix_)) { diff --git a/dbms/src/Processors/tests/processors_test_chain.cpp b/dbms/src/Processors/tests/processors_test_chain.cpp index b6a4f0ad653..d6d10c56285 100644 --- a/dbms/src/Processors/tests/processors_test_chain.cpp +++ b/dbms/src/Processors/tests/processors_test_chain.cpp @@ -76,7 +76,7 @@ class PrintSink : public ISink public: String getName() const override { return "Print"; } - PrintSink(String prefix_) + explicit PrintSink(String prefix_) : ISink(Block({ColumnWithTypeAndName{ ColumnUInt64::create(), std::make_shared(), "number" }})), prefix(std::move(prefix_)) { diff --git a/dbms/src/Processors/tests/processors_test_expand_pipeline.cpp b/dbms/src/Processors/tests/processors_test_expand_pipeline.cpp index 1d03d75c55d..78fa0bee7dd 100644 --- a/dbms/src/Processors/tests/processors_test_expand_pipeline.cpp +++ b/dbms/src/Processors/tests/processors_test_expand_pipeline.cpp @@ -26,7 +26,7 @@ class PrintSink : public ISink public: String getName() const override { return "Print"; } - PrintSink(String prefix_) + explicit PrintSink(String prefix_) : ISink(Block({ColumnWithTypeAndName{ ColumnUInt64::create(), std::make_shared(), "number" }})), prefix(std::move(prefix_)) { @@ -64,7 +64,7 @@ class OneNumberSource : public ISource public: String getName() const override { return "OneNumber"; } - OneNumberSource(UInt64 number_) + explicit OneNumberSource(UInt64 number_) : ISource(Block({ColumnWithTypeAndName{ ColumnUInt64::create(), std::make_shared(), "number" }})), number(number_) { diff --git a/dbms/src/Processors/tests/processors_test_merge.cpp b/dbms/src/Processors/tests/processors_test_merge.cpp index 3842286bc59..226b08ece09 100644 --- a/dbms/src/Processors/tests/processors_test_merge.cpp +++ b/dbms/src/Processors/tests/processors_test_merge.cpp @@ -211,7 +211,7 @@ class PrintSink : public ISink public: String getName() const override { return "Print"; } - PrintSink(String prefix_) + explicit PrintSink(String prefix_) : ISink(Block({ColumnWithTypeAndName{ ColumnUInt64::create(), std::make_shared(), "number" }})), prefix(std::move(prefix_)) { diff --git a/dbms/src/Processors/tests/processors_test_merging_sorted_transform.cpp b/dbms/src/Processors/tests/processors_test_merging_sorted_transform.cpp index af27973e3fd..f148d46bd19 100644 --- a/dbms/src/Processors/tests/processors_test_merging_sorted_transform.cpp +++ b/dbms/src/Processors/tests/processors_test_merging_sorted_transform.cpp @@ -83,7 +83,7 @@ class PrintSink : public ISink public: String getName() const override { return "Print"; } - PrintSink(String prefix_) + explicit PrintSink(String prefix_) : ISink(Block({ColumnWithTypeAndName{ ColumnUInt64::create(), std::make_shared(), "number" }})), prefix(std::move(prefix_)) { diff --git a/dbms/src/Storages/Kafka/StorageKafka.cpp b/dbms/src/Storages/Kafka/StorageKafka.cpp index 1c988840abf..002f072f004 100644 --- a/dbms/src/Storages/Kafka/StorageKafka.cpp +++ b/dbms/src/Storages/Kafka/StorageKafka.cpp @@ -431,7 +431,7 @@ void registerStorageKafka(StorageFactory & factory) // Check arguments and settings #define CHECK_KAFKA_STORAGE_ARGUMENT(ARG_NUM, PAR_NAME) \ /* One of the four required arguments is not specified */ \ - if (args_count < ARG_NUM && ARG_NUM <= 4 && \ + if (args_count < (ARG_NUM) && (ARG_NUM) <= 4 && \ !kafka_settings.PAR_NAME.changed) \ { \ throw Exception( \ @@ -442,7 +442,7 @@ void registerStorageKafka(StorageFactory & factory) /* The same argument is given in two places */ \ if (has_settings && \ kafka_settings.PAR_NAME.changed && \ - args_count >= ARG_NUM) \ + args_count >= (ARG_NUM)) \ { \ throw Exception( \ "The argument №" #ARG_NUM " of storage Kafka " \ diff --git a/dbms/src/Storages/Kafka/StorageKafka.h b/dbms/src/Storages/Kafka/StorageKafka.h index ab6910a681a..ea876a2e712 100644 --- a/dbms/src/Storages/Kafka/StorageKafka.h +++ b/dbms/src/Storages/Kafka/StorageKafka.h @@ -24,7 +24,7 @@ namespace DB /** Implements a Kafka queue table engine that can be used as a persistent queue / buffer, * or as a basic building block for creating pipelines with a continuous insertion / ETL. */ -class StorageKafka : public ext::shared_ptr_helper, public IStorage +class StorageKafka final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/LiveView/StorageLiveView.cpp b/dbms/src/Storages/LiveView/StorageLiveView.cpp index 1faa5e04dc9..29eb896bcee 100644 --- a/dbms/src/Storages/LiveView/StorageLiveView.cpp +++ b/dbms/src/Storages/LiveView/StorageLiveView.cpp @@ -323,7 +323,7 @@ ASTPtr StorageLiveView::getInnerBlocksQuery() /// Rewrite inner query with right aliases for JOIN. /// It cannot be done in constructor or startup() because InterpreterSelectQuery may access table, /// which is not loaded yet during server startup, so we do it lazily - InterpreterSelectQuery(inner_blocks_query, *live_view_context, SelectQueryOptions().modify().analyze()); + InterpreterSelectQuery(inner_blocks_query, *live_view_context, SelectQueryOptions().modify().analyze()); // NOLINT auto table_id = getStorageID(); extractDependentTable(inner_blocks_query, global_context, table_id.table_name, inner_subquery); } diff --git a/dbms/src/Storages/LiveView/StorageLiveView.h b/dbms/src/Storages/LiveView/StorageLiveView.h index 8402d447279..9186132f99d 100644 --- a/dbms/src/Storages/LiveView/StorageLiveView.h +++ b/dbms/src/Storages/LiveView/StorageLiveView.h @@ -38,7 +38,7 @@ using ASTPtr = std::shared_ptr; using BlocksMetadataPtr = std::shared_ptr; using MergeableBlocksPtr = std::shared_ptr; -class StorageLiveView : public ext::shared_ptr_helper, public IStorage +class StorageLiveView final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; friend class LiveViewBlockInputStream; diff --git a/dbms/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/dbms/src/Storages/MergeTree/IMergeTreeDataPart.cpp index b5c8f16b7e5..dedda5b5159 100644 --- a/dbms/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/dbms/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -3,16 +3,13 @@ #include #include #include -#include #include #include #include #include -#include -#include +#include #include #include -#include #include #include @@ -30,7 +27,6 @@ namespace ErrorCodes extern const int NOT_FOUND_EXPECTED_DATA_PART; extern const int BAD_SIZE_OF_FILE_IN_DATA_PART; extern const int BAD_TTL_FILE; - extern const int CANNOT_UNLINK; extern const int NOT_IMPLEMENTED; } @@ -251,7 +247,7 @@ void IMergeTreeDataPart::removeIfNeeded() if (is_temp) { - String file_name = Poco::Path(relative_path).getFileName(); + String file_name = fileName(relative_path); if (file_name.empty()) throw Exception("relative_path " + relative_path + " of part " + name + " is invalid or not set", ErrorCodes::LOGICAL_ERROR); @@ -699,33 +695,33 @@ void IMergeTreeDataPart::remove() const * And a race condition can happen that will lead to "File not found" error here. */ - String from_ = storage.relative_data_path + relative_path; - String to_ = storage.relative_data_path + "delete_tmp_" + name; + String from = storage.relative_data_path + relative_path; + String to = storage.relative_data_path + "delete_tmp_" + name; // TODO directory delete_tmp_ is never removed if server crashes before returning from this function - if (disk->exists(to_)) + if (disk->exists(to)) { - LOG_WARNING(storage.log, "Directory " << fullPath(disk, to_) << " (to which part must be renamed before removing) already exists." + LOG_WARNING(storage.log, "Directory " << fullPath(disk, to) << " (to which part must be renamed before removing) already exists." " Most likely this is due to unclean restart. Removing it."); try { - disk->removeRecursive(to_); + disk->removeRecursive(to); } catch (...) { - LOG_ERROR(storage.log, "Cannot recursively remove directory " << fullPath(disk, to_) << ". Exception: " << getCurrentExceptionMessage(false)); + LOG_ERROR(storage.log, "Cannot recursively remove directory " << fullPath(disk, to) << ". Exception: " << getCurrentExceptionMessage(false)); throw; } } try { - disk->moveFile(from_, to_); + disk->moveFile(from, to); } catch (const Poco::FileNotFoundException &) { - LOG_ERROR(storage.log, "Directory " << fullPath(disk, to_) << " (part to remove) doesn't exist or one of nested files has gone." + LOG_ERROR(storage.log, "Directory " << fullPath(disk, to) << " (part to remove) doesn't exist or one of nested files has gone." " Most likely this is due to manual removing. This should be discouraged. Ignoring."); return; @@ -741,37 +737,25 @@ void IMergeTreeDataPart::remove() const #endif std::shared_lock lock(columns_lock); - /// TODO: IDisk doesn't support `unlink()` and `rmdir()` functionality. - auto to = fullPath(disk, to_); - for (const auto & [file, _] : checksums.files) - { - String path_to_remove = to + "/" + file; - if (0 != unlink(path_to_remove.c_str())) - throwFromErrnoWithPath("Cannot unlink file " + path_to_remove, path_to_remove, ErrorCodes::CANNOT_UNLINK); - } + disk->remove(to + "/" + file); #if !__clang__ # pragma GCC diagnostic pop #endif for (const auto & file : {"checksums.txt", "columns.txt"}) - { - String path_to_remove = to + "/" + file; - if (0 != unlink(path_to_remove.c_str())) - throwFromErrnoWithPath("Cannot unlink file " + path_to_remove, path_to_remove, ErrorCodes::CANNOT_UNLINK); - } + disk->remove(to + "/" + file); - if (0 != rmdir(to.c_str())) - throwFromErrnoWithPath("Cannot rmdir file " + to, to, ErrorCodes::CANNOT_UNLINK); + disk->remove(to); } catch (...) { /// Recursive directory removal does many excessive "stat" syscalls under the hood. - LOG_ERROR(storage.log, "Cannot quickly remove directory " << fullPath(disk, to_) << " by removing files; fallback to recursive removal. Reason: " + LOG_ERROR(storage.log, "Cannot quickly remove directory " << fullPath(disk, to) << " by removing files; fallback to recursive removal. Reason: " << getCurrentExceptionMessage(false)); - disk->removeRecursive(to_ + "/"); + disk->removeRecursive(to + "/"); } } @@ -791,7 +775,7 @@ String IMergeTreeDataPart::getRelativePathForDetachedPart(const String & prefix) { res = "detached/" + (prefix.empty() ? "" : prefix + "_") + name + (try_no ? "_try" + DB::toString(try_no) : ""); - if (!Poco::File(storage.getFullPathOnDisk(disk) + res).exists()) + if (!disk->exists(getFullRelativePath() + res)) return res; LOG_WARNING(storage.log, "Directory " << res << " (to detach to) already exists." @@ -812,10 +796,8 @@ void IMergeTreeDataPart::makeCloneInDetached(const String & prefix) const assertOnDisk(); LOG_INFO(storage.log, "Detaching " << relative_path); - Poco::Path src(getFullPath()); - Poco::Path dst(storage.getFullPathOnDisk(disk) + getRelativePathForDetachedPart(prefix)); /// Backup is not recursive (max_level is 0), so do not copy inner directories - localBackup(src, dst, 0); + localBackup(disk, getFullRelativePath(), storage.relative_data_path + getRelativePathForDetachedPart(prefix), 0); } void IMergeTreeDataPart::makeCloneOnDiskDetached(const ReservationPtr & reservation) const @@ -825,14 +807,13 @@ void IMergeTreeDataPart::makeCloneOnDiskDetached(const ReservationPtr & reservat if (reserved_disk->getName() == disk->getName()) throw Exception("Can not clone data part " + name + " to same disk " + disk->getName(), ErrorCodes::LOGICAL_ERROR); - String path_to_clone = storage.getFullPathOnDisk(reserved_disk) + "detached/"; + String path_to_clone = storage.relative_data_path + "detached/"; - if (Poco::File(path_to_clone + relative_path).exists()) - throw Exception("Path " + path_to_clone + relative_path + " already exists. Can not clone ", ErrorCodes::DIRECTORY_ALREADY_EXISTS); - Poco::File(path_to_clone).createDirectory(); + if (reserved_disk->exists(path_to_clone + relative_path)) + throw Exception("Path " + fullPath(reserved_disk, path_to_clone + relative_path) + " already exists. Can not clone ", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + reserved_disk->createDirectory(path_to_clone); - Poco::File cloning_directory(getFullPath()); - cloning_directory.copyTo(path_to_clone); + disk->copy(getFullRelativePath(), reserved_disk, path_to_clone); } void IMergeTreeDataPart::checkConsistencyBase() const diff --git a/dbms/src/Storages/MergeTree/KeyCondition.cpp b/dbms/src/Storages/MergeTree/KeyCondition.cpp index c74ca41f054..e994d254958 100644 --- a/dbms/src/Storages/MergeTree/KeyCondition.cpp +++ b/dbms/src/Storages/MergeTree/KeyCondition.cpp @@ -516,7 +516,7 @@ void KeyCondition::traverseAST(const ASTPtr & node, const Context & context, Blo * - in this case `n - 1` elements are added (where `n` is the number of arguments). */ if (i != 0 || element.function == RPNElement::FUNCTION_NOT) - rpn.emplace_back(std::move(element)); + rpn.emplace_back(element); } return; diff --git a/dbms/src/Storages/MergeTree/KeyCondition.h b/dbms/src/Storages/MergeTree/KeyCondition.h index 3344d22c749..8667e0aea27 100644 --- a/dbms/src/Storages/MergeTree/KeyCondition.h +++ b/dbms/src/Storages/MergeTree/KeyCondition.h @@ -382,7 +382,7 @@ private: void traverseAST(const ASTPtr & node, const Context & context, Block & block_with_constants); bool tryParseAtomFromAST(const ASTPtr & node, const Context & context, Block & block_with_constants, RPNElement & out); - bool tryParseLogicalOperatorFromAST(const ASTFunction * func, RPNElement & out); + static bool tryParseLogicalOperatorFromAST(const ASTFunction * func, RPNElement & out); /** Is node the key column * or expression in which column of key is wrapped by chain of functions, diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index 930eac69033..fd88a588ec3 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -1,50 +1,49 @@ -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include #include +#include +#include #include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include +#include +#include #include #include -#include -#include #include -#include -#include #include @@ -859,7 +858,6 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) const auto settings = getSettings(); std::vector> part_names_with_disks; Strings part_file_names; - Poco::DirectoryIterator end; auto disks = getStoragePolicy()->getDisks(); @@ -1318,7 +1316,7 @@ void MergeTreeData::rename( for (const auto & disk : disks) { - auto new_table_path_parent = Poco::Path(new_table_path).makeParent().toString(); + auto new_table_path_parent = parentPath(new_table_path); disk->createDirectory(new_table_path_parent); disk->moveDirectory(relative_data_path, new_table_path); } @@ -1713,8 +1711,8 @@ void MergeTreeData::alterDataPart( size_t num_files_to_modify = transaction->rename_map.size(); size_t num_files_to_remove = 0; - for (const auto & from_to : transaction->rename_map) - if (from_to.second.empty()) + for (const auto & [from, to] : transaction->rename_map) + if (to.empty()) ++num_files_to_remove; if (!skip_sanity_checks @@ -1732,18 +1730,18 @@ void MergeTreeData::alterDataPart( << ") files ("; bool first = true; - for (const auto & from_to : transaction->rename_map) + for (const auto & [from, to] : transaction->rename_map) { if (!first) exception_message << ", "; if (forbidden_because_of_modify) { - exception_message << "from " << backQuote(from_to.first) << " to " << backQuote(from_to.second); + exception_message << "from " << backQuote(from) << " to " << backQuote(to); first = false; } - else if (from_to.second.empty()) + else if (to.empty()) { - exception_message << backQuote(from_to.first); + exception_message << backQuote(from); first = false; } } @@ -1813,28 +1811,28 @@ void MergeTreeData::alterDataPart( /// Update the checksums. DataPart::Checksums new_checksums = part->checksums; - for (const auto & it : transaction->rename_map) + for (const auto & [from, to] : transaction->rename_map) { - if (it.second.empty()) - new_checksums.files.erase(it.first); + if (to.empty()) + new_checksums.files.erase(from); else - new_checksums.files[it.second] = add_checksums.files[it.first]; + new_checksums.files[to] = add_checksums.files[from]; } /// Write the checksums to the temporary file. if (!part->checksums.empty()) { transaction->new_checksums = new_checksums; - WriteBufferFromFile checksums_file(part->getFullPath() + "checksums.txt.tmp", 4096); - new_checksums.write(checksums_file); + auto checksums_file = part->disk->writeFile(part->getFullRelativePath() + "checksums.txt.tmp", 4096); + new_checksums.write(*checksums_file); transaction->rename_map["checksums.txt.tmp"] = "checksums.txt"; } /// Write the new column list to the temporary file. { transaction->new_columns = new_columns.filter(part->getColumns().getNames()); - WriteBufferFromFile columns_file(part->getFullPath() + "columns.txt.tmp", 4096); - transaction->new_columns.writeText(columns_file); + auto columns_file = part->disk->writeFile(part->getFullRelativePath() + "columns.txt.tmp", 4096); + transaction->new_columns.writeText(*columns_file); transaction->rename_map["columns.txt.tmp"] = "columns.txt"; } } @@ -1863,16 +1861,16 @@ void MergeTreeData::changeSettings( for (const String & disk_name : all_diff_disk_names) { - const auto & path = getFullPathOnDisk(new_storage_policy->getDiskByName(disk_name)); - if (Poco::File(path).exists()) + auto disk = new_storage_policy->getDiskByName(disk_name); + if (disk->exists(relative_data_path)) throw Exception("New storage policy contain disks which already contain data of a table with the same name", ErrorCodes::LOGICAL_ERROR); } for (const String & disk_name : all_diff_disk_names) { - const auto & path = getFullPathOnDisk(new_storage_policy->getDiskByName(disk_name)); - Poco::File(path).createDirectories(); - Poco::File(path + "detached").createDirectory(); + auto disk = new_storage_policy->getDiskByName(disk_name); + disk->createDirectories(relative_data_path); + disk->createDirectories(relative_data_path + "detached"); } /// FIXME how would that be done while reloading configuration??? } @@ -1939,7 +1937,8 @@ void MergeTreeData::AlterDataPartTransaction::commit() { std::unique_lock lock(data_part->columns_lock); - String path = data_part->getFullPath(); + auto disk = data_part->disk; + String path = data_part->getFullRelativePath(); /// NOTE: checking that a file exists before renaming or deleting it /// is justified by the fact that, when converting an ordinary column @@ -1947,19 +1946,18 @@ void MergeTreeData::AlterDataPartTransaction::commit() /// before, i.e. they do not have older versions. /// 1) Rename the old files. - for (const auto & from_to : rename_map) + for (const auto & [from, to] : rename_map) { - String name = from_to.second.empty() ? from_to.first : from_to.second; - Poco::File file{path + name}; - if (file.exists()) - file.renameTo(path + name + ".tmp2"); + String name = to.empty() ? from : to; + if (disk->exists(path + name)) + disk->moveFile(path + name, path + name + ".tmp2"); } /// 2) Move new files in the place of old and update the metadata in memory. - for (const auto & from_to : rename_map) + for (const auto & [from, to] : rename_map) { - if (!from_to.second.empty()) - Poco::File{path + from_to.first}.renameTo(path + from_to.second); + if (!to.empty()) + disk->moveFile(path + from, path + to); } auto & mutable_part = const_cast(*data_part); @@ -1967,12 +1965,10 @@ void MergeTreeData::AlterDataPartTransaction::commit() mutable_part.setColumns(new_columns); /// 3) Delete the old files and drop required columns (DROP COLUMN) - for (const auto & from_to : rename_map) + for (const auto & [from, to] : rename_map) { - String name = from_to.second.empty() ? from_to.first : from_to.second; - Poco::File file{path + name + ".tmp2"}; - if (file.exists()) - file.remove(); + String name = to.empty() ? from : to; + disk->removeIfExists(path + name + ".tmp2"); } mutable_part.bytes_on_disk = new_checksums.getTotalSizeOnDisk(); @@ -2002,20 +1998,18 @@ MergeTreeData::AlterDataPartTransaction::~AlterDataPartTransaction() { LOG_WARNING(data_part->storage.log, "Aborting ALTER of part " << data_part->relative_path); - String path = data_part->getFullPath(); - for (const auto & from_to : rename_map) + String path = data_part->getFullRelativePath(); + for (const auto & [from, to] : rename_map) { - if (!from_to.second.empty()) + if (!to.empty()) { try { - Poco::File file(path + from_to.first); - if (file.exists()) - file.remove(); + data_part->disk->removeIfExists(path + from); } catch (Poco::Exception & e) { - LOG_WARNING(data_part->storage.log, "Can't remove " << path + from_to.first << ": " << e.displayText()); + LOG_WARNING(data_part->storage.log, "Can't remove " << fullPath(data_part->disk, path + from) << ": " << e.displayText()); } } } @@ -2029,15 +2023,13 @@ MergeTreeData::AlterDataPartTransaction::~AlterDataPartTransaction() void MergeTreeData::PartsTemporaryRename::addPart(const String & old_name, const String & new_name) { old_and_new_names.push_back({old_name, new_name}); - const auto paths = storage.getDataPaths(); - for (const auto & full_path : paths) + for (const auto & [path, disk] : storage.getRelativeDataPathsWithDisks()) { - for (Poco::DirectoryIterator it = Poco::DirectoryIterator(full_path + source_dir); it != Poco::DirectoryIterator(); ++it) + for (auto it = disk->iterateDirectory(path + source_dir); it->isValid(); it->next()) { - String name = it.name(); - if (name == old_name) + if (it->name() == old_name) { - old_part_name_to_full_path[old_name] = full_path; + old_part_name_to_path_and_disk[old_name] = {path, disk}; break; } } @@ -2051,11 +2043,12 @@ void MergeTreeData::PartsTemporaryRename::tryRenameAll() { try { - const auto & names = old_and_new_names[i]; - if (names.first.empty() || names.second.empty()) + const auto & [old_name, new_name] = old_and_new_names[i]; + if (old_name.empty() || new_name.empty()) throw DB::Exception("Empty part name. Most likely it's a bug.", ErrorCodes::INCORRECT_FILE_NAME); - const auto full_path = old_part_name_to_full_path[names.first] + source_dir; /// old_name - Poco::File(full_path + names.first).renameTo(full_path + names.second); + const auto & [path, disk] = old_part_name_to_path_and_disk[old_name]; + const auto full_path = path + source_dir; /// for old_name + disk->moveFile(full_path + old_name, full_path + new_name); } catch (...) { @@ -2071,15 +2064,16 @@ MergeTreeData::PartsTemporaryRename::~PartsTemporaryRename() // TODO what if server had crashed before this destructor was called? if (!renamed) return; - for (const auto & names : old_and_new_names) + for (const auto & [old_name, new_name] : old_and_new_names) { - if (names.first.empty()) + if (old_name.empty()) continue; try { - const auto full_path = old_part_name_to_full_path[names.first] + source_dir; /// old_name - Poco::File(full_path + names.second).renameTo(full_path + names.first); + const auto & [path, disk] = old_part_name_to_path_and_disk[old_name]; + const auto full_path = path + source_dir; /// for old_name + disk->moveFile(full_path + new_name, full_path + old_name); } catch (...) { @@ -2691,14 +2685,15 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) auto part_it = data_parts_indexes.insert(part_copy).first; modifyPartState(part_it, DataPartState::Committed); - Poco::Path marker_path(Poco::Path(original_active_part->getFullPath()), DELETE_ON_DESTROY_MARKER_PATH); + auto disk = original_active_part->disk; + String marker_path = original_active_part->getFullRelativePath() + DELETE_ON_DESTROY_MARKER_PATH; try { - Poco::File(marker_path).createFile(); + disk->createFile(marker_path); } catch (Poco::Exception & e) { - LOG_ERROR(log, e.what() << " (while creating DeleteOnDestroy marker: " + backQuote(marker_path.toString()) + ")"); + LOG_ERROR(log, e.what() << " (while creating DeleteOnDestroy marker: " + backQuote(fullPath(disk, marker_path)) + ")"); } return; } @@ -2753,24 +2748,18 @@ MergeTreeData::DataPartPtr MergeTreeData::getPartIfExists(const String & part_na } -MergeTreeData::MutableDataPartPtr MergeTreeData::loadPartAndFixMetadata(const DiskPtr & disk, const String & relative_path) +static void loadPartAndFixMetadataImpl(MergeTreeData::MutableDataPartPtr part) { - MutableDataPartPtr part = createPart(Poco::Path(relative_path).getFileName(), disk, relative_path); - loadPartAndFixMetadata(part); - return part; -} - -void MergeTreeData::loadPartAndFixMetadata(MutableDataPartPtr part) -{ - String full_part_path = part->getFullPath(); + auto disk = part->disk; + String full_part_path = part->getFullRelativePath(); /// Earlier the list of columns was written incorrectly. Delete it and re-create. /// But in compact parts we can't get list of columns without this file. - if (isWidePart(part) && Poco::File(full_part_path + "columns.txt").exists()) - Poco::File(full_part_path + "columns.txt").remove(); + if (isWidePart(part)) + disk->removeIfExists(full_part_path + "columns.txt"); part->loadColumnsChecksumsIndexes(false, true); - part->modification_time = Poco::File(full_part_path).getLastModified().epochTime(); + part->modification_time = disk->getLastModified(full_part_path).epochTime(); /// If the checksums file is not present, calculate the checksums and write them to disk. /// Check the data while we are at it. @@ -2778,14 +2767,21 @@ void MergeTreeData::loadPartAndFixMetadata(MutableDataPartPtr part) { part->checksums = checkDataPart(part, false); { - WriteBufferFromFile out(full_part_path + "checksums.txt.tmp", 4096); - part->checksums.write(out); + auto out = disk->writeFile(full_part_path + "checksums.txt.tmp", 4096); + part->checksums.write(*out); } - Poco::File(full_part_path + "checksums.txt.tmp").renameTo(full_part_path + "checksums.txt"); + disk->moveFile(full_part_path + "checksums.txt.tmp", full_part_path + "checksums.txt"); } } +MergeTreeData::MutableDataPartPtr MergeTreeData::loadPartAndFixMetadata(const DiskPtr & disk, const String & relative_path) +{ + MutableDataPartPtr part = createPart(Poco::Path(relative_path).getFileName(), disk, relative_path); + loadPartAndFixMetadataImpl(part); + return part; +} + void MergeTreeData::calculateColumnSizesImpl() { @@ -3098,17 +3094,14 @@ MergeTreeData::getDetachedParts() const { std::vector res; - for (const auto & [path, disk] : getDataPathsWithDisks()) + for (const auto & [path, disk] : getRelativeDataPathsWithDisks()) { - for (Poco::DirectoryIterator it(path + "detached"); - it != Poco::DirectoryIterator(); ++it) + for (auto it = disk->iterateDirectory(path + "detached"); it->isValid(); it->next()) { - auto dir_name = it.name(); - res.emplace_back(); auto & part = res.back(); - DetachedPartInfo::tryParseDetachedPartName(dir_name, part, format_version); + DetachedPartInfo::tryParseDetachedPartName(it->name(), part, format_version); part.disk = disk->getName(); } } @@ -3120,9 +3113,9 @@ void MergeTreeData::validateDetachedPartName(const String & name) const if (name.find('/') != std::string::npos || name == "." || name == "..") throw DB::Exception("Invalid part name '" + name + "'", ErrorCodes::INCORRECT_FILE_NAME); - String full_path = getFullPathForPart(name, "detached/"); + auto full_path = getFullRelativePathForPart(name, "detached/"); - if (full_path.empty() || !Poco::File(full_path + name).exists()) + if (!full_path) throw DB::Exception("Detached part \"" + name + "\" not found" , ErrorCodes::BAD_DATA_PART_NAME); if (startsWith(name, "attaching_") || startsWith(name, "deleting_")) @@ -3157,7 +3150,8 @@ void MergeTreeData::dropDetached(const ASTPtr & partition, bool part, const Cont for (auto & [old_name, new_name] : renamed_parts.old_and_new_names) { - Poco::File(renamed_parts.old_part_name_to_full_path[old_name] + "detached/" + new_name).remove(true); + const auto & [path, disk] = renamed_parts.old_part_name_to_path_and_disk[old_name]; + disk->removeRecursive(path + "detached/" + new_name + "/"); LOG_DEBUG(log, "Dropped detached part " << old_name); old_name.clear(); } @@ -3185,12 +3179,11 @@ MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const ActiveDataPartSet active_parts(format_version); const auto disks = getStoragePolicy()->getDisks(); - for (const DiskPtr & disk : disks) + for (auto & disk : disks) { - const auto full_path = getFullPathOnDisk(disk); - for (Poco::DirectoryIterator it = Poco::DirectoryIterator(full_path + source_dir); it != Poco::DirectoryIterator(); ++it) + for (auto it = disk->iterateDirectory(relative_data_path + source_dir); it->isValid(); it->next()) { - const String & name = it.name(); + const String & name = it->name(); MergeTreePartInfo part_info; // TODO what if name contains "_tryN" suffix? /// Parts with prefix in name (e.g. attaching_1_3_3_0, deleting_1_3_3_0) will be ignored @@ -3211,10 +3204,8 @@ MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const String containing_part = active_parts.getContainingPart(name); if (!containing_part.empty() && containing_part != name) { - auto full_path = getFullPathOnDisk(disk); // TODO maybe use PartsTemporaryRename here? - Poco::File(full_path + source_dir + name) - .renameTo(full_path + source_dir + "inactive_" + name); + disk->moveDirectory(relative_data_path + source_dir + name, relative_data_path + source_dir + "inactive_" + name); } else renamed_parts.addPart(name, "attaching_" + name); @@ -3233,7 +3224,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const { LOG_DEBUG(log, "Checking part " << part_names.second); MutableDataPartPtr part = createPart(part_names.first, name_to_disk[part_names.first], source_dir + part_names.second); - loadPartAndFixMetadata(part); + loadPartAndFixMetadataImpl(part); loaded_parts.push_back(part); } @@ -3257,25 +3248,20 @@ inline ReservationPtr checkAndReturnReservation(UInt64 expected_size, Reservatio ReservationPtr MergeTreeData::reserveSpace(UInt64 expected_size) const { expected_size = std::max(RESERVATION_MIN_ESTIMATION_SIZE, expected_size); - auto reservation = getStoragePolicy()->reserve(expected_size); - return checkAndReturnReservation(expected_size, std::move(reservation)); } -ReservationPtr MergeTreeData::reserveSpace(UInt64 expected_size, SpacePtr space) const +ReservationPtr MergeTreeData::reserveSpace(UInt64 expected_size, SpacePtr space) { expected_size = std::max(RESERVATION_MIN_ESTIMATION_SIZE, expected_size); - auto reservation = tryReserveSpace(expected_size, space); - return checkAndReturnReservation(expected_size, std::move(reservation)); } -ReservationPtr MergeTreeData::tryReserveSpace(UInt64 expected_size, SpacePtr space) const +ReservationPtr MergeTreeData::tryReserveSpace(UInt64 expected_size, SpacePtr space) { expected_size = std::max(RESERVATION_MIN_ESTIMATION_SIZE, expected_size); - return space->reserve(expected_size); } @@ -3529,13 +3515,13 @@ bool MergeTreeData::mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, con } } -MergeTreeData & MergeTreeData::checkStructureAndGetMergeTreeData(IStorage * source_table) const +MergeTreeData & MergeTreeData::checkStructureAndGetMergeTreeData(IStorage & source_table) const { - MergeTreeData * src_data = dynamic_cast(source_table); + MergeTreeData * src_data = dynamic_cast(&source_table); if (!src_data) - throw Exception("Table " + source_table->getStorageID().getNameForLogs() + + throw Exception("Table " + source_table.getStorageID().getNameForLogs() + " supports attachPartitionFrom only for MergeTree family of table engines." - " Got " + source_table->getName(), ErrorCodes::NOT_IMPLEMENTED); + " Got " + source_table.getName(), ErrorCodes::NOT_IMPLEMENTED); if (getColumns().getAllPhysical().sizeOfDifference(src_data->getColumns().getAllPhysical())) throw Exception("Tables have different structure", ErrorCodes::INCOMPATIBLE_COLUMNS); @@ -3559,7 +3545,7 @@ MergeTreeData & MergeTreeData::checkStructureAndGetMergeTreeData(IStorage * sour MergeTreeData & MergeTreeData::checkStructureAndGetMergeTreeData(const StoragePtr & source_table) const { - return checkStructureAndGetMergeTreeData(source_table.get()); + return checkStructureAndGetMergeTreeData(*source_table); } MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(const MergeTreeData::DataPartPtr & src_part, @@ -3584,22 +3570,22 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( String tmp_dst_part_name = tmp_part_prefix + dst_part_name; auto reservation = reserveSpace(src_part->bytes_on_disk, src_part->disk); - String dst_part_path = getFullPathOnDisk(reservation->getDisk()); - Poco::Path dst_part_absolute_path = Poco::Path(dst_part_path + tmp_dst_part_name).absolute(); - Poco::Path src_part_absolute_path = Poco::Path(src_part->getFullPath()).absolute(); + auto disk = reservation->getDisk(); + String src_part_path = src_part->getFullRelativePath(); + String dst_part_path = relative_data_path + tmp_dst_part_name; - if (Poco::File(dst_part_absolute_path).exists()) - throw Exception("Part in " + dst_part_absolute_path.toString() + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + if (disk->exists(dst_part_path)) + throw Exception("Part in " + fullPath(disk, dst_part_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); - LOG_DEBUG(log, "Cloning part " << src_part_absolute_path.toString() << " to " << dst_part_absolute_path.toString()); - localBackup(src_part_absolute_path, dst_part_absolute_path); + LOG_DEBUG(log, "Cloning part " << fullPath(disk, src_part_path) << " to " << fullPath(disk, dst_part_path)); + localBackup(disk, src_part_path, dst_part_path); auto dst_data_part = createPart(dst_part_name, dst_part_info, reservation->getDisk(), tmp_dst_part_name); dst_data_part->is_temp = true; dst_data_part->loadColumnsChecksumsIndexes(require_part_metadata, true); - dst_data_part->modification_time = Poco::File(dst_part_absolute_path).getLastModified().epochTime(); + dst_data_part->modification_time = disk->getLastModified(dst_part_path).epochTime(); return dst_data_part; } @@ -3609,26 +3595,25 @@ String MergeTreeData::getFullPathOnDisk(const DiskPtr & disk) const } -DiskPtr MergeTreeData::getDiskForPart(const String & part_name, const String & relative_path) const +DiskPtr MergeTreeData::getDiskForPart(const String & part_name, const String & additional_path) const { const auto disks = getStoragePolicy()->getDisks(); + for (const DiskPtr & disk : disks) - { - const auto disk_path = getFullPathOnDisk(disk); - for (Poco::DirectoryIterator it = Poco::DirectoryIterator(disk_path + relative_path); it != Poco::DirectoryIterator(); ++it) - if (it.name() == part_name) + for (auto it = disk->iterateDirectory(relative_data_path + additional_path); it->isValid(); it->next()) + if (it->name() == part_name) return disk; - } + return nullptr; } -String MergeTreeData::getFullPathForPart(const String & part_name, const String & relative_path) const +std::optional MergeTreeData::getFullRelativePathForPart(const String & part_name, const String & additional_path) const { - auto disk = getDiskForPart(part_name, relative_path); + auto disk = getDiskForPart(part_name, additional_path); if (disk) - return getFullPathOnDisk(disk) + relative_path; - return ""; + return relative_data_path + additional_path; + return {}; } Strings MergeTreeData::getDataPaths() const @@ -3640,15 +3625,6 @@ Strings MergeTreeData::getDataPaths() const return res; } -MergeTreeData::PathsWithDisks MergeTreeData::getDataPathsWithDisks() const -{ - PathsWithDisks res; - auto disks = getStoragePolicy()->getDisks(); - for (const auto & disk : disks) - res.emplace_back(getFullPathOnDisk(disk), disk); - return res; -} - MergeTreeData::PathsWithDisks MergeTreeData::getRelativeDataPathsWithDisks() const { PathsWithDisks res; @@ -3665,6 +3641,8 @@ void MergeTreeData::freezePartitionsByMatcher(MatcherFn matcher, const String & Poco::File(default_shadow_path).createDirectories(); auto increment = Increment(default_shadow_path + "increment.txt").get(true); + const String shadow_path = "shadow/"; + /// Acquire a snapshot of active data parts to prevent removing while doing backup. const auto data_parts = getDataParts(); @@ -3674,9 +3652,8 @@ void MergeTreeData::freezePartitionsByMatcher(MatcherFn matcher, const String & if (!matcher(part)) continue; - String shadow_path = part->disk->getPath() + "shadow/"; + part->disk->createDirectories(shadow_path); - Poco::File(shadow_path).createDirectories(); String backup_path = shadow_path + (!with_name.empty() ? escapeForFileName(with_name) @@ -3685,11 +3662,8 @@ void MergeTreeData::freezePartitionsByMatcher(MatcherFn matcher, const String & LOG_DEBUG(log, "Freezing part " << part->name << " snapshot will be placed at " + backup_path); - String part_absolute_path = Poco::Path(part->getFullPath()).absolute().toString(); - String backup_part_absolute_path = backup_path - + relative_data_path - + part->relative_path; - localBackup(part_absolute_path, backup_part_absolute_path); + String backup_part_path = backup_path + relative_data_path + part->relative_path; + localBackup(part->disk, part->getFullRelativePath(), backup_part_path); part->is_frozen.store(true, std::memory_order_relaxed); ++parts_processed; } @@ -3760,6 +3734,7 @@ try part_log_elem.rows = (*merge_entry)->rows_written; part_log_elem.bytes_uncompressed = (*merge_entry)->bytes_written_uncompressed; + part_log_elem.peak_memory_usage = (*merge_entry)->memory_tracker.getPeak(); } part_log->add(part_log_elem); @@ -3861,11 +3836,10 @@ MergeTreeData::CurrentlyMovingPartsTagger MergeTreeData::checkPartsForMove(const throw Exception("Move is not possible. Not enough space on '" + space->getName() + "'", ErrorCodes::NOT_ENOUGH_SPACE); auto reserved_disk = reservation->getDisk(); - String path_to_clone = getFullPathOnDisk(reserved_disk); - if (Poco::File(path_to_clone + part->name).exists()) + if (reserved_disk->exists(relative_data_path + part->name)) throw Exception( - "Move is not possible: " + path_to_clone + part->name + " already exists", + "Move is not possible: " + fullPath(reserved_disk, relative_data_path + part->name) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); if (currently_moving_parts.count(part) || partIsAssignedToBackgroundOperation(part)) diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.h b/dbms/src/Storages/MergeTree/MergeTreeData.h index 87a42a4d624..188354179dd 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.h +++ b/dbms/src/Storages/MergeTree/MergeTreeData.h @@ -263,6 +263,7 @@ public: }; using AlterDataPartTransactionPtr = std::unique_ptr; + using PathWithDisk = std::pair; struct PartsTemporaryRename : private boost::noncopyable { @@ -285,7 +286,7 @@ public: const MergeTreeData & storage; const String source_dir; std::vector> old_and_new_names; - std::unordered_map old_part_name_to_full_path; + std::unordered_map old_part_name_to_path_and_disk; bool renamed = false; }; @@ -601,7 +602,6 @@ public: /// Check that the part is not broken and calculate the checksums for it if they are not present. MutableDataPartPtr loadPartAndFixMetadata(const DiskPtr & disk, const String & relative_path); - void loadPartAndFixMetadata(MutableDataPartPtr part); /** Create local backup (snapshot) for parts with specified prefix. * Backup is created in directory clickhouse_dir/shadow/i/, where i - incremental number, @@ -644,7 +644,7 @@ public: /// and checks that their structure suitable for ALTER TABLE ATTACH PARTITION FROM /// Tables structure should be locked. MergeTreeData & checkStructureAndGetMergeTreeData(const StoragePtr & source_table) const; - MergeTreeData & checkStructureAndGetMergeTreeData(IStorage * source_table) const; + MergeTreeData & checkStructureAndGetMergeTreeData(IStorage & source_table) const; MergeTreeData::MutableDataPartPtr cloneAndLoadDataPartOnSameDisk( const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info); @@ -671,26 +671,25 @@ public: /// Get table path on disk String getFullPathOnDisk(const DiskPtr & disk) const; - /// Get disk for part. Looping through directories on FS because some parts maybe not in - /// active dataparts set (detached) - DiskPtr getDiskForPart(const String & part_name, const String & relative_path = "") const; + /// Get disk where part is located. + /// `additional_path` can be set if part is not located directly in table data path (e.g. 'detached/') + DiskPtr getDiskForPart(const String & part_name, const String & additional_path = "") const; - /// Get full path for part. Uses getDiskForPart and returns the full path - String getFullPathForPart(const String & part_name, const String & relative_path = "") const; + /// Get full path for part. Uses getDiskForPart and returns the full relative path. + /// `additional_path` can be set if part is not located directly in table data path (e.g. 'detached/') + std::optional getFullRelativePathForPart(const String & part_name, const String & additional_path = "") const; Strings getDataPaths() const override; - using PathWithDisk = std::pair; using PathsWithDisks = std::vector; - PathsWithDisks getDataPathsWithDisks() const; PathsWithDisks getRelativeDataPathsWithDisks() const; /// Reserves space at least 1MB. ReservationPtr reserveSpace(UInt64 expected_size) const; /// Reserves space at least 1MB on specific disk or volume. - ReservationPtr reserveSpace(UInt64 expected_size, SpacePtr space) const; - ReservationPtr tryReserveSpace(UInt64 expected_size, SpacePtr space) const; + static ReservationPtr reserveSpace(UInt64 expected_size, SpacePtr space); + static ReservationPtr tryReserveSpace(UInt64 expected_size, SpacePtr space); /// Reserves space at least 1MB preferring best destination according to `ttl_infos`. ReservationPtr reserveSpacePreferringTTLRules(UInt64 expected_size, diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 3de470abc20..9888f7003f1 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -26,9 +26,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -494,7 +491,7 @@ public: /// Vertical merge) or a mutation of a single part. During a single stage all rows are read. struct MergeStageProgress { - MergeStageProgress(Float64 weight_) + explicit MergeStageProgress(Float64 weight_) : is_first(true) , weight(weight_) { } @@ -576,10 +573,11 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor << parts.front()->name << " to " << parts.back()->name << " into " << TMP_PREFIX + future_part.name + " with type " + future_part.type.toString()); - String part_path = data.getFullPathOnDisk(space_reservation->getDisk()); + auto disk = space_reservation->getDisk(); + String part_path = data.relative_data_path; String new_part_tmp_path = part_path + TMP_PREFIX + future_part.name + "/"; - if (Poco::File(new_part_tmp_path).exists()) - throw Exception("Directory " + new_part_tmp_path + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + if (disk->exists(new_part_tmp_path)) + throw Exception("Directory " + fullPath(disk, new_part_tmp_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); MergeTreeData::DataPart::ColumnToSize merged_column_to_size; @@ -598,7 +596,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor future_part.name, future_part.type, future_part.part_info, - space_reservation->getDisk(), + disk, TMP_PREFIX + future_part.name); new_data_part->setColumns(all_columns); @@ -633,16 +631,17 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor merge_entry->total_size_bytes_compressed, static_cast (merge_entry->total_size_bytes_compressed) / data.getTotalActiveSizeInBytes()); + /// TODO: Should it go through IDisk interface? String rows_sources_file_path; - std::unique_ptr rows_sources_uncompressed_write_buf; + std::unique_ptr rows_sources_uncompressed_write_buf; std::unique_ptr rows_sources_write_buf; std::optional column_sizes; if (merge_alg == MergeAlgorithm::Vertical) { - Poco::File(new_part_tmp_path).createDirectories(); + disk->createDirectories(new_part_tmp_path); rows_sources_file_path = new_part_tmp_path + "rows_sources"; - rows_sources_uncompressed_write_buf = std::make_unique(rows_sources_file_path); + rows_sources_uncompressed_write_buf = disk->writeFile(rows_sources_file_path); rows_sources_write_buf = std::make_unique(*rows_sources_uncompressed_write_buf); for (const MergeTreeData::DataPartPtr & part : parts) @@ -832,6 +831,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor rows_sources_write_buf->next(); rows_sources_uncompressed_write_buf->next(); + /// Ensure data has written to disk. + rows_sources_uncompressed_write_buf->finalize(); size_t rows_sources_count = rows_sources_write_buf->count(); /// In special case, when there is only one source part, and no rows were skipped, we may have @@ -842,7 +843,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor + ") differs from number of bytes written to rows_sources file (" + toString(rows_sources_count) + "). It is a bug.", ErrorCodes::LOGICAL_ERROR); - CompressedReadBufferFromFile rows_sources_read_buf(rows_sources_file_path, 0, 0, 0); + CompressedReadBufferFromFile rows_sources_read_buf(disk->readFile(rows_sources_file_path)); IMergedBlockOutputStream::WrittenOffsetColumns written_offset_columns; for (size_t column_num = 0, gathering_column_names_size = gathering_column_names.size(); @@ -909,7 +910,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor merge_entry->progress.store(progress_before + column_sizes->columnWeight(column_name), std::memory_order_relaxed); } - Poco::File(rows_sources_file_path).remove(); + disk->remove(rows_sources_file_path); } for (const auto & part : parts) @@ -1018,7 +1019,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mutatePartToTempor new_data_part->index_granularity_info = source_part->index_granularity_info; new_data_part->setColumns(getColumnsForNewDataPart(source_part, updated_header, all_columns)); - String new_part_tmp_path = new_data_part->getFullPath(); + auto disk = new_data_part->disk; + String new_part_tmp_path = new_data_part->getFullRelativePath(); /// Note: this is done before creating input streams, because otherwise data.data_parts_mutex /// (which is locked in data.getTotalActiveSizeInBytes()) is locked after part->columns_lock @@ -1029,7 +1031,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mutatePartToTempor source_part->bytes_on_disk, static_cast(source_part->bytes_on_disk) / data.getTotalActiveSizeInBytes()); - Poco::File(new_part_tmp_path).createDirectories(); + disk->createDirectories(new_part_tmp_path); /// Don't change granularity type while mutating subset of columns auto mrk_extension = source_part->index_granularity_info.is_adaptive ? getAdaptiveMrkExtension(new_data_part->getType()) @@ -1041,7 +1043,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mutatePartToTempor need_remove_expired_values = true; /// All columns from part are changed and may be some more that were missing before in part - if (source_part->getColumns().isSubsetOf(updated_header.getNamesAndTypesList())) + if (isCompactPart(source_part) || source_part->getColumns().isSubsetOf(updated_header.getNamesAndTypesList())) { /// All columns are modified, proceed to write a new part from scratch. if (data.hasPrimaryKey() || data.hasSkipIndices()) @@ -1125,17 +1127,15 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mutatePartToTempor if (need_remove_expired_values) files_to_skip.insert("ttl.txt"); - Poco::DirectoryIterator dir_end; /// Create hardlinks for unchanged files - for (Poco::DirectoryIterator dir_it(source_part->getFullPath()); dir_it != dir_end; ++dir_it) + for (auto it = disk->iterateDirectory(source_part->getFullRelativePath()); it->isValid(); it->next()) { - if (files_to_skip.count(dir_it.name()) || files_to_remove.count(dir_it.name())) + if (files_to_skip.count(it->name()) || files_to_remove.count(it->name())) continue; - Poco::Path destination(new_part_tmp_path); - destination.append(dir_it.name()); + String destination = new_part_tmp_path + "/" + it->name(); - createHardLink(dir_it.path().toString(), destination.toString()); + disk->createHardLink(it->path(), destination); } merge_entry->columns_written = all_columns.size() - updated_header.columns(); @@ -1181,8 +1181,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mutatePartToTempor if (need_remove_expired_values) { /// Write a file with ttl infos in json format. - WriteBufferFromFile out_ttl(new_part_tmp_path + "ttl.txt", 4096); - HashingWriteBuffer out_hashing(out_ttl); + auto out_ttl = disk->writeFile(new_part_tmp_path + "ttl.txt", 4096); + HashingWriteBuffer out_hashing(*out_ttl); new_data_part->ttl_infos.write(out_hashing); new_data_part->checksums.files["ttl.txt"].file_size = out_hashing.count(); new_data_part->checksums.files["ttl.txt"].file_hash = out_hashing.getHash(); @@ -1193,15 +1193,15 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mutatePartToTempor new_data_part->checksums.files.erase(removed_file); { /// Write file with checksums. - WriteBufferFromFile out_checksums(new_part_tmp_path + "checksums.txt", 4096); - new_data_part->checksums.write(out_checksums); + auto out_checksums = disk->writeFile(new_part_tmp_path + "checksums.txt", 4096); + new_data_part->checksums.write(*out_checksums); } /// close fd { /// Write a file with a description of columns. - WriteBufferFromFile out_columns(new_part_tmp_path + "columns.txt", 4096); - new_data_part->getColumns().writeText(out_columns); + auto out_columns = disk->writeFile(new_part_tmp_path + "columns.txt", 4096); + new_data_part->getColumns().writeText(*out_columns); } /// close new_data_part->rows_count = source_part->rows_count; @@ -1318,7 +1318,7 @@ void MergeTreeDataMergerMutator::splitMutationCommands( MergeTreeData::DataPartPtr part, const MutationCommands & commands, MutationCommands & for_interpreter, - MutationCommands & for_file_renames) const + MutationCommands & for_file_renames) { NameSet removed_columns_from_compact_part; NameSet already_changed_columns; @@ -1379,7 +1379,7 @@ void MergeTreeDataMergerMutator::splitMutationCommands( NameSet MergeTreeDataMergerMutator::collectFilesToRemove( - MergeTreeData::DataPartPtr source_part, const MutationCommands & commands_for_removes, const String & mrk_extension) const + MergeTreeData::DataPartPtr source_part, const MutationCommands & commands_for_removes, const String & mrk_extension) { /// Collect counts for shared streams of different columns. As an example, Nested columns have shared stream with array sizes. std::map stream_counts; @@ -1393,7 +1393,6 @@ NameSet MergeTreeDataMergerMutator::collectFilesToRemove( {}); } - NameSet remove_files; /// Remove old indices for (const auto & command : commands_for_removes) @@ -1422,11 +1421,12 @@ NameSet MergeTreeDataMergerMutator::collectFilesToRemove( column->type->enumerateStreams(callback, stream_path); } } + return remove_files; } NameSet MergeTreeDataMergerMutator::collectFilesToSkip( - const Block & updated_header, const std::set & indices_to_recalc, const String & mrk_extension) const + const Block & updated_header, const std::set & indices_to_recalc, const String & mrk_extension) { NameSet files_to_skip = {"checksums.txt", "columns.txt"}; @@ -1454,7 +1454,7 @@ NameSet MergeTreeDataMergerMutator::collectFilesToSkip( NamesAndTypesList MergeTreeDataMergerMutator::getColumnsForNewDataPart( - MergeTreeData::DataPartPtr source_part, const Block & updated_header, NamesAndTypesList all_columns) const + MergeTreeData::DataPartPtr source_part, const Block & updated_header, NamesAndTypesList all_columns) { Names source_column_names = source_part->getColumns().getNames(); NameSet source_columns_name_set(source_column_names.begin(), source_column_names.end()); diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.h b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.h index d55298bb944..8552df8cc04 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.h +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.h @@ -138,23 +138,23 @@ private: * First part should be executed by mutations interpreter. * Other is just simple drop/renames, so they can be executed without interpreter. */ - void splitMutationCommands( + static void splitMutationCommands( MergeTreeData::DataPartPtr part, const MutationCommands & commands, MutationCommands & for_interpreter, - MutationCommands & for_file_renames) const; + MutationCommands & for_file_renames); /// Apply commands to source_part i.e. remove some columns in source_part /// and return set of files, that have to be removed from filesystem and checksums - NameSet collectFilesToRemove(MergeTreeData::DataPartPtr source_part, const MutationCommands & commands_for_removes, const String & mrk_extension) const; + static NameSet collectFilesToRemove(MergeTreeData::DataPartPtr source_part, const MutationCommands & commands_for_removes, const String & mrk_extension); /// Files, that we don't need to remove and don't need to hardlink, for example columns.txt and checksums.txt. /// Because we will generate new versions of them after we perform mutation. - NameSet collectFilesToSkip(const Block & updated_header, const std::set & indices_to_recalc, const String & mrk_extension) const; + static NameSet collectFilesToSkip(const Block & updated_header, const std::set & indices_to_recalc, const String & mrk_extension); /// Get the columns list of the resulting part in the same order as all_columns. - NamesAndTypesList getColumnsForNewDataPart(MergeTreeData::DataPartPtr source_part, const Block & updated_header, NamesAndTypesList all_columns) const; + static NamesAndTypesList getColumnsForNewDataPart(MergeTreeData::DataPartPtr source_part, const Block & updated_header, NamesAndTypesList all_columns); bool shouldExecuteTTL(const Names & columns, const MutationCommands & commands) const; diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/dbms/src/Storages/MergeTree/MergeTreeDataPartCompact.h index d75e5befe36..a8d31795df6 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -42,15 +42,15 @@ public: UncompressedCache * uncompressed_cache, MarkCache * mark_cache, const MergeTreeReaderSettings & reader_settings_, - const ValueSizeMap & avg_value_size_hints = ValueSizeMap{}, - const ReadBufferFromFileBase::ProfileCallback & profile_callback = ReadBufferFromFileBase::ProfileCallback{}) const override; + const ValueSizeMap & avg_value_size_hints, + const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; MergeTreeWriterPtr getWriter( const NamesAndTypesList & columns_list, const std::vector & indices_to_recalc, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, - const MergeTreeIndexGranularity & computed_index_granularity = {}) const override; + const MergeTreeIndexGranularity & computed_index_granularity) const override; bool isStoredOnDisk() const override { return true; } diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPartWide.h b/dbms/src/Storages/MergeTree/MergeTreeDataPartWide.h index c0c7e45b7ef..bba70aa8c5f 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -34,15 +34,15 @@ public: UncompressedCache * uncompressed_cache, MarkCache * mark_cache, const MergeTreeReaderSettings & reader_settings_, - const ValueSizeMap & avg_value_size_hints = ValueSizeMap{}, - const ReadBufferFromFileBase::ProfileCallback & profile_callback = ReadBufferFromFileBase::ProfileCallback{}) const override; + const ValueSizeMap & avg_value_size_hints, + const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; MergeTreeWriterPtr getWriter( const NamesAndTypesList & columns_list, const std::vector & indices_to_recalc, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, - const MergeTreeIndexGranularity & computed_index_granularity = {}) const override; + const MergeTreeIndexGranularity & computed_index_granularity) const override; bool isStoredOnDisk() const override { return true; } diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/dbms/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index 3b405bbf1c5..598a4dd47fb 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -18,10 +18,10 @@ public: const MergeTreeWriterSettings & settings, const MergeTreeIndexGranularity & index_granularity); - void write(const Block & block, const IColumn::Permutation * permutation = nullptr, - const Block & primary_key_block = {}, const Block & skip_indexes_block = {}) override; + void write(const Block & block, const IColumn::Permutation * permutation, + const Block & primary_key_block, const Block & skip_indexes_block) override; - void finishDataSerialization(IMergeTreeDataPart::Checksums & checksums, bool sync = false) override; + void finishDataSerialization(IMergeTreeDataPart::Checksums & checksums, bool sync) override; private: /// Write single granule of one column (rows between 2 marks) diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/dbms/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index d6b01f9c45c..95e43cd31af 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -21,10 +21,10 @@ public: const MergeTreeWriterSettings & settings, const MergeTreeIndexGranularity & index_granularity); - void write(const Block & block, const IColumn::Permutation * permutation = nullptr, - const Block & primary_key_block = {}, const Block & skip_indexes_block = {}) override; + void write(const Block & block, const IColumn::Permutation * permutation, + const Block & primary_key_block, const Block & skip_indexes_block) override; - void finishDataSerialization(IMergeTreeDataPart::Checksums & checksums, bool sync = false) override; + void finishDataSerialization(IMergeTreeDataPart::Checksums & checksums, bool sync) override; IDataType::OutputStreamGetter createStreamGetter(const String & name, WrittenOffsetColumns & offset_columns); diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 2b02aad8970..0b87b241d85 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1091,8 +1091,10 @@ Pipes MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal( { auto merged_processor = std::make_shared(header, pipes.size(), sort_description, max_block_size); - pipes.emplace_back(std::move(pipes), std::move(merged_processor)); - break; + Pipe pipe(std::move(pipes), std::move(merged_processor)); + pipes = Pipes(); + pipes.emplace_back(std::move(pipe)); + return pipes; } case MergeTreeData::MergingParams::Collapsing: diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp index e66accb2f92..6e76333da9d 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp @@ -17,6 +17,21 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +static void assertGranuleBlocksStructure(const Blocks & granule_index_blocks) +{ + Block prev_block; + for (size_t index = 0; index < granule_index_blocks.size(); ++index) + { + Block granule_index_block = granule_index_blocks[index]; + + if (index != 0) + assertBlocksHaveEqualStructure(prev_block, granule_index_block, "Granule blocks of bloom filter has difference structure."); + + prev_block = granule_index_block; + } +} + + MergeTreeIndexGranuleBloomFilter::MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, size_t index_columns_) : bits_per_row(bits_per_row_), hash_functions(hash_functions_) { @@ -96,20 +111,6 @@ void MergeTreeIndexGranuleBloomFilter::serializeBinary(WriteBuffer & ostr) const ostr.write(reinterpret_cast(bloom_filter->getFilter().data()), bytes_size); } -void MergeTreeIndexGranuleBloomFilter::assertGranuleBlocksStructure(const Blocks & granule_index_blocks) const -{ - Block prev_block; - for (size_t index = 0; index < granule_index_blocks.size(); ++index) - { - Block granule_index_block = granule_index_blocks[index]; - - if (index != 0) - assertBlocksHaveEqualStructure(prev_block, granule_index_block, "Granule blocks of bloom filter has difference structure."); - - prev_block = granule_index_block; - } -} - void MergeTreeIndexGranuleBloomFilter::fillingBloomFilter(BloomFilterPtr & bf, const Block & granule_index_block, size_t index_hash_column) { const auto & column = granule_index_block.getByPosition(index_hash_column); diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h b/dbms/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h index 782f84170c7..5524c904309 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h @@ -27,8 +27,6 @@ private: size_t hash_functions; std::vector bloom_filters; - void assertGranuleBlocksStructure(const Blocks & granule_index_blocks) const; - void fillingBloomFilter(BloomFilterPtr & bf, const Block & granule_index_block, size_t index_hash_column); }; diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexSet.cpp index 843b1e6fc98..f68184e2691 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -342,7 +342,7 @@ bool MergeTreeIndexConditionSet::atomFromAST(ASTPtr & node) const return false; } -bool MergeTreeIndexConditionSet::operatorFromAST(ASTPtr & node) const +bool MergeTreeIndexConditionSet::operatorFromAST(ASTPtr & node) { /// Functions AND, OR, NOT. Replace with bit*. auto * func = node->as(); diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexSet.h b/dbms/src/Storages/MergeTree/MergeTreeIndexSet.h index fc480e4024a..5b0448d13be 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexSet.h +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexSet.h @@ -78,7 +78,7 @@ public: private: void traverseAST(ASTPtr & node) const; bool atomFromAST(ASTPtr & node) const; - bool operatorFromAST(ASTPtr & node) const; + static bool operatorFromAST(ASTPtr & node); bool checkASTUseless(const ASTPtr & node, bool atomic = false) const; diff --git a/dbms/src/Storages/MergeTree/MergeTreePartsMover.cpp b/dbms/src/Storages/MergeTree/MergeTreePartsMover.cpp index 5e3999ffbec..160b15e0f6c 100644 --- a/dbms/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -33,7 +33,7 @@ class LargestPartsWithRequiredSize UInt64 current_size_sum = 0; public: - LargestPartsWithRequiredSize(UInt64 required_sum_size_) : required_size_sum(required_sum_size_) {} + explicit LargestPartsWithRequiredSize(UInt64 required_sum_size_) : required_size_sum(required_sum_size_) {} void add(MergeTreeData::DataPartPtr part) { diff --git a/dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 7718ed642d3..b74fb5d1f5d 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -15,6 +15,25 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } + +static void filterColumns(Columns & columns, const IColumn::Filter & filter) +{ + for (auto & column : columns) + { + if (column) + { + column = column->filter(filter, -1); + + if (column->empty()) + { + columns.clear(); + return; + } + } + } +} + + MergeTreeRangeReader::DelayedStream::DelayedStream( size_t from_mark, IMergeTreeReader * merge_tree_reader_) : current_mark(from_mark), current_offset(0), num_delayed_rows(0) @@ -682,22 +701,6 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar return read_result; } -void MergeTreeRangeReader::filterColumns(Columns & columns, const IColumn::Filter & filter) const -{ - for (auto & column : columns) - { - if (column) - { - column = column->filter(filter, -1); - - if (column->empty()) - { - columns.clear(); - return; - } - } - } -} MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t max_rows, MarkRanges & ranges) { diff --git a/dbms/src/Storages/MergeTree/MergeTreeRangeReader.h b/dbms/src/Storages/MergeTree/MergeTreeRangeReader.h index affdb00147a..0af19e70c2b 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/dbms/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -213,7 +213,6 @@ private: ReadResult startReadingChain(size_t max_rows, MarkRanges & ranges); Columns continueReadingChain(ReadResult & result, size_t & num_rows); void executePrewhereActionsAndFilterColumns(ReadResult & result); - void filterColumns(Columns & columns, const IColumn::Filter & filter) const; IMergeTreeReader * merge_tree_reader = nullptr; const MergeTreeIndexGranularity * index_granularity = nullptr; diff --git a/dbms/src/Storages/MergeTree/MergeTreeSettings.cpp b/dbms/src/Storages/MergeTree/MergeTreeSettings.cpp index 93f5ff20045..5c4113c1565 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeSettings.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeSettings.cpp @@ -70,7 +70,7 @@ void MergeTreeSettings::loadFromQuery(ASTStorage & storage_def) if (std::find_if(changes.begin(), changes.end(), \ [](const SettingChange & c) { return c.name == #NAME; }) \ == changes.end()) \ - changes.push_back(SettingChange{#NAME, NAME.value}); + changes.push_back(SettingChange{#NAME, (NAME).value}); APPLY_FOR_IMMUTABLE_MERGE_TREE_SETTINGS(ADD_IF_ABSENT) #undef ADD_IF_ABSENT diff --git a/dbms/src/Storages/MergeTree/MergeTreeSettings.h b/dbms/src/Storages/MergeTree/MergeTreeSettings.h index 7d53f161620..bbd1fd6cbeb 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeSettings.h +++ b/dbms/src/Storages/MergeTree/MergeTreeSettings.h @@ -42,7 +42,6 @@ struct MergeTreeSettings : public SettingsCollection M(SettingUInt64, number_of_free_entries_in_pool_to_execute_mutation, 10, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ M(SettingSeconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \ M(SettingSeconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories.", 0) \ - M(SettingBool, disable_background_merges, false, "Disable background merges.", 0) \ \ /** Inserts settings. */ \ M(SettingUInt64, parts_to_delay_insert, 150, "If table contains at least that many active parts in single partition, artificially slow down insert into table.", 0) \ diff --git a/dbms/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/dbms/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index b3660674362..8f8b5111f48 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -71,6 +71,55 @@ static void collectIdentifiersNoSubqueries(const ASTPtr & ast, NameSet & set) collectIdentifiersNoSubqueries(child, set); } +static bool isConditionGood(const ASTPtr & condition) +{ + const auto * function = condition->as(); + if (!function) + return false; + + /** we are only considering conditions of form `equals(one, another)` or `one = another`, + * especially if either `one` or `another` is ASTIdentifier */ + if (function->name != "equals") + return false; + + auto left_arg = function->arguments->children.front().get(); + auto right_arg = function->arguments->children.back().get(); + + /// try to ensure left_arg points to ASTIdentifier + if (!left_arg->as() && right_arg->as()) + std::swap(left_arg, right_arg); + + if (left_arg->as()) + { + /// condition may be "good" if only right_arg is a constant and its value is outside the threshold + if (const auto * literal = right_arg->as()) + { + const auto & field = literal->value; + const auto type = field.getType(); + + /// check the value with respect to threshold + if (type == Field::Types::UInt64) + { + const auto value = field.get(); + return value > threshold; + } + else if (type == Field::Types::Int64) + { + const auto value = field.get(); + return value < -threshold || threshold < value; + } + else if (type == Field::Types::Float64) + { + const auto value = field.get(); + return value < threshold || threshold < value; + } + } + } + + return false; +} + + void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) const { if (const auto * func_and = node->as(); func_and && func_and->name == "and") @@ -116,7 +165,7 @@ MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPt } /// Transform Conditions list to WHERE or PREWHERE expression. -ASTPtr MergeTreeWhereOptimizer::reconstruct(const Conditions & conditions) const +ASTPtr MergeTreeWhereOptimizer::reconstruct(const Conditions & conditions) { if (conditions.empty()) return {}; @@ -205,55 +254,6 @@ UInt64 MergeTreeWhereOptimizer::getIdentifiersColumnSize(const NameSet & identif } -bool MergeTreeWhereOptimizer::isConditionGood(const ASTPtr & condition) const -{ - const auto * function = condition->as(); - if (!function) - return false; - - /** we are only considering conditions of form `equals(one, another)` or `one = another`, - * especially if either `one` or `another` is ASTIdentifier */ - if (function->name != "equals") - return false; - - auto left_arg = function->arguments->children.front().get(); - auto right_arg = function->arguments->children.back().get(); - - /// try to ensure left_arg points to ASTIdentifier - if (!left_arg->as() && right_arg->as()) - std::swap(left_arg, right_arg); - - if (left_arg->as()) - { - /// condition may be "good" if only right_arg is a constant and its value is outside the threshold - if (const auto * literal = right_arg->as()) - { - const auto & field = literal->value; - const auto type = field.getType(); - - /// check the value with respect to threshold - if (type == Field::Types::UInt64) - { - const auto value = field.get(); - return value > threshold; - } - else if (type == Field::Types::Int64) - { - const auto value = field.get(); - return value < -threshold || threshold < value; - } - else if (type == Field::Types::Float64) - { - const auto value = field.get(); - return value < threshold || threshold < value; - } - } - } - - return false; -} - - bool MergeTreeWhereOptimizer::hasPrimaryKeyAtoms(const ASTPtr & ast) const { if (const auto * func = ast->as()) diff --git a/dbms/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/dbms/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index 6ec81592ee1..f9fdc01812b 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/dbms/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -65,7 +65,7 @@ private: Conditions analyze(const ASTPtr & expression) const; /// Transform Conditions list to WHERE or PREWHERE expression. - ASTPtr reconstruct(const Conditions & conditions) const; + static ASTPtr reconstruct(const Conditions & conditions); void calculateColumnSizes(const MergeTreeData & data, const Names & column_names); @@ -75,8 +75,6 @@ private: UInt64 getIdentifiersColumnSize(const NameSet & identifiers) const; - bool isConditionGood(const ASTPtr & condition) const; - bool hasPrimaryKeyAtoms(const ASTPtr & ast) const; bool isPrimaryKeyAtom(const ASTPtr & ast) const; diff --git a/dbms/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h b/dbms/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h index c44c744efaf..0e93301e124 100644 --- a/dbms/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h +++ b/dbms/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h @@ -13,7 +13,7 @@ namespace DB { /// A Storage that allows reading from a single MergeTree data part. -class StorageFromMergeTreeDataPart : public ext::shared_ptr_helper, public IStorage +class StorageFromMergeTreeDataPart final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: @@ -53,6 +53,11 @@ public: } + bool hasSortingKey() const { return part->storage.hasSortingKey(); } + + Names getSortingKeyColumns() const override { return part->storage.getSortingKeyColumns(); } + + protected: StorageFromMergeTreeDataPart(const MergeTreeData::DataPartPtr & part_) : IStorage(getIDFromPart(part_), part_->storage.getVirtuals()) diff --git a/dbms/src/Storages/MergeTree/checkDataPart.cpp b/dbms/src/Storages/MergeTree/checkDataPart.cpp index 6195facc914..6da051d04ac 100644 --- a/dbms/src/Storages/MergeTree/checkDataPart.cpp +++ b/dbms/src/Storages/MergeTree/checkDataPart.cpp @@ -99,7 +99,6 @@ IMergeTreeDataPart::Checksums checkDataPart( throw Exception("Unknown type in part " + path, ErrorCodes::UNKNOWN_PART_TYPE); } - Poco::DirectoryIterator dir_end; for (auto it = disk->iterateDirectory(path); it->isValid(); it->next()) { const String & file_name = it->name(); diff --git a/dbms/src/Common/localBackup.cpp b/dbms/src/Storages/MergeTree/localBackup.cpp similarity index 51% rename from dbms/src/Common/localBackup.cpp rename to dbms/src/Storages/MergeTree/localBackup.cpp index 2e042351a90..7d7dacaeaf1 100644 --- a/dbms/src/Common/localBackup.cpp +++ b/dbms/src/Storages/MergeTree/localBackup.cpp @@ -1,13 +1,8 @@ #include "localBackup.h" -#include #include -#include -#include -#include #include -#include -#include +#include namespace DB @@ -20,7 +15,7 @@ namespace ErrorCodes } -static void localBackupImpl(const Poco::Path & source_path, const Poco::Path & destination_path, size_t level, +static void localBackupImpl(const DiskPtr & disk, const String & source_path, const String & destination_path, size_t level, std::optional max_level) { if (max_level && level > *max_level) @@ -29,34 +24,30 @@ static void localBackupImpl(const Poco::Path & source_path, const Poco::Path & d if (level >= 1000) throw DB::Exception("Too deep recursion", DB::ErrorCodes::TOO_DEEP_RECURSION); - Poco::File(destination_path).createDirectories(); + disk->createDirectories(destination_path); - Poco::DirectoryIterator dir_end; - for (Poco::DirectoryIterator dir_it(source_path); dir_it != dir_end; ++dir_it) + for (auto it = disk->iterateDirectory(source_path); it->isValid(); it->next()) { - Poco::Path source = dir_it.path(); - Poco::Path destination = destination_path; - destination.append(dir_it.name()); + auto source = it->path(); + auto destination = destination_path + "/" + it->name(); - if (!dir_it->isDirectory()) + if (!disk->isDirectory(source)) { - dir_it->setReadOnly(); - - createHardLink(source.toString(), destination.toString()); + disk->setReadOnly(source); + disk->createHardLink(source, destination); } else { - localBackupImpl(source, destination, level + 1, max_level); + localBackupImpl(disk, source, destination, level + 1, max_level); } } } -void localBackup(const Poco::Path & source_path, const Poco::Path & destination_path, std::optional max_level) +void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, std::optional max_level) { - if (Poco::File(destination_path).exists() - && Poco::DirectoryIterator(destination_path) != Poco::DirectoryIterator()) + if (disk->exists(destination_path) && !disk->isDirectoryEmpty(destination_path)) { - throw DB::Exception("Directory " + destination_path.toString() + " already exists and is not empty.", DB::ErrorCodes::DIRECTORY_ALREADY_EXISTS); + throw DB::Exception("Directory " + fullPath(disk, destination_path) + " already exists and is not empty.", DB::ErrorCodes::DIRECTORY_ALREADY_EXISTS); } size_t try_no = 0; @@ -70,7 +61,7 @@ void localBackup(const Poco::Path & source_path, const Poco::Path & destination_ { try { - localBackupImpl(source_path, destination_path, 0, max_level); + localBackupImpl(disk, source_path, destination_path, 0, max_level); } catch (const DB::ErrnoException & e) { diff --git a/dbms/src/Common/localBackup.h b/dbms/src/Storages/MergeTree/localBackup.h similarity index 80% rename from dbms/src/Common/localBackup.h rename to dbms/src/Storages/MergeTree/localBackup.h index e3ea32614ee..3c9d92fa9da 100644 --- a/dbms/src/Common/localBackup.h +++ b/dbms/src/Storages/MergeTree/localBackup.h @@ -1,8 +1,8 @@ #pragma once #include - -namespace Poco { class Path; } +#include +#include namespace DB { @@ -20,6 +20,6 @@ namespace DB * If max_level is specified, than only files which depth relative source_path less or equal max_level will be copied. * So, if max_level=0 than only direct file child are copied. */ -void localBackup(const Poco::Path & source_path, const Poco::Path & destination_path, std::optional max_level = {}); +void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, std::optional max_level = {}); } diff --git a/dbms/src/Storages/ReadInOrderOptimizer.cpp b/dbms/src/Storages/ReadInOrderOptimizer.cpp index 667ce095932..c05acfa71ab 100644 --- a/dbms/src/Storages/ReadInOrderOptimizer.cpp +++ b/dbms/src/Storages/ReadInOrderOptimizer.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -31,14 +32,28 @@ ReadInOrderOptimizer::ReadInOrderOptimizer( InputSortingInfoPtr ReadInOrderOptimizer::getInputOrder(const StoragePtr & storage) const { - const MergeTreeData * merge_tree = dynamic_cast(storage.get()); - if (!merge_tree || !merge_tree->hasSortingKey()) + Names sorting_key_columns; + if (const auto * merge_tree = dynamic_cast(storage.get())) + { + if (!merge_tree->hasSortingKey()) + return {}; + sorting_key_columns = merge_tree->getSortingKeyColumns(); + } + else if (const auto * part = dynamic_cast(storage.get())) + { + if (!part->hasSortingKey()) + return {}; + sorting_key_columns = part->getSortingKeyColumns(); + } + else /// Inapplicable storage type + { return {}; + } + SortDescription order_key_prefix_descr; int read_direction = required_sort_description.at(0).direction; - const auto & sorting_key_columns = merge_tree->getSortingKeyColumns(); size_t prefix_size = std::min(required_sort_description.size(), sorting_key_columns.size()); for (size_t i = 0; i < prefix_size; ++i) diff --git a/dbms/src/Storages/StorageBuffer.h b/dbms/src/Storages/StorageBuffer.h index 087ff5df07e..d4c6f3e6078 100644 --- a/dbms/src/Storages/StorageBuffer.h +++ b/dbms/src/Storages/StorageBuffer.h @@ -37,7 +37,7 @@ class Context; * When you destroy a Buffer table, all remaining data is flushed to the subordinate table. * The data in the buffer is not replicated, not logged to disk, not indexed. With a rough restart of the server, the data is lost. */ -class StorageBuffer : public ext::shared_ptr_helper, public IStorage +class StorageBuffer final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; friend class BufferSource; diff --git a/dbms/src/Storages/StorageDictionary.h b/dbms/src/Storages/StorageDictionary.h index 67e5ddecaf1..fd6cb1902dc 100644 --- a/dbms/src/Storages/StorageDictionary.h +++ b/dbms/src/Storages/StorageDictionary.h @@ -19,7 +19,7 @@ struct DictionaryStructure; struct IDictionaryBase; class ExternalDictionaries; -class StorageDictionary : public ext::shared_ptr_helper, public IStorage +class StorageDictionary final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageDistributed.cpp b/dbms/src/Storages/StorageDistributed.cpp index fda10029fe4..4ef8f39d1f7 100644 --- a/dbms/src/Storages/StorageDistributed.cpp +++ b/dbms/src/Storages/StorageDistributed.cpp @@ -318,13 +318,8 @@ StoragePtr StorageDistributed::createWithOwnCluster( return res; } -QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage(const Context & context) const -{ - auto cluster = getCluster(); - return getQueryProcessingStage(context, cluster); -} -QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage(const Context & context, const ClusterPtr & cluster) const +static QueryProcessingStage::Enum getQueryProcessingStageImpl(const Context & context, const ClusterPtr & cluster) { const Settings & settings = context.getSettingsRef(); @@ -339,6 +334,12 @@ QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage(const Con : QueryProcessingStage::WithMergeableState; } +QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage(const Context & context) const +{ + auto cluster = getCluster(); + return getQueryProcessingStageImpl(context, cluster); +} + Pipes StorageDistributed::read( const Names & column_names, const SelectQueryInfo & query_info, @@ -641,7 +642,7 @@ ClusterPtr StorageDistributed::skipUnusedShards(ClusterPtr cluster, const Select if (!block.has(sharding_key_column_name)) throw Exception("sharding_key_expr should evaluate as a single row", ErrorCodes::TOO_MANY_ROWS); - const auto result = block.getByName(sharding_key_column_name); + const ColumnWithTypeAndName & result = block.getByName(sharding_key_column_name); const auto selector = createSelector(cluster, result); shards.insert(selector.begin(), selector.end()); diff --git a/dbms/src/Storages/StorageDistributed.h b/dbms/src/Storages/StorageDistributed.h index 27374729db5..086842c5888 100644 --- a/dbms/src/Storages/StorageDistributed.h +++ b/dbms/src/Storages/StorageDistributed.h @@ -31,7 +31,7 @@ using ExpressionActionsPtr = std::shared_ptr; * You can pass one address, not several. * In this case, the table can be considered remote, rather than distributed. */ -class StorageDistributed : public ext::shared_ptr_helper, public IStorage +class StorageDistributed final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; friend class DistributedBlockOutputStream; @@ -67,7 +67,6 @@ public: bool isRemote() const override { return true; } QueryProcessingStage::Enum getQueryProcessingStage(const Context & context) const override; - QueryProcessingStage::Enum getQueryProcessingStage(const Context & context, const ClusterPtr & cluster) const; Pipes read( const Names & column_names, diff --git a/dbms/src/Storages/StorageFile.h b/dbms/src/Storages/StorageFile.h index 6134dff5db5..339c6b76eb9 100644 --- a/dbms/src/Storages/StorageFile.h +++ b/dbms/src/Storages/StorageFile.h @@ -18,7 +18,7 @@ namespace DB class StorageFileBlockInputStream; class StorageFileBlockOutputStream; -class StorageFile : public ext::shared_ptr_helper, public IStorage +class StorageFile final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageGenerateRandom.h b/dbms/src/Storages/StorageGenerateRandom.h index f39ca1f18c1..955b8bd671d 100644 --- a/dbms/src/Storages/StorageGenerateRandom.h +++ b/dbms/src/Storages/StorageGenerateRandom.h @@ -9,7 +9,7 @@ namespace DB { /* Generates random data for given schema. */ -class StorageGenerateRandom : public ext::shared_ptr_helper, public IStorage +class StorageGenerateRandom final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageHDFS.h b/dbms/src/Storages/StorageHDFS.h index 8cc9ce62567..b8e3be20b82 100644 --- a/dbms/src/Storages/StorageHDFS.h +++ b/dbms/src/Storages/StorageHDFS.h @@ -13,7 +13,7 @@ namespace DB * This class represents table engine for external hdfs files. * Read method is supported for now. */ -class StorageHDFS : public ext::shared_ptr_helper, public IStorage +class StorageHDFS final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageInMemoryMetadata.cpp b/dbms/src/Storages/StorageInMemoryMetadata.cpp index 86c6551896a..a05872234de 100644 --- a/dbms/src/Storages/StorageInMemoryMetadata.cpp +++ b/dbms/src/Storages/StorageInMemoryMetadata.cpp @@ -35,11 +35,13 @@ StorageInMemoryMetadata::StorageInMemoryMetadata(const StorageInMemoryMetadata & StorageInMemoryMetadata & StorageInMemoryMetadata::operator=(const StorageInMemoryMetadata & other) { + if (this == &other) + return *this; + columns = other.columns; indices = other.indices; constraints = other.constraints; - if (other.partition_by_ast) partition_by_ast = other.partition_by_ast->clone(); else diff --git a/dbms/src/Storages/StorageInput.h b/dbms/src/Storages/StorageInput.h index 16d195682e9..60bda222c2e 100644 --- a/dbms/src/Storages/StorageInput.h +++ b/dbms/src/Storages/StorageInput.h @@ -8,7 +8,7 @@ namespace DB /** Internal temporary storage for table function input(...) */ -class StorageInput : public ext::shared_ptr_helper, public IStorage +class StorageInput final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageJoin.cpp b/dbms/src/Storages/StorageJoin.cpp index 4bec5a05205..f47cc9ae886 100644 --- a/dbms/src/Storages/StorageJoin.cpp +++ b/dbms/src/Storages/StorageJoin.cpp @@ -94,7 +94,7 @@ HashJoinPtr StorageJoin::getJoin(std::shared_ptr analyzed_join) co } -void StorageJoin::insertBlock(const Block & block) { join->addJoinedBlock(block); } +void StorageJoin::insertBlock(const Block & block) { join->addJoinedBlock(block, true); } size_t StorageJoin::getSize() const { return join->getTotalRowCount(); } diff --git a/dbms/src/Storages/StorageJoin.h b/dbms/src/Storages/StorageJoin.h index b2eb7cbf3f7..6ee941feaec 100644 --- a/dbms/src/Storages/StorageJoin.h +++ b/dbms/src/Storages/StorageJoin.h @@ -21,7 +21,7 @@ using HashJoinPtr = std::shared_ptr; * * When using, JOIN must be of the appropriate type (ANY|ALL LEFT|INNER ...). */ -class StorageJoin : public ext::shared_ptr_helper, public StorageSetOrJoinBase +class StorageJoin final : public ext::shared_ptr_helper, public StorageSetOrJoinBase { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageLog.h b/dbms/src/Storages/StorageLog.h index 8bec52e0c64..79d7c960004 100644 --- a/dbms/src/Storages/StorageLog.h +++ b/dbms/src/Storages/StorageLog.h @@ -15,7 +15,7 @@ namespace DB /** Implements simple table engine without support of indices. * The data is stored in a compressed form. */ -class StorageLog : public ext::shared_ptr_helper, public IStorage +class StorageLog final : public ext::shared_ptr_helper, public IStorage { friend class LogSource; friend class LogBlockOutputStream; diff --git a/dbms/src/Storages/StorageMaterializedView.h b/dbms/src/Storages/StorageMaterializedView.h index aa312e184aa..6951f11038e 100644 --- a/dbms/src/Storages/StorageMaterializedView.h +++ b/dbms/src/Storages/StorageMaterializedView.h @@ -12,7 +12,7 @@ namespace DB { -class StorageMaterializedView : public ext::shared_ptr_helper, public IStorage +class StorageMaterializedView final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageMemory.h b/dbms/src/Storages/StorageMemory.h index 2d237918b7c..2515f948b4c 100644 --- a/dbms/src/Storages/StorageMemory.h +++ b/dbms/src/Storages/StorageMemory.h @@ -17,7 +17,7 @@ namespace DB * It does not support keys. * Data is stored as a set of blocks and is not stored anywhere else. */ -class StorageMemory : public ext::shared_ptr_helper, public IStorage +class StorageMemory final : public ext::shared_ptr_helper, public IStorage { friend class MemoryBlockInputStream; friend class MemoryBlockOutputStream; diff --git a/dbms/src/Storages/StorageMerge.h b/dbms/src/Storages/StorageMerge.h index c09b7c108e7..1b7bb2fabe8 100644 --- a/dbms/src/Storages/StorageMerge.h +++ b/dbms/src/Storages/StorageMerge.h @@ -13,7 +13,7 @@ namespace DB /** A table that represents the union of an arbitrary number of other tables. * All tables must have the same structure. */ -class StorageMerge : public ext::shared_ptr_helper, public IStorage +class StorageMerge final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageMergeTree.cpp b/dbms/src/Storages/StorageMergeTree.cpp index aa4566fef37..5e67d48d1b5 100644 --- a/dbms/src/Storages/StorageMergeTree.cpp +++ b/dbms/src/Storages/StorageMergeTree.cpp @@ -21,8 +21,6 @@ #include #include #include -#include -#include #include #include #include @@ -95,8 +93,7 @@ void StorageMergeTree::startup() /// NOTE background task will also do the above cleanups periodically. time_after_previous_cleanup.restart(); - if (!getSettings()->disable_background_merges) - merging_mutating_task_handle = global_context.getBackgroundPool().addTask([this] { return mergeMutateTask(); }); + merging_mutating_task_handle = global_context.getBackgroundPool().addTask([this] { return mergeMutateTask(); }); if (areBackgroundMovesNeeded()) moving_task_handle = global_context.getBackgroundMovePool().addTask([this] { return movePartsTask(); }); } @@ -1241,7 +1238,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const dest_table_storage->getStorageID().getNameForLogs() + ": " + dest_table_storage->getStoragePolicy()->getName(), ErrorCodes::LOGICAL_ERROR); Stopwatch watch; - MergeTreeData & src_data = dest_table_storage->checkStructureAndGetMergeTreeData(this); + MergeTreeData & src_data = dest_table_storage->checkStructureAndGetMergeTreeData(*this); String partition_id = getPartitionIDFromQuery(partition, context); DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Committed, partition_id); diff --git a/dbms/src/Storages/StorageMergeTree.h b/dbms/src/Storages/StorageMergeTree.h index 33d48cf1a85..29004a17d32 100644 --- a/dbms/src/Storages/StorageMergeTree.h +++ b/dbms/src/Storages/StorageMergeTree.h @@ -23,7 +23,7 @@ namespace DB /** See the description of the data structure in MergeTreeData. */ -class StorageMergeTree : public ext::shared_ptr_helper, public MergeTreeData +class StorageMergeTree final : public ext::shared_ptr_helper, public MergeTreeData { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageMySQL.cpp b/dbms/src/Storages/StorageMySQL.cpp index 1991215ad12..84b554115a6 100644 --- a/dbms/src/Storages/StorageMySQL.cpp +++ b/dbms/src/Storages/StorageMySQL.cpp @@ -175,7 +175,7 @@ public: return splitted_blocks; } - std::string dumpNamesWithBackQuote(const Block & block) const + static std::string dumpNamesWithBackQuote(const Block & block) { WriteBufferFromOwnString out; for (auto it = block.begin(); it != block.end(); ++it) @@ -187,7 +187,6 @@ public: return out.str(); } - private: const StorageMySQL & storage; std::string remote_database_name; diff --git a/dbms/src/Storages/StorageMySQL.h b/dbms/src/Storages/StorageMySQL.h index 41691316500..3bde1cab4c7 100644 --- a/dbms/src/Storages/StorageMySQL.h +++ b/dbms/src/Storages/StorageMySQL.h @@ -17,7 +17,7 @@ namespace DB * Use ENGINE = mysql(host_port, database_name, table_name, user_name, password) * Read only. */ -class StorageMySQL : public ext::shared_ptr_helper, public IStorage +class StorageMySQL final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageNull.h b/dbms/src/Storages/StorageNull.h index e62a4152d70..f34b9479a03 100644 --- a/dbms/src/Storages/StorageNull.h +++ b/dbms/src/Storages/StorageNull.h @@ -16,7 +16,7 @@ namespace DB /** When writing, does nothing. * When reading, returns nothing. */ -class StorageNull : public ext::shared_ptr_helper, public IStorage +class StorageNull final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.cpp b/dbms/src/Storages/StorageReplicatedMergeTree.cpp index 1e476945650..c4c34c3b793 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp +++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp @@ -253,7 +253,6 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( metadata_version = metadata_stat.version; createReplica(); - } else { @@ -299,7 +298,6 @@ void StorageReplicatedMergeTree::waitMutationToFinishOnReplicas( zkutil::EventPtr wait_event = std::make_shared(); - std::set inactive_replicas; for (const String & replica : replicas) { @@ -4518,18 +4516,18 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, const void StorageReplicatedMergeTree::waitMutation(const String & znode_name, size_t mutations_sync) const { - auto zookeeper = getZooKeeper(); - /// we have to wait - if (mutations_sync != 0) - { - Strings replicas; - if (mutations_sync == 2) /// wait for all replicas - replicas = zookeeper->getChildren(zookeeper_path + "/replicas"); - else if (mutations_sync == 1) /// just wait for ourself - replicas.push_back(replica_name); + if (!mutations_sync) + return; - waitMutationToFinishOnReplicas(replicas, znode_name); - } + /// we have to wait + auto zookeeper = getZooKeeper(); + Strings replicas; + if (mutations_sync == 2) /// wait for all replicas + replicas = zookeeper->getChildren(zookeeper_path + "/replicas"); + else if (mutations_sync == 1) /// just wait for ourself + replicas.push_back(replica_name); + + waitMutationToFinishOnReplicas(replicas, znode_name); } std::vector StorageReplicatedMergeTree::getMutationsStatus() const @@ -5051,7 +5049,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta getStorageID().getNameForLogs() + ": " + dest_table_storage->getStoragePolicy()->getName(), ErrorCodes::LOGICAL_ERROR); Stopwatch watch; - MergeTreeData & src_data = dest_table_storage->checkStructureAndGetMergeTreeData(this); + MergeTreeData & src_data = dest_table_storage->checkStructureAndGetMergeTreeData(*this); auto src_data_id = src_data.getStorageID(); String partition_id = getPartitionIDFromQuery(partition, context); diff --git a/dbms/src/Storages/StorageReplicatedMergeTree.h b/dbms/src/Storages/StorageReplicatedMergeTree.h index 928938ccfc6..5589ff1e2b1 100644 --- a/dbms/src/Storages/StorageReplicatedMergeTree.h +++ b/dbms/src/Storages/StorageReplicatedMergeTree.h @@ -73,7 +73,7 @@ namespace DB * as the time will take the time of creation the appropriate part on any of the replicas. */ -class StorageReplicatedMergeTree : public ext::shared_ptr_helper, public MergeTreeData +class StorageReplicatedMergeTree final : public ext::shared_ptr_helper, public MergeTreeData { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index 21b25f65607..a2acab7734b 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -22,7 +22,7 @@ namespace DB * It sends HTTP GET to server when select is called and * HTTP PUT when insert is called. */ -class StorageS3 : public ext::shared_ptr_helper, public IStorage +class StorageS3 final : public ext::shared_ptr_helper, public IStorage { public: StorageS3(const S3::URI & uri, diff --git a/dbms/src/Storages/StorageSet.h b/dbms/src/Storages/StorageSet.h index 41e2e9ccdd6..39c1e3b7952 100644 --- a/dbms/src/Storages/StorageSet.h +++ b/dbms/src/Storages/StorageSet.h @@ -57,7 +57,7 @@ private: * and also written to a file-backup, for recovery after a restart. * Reading from the table is not possible directly - it is possible to specify only the right part of the IN statement. */ -class StorageSet : public ext::shared_ptr_helper, public StorageSetOrJoinBase +class StorageSet final : public ext::shared_ptr_helper, public StorageSetOrJoinBase { friend struct ext::shared_ptr_helper; diff --git a/dbms/src/Storages/StorageStripeLog.h b/dbms/src/Storages/StorageStripeLog.h index ad43fa3a0d3..2ca4f69e742 100644 --- a/dbms/src/Storages/StorageStripeLog.h +++ b/dbms/src/Storages/StorageStripeLog.h @@ -16,7 +16,7 @@ namespace DB /** Implements a table engine that is suitable for small chunks of the log. * In doing so, stores all the columns in a single Native file, with a nearby index. */ -class StorageStripeLog : public ext::shared_ptr_helper, public IStorage +class StorageStripeLog final : public ext::shared_ptr_helper, public IStorage { friend class StripeLogSource; friend class StripeLogBlockOutputStream; diff --git a/dbms/src/Storages/StorageTinyLog.h b/dbms/src/Storages/StorageTinyLog.h index b1599a0ebc0..42531a4c75e 100644 --- a/dbms/src/Storages/StorageTinyLog.h +++ b/dbms/src/Storages/StorageTinyLog.h @@ -15,7 +15,7 @@ namespace DB /** Implements a table engine that is suitable for small chunks of the log. * It differs from StorageLog in the absence of mark files. */ -class StorageTinyLog : public ext::shared_ptr_helper, public IStorage +class StorageTinyLog final : public ext::shared_ptr_helper, public IStorage { friend class TinyLogSource; friend class TinyLogBlockOutputStream; diff --git a/dbms/src/Storages/StorageURL.h b/dbms/src/Storages/StorageURL.h index 805cda3dbad..0ff84bc4d37 100644 --- a/dbms/src/Storages/StorageURL.h +++ b/dbms/src/Storages/StorageURL.h @@ -63,7 +63,7 @@ private: }; -class StorageURL : public ext::shared_ptr_helper, public IStorageURLBase +class StorageURL final : public ext::shared_ptr_helper, public IStorageURLBase { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageValues.h b/dbms/src/Storages/StorageValues.h index f06a7d7388e..1932b2fa37e 100644 --- a/dbms/src/Storages/StorageValues.h +++ b/dbms/src/Storages/StorageValues.h @@ -9,7 +9,7 @@ namespace DB /* One block storage used for values table function * It's structure is similar to IStorageSystemOneBlock */ -class StorageValues : public ext::shared_ptr_helper, public IStorage +class StorageValues final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/StorageView.cpp b/dbms/src/Storages/StorageView.cpp index e99bbcb843a..05feeb7d786 100644 --- a/dbms/src/Storages/StorageView.cpp +++ b/dbms/src/Storages/StorageView.cpp @@ -91,6 +91,28 @@ ASTPtr StorageView::getRuntimeViewQuery(const ASTSelectQuery & outer_query, cons } +static void replaceTableNameWithSubquery(ASTSelectQuery * select_query, ASTPtr & subquery) +{ + auto * select_element = select_query->tables()->children[0]->as(); + + if (!select_element->table_expression) + throw Exception("Logical error: incorrect table expression", ErrorCodes::LOGICAL_ERROR); + + auto * table_expression = select_element->table_expression->as(); + + if (!table_expression->database_and_table_name) + throw Exception("Logical error: incorrect table expression", ErrorCodes::LOGICAL_ERROR); + + const auto alias = table_expression->database_and_table_name->tryGetAlias(); + table_expression->database_and_table_name = {}; + table_expression->subquery = std::make_shared(); + table_expression->subquery->children.push_back(subquery); + table_expression->children.push_back(table_expression->subquery); + if (!alias.empty()) + table_expression->subquery->setAlias(alias); +} + + ASTPtr StorageView::getRuntimeViewQuery(ASTSelectQuery * outer_query, const Context & context, bool normalize) { auto runtime_view_query = inner_query->clone(); @@ -117,28 +139,6 @@ ASTPtr StorageView::getRuntimeViewQuery(ASTSelectQuery * outer_query, const Cont return runtime_view_query; } -void StorageView::replaceTableNameWithSubquery(ASTSelectQuery * select_query, ASTPtr & subquery) -{ - auto * select_element = select_query->tables()->children[0]->as(); - - if (!select_element->table_expression) - throw Exception("Logical error: incorrect table expression", ErrorCodes::LOGICAL_ERROR); - - auto * table_expression = select_element->table_expression->as(); - - if (!table_expression->database_and_table_name) - throw Exception("Logical error: incorrect table expression", ErrorCodes::LOGICAL_ERROR); - - const auto alias = table_expression->database_and_table_name->tryGetAlias(); - table_expression->database_and_table_name = {}; - table_expression->subquery = std::make_shared(); - table_expression->subquery->children.push_back(subquery); - table_expression->children.push_back(table_expression->subquery); - if (!alias.empty()) - table_expression->subquery->setAlias(alias); -} - - void registerStorageView(StorageFactory & factory) { factory.registerStorage("View", [](const StorageFactory::Arguments & args) diff --git a/dbms/src/Storages/StorageView.h b/dbms/src/Storages/StorageView.h index 0e1e90ff612..86550db83ce 100644 --- a/dbms/src/Storages/StorageView.h +++ b/dbms/src/Storages/StorageView.h @@ -10,7 +10,7 @@ namespace DB { -class StorageView : public ext::shared_ptr_helper, public IStorage +class StorageView final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: @@ -36,8 +36,6 @@ public: private: ASTPtr inner_query; - void replaceTableNameWithSubquery(ASTSelectQuery * select_query, ASTPtr & subquery); - protected: StorageView( const StorageID & table_id_, diff --git a/dbms/src/Storages/System/StorageSystemAggregateFunctionCombinators.h b/dbms/src/Storages/System/StorageSystemAggregateFunctionCombinators.h index 86585f0ea73..8d204020160 100644 --- a/dbms/src/Storages/System/StorageSystemAggregateFunctionCombinators.h +++ b/dbms/src/Storages/System/StorageSystemAggregateFunctionCombinators.h @@ -7,7 +7,7 @@ namespace DB { -class StorageSystemAggregateFunctionCombinators : public ext::shared_ptr_helper, +class StorageSystemAggregateFunctionCombinators final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; diff --git a/dbms/src/Storages/System/StorageSystemAsynchronousMetrics.h b/dbms/src/Storages/System/StorageSystemAsynchronousMetrics.h index 77f247f89eb..f31450fe086 100644 --- a/dbms/src/Storages/System/StorageSystemAsynchronousMetrics.h +++ b/dbms/src/Storages/System/StorageSystemAsynchronousMetrics.h @@ -12,7 +12,7 @@ class Context; /** Implements system table asynchronous_metrics, which allows to get values of periodically (asynchronously) updated metrics. */ -class StorageSystemAsynchronousMetrics : public ext::shared_ptr_helper, +class StorageSystemAsynchronousMetrics final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; diff --git a/dbms/src/Storages/System/StorageSystemBuildOptions.h b/dbms/src/Storages/System/StorageSystemBuildOptions.h index d81682765a2..afd27f00bcc 100644 --- a/dbms/src/Storages/System/StorageSystemBuildOptions.h +++ b/dbms/src/Storages/System/StorageSystemBuildOptions.h @@ -12,7 +12,7 @@ class Context; /** System table "build_options" with many params used for clickhouse building */ -class StorageSystemBuildOptions : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemBuildOptions final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; protected: diff --git a/dbms/src/Storages/System/StorageSystemClusters.h b/dbms/src/Storages/System/StorageSystemClusters.h index 8efb148cdb9..4cda7c372b2 100644 --- a/dbms/src/Storages/System/StorageSystemClusters.h +++ b/dbms/src/Storages/System/StorageSystemClusters.h @@ -15,7 +15,7 @@ class Context; * that allows to obtain information about available clusters * (which may be specified in Distributed tables). */ -class StorageSystemClusters : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemClusters final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemCollations.h b/dbms/src/Storages/System/StorageSystemCollations.h index eccce154047..133acd937a1 100644 --- a/dbms/src/Storages/System/StorageSystemCollations.h +++ b/dbms/src/Storages/System/StorageSystemCollations.h @@ -5,7 +5,7 @@ namespace DB { -class StorageSystemCollations : public ext::shared_ptr_helper, +class StorageSystemCollations final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; diff --git a/dbms/src/Storages/System/StorageSystemColumns.h b/dbms/src/Storages/System/StorageSystemColumns.h index 750a1cd93c8..66b423efb96 100644 --- a/dbms/src/Storages/System/StorageSystemColumns.h +++ b/dbms/src/Storages/System/StorageSystemColumns.h @@ -11,7 +11,7 @@ class Context; /** Implements system table 'columns', that allows to get information about columns for every table. */ -class StorageSystemColumns : public ext::shared_ptr_helper, public IStorage +class StorageSystemColumns final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemContributors.h b/dbms/src/Storages/System/StorageSystemContributors.h index 4fc91ce1fbd..0fd77f78655 100644 --- a/dbms/src/Storages/System/StorageSystemContributors.h +++ b/dbms/src/Storages/System/StorageSystemContributors.h @@ -11,7 +11,7 @@ class Context; /** System table "contributors" with list of clickhouse contributors */ -class StorageSystemContributors : public ext::shared_ptr_helper, +class StorageSystemContributors final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; diff --git a/dbms/src/Storages/System/StorageSystemDataTypeFamilies.h b/dbms/src/Storages/System/StorageSystemDataTypeFamilies.h index 44dd59d471f..2a89ae77647 100644 --- a/dbms/src/Storages/System/StorageSystemDataTypeFamilies.h +++ b/dbms/src/Storages/System/StorageSystemDataTypeFamilies.h @@ -6,7 +6,7 @@ namespace DB { -class StorageSystemDataTypeFamilies : public ext::shared_ptr_helper, +class StorageSystemDataTypeFamilies final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; diff --git a/dbms/src/Storages/System/StorageSystemDatabases.h b/dbms/src/Storages/System/StorageSystemDatabases.h index 957fcae2707..fe517c0f651 100644 --- a/dbms/src/Storages/System/StorageSystemDatabases.h +++ b/dbms/src/Storages/System/StorageSystemDatabases.h @@ -12,7 +12,7 @@ class Context; /** Implements `databases` system table, which allows you to get information about all databases. */ -class StorageSystemDatabases : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemDatabases final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemDetachedParts.cpp b/dbms/src/Storages/System/StorageSystemDetachedParts.cpp index aa398a99bab..c64fe7b52b6 100644 --- a/dbms/src/Storages/System/StorageSystemDetachedParts.cpp +++ b/dbms/src/Storages/System/StorageSystemDetachedParts.cpp @@ -18,7 +18,7 @@ namespace DB * We don't use StorageSystemPartsBase, because it introduces virtual _state * column and column aliases which we don't need. */ -class StorageSystemDetachedParts : +class StorageSystemDetachedParts final : public ext::shared_ptr_helper, public IStorage { diff --git a/dbms/src/Storages/System/StorageSystemDictionaries.h b/dbms/src/Storages/System/StorageSystemDictionaries.h index 6b28f03d917..5139ce3c5f6 100644 --- a/dbms/src/Storages/System/StorageSystemDictionaries.h +++ b/dbms/src/Storages/System/StorageSystemDictionaries.h @@ -10,7 +10,7 @@ namespace DB class Context; -class StorageSystemDictionaries : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemDictionaries final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemDisks.h b/dbms/src/Storages/System/StorageSystemDisks.h index 2b910478967..b136a217508 100644 --- a/dbms/src/Storages/System/StorageSystemDisks.h +++ b/dbms/src/Storages/System/StorageSystemDisks.h @@ -14,7 +14,7 @@ class Context; /** Implements the system table `disks`, which allows you to get information about all disks. */ -class StorageSystemDisks : public ext::shared_ptr_helper, public IStorage +class StorageSystemDisks final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemEvents.h b/dbms/src/Storages/System/StorageSystemEvents.h index 88753d3ea4f..f1687e42233 100644 --- a/dbms/src/Storages/System/StorageSystemEvents.h +++ b/dbms/src/Storages/System/StorageSystemEvents.h @@ -11,7 +11,7 @@ class Context; /** Implements `events` system table, which allows you to obtain information for profiling. */ -class StorageSystemEvents : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemEvents final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemFormats.h b/dbms/src/Storages/System/StorageSystemFormats.h index 61bd9ebeb8f..f90839e44e9 100644 --- a/dbms/src/Storages/System/StorageSystemFormats.h +++ b/dbms/src/Storages/System/StorageSystemFormats.h @@ -5,7 +5,7 @@ namespace DB { -class StorageSystemFormats : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemFormats final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; protected: diff --git a/dbms/src/Storages/System/StorageSystemFunctions.h b/dbms/src/Storages/System/StorageSystemFunctions.h index 1ae4483583f..f62d731f288 100644 --- a/dbms/src/Storages/System/StorageSystemFunctions.h +++ b/dbms/src/Storages/System/StorageSystemFunctions.h @@ -13,7 +13,7 @@ class Context; /** Implements `functions`system table, which allows you to get a list * all normal and aggregate functions. */ -class StorageSystemFunctions : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemFunctions final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemGraphite.cpp b/dbms/src/Storages/System/StorageSystemGraphite.cpp index b5023cff7c3..a6163f89182 100644 --- a/dbms/src/Storages/System/StorageSystemGraphite.cpp +++ b/dbms/src/Storages/System/StorageSystemGraphite.cpp @@ -25,10 +25,10 @@ NamesAndTypesList StorageSystemGraphite::getNamesAndTypes() /* * Looking for (Replicated)*GraphiteMergeTree and get all configuration parameters for them */ -StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context & context) const +static StorageSystemGraphite::Configs getConfigs(const Context & context) { const Databases databases = DatabaseCatalog::instance().getDatabases(); - Configs graphite_configs; + StorageSystemGraphite::Configs graphite_configs; for (const auto & db : databases) { @@ -51,7 +51,7 @@ StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context & auto table_id = table_data->getStorageID(); if (!graphite_configs.count(config_name)) { - Config new_config = + StorageSystemGraphite::Config new_config = { table_data->merging_params.graphite_params, { table_id.database_name }, @@ -73,7 +73,7 @@ StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context & void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const { - Configs graphite_configs = StorageSystemGraphite::getConfigs(context); + Configs graphite_configs = getConfigs(context); for (const auto & config : graphite_configs) { diff --git a/dbms/src/Storages/System/StorageSystemGraphite.h b/dbms/src/Storages/System/StorageSystemGraphite.h index 5ba7b7bda8c..703db41dc39 100644 --- a/dbms/src/Storages/System/StorageSystemGraphite.h +++ b/dbms/src/Storages/System/StorageSystemGraphite.h @@ -11,7 +11,7 @@ namespace DB { /// Provides information about Graphite configuration. -class StorageSystemGraphite : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemGraphite final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: @@ -33,7 +33,6 @@ protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override; - StorageSystemGraphite::Configs getConfigs(const Context & context) const; }; } diff --git a/dbms/src/Storages/System/StorageSystemMacros.h b/dbms/src/Storages/System/StorageSystemMacros.h index 09cc8e3a2d9..52336bd6f69 100644 --- a/dbms/src/Storages/System/StorageSystemMacros.h +++ b/dbms/src/Storages/System/StorageSystemMacros.h @@ -13,7 +13,7 @@ class Context; /** Information about macros for introspection. */ -class StorageSystemMacros : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemMacros final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemMergeTreeSettings.h b/dbms/src/Storages/System/StorageSystemMergeTreeSettings.h index cb3668f13ec..ac4d9d27505 100644 --- a/dbms/src/Storages/System/StorageSystemMergeTreeSettings.h +++ b/dbms/src/Storages/System/StorageSystemMergeTreeSettings.h @@ -13,7 +13,7 @@ class Context; /** implements system table "merge_tree_settings", which allows to get information about the current MergeTree settings. */ -class SystemMergeTreeSettings : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class SystemMergeTreeSettings final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemMerges.h b/dbms/src/Storages/System/StorageSystemMerges.h index 6d970b8797f..81c03c4e397 100644 --- a/dbms/src/Storages/System/StorageSystemMerges.h +++ b/dbms/src/Storages/System/StorageSystemMerges.h @@ -13,7 +13,7 @@ namespace DB class Context; -class StorageSystemMerges : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemMerges final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemMetrics.h b/dbms/src/Storages/System/StorageSystemMetrics.h index 86d95800a31..c47bcea656f 100644 --- a/dbms/src/Storages/System/StorageSystemMetrics.h +++ b/dbms/src/Storages/System/StorageSystemMetrics.h @@ -12,7 +12,7 @@ class Context; /** Implements `metrics` system table, which provides information about the operation of the server. */ -class StorageSystemMetrics : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemMetrics final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemModels.h b/dbms/src/Storages/System/StorageSystemModels.h index 8bcb70da35e..cee5200e7de 100644 --- a/dbms/src/Storages/System/StorageSystemModels.h +++ b/dbms/src/Storages/System/StorageSystemModels.h @@ -10,7 +10,7 @@ namespace DB class Context; -class StorageSystemModels : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemModels final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemMutations.h b/dbms/src/Storages/System/StorageSystemMutations.h index 270cef13ef6..f7bc5f6f33c 100644 --- a/dbms/src/Storages/System/StorageSystemMutations.h +++ b/dbms/src/Storages/System/StorageSystemMutations.h @@ -12,7 +12,7 @@ class Context; /// Implements the `mutations` system table, which provides information about the status of mutations /// in the MergeTree tables. -class StorageSystemMutations : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemMutations final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemNumbers.cpp b/dbms/src/Storages/System/StorageSystemNumbers.cpp index 19ae1f1e21f..20dcc58f652 100644 --- a/dbms/src/Storages/System/StorageSystemNumbers.cpp +++ b/dbms/src/Storages/System/StorageSystemNumbers.cpp @@ -106,7 +106,7 @@ private: UInt64 block_size; UInt64 max_counter; - Block createHeader() const + static Block createHeader() { return { ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), "number") }; } diff --git a/dbms/src/Storages/System/StorageSystemNumbers.h b/dbms/src/Storages/System/StorageSystemNumbers.h index 8482e5eeca9..4d205728496 100644 --- a/dbms/src/Storages/System/StorageSystemNumbers.h +++ b/dbms/src/Storages/System/StorageSystemNumbers.h @@ -23,7 +23,7 @@ class Context; * In multithreaded case, if even_distributed is False, implementation with atomic is used, * and result is always in [0 ... limit - 1] range. */ -class StorageSystemNumbers : public ext::shared_ptr_helper, public IStorage +class StorageSystemNumbers final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemOne.h b/dbms/src/Storages/System/StorageSystemOne.h index 8ff0ff2e6b1..9fb3a670949 100644 --- a/dbms/src/Storages/System/StorageSystemOne.h +++ b/dbms/src/Storages/System/StorageSystemOne.h @@ -15,7 +15,7 @@ class Context; * Used when the table is not specified in the query. * Analog of the DUAL table in Oracle and MySQL. */ -class StorageSystemOne : public ext::shared_ptr_helper, public IStorage +class StorageSystemOne final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemParts.h b/dbms/src/Storages/System/StorageSystemParts.h index f7f58daad2e..06f4f414c88 100644 --- a/dbms/src/Storages/System/StorageSystemParts.h +++ b/dbms/src/Storages/System/StorageSystemParts.h @@ -12,7 +12,7 @@ class Context; /** Implements system table 'parts' which allows to get information about data parts for tables of MergeTree family. */ -class StorageSystemParts : public ext::shared_ptr_helper, public StorageSystemPartsBase +class StorageSystemParts final : public ext::shared_ptr_helper, public StorageSystemPartsBase { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemPartsColumns.h b/dbms/src/Storages/System/StorageSystemPartsColumns.h index 1ee90c7cde3..596fd7dae45 100644 --- a/dbms/src/Storages/System/StorageSystemPartsColumns.h +++ b/dbms/src/Storages/System/StorageSystemPartsColumns.h @@ -13,7 +13,7 @@ class Context; /** Implements system table 'parts_columns' which allows to get information about * columns in data parts for tables of MergeTree family. */ -class StorageSystemPartsColumns +class StorageSystemPartsColumns final : public ext::shared_ptr_helper, public StorageSystemPartsBase { friend struct ext::shared_ptr_helper; diff --git a/dbms/src/Storages/System/StorageSystemProcesses.h b/dbms/src/Storages/System/StorageSystemProcesses.h index 735315115c4..62c568970e7 100644 --- a/dbms/src/Storages/System/StorageSystemProcesses.h +++ b/dbms/src/Storages/System/StorageSystemProcesses.h @@ -12,7 +12,7 @@ class Context; /** Implements `processes` system table, which allows you to get information about the queries that are currently executing. */ -class StorageSystemProcesses : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemProcesses final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemQuotaUsage.h b/dbms/src/Storages/System/StorageSystemQuotaUsage.h index f2151b27612..d410decf394 100644 --- a/dbms/src/Storages/System/StorageSystemQuotaUsage.h +++ b/dbms/src/Storages/System/StorageSystemQuotaUsage.h @@ -13,7 +13,7 @@ class Context; /** Implements the `quota_usage` system tables, which allows you to get information about * how the quotas are used by all users. */ -class StorageSystemQuotaUsage : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemQuotaUsage final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { public: std::string getName() const override { return "SystemQuotaUsage"; } diff --git a/dbms/src/Storages/System/StorageSystemQuotas.h b/dbms/src/Storages/System/StorageSystemQuotas.h index 0f54f193654..dda188c7ab7 100644 --- a/dbms/src/Storages/System/StorageSystemQuotas.h +++ b/dbms/src/Storages/System/StorageSystemQuotas.h @@ -12,7 +12,7 @@ class Context; /** Implements the `quotas` system tables, which allows you to get information about quotas. */ -class StorageSystemQuotas : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemQuotas final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { public: std::string getName() const override { return "SystemQuotas"; } diff --git a/dbms/src/Storages/System/StorageSystemReplicas.h b/dbms/src/Storages/System/StorageSystemReplicas.h index 7e75bd4c676..94b0d6c9d06 100644 --- a/dbms/src/Storages/System/StorageSystemReplicas.h +++ b/dbms/src/Storages/System/StorageSystemReplicas.h @@ -12,7 +12,7 @@ class Context; /** Implements `replicas` system table, which provides information about the status of the replicated tables. */ -class StorageSystemReplicas : public ext::shared_ptr_helper, public IStorage +class StorageSystemReplicas final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemReplicationQueue.h b/dbms/src/Storages/System/StorageSystemReplicationQueue.h index 79e015250da..36841fb9be9 100644 --- a/dbms/src/Storages/System/StorageSystemReplicationQueue.h +++ b/dbms/src/Storages/System/StorageSystemReplicationQueue.h @@ -12,7 +12,7 @@ class Context; /** Implements the `replication_queue` system table, which allows you to view the replication queues for the replicated tables. */ -class StorageSystemReplicationQueue : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemReplicationQueue final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemRowPolicies.h b/dbms/src/Storages/System/StorageSystemRowPolicies.h index c28342eb18c..b81020b421c 100644 --- a/dbms/src/Storages/System/StorageSystemRowPolicies.h +++ b/dbms/src/Storages/System/StorageSystemRowPolicies.h @@ -11,7 +11,7 @@ class Context; /// Implements `row_policies` system table, which allows you to get information about row policies. -class StorageSystemRowPolicies : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemRowPolicies final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { public: std::string getName() const override { return "SystemRowPolicies"; } diff --git a/dbms/src/Storages/System/StorageSystemSettings.h b/dbms/src/Storages/System/StorageSystemSettings.h index f745fe32dcf..6cb5e18e1d7 100644 --- a/dbms/src/Storages/System/StorageSystemSettings.h +++ b/dbms/src/Storages/System/StorageSystemSettings.h @@ -12,7 +12,7 @@ class Context; /** implements system table "settings", which allows to get information about the current settings. */ -class StorageSystemSettings : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemSettings final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemStackTrace.cpp b/dbms/src/Storages/System/StorageSystemStackTrace.cpp index b5dfe6560ff..9362a3566cd 100644 --- a/dbms/src/Storages/System/StorageSystemStackTrace.cpp +++ b/dbms/src/Storages/System/StorageSystemStackTrace.cpp @@ -64,7 +64,8 @@ namespace StringRef query_id = CurrentThread::getQueryId(); query_id_size = std::min(query_id.size, max_query_id_size); - memcpy(query_id_data, query_id.data, query_id_size); + if (query_id.data && query_id.size) + memcpy(query_id_data, query_id.data, query_id_size); int notification_num = info->si_value.sival_int; ssize_t res = ::write(notification_pipe.fds_rw[1], ¬ification_num, sizeof(notification_num)); diff --git a/dbms/src/Storages/System/StorageSystemStackTrace.h b/dbms/src/Storages/System/StorageSystemStackTrace.h index 4961d786f59..3f20b1e973c 100644 --- a/dbms/src/Storages/System/StorageSystemStackTrace.h +++ b/dbms/src/Storages/System/StorageSystemStackTrace.h @@ -16,7 +16,7 @@ class Context; /// Allows to introspect stack trace of all server threads. /// It acts like an embedded debugger. /// More than one instance of this table cannot be used. -class StorageSystemStackTrace : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemStackTrace final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemStoragePolicies.h b/dbms/src/Storages/System/StorageSystemStoragePolicies.h index 5f5474280d5..79e89863bf0 100644 --- a/dbms/src/Storages/System/StorageSystemStoragePolicies.h +++ b/dbms/src/Storages/System/StorageSystemStoragePolicies.h @@ -14,7 +14,7 @@ class Context; /** Implements the system table `storage`, which allows you to get information about all disks. */ -class StorageSystemStoragePolicies : public ext::shared_ptr_helper, public IStorage +class StorageSystemStoragePolicies final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemTableEngines.h b/dbms/src/Storages/System/StorageSystemTableEngines.h index 0bbb03d3898..1c080c3040b 100644 --- a/dbms/src/Storages/System/StorageSystemTableEngines.h +++ b/dbms/src/Storages/System/StorageSystemTableEngines.h @@ -7,7 +7,7 @@ namespace DB { -class StorageSystemTableEngines : public ext::shared_ptr_helper, +class StorageSystemTableEngines final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; diff --git a/dbms/src/Storages/System/StorageSystemTableFunctions.h b/dbms/src/Storages/System/StorageSystemTableFunctions.h index 504ecda4afb..95e025b9881 100644 --- a/dbms/src/Storages/System/StorageSystemTableFunctions.h +++ b/dbms/src/Storages/System/StorageSystemTableFunctions.h @@ -6,7 +6,7 @@ namespace DB { -class StorageSystemTableFunctions : public ext::shared_ptr_helper, +class StorageSystemTableFunctions final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; diff --git a/dbms/src/Storages/System/StorageSystemTables.h b/dbms/src/Storages/System/StorageSystemTables.h index 9d65f471c61..bab3aef6e15 100644 --- a/dbms/src/Storages/System/StorageSystemTables.h +++ b/dbms/src/Storages/System/StorageSystemTables.h @@ -12,7 +12,7 @@ class Context; /** Implements the system table `tables`, which allows you to get information about all tables. */ -class StorageSystemTables : public ext::shared_ptr_helper, public IStorage +class StorageSystemTables final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemZeros.h b/dbms/src/Storages/System/StorageSystemZeros.h index 609bd16edd3..3768885d03d 100644 --- a/dbms/src/Storages/System/StorageSystemZeros.h +++ b/dbms/src/Storages/System/StorageSystemZeros.h @@ -14,7 +14,7 @@ namespace DB * You could also specify a limit (how many zeros to give). * If multithreaded is specified, zeros will be generated in several streams. */ -class StorageSystemZeros : public ext::shared_ptr_helper, public IStorage +class StorageSystemZeros final : public ext::shared_ptr_helper, public IStorage { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/Storages/System/StorageSystemZooKeeper.h b/dbms/src/Storages/System/StorageSystemZooKeeper.h index ac0ddd29d1f..06611f61dae 100644 --- a/dbms/src/Storages/System/StorageSystemZooKeeper.h +++ b/dbms/src/Storages/System/StorageSystemZooKeeper.h @@ -12,7 +12,7 @@ class Context; /** Implements `zookeeper` system table, which allows you to view the data in ZooKeeper for debugging purposes. */ -class StorageSystemZooKeeper : public ext::shared_ptr_helper, public IStorageSystemOneBlock +class StorageSystemZooKeeper final : public ext::shared_ptr_helper, public IStorageSystemOneBlock { friend struct ext::shared_ptr_helper; public: diff --git a/dbms/src/TableFunctions/TableFunctionRemote.cpp b/dbms/src/TableFunctions/TableFunctionRemote.cpp index 802c3a66aa5..aff9d8c7dce 100644 --- a/dbms/src/TableFunctions/TableFunctionRemote.cpp +++ b/dbms/src/TableFunctions/TableFunctionRemote.cpp @@ -155,6 +155,7 @@ StoragePtr TableFunctionRemote::executeImpl(const ASTPtr & ast_function, const C std::vector shards = parseRemoteDescription(cluster_description, 0, cluster_description.size(), ',', max_addresses); std::vector> names; + names.reserve(shards.size()); for (const auto & shard : shards) names.push_back(parseRemoteDescription(shard, 0, shard.size(), '|', max_addresses)); diff --git a/dbms/src/TableFunctions/TableFunctionS3.cpp b/dbms/src/TableFunctions/TableFunctionS3.cpp index 04ca48df1df..73121d342e2 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.cpp +++ b/dbms/src/TableFunctions/TableFunctionS3.cpp @@ -84,13 +84,23 @@ StoragePtr TableFunctionS3::getStorage( const ColumnsDescription & columns, Context & global_context, const std::string & table_name, - const String & compression_method) const + const String & compression_method) { Poco::URI uri (source); S3::URI s3_uri (uri); UInt64 min_upload_part_size = global_context.getSettingsRef().s3_min_upload_part_size; - return StorageS3::create(s3_uri, access_key_id, secret_access_key, StorageID(getDatabaseName(), table_name), format, min_upload_part_size, columns, ConstraintsDescription{}, global_context, compression_method); + return StorageS3::create( + s3_uri, + access_key_id, + secret_access_key, + StorageID(getDatabaseName(), table_name), + format, + min_upload_part_size, + columns, + ConstraintsDescription{}, + global_context, + compression_method); } void registerTableFunctionS3(TableFunctionFactory & factory) diff --git a/dbms/src/TableFunctions/TableFunctionS3.h b/dbms/src/TableFunctions/TableFunctionS3.h index 76d96e56c74..a49033da1b4 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.h +++ b/dbms/src/TableFunctions/TableFunctionS3.h @@ -29,7 +29,7 @@ private: const Context & context, const std::string & table_name) const override; - StoragePtr getStorage( + static StoragePtr getStorage( const String & source, const String & access_key_id, const String & secret_access_key, @@ -37,7 +37,7 @@ private: const ColumnsDescription & columns, Context & global_context, const std::string & table_name, - const String & compression_method) const; + const String & compression_method); }; } diff --git a/dbms/tests/clickhouse-test b/dbms/tests/clickhouse-test index 5f987890aaa..df92b84f10c 100755 --- a/dbms/tests/clickhouse-test +++ b/dbms/tests/clickhouse-test @@ -523,7 +523,7 @@ if __name__ == '__main__': parser.add_argument('test', nargs='*', help='Optional test case name regex') parser.add_argument('-d', '--disabled', action='store_true', default=False, help='Also run disabled tests') parser.add_argument('--stop', action='store_true', default=None, dest='stop', help='Stop on network errors') - parser.add_argument('--order', default='desc', help='Run order (asc, desc, random)') + parser.add_argument('--order', default='desc', choices=['asc', 'desc', 'random'], help='Run order') parser.add_argument('--testname', action='store_true', default=None, dest='testname', help='Make query with test name before test run') parser.add_argument('--hung-check', action='store_true', default=False) parser.add_argument('--force-color', action='store_true', default=False) diff --git a/dbms/tests/integration/README.md b/dbms/tests/integration/README.md index 95001e3cfa7..64d8b29e35a 100644 --- a/dbms/tests/integration/README.md +++ b/dbms/tests/integration/README.md @@ -86,13 +86,23 @@ test_odbc_interaction/test.py ...... [100%] You can just open shell inside a container by overwritting the command: ./runner --command=bash +### Rebuilding the docker containers + +The main container used for integration tests lives in `docker/test/integration/Dockerfile`. Rebuild it with +``` +cd docker/test/integration +docker build -t yandex/clickhouse-integration-test . +``` + +The helper container used by the `runner` script is in `dbms/tests/integration/image/Dockerfile`. + ### Adding new tests To add new test named `foo`, create a directory `test_foo` with an empty `__init__.py` and a file named `test.py` containing tests in it. All functions with names starting with `test` will become test cases. `helpers` directory contains utilities for: -* Launching a ClickHouse cluster with or without ZooKeeper indocker containers. +* Launching a ClickHouse cluster with or without ZooKeeper in docker containers. * Sending queries to launched instances. * Introducing network failures such as severing network link between two instances. diff --git a/dbms/tests/integration/helpers/cluster.py b/dbms/tests/integration/helpers/cluster.py index 3537475ce96..51925083d56 100644 --- a/dbms/tests/integration/helpers/cluster.py +++ b/dbms/tests/integration/helpers/cluster.py @@ -13,6 +13,7 @@ import urllib import xml.dom.minidom import logging import docker +import pprint import psycopg2 import pymongo import pymysql @@ -302,6 +303,7 @@ class ClickHouseCluster: print "Can't connect to MySQL " + str(ex) time.sleep(0.5) + subprocess_call(['docker-compose', 'ps', '--services', '--all']) raise Exception("Cannot wait MySQL container") def wait_postgres_to_start(self, timeout=60): @@ -705,7 +707,16 @@ class ClickHouseInstance: output = output.decode('utf8') exit_code = self.docker_client.api.exec_inspect(exec_id)['ExitCode'] if exit_code: - raise Exception('Cmd "{}" failed! Return code {}. Output: {}'.format(' '.join(cmd), exit_code, output)) + container_info = self.docker_client.api.inspect_container(container.id) + image_id = container_info.get('Image') + image_info = self.docker_client.api.inspect_image(image_id) + print("Command failed in container {}: ".format(container.id)) + pprint.pprint(container_info) + print("") + print("Container {} uses image {}: ".format(container.id, image_id)) + pprint.pprint(image_info) + print("") + raise Exception('Cmd "{}" failed in container {}. Return code {}. Output: {}'.format(' '.join(cmd), container.id, exit_code, output)) return output def contains_in_log(self, substring): diff --git a/dbms/tests/integration/runner b/dbms/tests/integration/runner index 071df8b1fd0..cd148d1fe72 100755 --- a/dbms/tests/integration/runner +++ b/dbms/tests/integration/runner @@ -20,6 +20,11 @@ def check_args_and_update_paths(args): if not os.path.isabs(args.binary): args.binary = os.path.abspath(os.path.join(CURRENT_WORK_DIR, args.binary)) + if not args.bridge_binary: + args.bridge_binary = os.path.join(os.path.dirname(args.binary), 'clickhouse-odbc-bridge') + elif not os.path.isabs(args.bridge_binary): + args.bridge_binary = os.path.abspath(os.path.join(CURRENT_WORK_DIR, args.bridge_binary)) + if not os.path.isabs(args.configs_dir): args.configs_dir = os.path.abspath(os.path.join(CURRENT_WORK_DIR, args.configs_dir)) @@ -47,8 +52,8 @@ if __name__ == "__main__": parser.add_argument( "--bridge-binary", - default=os.environ.get("CLICKHOUSE_TESTS_ODBC_BRIDGE_BIN_PATH", "/usr/bin/clickhouse-odbc-bridge"), - help="Path to clickhouse-odbc-bridge binary") + default=os.environ.get("CLICKHOUSE_TESTS_ODBC_BRIDGE_BIN_PATH", ""), + help="Path to clickhouse-odbc-bridge binary. Defaults to clickhouse-odbc-bridge in the same dir as clickhouse.") parser.add_argument( "--configs-dir", @@ -106,5 +111,5 @@ if __name__ == "__main__": command=args.command ) - #print(cmd) + print("Running pytest container as: '" + cmd + "'.") subprocess.check_call(cmd, shell=True) diff --git a/dbms/tests/integration/test_dictionaries_ddl/test.py b/dbms/tests/integration/test_dictionaries_ddl/test.py index ad50a8c7daf..c5df6e68dd2 100644 --- a/dbms/tests/integration/test_dictionaries_ddl/test.py +++ b/dbms/tests/integration/test_dictionaries_ddl/test.py @@ -214,3 +214,32 @@ def test_file_dictionary_restrictions(started_cluster): node3.query("SELECT dictGetString('test.restricted_file_dictionary', 'value', toUInt64(1))") except QueryRuntimeException as ex: assert 'is not inside' in str(ex) + + +def test_dictionary_with_where(started_cluster): + mysql_conn = create_mysql_conn("root", "clickhouse", "localhost", 3308) + execute_mysql_query(mysql_conn, "CREATE DATABASE IF NOT EXISTS clickhouse") + execute_mysql_query(mysql_conn, "CREATE TABLE clickhouse.special_table (key_field1 int, value1 text, PRIMARY KEY (key_field1))") + execute_mysql_query(mysql_conn, "INSERT INTO clickhouse.special_table VALUES (1, 'abcabc'), (2, 'qweqwe')") + + node1.query(""" + CREATE DICTIONARY default.special_dict ( + key_field1 Int32, + value1 String DEFAULT 'xxx' + ) + PRIMARY KEY key_field1 + SOURCE(MYSQL( + USER 'root' + PASSWORD 'clickhouse' + DB 'clickhouse' + TABLE 'special_table' + REPLICA(PRIORITY 1 HOST 'mysql1' PORT 3306) + WHERE 'value1 = \\'qweqwe\\' OR value1 = \\'\\\\u3232\\'' + )) + LAYOUT(FLAT()) + LIFETIME(MIN 1 MAX 3) + """) + + node1.query("SYSTEM RELOAD DICTIONARY default.special_dict") + + assert node1.query("SELECT dictGetString('default.special_dict', 'value1', toUInt64(2))") == 'qweqwe\n' diff --git a/dbms/tests/integration/test_dictionaries_update_and_reload/test.py b/dbms/tests/integration/test_dictionaries_update_and_reload/test.py index 3fd7f2ac0c2..6f3100b5078 100644 --- a/dbms/tests/integration/test_dictionaries_update_and_reload/test.py +++ b/dbms/tests/integration/test_dictionaries_update_and_reload/test.py @@ -111,6 +111,10 @@ def test_reload_after_loading(started_cluster): assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "10\n" # Change the dictionaries' data. + # FIXME we sleep before this, because Poco 1.x has one-second granularity + # for mtime, and clickhouse will miss the update if we change the file too + # soon. Should probably be fixed by switching to use std::filesystem. + time.sleep(1) replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '8', '81') replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '10', '101') @@ -124,6 +128,7 @@ def test_reload_after_loading(started_cluster): assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "101\n" # SYSTEM RELOAD DICTIONARIES reloads all loaded dictionaries. + time.sleep(1) # see the comment above replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '81', '82') replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '101', '102') query("SYSTEM RELOAD DICTIONARIES") @@ -131,9 +136,11 @@ def test_reload_after_loading(started_cluster): assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "102\n" # Configuration files are reloaded and lifetimes are checked automatically once in 5 seconds. + # Wait slightly more, to be sure it did reload. + time.sleep(1) # see the comment above replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '82', '83') replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '102', '103') - time.sleep(5) + time.sleep(7) assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "103\n" assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "83\n" diff --git a/dbms/tests/integration/test_merge_tree_s3/configs/config.d/bg_processing_pool_conf.xml b/dbms/tests/integration/test_merge_tree_s3/configs/config.d/bg_processing_pool_conf.xml new file mode 100644 index 00000000000..a756c4434ea --- /dev/null +++ b/dbms/tests/integration/test_merge_tree_s3/configs/config.d/bg_processing_pool_conf.xml @@ -0,0 +1,5 @@ + + 0.5 + 0.5 + 0.5 + diff --git a/dbms/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml b/dbms/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml new file mode 100644 index 00000000000..5b292446c6b --- /dev/null +++ b/dbms/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml @@ -0,0 +1,28 @@ + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + local + / + + + + + +
+ s3 +
+ + hdd + +
+
+
+
+
diff --git a/dbms/tests/integration/test_merge_tree_s3/configs/config.d/users.xml b/dbms/tests/integration/test_merge_tree_s3/configs/config.d/users.xml new file mode 100644 index 00000000000..a13b24b278d --- /dev/null +++ b/dbms/tests/integration/test_merge_tree_s3/configs/config.d/users.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/dbms/tests/integration/test_merge_tree_s3/configs/config.xml b/dbms/tests/integration/test_merge_tree_s3/configs/config.xml index 63b4d951eb7..24b7344df3a 100644 --- a/dbms/tests/integration/test_merge_tree_s3/configs/config.xml +++ b/dbms/tests/integration/test_merge_tree_s3/configs/config.xml @@ -1,25 +1,5 @@ - - trace - /var/log/clickhouse-server/clickhouse-server.log - /var/log/clickhouse-server/clickhouse-server.err.log - 1000M - 10 - - - - - - s3 - http://minio1:9001/root/data/ - minio - minio123 - - - - - 9000 127.0.0.1 diff --git a/dbms/tests/integration/test_merge_tree_s3/configs/users.xml b/dbms/tests/integration/test_merge_tree_s3/configs/users.xml deleted file mode 100644 index 6061af8e33d..00000000000 --- a/dbms/tests/integration/test_merge_tree_s3/configs/users.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - - - - - - - - - ::/0 - - default - default - - - - - - - - diff --git a/dbms/tests/integration/test_merge_tree_s3/test.py b/dbms/tests/integration/test_merge_tree_s3/test.py index c79745642a0..631d69911ff 100644 --- a/dbms/tests/integration/test_merge_tree_s3/test.py +++ b/dbms/tests/integration/test_merge_tree_s3/test.py @@ -1,6 +1,7 @@ import logging import random import string +import time import pytest from helpers.cluster import ClickHouseCluster @@ -36,56 +37,212 @@ def cluster(): cluster.shutdown() +FILES_OVERHEAD = 1 +FILES_OVERHEAD_PER_COLUMN = 2 # Data and mark files +FILES_OVERHEAD_PER_PART_WIDE = FILES_OVERHEAD_PER_COLUMN * 3 + 2 + 6 +FILES_OVERHEAD_PER_PART_COMPACT = 10 + + def random_string(length): letters = string.ascii_letters return ''.join(random.choice(letters) for i in range(length)) -def generate_values(date_str, count): - data = [[date_str, i, random_string(10)] for i in range(count)] +def generate_values(date_str, count, sign=1): + data = [[date_str, sign*(i + 1), random_string(10)] for i in range(count)] data.sort(key=lambda tup: tup[1]) return ",".join(["('{}',{},'{}')".format(x, y, z) for x, y, z in data]) -@pytest.mark.parametrize( - "min_rows_for_wide_part,files_overhead,files_per_part", - [ - (0, 1, 14), - (8192, 1, 10) - ] -) -def test_log_family_s3(cluster, min_rows_for_wide_part, files_overhead, files_per_part): +def create_table(cluster, additional_settings=None): node = cluster.instances["node"] - minio = cluster.minio_client - node.query( - """ + create_table_statement = """ CREATE TABLE s3_test( dt Date, - id UInt64, + id Int64, data String, INDEX min_max (id) TYPE minmax GRANULARITY 3 ) ENGINE=MergeTree() PARTITION BY dt ORDER BY (dt, id) - SETTINGS disable_background_merges='true', index_granularity=512, min_rows_for_wide_part={} + SETTINGS + old_parts_lifetime=0, index_granularity=512 """ - .format(min_rows_for_wide_part) - ) - assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == 1 - values1 = generate_values('2020-01-03', 4096) - node.query("INSERT INTO s3_test VALUES {}".format(values1)) - assert node.query("SELECT * FROM s3_test order by dt, id FORMAT Values") == values1 - assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == files_overhead + files_per_part + if additional_settings: + create_table_statement += "," + create_table_statement += additional_settings - values2 = generate_values('2020-01-04', 4096) - node.query("INSERT INTO s3_test VALUES {}".format(values2)) - assert node.query("SELECT * FROM s3_test ORDER BY dt, id FORMAT Values") == values1 + "," + values2 - assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == files_overhead + 2 * files_per_part + node.query(create_table_statement) - assert node.query("SELECT count(*) FROM s3_test where id = 0 FORMAT Values") == "(2)" + +@pytest.fixture(autouse=True) +def drop_table(cluster): + yield + node = cluster.instances["node"] + minio = cluster.minio_client node.query("DROP TABLE s3_test") assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == 0 + +@pytest.mark.parametrize( + "min_rows_for_wide_part,files_per_part", + [ + (0, FILES_OVERHEAD_PER_PART_WIDE), + (8192, FILES_OVERHEAD_PER_PART_COMPACT) + ] +) +def test_simple_insert_select(cluster, min_rows_for_wide_part, files_per_part): + create_table(cluster, "min_rows_for_wide_part={}".format(min_rows_for_wide_part)) + + node = cluster.instances["node"] + minio = cluster.minio_client + + values1 = generate_values('2020-01-03', 4096) + node.query("INSERT INTO s3_test VALUES {}".format(values1)) + assert node.query("SELECT * FROM s3_test order by dt, id FORMAT Values") == values1 + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + files_per_part + + values2 = generate_values('2020-01-04', 4096) + node.query("INSERT INTO s3_test VALUES {}".format(values2)) + assert node.query("SELECT * FROM s3_test ORDER BY dt, id FORMAT Values") == values1 + "," + values2 + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + files_per_part*2 + + assert node.query("SELECT count(*) FROM s3_test where id = 1 FORMAT Values") == "(2)" + + +@pytest.mark.parametrize( + "merge_vertical", [False, True] +) +def test_insert_same_partition_and_merge(cluster, merge_vertical): + settings = None + if merge_vertical: + settings = """ + vertical_merge_algorithm_min_rows_to_activate=0, + vertical_merge_algorithm_min_columns_to_activate=0 + """ + create_table(cluster, settings) + + node = cluster.instances["node"] + minio = cluster.minio_client + + node.query("SYSTEM STOP MERGES s3_test") + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 1024))) + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 2048))) + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 4096))) + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 1024, -1))) + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 2048, -1))) + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 4096, -1))) + assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)" + assert node.query("SELECT count(distinct(id)) FROM s3_test FORMAT Values") == "(8192)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD_PER_PART_WIDE*6 + FILES_OVERHEAD + + node.query("SYSTEM START MERGES s3_test") + # Wait for merges and old parts deletion + time.sleep(3) + + assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)" + assert node.query("SELECT count(distinct(id)) FROM s3_test FORMAT Values") == "(8192)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD_PER_PART_WIDE + FILES_OVERHEAD + + +def test_alter_table_columns(cluster): + create_table(cluster) + + node = cluster.instances["node"] + minio = cluster.minio_client + + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 4096))) + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 4096, -1))) + + node.query("ALTER TABLE s3_test ADD COLUMN col1 UInt64 DEFAULT 1") + # To ensure parts have merged + node.query("OPTIMIZE TABLE s3_test") + + # Wait for merges, mutations and old parts deletion + time.sleep(3) + + assert node.query("SELECT sum(col1) FROM s3_test FORMAT Values") == "(8192)" + assert node.query("SELECT sum(col1) FROM s3_test WHERE id > 0 FORMAT Values") == "(4096)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + FILES_OVERHEAD_PER_COLUMN + + node.query("ALTER TABLE s3_test MODIFY COLUMN col1 String") + assert node.query("SELECT distinct(col1) FROM s3_test FORMAT Values") == "('1')" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + FILES_OVERHEAD_PER_COLUMN + + node.query("ALTER TABLE s3_test DROP COLUMN col1") + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + + +def test_attach_detach_partition(cluster): + create_table(cluster) + + node = cluster.instances["node"] + minio = cluster.minio_client + + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 4096))) + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-04', 4096))) + assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE*2 + + node.query("ALTER TABLE s3_test DETACH PARTITION '2020-01-03'") + assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(4096)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE*2 + + node.query("ALTER TABLE s3_test ATTACH PARTITION '2020-01-03'") + assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE*2 + + node.query("ALTER TABLE s3_test DROP PARTITION '2020-01-03'") + assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(4096)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + + node.query("ALTER TABLE s3_test DETACH PARTITION '2020-01-04'") + node.query("SET allow_drop_detached=1; ALTER TABLE s3_test DROP DETACHED PARTITION '2020-01-04'") + assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + + +def test_move_partition(cluster): + create_table(cluster) + + node = cluster.instances["node"] + minio = cluster.minio_client + + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 4096))) + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-04', 4096))) + assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE*2 + + node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-04' TO DISK 'hdd'") + assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + + +def test_table_manipulations(cluster): + create_table(cluster) + + node = cluster.instances["node"] + minio = cluster.minio_client + + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 4096))) + node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-04', 4096))) + + node.query("RENAME TABLE s3_test TO s3_renamed") + assert node.query("SELECT count(*) FROM s3_renamed FORMAT Values") == "(8192)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE*2 + node.query("RENAME TABLE s3_renamed TO s3_test") + + # TODO: Doesn't work with min_max index. + #assert node.query("SET check_query_single_value_result='false'; CHECK TABLE s3_test FORMAT Values") == "(1)" + + node.query("DETACH TABLE s3_test") + node.query("ATTACH TABLE s3_test") + assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE*2 + + node.query("TRUNCATE TABLE s3_test") + assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)" + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == FILES_OVERHEAD diff --git a/dbms/tests/integration/test_odbc_interaction/test.py b/dbms/tests/integration/test_odbc_interaction/test.py index f4af04b5fbc..1e876e507f4 100644 --- a/dbms/tests/integration/test_odbc_interaction/test.py +++ b/dbms/tests/integration/test_odbc_interaction/test.py @@ -10,7 +10,7 @@ from helpers.cluster import ClickHouseCluster SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs')) -node1 = cluster.add_instance('node1', with_odbc_drivers=True, with_mysql=True, image='alesapin/ubuntu_with_odbc', main_configs=['configs/dictionaries/sqlite3_odbc_hashed_dictionary.xml', 'configs/dictionaries/sqlite3_odbc_cached_dictionary.xml', 'configs/dictionaries/postgres_odbc_hashed_dictionary.xml'], stay_alive=True) +node1 = cluster.add_instance('node1', with_odbc_drivers=True, with_mysql=True, image='yandex/clickhouse-integration-test', main_configs=['configs/dictionaries/sqlite3_odbc_hashed_dictionary.xml', 'configs/dictionaries/sqlite3_odbc_cached_dictionary.xml', 'configs/dictionaries/postgres_odbc_hashed_dictionary.xml'], stay_alive=True) create_table_sql_template = """ CREATE TABLE `clickhouse`.`{}` ( @@ -219,5 +219,10 @@ def test_bridge_dies_with_parent(started_cluster): time.sleep(1) # just for sure, that odbc-bridge caught signal bridge_pid = node1.get_process_pid("odbc-bridge") + if bridge_pid: + out = node1.exec_in_container(["gdb", "-p", str(bridge_pid), "--ex", "thread apply all bt", "--ex", "q"], privileged=True, user='root') + print("Bridge is running, gdb output:") + print(out) + assert clickhouse_pid is None assert bridge_pid is None diff --git a/dbms/tests/performance/arithmetic.xml b/dbms/tests/performance/arithmetic.xml index a2e7c654fc8..c62d9010882 100644 --- a/dbms/tests/performance/arithmetic.xml +++ b/dbms/tests/performance/arithmetic.xml @@ -62,7 +62,16 @@ ; - SELECT count() FROM nums WHERE NOT ignore({op}({arg}, {arg})) + SELECT count() FROM nums WHERE NOT ignore( + {op}({arg}, {arg}), + {op}({arg}, {arg} + 1), + {op}({arg}, {arg} + 2), + {op}({arg}, {arg} + 3), + {op}({arg}, {arg} + 4), + {op}({arg}, {arg} + 5), + {op}({arg}, {arg} + 6), + {op}({arg}, {arg} + 7) + ) DROP TABLE nums diff --git a/dbms/tests/performance/general_purpose_hashes.xml b/dbms/tests/performance/general_purpose_hashes.xml index 4746271d664..458e646f3a7 100644 --- a/dbms/tests/performance/general_purpose_hashes.xml +++ b/dbms/tests/performance/general_purpose_hashes.xml @@ -47,15 +47,15 @@ table_slow - zeros(100000) - zeros_mt(1000000) + zeros(1000000) + zeros_mt(10000000) table - numbers(10000000) - numbers_mt(100000000) + numbers(100000000) + numbers_mt(1000000000) diff --git a/dbms/tests/performance/synthetic_hardware_benchmark.xml b/dbms/tests/performance/synthetic_hardware_benchmark.xml index 22cb79dcc2b..fc910077c9f 100644 --- a/dbms/tests/performance/synthetic_hardware_benchmark.xml +++ b/dbms/tests/performance/synthetic_hardware_benchmark.xml @@ -19,47 +19,49 @@ Мы запускаем этот запрос и наблюдаем, с какой скоростью он выполняется. Через несколько секунд, когда скорость стабилизируется, прерываем выполнение. В качестве скорости выполнения запроса указывается количество обработанных исходных (прочитанных из таблицы) данных в единицу времени. Например, в таблице numbers читаемые нами данные - это числа типа UInt64 (8 байт). Если мы обрабатываем миллиард таких чисел в секунду, то отобразится скорость - 8 GB/sec. --> -SELECT count() FROM zeros(100000000) WHERE NOT ignore(rand()) -SELECT count() FROM zeros_mt(1600000000) WHERE NOT ignore(rand()) +SELECT count() FROM zeros( 100000000) WHERE NOT ignore(rand()) +SELECT count() FROM zeros_mt(1600000000) WHERE NOT ignore(rand()) -SELECT count() FROM numbers(100000000) WHERE NOT ignore(intHash64(number)) +SELECT count() FROM numbers( 100000000) WHERE NOT ignore(intHash64(number)) SELECT count() FROM numbers_mt(1600000000) WHERE NOT ignore(intHash64(number)) -SELECT count() FROM numbers(100000000) WHERE NOT ignore(intHash32(number)) +SELECT count() FROM numbers( 100000000) WHERE NOT ignore(intHash32(number)) SELECT count() FROM numbers_mt(1600000000) WHERE NOT ignore(intHash32(number)) -SELECT count() FROM numbers(100000000) WHERE NOT ignore(toString(number)) -SELECT count() FROM numbers_mt(1600000000) WHERE NOT ignore(toString(number)) +SELECT count() FROM numbers( 10000000) WHERE NOT ignore(toString(number)) +SELECT count() FROM numbers_mt(160000000) WHERE NOT ignore(toString(number)) -SELECT count() FROM numbers(100000000) WHERE NOT ignore(reinterpretAsString(number)) +SELECT count() FROM numbers( 100000000) WHERE NOT ignore(reinterpretAsString(number)) SELECT count() FROM numbers_mt(1600000000) WHERE NOT ignore(reinterpretAsString(number)) -SELECT count() FROM numbers(100000000) WHERE NOT ignore(number / 7) +SELECT count() FROM numbers( 100000000) WHERE NOT ignore(number / 7) SELECT count() FROM numbers_mt(1600000000) WHERE NOT ignore(number / 7) -SELECT count() FROM numbers(100000000) WHERE NOT ignore(number % 7) +SELECT count() FROM numbers( 100000000) WHERE NOT ignore(number % 7) SELECT count() FROM numbers_mt(1600000000) WHERE NOT ignore(number % 7) -SELECT count() FROM numbers(100000000) WHERE NOT ignore(number % 34908756) +SELECT count() FROM numbers( 100000000) WHERE NOT ignore(number % 34908756) SELECT count() FROM numbers_mt(1600000000) WHERE NOT ignore(number % 34908756) -SELECT number % 1000 AS k, count() FROM numbers(100000000) GROUP BY k -SELECT number % 1000 AS k, count() FROM numbers_mt(1600000000) GROUP BY k +SELECT number % 1000 AS k, count() FROM numbers( 100000000) GROUP BY k FORMAT Null +SELECT number % 1000 AS k, count() FROM numbers_mt(1600000000) GROUP BY k FORMAT Null -SELECT number % 100000 AS k, count() FROM numbers(100000000) GROUP BY k -SELECT number % 100000 AS k, count() FROM numbers_mt(1600000000) GROUP BY k +SELECT number % 100000 AS k, count() FROM numbers( 10000000) GROUP BY k FORMAT Null +SELECT number % 100000 AS k, count() FROM numbers_mt(160000000) GROUP BY k FORMAT Null -SELECT number % 1000000 AS k, count() FROM numbers(100000000) GROUP BY k -SELECT number % 1000000 AS k, count() FROM numbers_mt(1600000000) GROUP BY k +SELECT number % 1000000 AS k, count() FROM numbers( 10000000) GROUP BY k FORMAT Null +SELECT number % 1000000 AS k, count() FROM numbers_mt(160000000) GROUP BY k FORMAT Null -SELECT number % 10000000 AS k, count() FROM numbers(100000000) GROUP BY k -SELECT number % 10000000 AS k, count() FROM numbers_mt(1600000000) GROUP BY k +SELECT number % 10000000 AS k, count() FROM numbers( 10000000) GROUP BY k FORMAT Null +SELECT number % 10000000 AS k, count() FROM numbers_mt(80000000) GROUP BY k FORMAT Null -SELECT number % 500000000 AS k, count() FROM numbers(100000000) GROUP BY k -SELECT number % 500000000 AS k, count() FROM numbers_mt(1600000000) GROUP BY k + +SELECT number % toUInt32(1e8) AS k, count() FROM numbers_mt(toUInt32(1e8)) GROUP BY k FORMAT Null -SELECT count() FROM zeros(10000000) WHERE NOT ignore(materialize('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx') AS s, concat(s,s,s,s,s,s,s,s,s,s) AS t, concat(t,t,t,t,t,t,t,t,t,t) AS u) SETTINGS max_block_size = 1000 +SELECT count() FROM zeros(1000000) WHERE NOT ignore(materialize('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx') AS s, concat(s,s,s,s,s,s,s,s,s,s) AS t, concat(t,t,t,t,t,t,t,t,t,t) AS u) SETTINGS max_block_size = 1000 diff --git a/dbms/tests/queries/0_stateless/01045_zookeeper_system_mutations_with_parts_names.sh b/dbms/tests/queries/0_stateless/01045_zookeeper_system_mutations_with_parts_names.sh index 28d4d61f811..3d6c916fffc 100755 --- a/dbms/tests/queries/0_stateless/01045_zookeeper_system_mutations_with_parts_names.sh +++ b/dbms/tests/queries/0_stateless/01045_zookeeper_system_mutations_with_parts_names.sh @@ -5,6 +5,17 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . $CURDIR/mergetree_mutations.lib +function wait_mutation_to_start() +{ + query_wait=`$CLICKHOUSE_CLIENT --query="SELECT length(parts_to_do_names) FROM system.mutations where table = '$1'" 2>&1` + + while [ "$query_wait" == "0" ] || [ -z "$query_wait" ] + do + query_wait=`$CLICKHOUSE_CLIENT --query="SELECT length(parts_to_do_names) FROM system.mutations where table = '$1'" 2>&1` + sleep 0.5 + done +} + ${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS table_for_mutations" ${CLICKHOUSE_CLIENT} --query="CREATE TABLE table_for_mutations(k UInt32, v1 UInt64) ENGINE MergeTree ORDER BY k PARTITION BY modulo(k, 2)" @@ -19,6 +30,8 @@ ${CLICKHOUSE_CLIENT} --query="ALTER TABLE table_for_mutations UPDATE v1 = v1 + 1 ${CLICKHOUSE_CLIENT} --query="SELECT is_done, parts_to_do_names, parts_to_do FROM system.mutations where table = 'table_for_mutations'" +wait_mutation_to_start "table_for_mutations" + ${CLICKHOUSE_CLIENT} --query="SYSTEM START MERGES" wait_for_mutation "table_for_mutations" "mutation_3.txt" @@ -42,6 +55,8 @@ ${CLICKHOUSE_CLIENT} --query="SELECT sum(v1) FROM replicated_table_for_mutations ${CLICKHOUSE_CLIENT} --query="ALTER TABLE replicated_table_for_mutations UPDATE v1 = v1 + 1 WHERE 1" +wait_mutation_to_start "replicated_table_for_mutations" + ${CLICKHOUSE_CLIENT} --query="SELECT is_done, parts_to_do_names, parts_to_do FROM system.mutations where table = 'replicated_table_for_mutations'" ${CLICKHOUSE_CLIENT} --query="SYSTEM START MERGES" diff --git a/dbms/tests/queries/0_stateless/01050_group_array_sample.sql b/dbms/tests/queries/0_stateless/01050_group_array_sample.sql index 395ab9d41b6..8c7c6a9648c 100644 --- a/dbms/tests/queries/0_stateless/01050_group_array_sample.sql +++ b/dbms/tests/queries/0_stateless/01050_group_array_sample.sql @@ -1,4 +1,4 @@ -select k, groupArraySample(10)(v) from (select number % 4 as k, number as v from numbers(1024)) group by k; +select k, groupArraySample(10, 123456)(v) from (select number % 4 as k, number as v from numbers(1024)) group by k; -- different seed select k, groupArraySample(10, 1)(v) from (select number % 4 as k, number as v from numbers(1024)) group by k; diff --git a/dbms/tests/queries/0_stateless/01055_compact_parts_1.sql b/dbms/tests/queries/0_stateless/01055_compact_parts_1.sql index 91941b90860..c2c2ab041e7 100644 --- a/dbms/tests/queries/0_stateless/01055_compact_parts_1.sql +++ b/dbms/tests/queries/0_stateless/01055_compact_parts_1.sql @@ -1,4 +1,5 @@ drop table if exists mt_compact; +drop table if exists mt_compact_2; create table mt_compact (a Int, s String) engine = MergeTree order by a partition by a settings index_granularity_bytes = 0; @@ -11,6 +12,7 @@ insert into mt_compact_2 values (1, 'a'); alter table mt_compact attach partition 1 from mt_compact_2; -- { serverError 36 } drop table mt_compact; +drop table mt_compact_2; set send_logs_level = 'error'; create table mt_compact (a Int, s String) engine = MergeTree order by a partition by a diff --git a/dbms/tests/queries/0_stateless/01100_split_by_string.reference b/dbms/tests/queries/0_stateless/01100_split_by_string.reference new file mode 100644 index 00000000000..7a0947ebd3f --- /dev/null +++ b/dbms/tests/queries/0_stateless/01100_split_by_string.reference @@ -0,0 +1,9 @@ +['cde','cde'] +['','cde','cde',''] +['','','',''] +['',''] +['a','b','c','d','e'] +['hello','world'] +['gbye','bug'] +[''] +[] diff --git a/dbms/tests/queries/0_stateless/01100_split_by_string.sql b/dbms/tests/queries/0_stateless/01100_split_by_string.sql new file mode 100644 index 00000000000..5dfe392f829 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01100_split_by_string.sql @@ -0,0 +1,8 @@ +select splitByString('ab', 'cdeabcde'); +select splitByString('ab', 'abcdeabcdeab'); +select splitByString('ab', 'ababab'); +select splitByString('ababab', 'ababab'); +select splitByString('', 'abcde'); +select splitByString(', ', x) from (select arrayJoin(['hello, world', 'gbye, bug']) x); +select splitByString('ab', ''); +select splitByString('', ''); diff --git a/dbms/tests/queries/0_stateless/01200_mutations_memory_consumption.reference b/dbms/tests/queries/0_stateless/01200_mutations_memory_consumption.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01200_mutations_memory_consumption.reference @@ -0,0 +1 @@ +1 diff --git a/dbms/tests/queries/0_stateless/01200_mutations_memory_consumption.sql b/dbms/tests/queries/0_stateless/01200_mutations_memory_consumption.sql new file mode 100644 index 00000000000..3c9d14b58cb --- /dev/null +++ b/dbms/tests/queries/0_stateless/01200_mutations_memory_consumption.sql @@ -0,0 +1,25 @@ +DROP TABLE IF EXISTS table_with_pk; + +CREATE TABLE table_with_pk +( + key UInt8, + value String +) +ENGINE = MergeTree +ORDER BY key; + +INSERT INTO table_with_pk SELECT number, toString(number % 10) FROM numbers(10000000); + +ALTER TABLE table_with_pk DELETE WHERE key % 77 = 0 SETTINGS mutations_sync = 1; + +SYSTEM FLUSH LOGS; + +-- Memory usage for all mutations must be almost constant and less than +-- read_bytes. +SELECT + DISTINCT read_bytes >= peak_memory_usage +FROM + system.part_log +WHERE event_type = 'MutatePart' AND table = 'table_with_pk' AND database = currentDatabase(); + +DROP TABLE IF EXISTS table_with_pk; diff --git a/dbms/tests/queries/0_stateless/01201_drop_column_compact_part_replicated.reference b/dbms/tests/queries/0_stateless/01201_drop_column_compact_part_replicated.reference new file mode 100644 index 00000000000..2b02239775d --- /dev/null +++ b/dbms/tests/queries/0_stateless/01201_drop_column_compact_part_replicated.reference @@ -0,0 +1,22 @@ +0 0 1167657 [0,0,0] ['qwqw'] baz +0 0 645645 [1,2] ['qwqw'] 0 +0 0 804292 [1,2] ['qwqw'] 3 +1 1 1409675 [1,2] ['qwqw'] 1 +1 1 1568322 [1,2] ['qwqw'] 4 +1 1 2072334 [0,0,0] ['qwqw'] bar +2 4 40262 [1,2] ['qwqw'] 2 +2 4 843568 [0,0,0] ['qwqw'] baz +3 9 1748245 [0,0,0] ['qwqw'] bar +4 16 519479 [0,0,0] ['qwqw'] baz +===================== +2 42 40262 [1,2] ['qwqw'] 2 +2 42 843568 [0,0,0] ['qwqw'] baz +3 42 1748245 [0,0,0] ['qwqw'] bar +4 42 519479 [0,0,0] ['qwqw'] baz +5 42 1424156 [0,0,0] ['qwqw'] bar +6 42 195390 [0,0,0] ['qwqw'] baz +7 42 1100067 [0,0,0] ['qwqw'] bar +8 42 2004744 [0,0,0] ['qwqw'] baz +9 42 775978 [0,0,0] ['qwqw'] bar +10 42 1680655 [0,0,0] ['qwqw'] baz +===================== diff --git a/dbms/tests/queries/0_stateless/01201_drop_column_compact_part_replicated.sql b/dbms/tests/queries/0_stateless/01201_drop_column_compact_part_replicated.sql new file mode 100644 index 00000000000..53cf940de6d --- /dev/null +++ b/dbms/tests/queries/0_stateless/01201_drop_column_compact_part_replicated.sql @@ -0,0 +1,25 @@ +-- Testing basic functionality with compact parts +set replication_alter_partitions_sync = 2; +drop table if exists mt_compact; + +create table mt_compact(a UInt64, b UInt64 DEFAULT a * a, s String, n Nested(x UInt32, y String), lc LowCardinality(String)) +engine = ReplicatedMergeTree('/clickhouse/test/mt_compact_replicated', '1') +order by a partition by a % 10 +settings index_granularity = 8, +min_rows_for_wide_part = 10; + +insert into mt_compact (a, s, n.y, lc) select number, toString((number * 2132214234 + 5434543) % 2133443), ['a', 'b', 'c'], number % 2 ? 'bar' : 'baz' from numbers(90); + +insert into mt_compact (a, s, n.x, lc) select number % 3, toString((number * 75434535 + 645645) % 2133443), [1, 2], toString(number) from numbers(5); + +alter table mt_compact drop column n.y; +alter table mt_compact add column n.y Array(String) DEFAULT ['qwqw'] after n.x; +select * from mt_compact order by a, s limit 10; +select '====================='; + +alter table mt_compact update b = 42 where 1 SETTINGS mutations_sync = 2; + +select * from mt_compact where a > 1 order by a, s limit 10; +select '====================='; + +drop table if exists mt_compact; diff --git a/dbms/tests/queries/1_stateful/00152_insert_different_granularity.sql b/dbms/tests/queries/1_stateful/00152_insert_different_granularity.sql index e690daadb6e..5ca34bbe48e 100644 --- a/dbms/tests/queries/1_stateful/00152_insert_different_granularity.sql +++ b/dbms/tests/queries/1_stateful/00152_insert_different_granularity.sql @@ -8,7 +8,8 @@ INSERT INTO fixed_granularity_table SELECT * FROM test.hits LIMIT 10; -- should INSERT INTO fixed_granularity_table SELECT * FROM test.hits LIMIT 10; -OPTIMIZE TABLE fixed_granularity_table FINAL; -- and even after optimize +-- We have removed testing of OPTIMIZE because it's too heavy on very slow builds (debug + coverage + thread fuzzer with sleeps) +-- OPTIMIZE TABLE fixed_granularity_table FINAL; -- and even after optimize DETACH TABLE fixed_granularity_table; @@ -39,7 +40,7 @@ ALTER TABLE test.hits DETACH PARTITION 201403; ALTER TABLE test.hits ATTACH PARTITION 201403; -OPTIMIZE TABLE test.hits; +-- OPTIMIZE TABLE test.hits; SELECT count() FROM test.hits; diff --git a/docker/test/performance-comparison/report.py b/docker/test/performance-comparison/report.py index df28251f015..f48adbb841d 100755 --- a/docker/test/performance-comparison/report.py +++ b/docker/test/performance-comparison/report.py @@ -274,6 +274,7 @@ if unstable_queries: error_tests += slow_average_tests if error_tests: + status = 'failure' message_array.append(str(error_tests) + ' errors') if message_array: diff --git a/docker/test/stateful/Dockerfile b/docker/test/stateful/Dockerfile index 59e3c037265..6a5a1ded478 100644 --- a/docker/test/stateful/Dockerfile +++ b/docker/test/stateful/Dockerfile @@ -47,4 +47,4 @@ CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \ && clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" \ && clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" \ && clickhouse-client --query "SHOW TABLES FROM test" \ - && clickhouse-test --testname --shard --zookeeper --no-stateless $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt + && clickhouse-test --testname --shard --zookeeper --no-stateless $ADDITIONAL_OPTIONS $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt diff --git a/docker/test/stateful_with_coverage/run.sh b/docker/test/stateful_with_coverage/run.sh index cf5eb0ee598..1e89350b3c9 100755 --- a/docker/test/stateful_with_coverage/run.sh +++ b/docker/test/stateful_with_coverage/run.sh @@ -98,7 +98,7 @@ LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-client --query "SHOW TABL LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-client --query "SHOW TABLES FROM test" -LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-test --testname --shard --zookeeper --no-stateless $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt +LLVM_PROFILE_FILE='client_%h_%p_%m.profraw' clickhouse-test --testname --shard --zookeeper --no-stateless $ADDITIONAL_OPTIONS $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt kill_clickhouse diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index 017b53036cf..6e5870a3560 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -1,4 +1,4 @@ -# docker build -t yandex/clickhouse-stateless-test . +# docker build -t yandex/clickhouse-stateless-test . FROM yandex/clickhouse-deb-builder ARG odbc_driver_url="https://github.com/ClickHouse/clickhouse-odbc/releases/download/v1.1.4.20200302/clickhouse-odbc-1.1.4-Linux.tar.gz" diff --git a/docs/dscr-templates/template-function.md b/docs/dscr-templates/template-function.md index c4f4973e9ac..1acf92cb501 100644 --- a/docs/dscr-templates/template-function.md +++ b/docs/dscr-templates/template-function.md @@ -4,7 +4,7 @@ Short description. **Syntax** (without SELECT) -```sql +``` sql ``` @@ -14,12 +14,12 @@ More text (Optional). **Parameters** (Optional) -- `x` — Description. [Type name](relative/path/to/type/dscr.md#type). -- `y` — Description. [Type name](relative/path/to/type/dscr.md#type). +- `x` — Description. [Type name](relative/path/to/type/dscr.md#type). +- `y` — Description. [Type name](relative/path/to/type/dscr.md#type). **Returned value(s)** -- Returned values list. +- Returned values list. Type: [Type](relative/path/to/type/dscr.md#type). @@ -29,20 +29,19 @@ The example must show usage and/or a use cases. The following text contains reco Input table (Optional): -```text +``` text ``` Query: -```sql +``` sql ``` Result: -```text +``` text ``` **See Also** (Optional) -- [link](#) - +- [link](#) diff --git a/docs/dscr-templates/template-setting.md b/docs/dscr-templates/template-setting.md index c0c1e05d3f1..b7a7674177e 100644 --- a/docs/dscr-templates/template-setting.md +++ b/docs/dscr-templates/template-setting.md @@ -2,19 +2,19 @@ Description. -For switcher setting, use the typical phrase: "Enables or disables something ...". +For switcher setting, use the typical phrase: “Enables or disables something …”. Possible values: *For switcher setting:* -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. *For another setting (typical phrases):* -- Positive integer. -- 0 — Disabled or unlimited or smth. else. +- Positive integer. +- 0 — Disabled or unlimited or smth. else. Default value: `value`. @@ -24,4 +24,4 @@ The name of additional section can be any, for example **Usage**. **See Also** (Optional) -- [link](#) +- [link](#) diff --git a/docs/dscr-templates/template-system-table.md b/docs/dscr-templates/template-system-table.md index 8bcd1f27c1e..23b1176f598 100644 --- a/docs/dscr-templates/template-system-table.md +++ b/docs/dscr-templates/template-system-table.md @@ -1,21 +1,21 @@ -## system.table_name {#system_tables-table_name} +## system.table\_name {#system-tables-table-name} Description. Columns: -- `column_name` ([data_type_name](data_type_uri)) — Description. +- `column_name` ([data\_type\_name](data_type_uri)) — Description. **Example** -```sql +``` sql SELECT * FROM system.table_name ``` -```text +``` text Some output. It shouldn't be long. ``` **See Also** -- [Article name](uri) — Some words about referenced information. +- [Article name](uri) — Some words about referenced information. diff --git a/docs/dscr-templates/template-table-engine.md b/docs/dscr-templates/template-table-engine.md index 822944e9ecf..8ab627abfcb 100644 --- a/docs/dscr-templates/template-table-engine.md +++ b/docs/dscr-templates/template-table-engine.md @@ -1,51 +1,49 @@ -# EngineName +# EngineName {#enginename} -- What the engine does. -- Relations with other engines if they exist. +- What the engine does. +- Relations with other engines if they exist. -## Creating a Table +## Creating a Table {#creating-a-table} -``` -Syntax codeblock -``` + Syntax codeblock **Engine Parameters** **Query Clauses** - -## Virtual columns +## Virtual columns {#virtual-columns} If they exist. -## Specifics and recommendations +## Specifics and recommendations {#specifics-and-recommendations} + Algorithms Specifics of read/write processes Examples of tasks Recommendations for usage Specifics of data storage -## Usage Example +## Usage Example {#usage-example} The example must show usage and/or a use cases. The following text contains recommended parts of an example. Input table: -```text +``` text ``` Query: -```sql +``` sql ``` Result: -```text +``` text ``` Any text clarifying the example. -## See Also +## See Also {#see-also} -- [link](#) +- [link](#) diff --git a/docs/en/changelog/2017.md b/docs/en/changelog/2017.md index e6d6d6bec10..9bfdd83e381 100644 --- a/docs/en/changelog/2017.md +++ b/docs/en/changelog/2017.md @@ -1,262 +1,261 @@ - -### ClickHouse release 1.1.54327, 2017-12-21 +### ClickHouse release 1.1.54327, 2017-12-21 {#clickhouse-release-1-1-54327-2017-12-21} This release contains bug fixes for the previous release 1.1.54318: -* Fixed bug with possible race condition in replication that could lead to data loss. This issue affects versions 1.1.54310 and 1.1.54318. If you use one of these versions with Replicated tables, the update is strongly recommended. This issue shows in logs in Warning messages like ` Part ... from own log doesn't exist.` The issue is relevant even if you don't see these messages in logs. +- Fixed bug with possible race condition in replication that could lead to data loss. This issue affects versions 1.1.54310 and 1.1.54318. If you use one of these versions with Replicated tables, the update is strongly recommended. This issue shows in logs in Warning messages like `Part ... from own log doesn't exist.` The issue is relevant even if you don’t see these messages in logs. -### ClickHouse release 1.1.54318, 2017-11-30 +### ClickHouse release 1.1.54318, 2017-11-30 {#clickhouse-release-1-1-54318-2017-11-30} This release contains bug fixes for the previous release 1.1.54310: -* Fixed incorrect row deletions during merges in the SummingMergeTree engine -* Fixed a memory leak in unreplicated MergeTree engines -* Fixed performance degradation with frequent inserts in MergeTree engines -* Fixed an issue that was causing the replication queue to stop running -* Fixed rotation and archiving of server logs +- Fixed incorrect row deletions during merges in the SummingMergeTree engine +- Fixed a memory leak in unreplicated MergeTree engines +- Fixed performance degradation with frequent inserts in MergeTree engines +- Fixed an issue that was causing the replication queue to stop running +- Fixed rotation and archiving of server logs -### ClickHouse release 1.1.54310, 2017-11-01 +### ClickHouse release 1.1.54310, 2017-11-01 {#clickhouse-release-1-1-54310-2017-11-01} -#### New features: +#### New features: {#new-features} -* Custom partitioning key for the MergeTree family of table engines. -* [Kafka](https://clickhouse.yandex/docs/en/operations/table_engines/kafka/) table engine. -* Added support for loading [CatBoost](https://catboost.yandex/) models and applying them to data stored in ClickHouse. -* Added support for time zones with non-integer offsets from UTC. -* Added support for arithmetic operations with time intervals. -* The range of values for the Date and DateTime types is extended to the year 2105. -* Added the ` CREATE MATERIALIZED VIEW x TO y` query (specifies an existing table for storing the data of a materialized view). -* Added the `ATTACH TABLE` query without arguments. -* The processing logic for Nested columns with names ending in -Map in a SummingMergeTree table was extracted to the sumMap aggregate function. You can now specify such columns explicitly. -* Max size of the IP trie dictionary is increased to 128M entries. -* Added the getSizeOfEnumType function. -* Added the sumWithOverflow aggregate function. -* Added support for the Cap'n Proto input format. -* You can now customize compression level when using the zstd algorithm. +- Custom partitioning key for the MergeTree family of table engines. +- [Kafka](https://clickhouse.yandex/docs/en/operations/table_engines/kafka/) table engine. +- Added support for loading [CatBoost](https://catboost.yandex/) models and applying them to data stored in ClickHouse. +- Added support for time zones with non-integer offsets from UTC. +- Added support for arithmetic operations with time intervals. +- The range of values for the Date and DateTime types is extended to the year 2105. +- Added the `CREATE MATERIALIZED VIEW x TO y` query (specifies an existing table for storing the data of a materialized view). +- Added the `ATTACH TABLE` query without arguments. +- The processing logic for Nested columns with names ending in -Map in a SummingMergeTree table was extracted to the sumMap aggregate function. You can now specify such columns explicitly. +- Max size of the IP trie dictionary is increased to 128M entries. +- Added the getSizeOfEnumType function. +- Added the sumWithOverflow aggregate function. +- Added support for the Cap’n Proto input format. +- You can now customize compression level when using the zstd algorithm. -#### Backward incompatible changes: +#### Backward incompatible changes: {#backward-incompatible-changes} -* Creation of temporary tables with an engine other than Memory is not allowed. -* Explicit creation of tables with the View or MaterializedView engine is not allowed. -* During table creation, a new check verifies that the sampling key expression is included in the primary key. +- Creation of temporary tables with an engine other than Memory is not allowed. +- Explicit creation of tables with the View or MaterializedView engine is not allowed. +- During table creation, a new check verifies that the sampling key expression is included in the primary key. -#### Bug fixes: +#### Bug fixes: {#bug-fixes} -* Fixed hangups when synchronously inserting into a Distributed table. -* Fixed nonatomic adding and removing of parts in Replicated tables. -* Data inserted into a materialized view is not subjected to unnecessary deduplication. -* Executing a query to a Distributed table for which the local replica is lagging and remote replicas are unavailable does not result in an error anymore. -* Users don't need access permissions to the `default` database to create temporary tables anymore. -* Fixed crashing when specifying the Array type without arguments. -* Fixed hangups when the disk volume containing server logs is full. -* Fixed an overflow in the toRelativeWeekNum function for the first week of the Unix epoch. +- Fixed hangups when synchronously inserting into a Distributed table. +- Fixed nonatomic adding and removing of parts in Replicated tables. +- Data inserted into a materialized view is not subjected to unnecessary deduplication. +- Executing a query to a Distributed table for which the local replica is lagging and remote replicas are unavailable does not result in an error anymore. +- Users don’t need access permissions to the `default` database to create temporary tables anymore. +- Fixed crashing when specifying the Array type without arguments. +- Fixed hangups when the disk volume containing server logs is full. +- Fixed an overflow in the toRelativeWeekNum function for the first week of the Unix epoch. -#### Build improvements: +#### Build improvements: {#build-improvements} -* Several third-party libraries (notably Poco) were updated and converted to git submodules. +- Several third-party libraries (notably Poco) were updated and converted to git submodules. -### ClickHouse release 1.1.54304, 2017-10-19 +### ClickHouse release 1.1.54304, 2017-10-19 {#clickhouse-release-1-1-54304-2017-10-19} -#### New features: +#### New features: {#new-features-1} -* TLS support in the native protocol (to enable, set `tcp_ssl_port` in `config.xml` ). +- TLS support in the native protocol (to enable, set `tcp_ssl_port` in `config.xml` ). -#### Bug fixes: +#### Bug fixes: {#bug-fixes-1} -* `ALTER` for replicated tables now tries to start running as soon as possible. -* Fixed crashing when reading data with the setting `preferred_block_size_bytes=0.` -* Fixed crashes of `clickhouse-client` when pressing ` Page Down` -* Correct interpretation of certain complex queries with `GLOBAL IN` and `UNION ALL` -* `FREEZE PARTITION` always works atomically now. -* Empty POST requests now return a response with code 411. -* Fixed interpretation errors for expressions like `CAST(1 AS Nullable(UInt8)).` -* Fixed an error when reading `Array(Nullable(String))` columns from `MergeTree` tables. -* Fixed crashing when parsing queries like `SELECT dummy AS dummy, dummy AS b` -* Users are updated correctly with invalid `users.xml` -* Correct handling when an executable dictionary returns a non-zero response code. +- `ALTER` for replicated tables now tries to start running as soon as possible. +- Fixed crashing when reading data with the setting `preferred_block_size_bytes=0.` +- Fixed crashes of `clickhouse-client` when pressing `Page Down` +- Correct interpretation of certain complex queries with `GLOBAL IN` and `UNION ALL` +- `FREEZE PARTITION` always works atomically now. +- Empty POST requests now return a response with code 411. +- Fixed interpretation errors for expressions like `CAST(1 AS Nullable(UInt8)).` +- Fixed an error when reading `Array(Nullable(String))` columns from `MergeTree` tables. +- Fixed crashing when parsing queries like `SELECT dummy AS dummy, dummy AS b` +- Users are updated correctly with invalid `users.xml` +- Correct handling when an executable dictionary returns a non-zero response code. -### ClickHouse release 1.1.54292, 2017-09-20 +### ClickHouse release 1.1.54292, 2017-09-20 {#clickhouse-release-1-1-54292-2017-09-20} -#### New features: +#### New features: {#new-features-2} -* Added the `pointInPolygon` function for working with coordinates on a coordinate plane. -* Added the `sumMap` aggregate function for calculating the sum of arrays, similar to `SummingMergeTree`. -* Added the `trunc` function. Improved performance of the rounding functions (`round`, `floor`, `ceil`, `roundToExp2`) and corrected the logic of how they work. Changed the logic of the `roundToExp2` function for fractions and negative numbers. -* The ClickHouse executable file is now less dependent on the libc version. The same ClickHouse executable file can run on a wide variety of Linux systems. There is still a dependency when using compiled queries (with the setting ` compile = 1` , which is not used by default). -* Reduced the time needed for dynamic compilation of queries. +- Added the `pointInPolygon` function for working with coordinates on a coordinate plane. +- Added the `sumMap` aggregate function for calculating the sum of arrays, similar to `SummingMergeTree`. +- Added the `trunc` function. Improved performance of the rounding functions (`round`, `floor`, `ceil`, `roundToExp2`) and corrected the logic of how they work. Changed the logic of the `roundToExp2` function for fractions and negative numbers. +- The ClickHouse executable file is now less dependent on the libc version. The same ClickHouse executable file can run on a wide variety of Linux systems. There is still a dependency when using compiled queries (with the setting `compile = 1` , which is not used by default). +- Reduced the time needed for dynamic compilation of queries. -#### Bug fixes: +#### Bug fixes: {#bug-fixes-2} -* Fixed an error that sometimes produced ` part ... intersects previous part` messages and weakened replica consistency. -* Fixed an error that caused the server to lock up if ZooKeeper was unavailable during shutdown. -* Removed excessive logging when restoring replicas. -* Fixed an error in the UNION ALL implementation. -* Fixed an error in the concat function that occurred if the first column in a block has the Array type. -* Progress is now displayed correctly in the system.merges table. +- Fixed an error that sometimes produced `part ... intersects previous part` messages and weakened replica consistency. +- Fixed an error that caused the server to lock up if ZooKeeper was unavailable during shutdown. +- Removed excessive logging when restoring replicas. +- Fixed an error in the UNION ALL implementation. +- Fixed an error in the concat function that occurred if the first column in a block has the Array type. +- Progress is now displayed correctly in the system.merges table. -### ClickHouse release 1.1.54289, 2017-09-13 +### ClickHouse release 1.1.54289, 2017-09-13 {#clickhouse-release-1-1-54289-2017-09-13} -#### New features: +#### New features: {#new-features-3} -* `SYSTEM` queries for server administration: `SYSTEM RELOAD DICTIONARY`, `SYSTEM RELOAD DICTIONARIES`, `SYSTEM DROP DNS CACHE`, `SYSTEM SHUTDOWN`, `SYSTEM KILL`. -* Added functions for working with arrays: `concat`, `arraySlice`, `arrayPushBack`, `arrayPushFront`, `arrayPopBack`, `arrayPopFront`. -* Added `root` and `identity` parameters for the ZooKeeper configuration. This allows you to isolate individual users on the same ZooKeeper cluster. -* Added aggregate functions `groupBitAnd`, `groupBitOr`, and `groupBitXor` (for compatibility, they are also available under the names `BIT_AND`, `BIT_OR`, and `BIT_XOR`). -* External dictionaries can be loaded from MySQL by specifying a socket in the filesystem. -* External dictionaries can be loaded from MySQL over SSL (`ssl_cert`, `ssl_key`, `ssl_ca` parameters). -* Added the `max_network_bandwidth_for_user` setting to restrict the overall bandwidth use for queries per user. -* Support for `DROP TABLE` for temporary tables. -* Support for reading `DateTime` values in Unix timestamp format from the `CSV` and `JSONEachRow` formats. -* Lagging replicas in distributed queries are now excluded by default (the default threshold is 5 minutes). -* FIFO locking is used during ALTER: an ALTER query isn't blocked indefinitely for continuously running queries. -* Option to set `umask` in the config file. -* Improved performance for queries with `DISTINCT` . +- `SYSTEM` queries for server administration: `SYSTEM RELOAD DICTIONARY`, `SYSTEM RELOAD DICTIONARIES`, `SYSTEM DROP DNS CACHE`, `SYSTEM SHUTDOWN`, `SYSTEM KILL`. +- Added functions for working with arrays: `concat`, `arraySlice`, `arrayPushBack`, `arrayPushFront`, `arrayPopBack`, `arrayPopFront`. +- Added `root` and `identity` parameters for the ZooKeeper configuration. This allows you to isolate individual users on the same ZooKeeper cluster. +- Added aggregate functions `groupBitAnd`, `groupBitOr`, and `groupBitXor` (for compatibility, they are also available under the names `BIT_AND`, `BIT_OR`, and `BIT_XOR`). +- External dictionaries can be loaded from MySQL by specifying a socket in the filesystem. +- External dictionaries can be loaded from MySQL over SSL (`ssl_cert`, `ssl_key`, `ssl_ca` parameters). +- Added the `max_network_bandwidth_for_user` setting to restrict the overall bandwidth use for queries per user. +- Support for `DROP TABLE` for temporary tables. +- Support for reading `DateTime` values in Unix timestamp format from the `CSV` and `JSONEachRow` formats. +- Lagging replicas in distributed queries are now excluded by default (the default threshold is 5 minutes). +- FIFO locking is used during ALTER: an ALTER query isn’t blocked indefinitely for continuously running queries. +- Option to set `umask` in the config file. +- Improved performance for queries with `DISTINCT` . -#### Bug fixes: +#### Bug fixes: {#bug-fixes-3} -* Improved the process for deleting old nodes in ZooKeeper. Previously, old nodes sometimes didn't get deleted if there were very frequent inserts, which caused the server to be slow to shut down, among other things. -* Fixed randomization when choosing hosts for the connection to ZooKeeper. -* Fixed the exclusion of lagging replicas in distributed queries if the replica is localhost. -* Fixed an error where a data part in a `ReplicatedMergeTree` table could be broken after running ` ALTER MODIFY` on an element in a `Nested` structure. -* Fixed an error that could cause SELECT queries to "hang". -* Improvements to distributed DDL queries. -* Fixed the query `CREATE TABLE ... AS `. -* Resolved the deadlock in the ` ALTER ... CLEAR COLUMN IN PARTITION` query for `Buffer` tables. -* Fixed the invalid default value for `Enum` s (0 instead of the minimum) when using the `JSONEachRow` and `TSKV` formats. -* Resolved the appearance of zombie processes when using a dictionary with an `executable` source. -* Fixed segfault for the HEAD query. +- Improved the process for deleting old nodes in ZooKeeper. Previously, old nodes sometimes didn’t get deleted if there were very frequent inserts, which caused the server to be slow to shut down, among other things. +- Fixed randomization when choosing hosts for the connection to ZooKeeper. +- Fixed the exclusion of lagging replicas in distributed queries if the replica is localhost. +- Fixed an error where a data part in a `ReplicatedMergeTree` table could be broken after running `ALTER MODIFY` on an element in a `Nested` structure. +- Fixed an error that could cause SELECT queries to “hang”. +- Improvements to distributed DDL queries. +- Fixed the query `CREATE TABLE ... AS `. +- Resolved the deadlock in the `ALTER ... CLEAR COLUMN IN PARTITION` query for `Buffer` tables. +- Fixed the invalid default value for `Enum` s (0 instead of the minimum) when using the `JSONEachRow` and `TSKV` formats. +- Resolved the appearance of zombie processes when using a dictionary with an `executable` source. +- Fixed segfault for the HEAD query. -#### Improved workflow for developing and assembling ClickHouse: +#### Improved workflow for developing and assembling ClickHouse: {#improved-workflow-for-developing-and-assembling-clickhouse} -* You can use `pbuilder` to build ClickHouse. -* You can use `libc++` instead of `libstdc++` for builds on Linux. -* Added instructions for using static code analysis tools: `Coverage`, `clang-tidy`, `cppcheck`. +- You can use `pbuilder` to build ClickHouse. +- You can use `libc++` instead of `libstdc++` for builds on Linux. +- Added instructions for using static code analysis tools: `Coverage`, `clang-tidy`, `cppcheck`. -#### Please note when upgrading: +#### Please note when upgrading: {#please-note-when-upgrading} -* There is now a higher default value for the MergeTree setting `max_bytes_to_merge_at_max_space_in_pool` (the maximum total size of data parts to merge, in bytes): it has increased from 100 GiB to 150 GiB. This might result in large merges running after the server upgrade, which could cause an increased load on the disk subsystem. If the free space available on the server is less than twice the total amount of the merges that are running, this will cause all other merges to stop running, including merges of small data parts. As a result, INSERT queries will fail with the message "Merges are processing significantly slower than inserts." Use the ` SELECT * FROM system.merges` query to monitor the situation. You can also check the `DiskSpaceReservedForMerge` metric in the `system.metrics` table, or in Graphite. You don't need to do anything to fix this, since the issue will resolve itself once the large merges finish. If you find this unacceptable, you can restore the previous value for the `max_bytes_to_merge_at_max_space_in_pool` setting. To do this, go to the section in config.xml, set ```107374182400` and restart the server. +- There is now a higher default value for the MergeTree setting `max_bytes_to_merge_at_max_space_in_pool` (the maximum total size of data parts to merge, in bytes): it has increased from 100 GiB to 150 GiB. This might result in large merges running after the server upgrade, which could cause an increased load on the disk subsystem. If the free space available on the server is less than twice the total amount of the merges that are running, this will cause all other merges to stop running, including merges of small data parts. As a result, INSERT queries will fail with the message “Merges are processing significantly slower than inserts.” Use the `SELECT * FROM system.merges` query to monitor the situation. You can also check the `DiskSpaceReservedForMerge` metric in the `system.metrics` table, or in Graphite. You don’t need to do anything to fix this, since the issue will resolve itself once the large merges finish. If you find this unacceptable, you can restore the previous value for the `max_bytes_to_merge_at_max_space_in_pool` setting. To do this, go to the section in config.xml, set ``` ``107374182400 ``` and restart the server. -### ClickHouse release 1.1.54284, 2017-08-29 +### ClickHouse release 1.1.54284, 2017-08-29 {#clickhouse-release-1-1-54284-2017-08-29} -* This is a bugfix release for the previous 1.1.54282 release. It fixes leaks in the parts directory in ZooKeeper. +- This is a bugfix release for the previous 1.1.54282 release. It fixes leaks in the parts directory in ZooKeeper. -### ClickHouse release 1.1.54282, 2017-08-23 +### ClickHouse release 1.1.54282, 2017-08-23 {#clickhouse-release-1-1-54282-2017-08-23} This release contains bug fixes for the previous release 1.1.54276: -* Fixed `DB::Exception: Assertion violation: !_path.empty()` when inserting into a Distributed table. -* Fixed parsing when inserting in RowBinary format if input data starts with';'. -* Errors during runtime compilation of certain aggregate functions (e.g. `groupArray()`). +- Fixed `DB::Exception: Assertion violation: !_path.empty()` when inserting into a Distributed table. +- Fixed parsing when inserting in RowBinary format if input data starts with’;’. +- Errors during runtime compilation of certain aggregate functions (e.g. `groupArray()`). -### Clickhouse Release 1.1.54276, 2017-08-16 +### Clickhouse Release 1.1.54276, 2017-08-16 {#clickhouse-release-1-1-54276-2017-08-16} -#### New features: +#### New features: {#new-features-4} -* Added an optional WITH section for a SELECT query. Example query: `WITH 1+1 AS a SELECT a, a*a` -* INSERT can be performed synchronously in a Distributed table: OK is returned only after all the data is saved on all the shards. This is activated by the setting insert_distributed_sync=1. -* Added the UUID data type for working with 16-byte identifiers. -* Added aliases of CHAR, FLOAT and other types for compatibility with the Tableau. -* Added the functions toYYYYMM, toYYYYMMDD, and toYYYYMMDDhhmmss for converting time into numbers. -* You can use IP addresses (together with the hostname) to identify servers for clustered DDL queries. -* Added support for non-constant arguments and negative offsets in the function `substring(str, pos, len).` -* Added the max_size parameter for the `groupArray(max_size)(column)` aggregate function, and optimized its performance. +- Added an optional WITH section for a SELECT query. Example query: `WITH 1+1 AS a SELECT a, a*a` +- INSERT can be performed synchronously in a Distributed table: OK is returned only after all the data is saved on all the shards. This is activated by the setting insert\_distributed\_sync=1. +- Added the UUID data type for working with 16-byte identifiers. +- Added aliases of CHAR, FLOAT and other types for compatibility with the Tableau. +- Added the functions toYYYYMM, toYYYYMMDD, and toYYYYMMDDhhmmss for converting time into numbers. +- You can use IP addresses (together with the hostname) to identify servers for clustered DDL queries. +- Added support for non-constant arguments and negative offsets in the function `substring(str, pos, len).` +- Added the max\_size parameter for the `groupArray(max_size)(column)` aggregate function, and optimized its performance. -#### Main changes: +#### Main changes: {#main-changes} -* Security improvements: all server files are created with 0640 permissions (can be changed via config parameter). -* Improved error messages for queries with invalid syntax. -* Significantly reduced memory consumption and improved performance when merging large sections of MergeTree data. -* Significantly increased the performance of data merges for the ReplacingMergeTree engine. -* Improved performance for asynchronous inserts from a Distributed table by combining multiple source inserts. To enable this functionality, use the setting distributed_directory_monitor_batch_inserts=1. +- Security improvements: all server files are created with 0640 permissions (can be changed via config parameter). +- Improved error messages for queries with invalid syntax. +- Significantly reduced memory consumption and improved performance when merging large sections of MergeTree data. +- Significantly increased the performance of data merges for the ReplacingMergeTree engine. +- Improved performance for asynchronous inserts from a Distributed table by combining multiple source inserts. To enable this functionality, use the setting distributed\_directory\_monitor\_batch\_inserts=1. -#### Backward incompatible changes: +#### Backward incompatible changes: {#backward-incompatible-changes-1} -* Changed the binary format of aggregate states of `groupArray(array_column)` functions for arrays. +- Changed the binary format of aggregate states of `groupArray(array_column)` functions for arrays. -#### Complete list of changes: +#### Complete list of changes: {#complete-list-of-changes} -* Added the `output_format_json_quote_denormals` setting, which enables outputting nan and inf values in JSON format. -* Optimized stream allocation when reading from a Distributed table. -* Settings can be configured in readonly mode if the value doesn't change. -* Added the ability to retrieve non-integer granules of the MergeTree engine in order to meet restrictions on the block size specified in the preferred_block_size_bytes setting. The purpose is to reduce the consumption of RAM and increase cache locality when processing queries from tables with large columns. -* Efficient use of indexes that contain expressions like `toStartOfHour(x)` for conditions like `toStartOfHour(x) op сonstexpr.` -* Added new settings for MergeTree engines (the merge_tree section in config.xml): - - replicated_deduplication_window_seconds sets the number of seconds allowed for deduplicating inserts in Replicated tables. - - cleanup_delay_period sets how often to start cleanup to remove outdated data. - - replicated_can_become_leader can prevent a replica from becoming the leader (and assigning merges). -* Accelerated cleanup to remove outdated data from ZooKeeper. -* Multiple improvements and fixes for clustered DDL queries. Of particular interest is the new setting distributed_ddl_task_timeout, which limits the time to wait for a response from the servers in the cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. -* Improved display of stack traces in the server logs. -* Added the "none" value for the compression method. -* You can use multiple dictionaries_config sections in config.xml. -* It is possible to connect to MySQL through a socket in the file system. -* The system.parts table has a new column with information about the size of marks, in bytes. +- Added the `output_format_json_quote_denormals` setting, which enables outputting nan and inf values in JSON format. +- Optimized stream allocation when reading from a Distributed table. +- Settings can be configured in readonly mode if the value doesn’t change. +- Added the ability to retrieve non-integer granules of the MergeTree engine in order to meet restrictions on the block size specified in the preferred\_block\_size\_bytes setting. The purpose is to reduce the consumption of RAM and increase cache locality when processing queries from tables with large columns. +- Efficient use of indexes that contain expressions like `toStartOfHour(x)` for conditions like `toStartOfHour(x) op сonstexpr.` +- Added new settings for MergeTree engines (the merge\_tree section in config.xml): + - replicated\_deduplication\_window\_seconds sets the number of seconds allowed for deduplicating inserts in Replicated tables. + - cleanup\_delay\_period sets how often to start cleanup to remove outdated data. + - replicated\_can\_become\_leader can prevent a replica from becoming the leader (and assigning merges). +- Accelerated cleanup to remove outdated data from ZooKeeper. +- Multiple improvements and fixes for clustered DDL queries. Of particular interest is the new setting distributed\_ddl\_task\_timeout, which limits the time to wait for a response from the servers in the cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. +- Improved display of stack traces in the server logs. +- Added the “none” value for the compression method. +- You can use multiple dictionaries\_config sections in config.xml. +- It is possible to connect to MySQL through a socket in the file system. +- The system.parts table has a new column with information about the size of marks, in bytes. -#### Bug fixes: +#### Bug fixes: {#bug-fixes-4} -* Distributed tables using a Merge table now work correctly for a SELECT query with a condition on the `_table` field. -* Fixed a rare race condition in ReplicatedMergeTree when checking data parts. -* Fixed possible freezing on "leader election" when starting a server. -* The max_replica_delay_for_distributed_queries setting was ignored when using a local replica of the data source. This has been fixed. -* Fixed incorrect behavior of `ALTER TABLE CLEAR COLUMN IN PARTITION` when attempting to clean a non-existing column. -* Fixed an exception in the multiIf function when using empty arrays or strings. -* Fixed excessive memory allocations when deserializing Native format. -* Fixed incorrect auto-update of Trie dictionaries. -* Fixed an exception when running queries with a GROUP BY clause from a Merge table when using SAMPLE. -* Fixed a crash of GROUP BY when using distributed_aggregation_memory_efficient=1. -* Now you can specify the database.table in the right side of IN and JOIN. -* Too many threads were used for parallel aggregation. This has been fixed. -* Fixed how the "if" function works with FixedString arguments. -* SELECT worked incorrectly from a Distributed table for shards with a weight of 0. This has been fixed. -* Running `CREATE VIEW IF EXISTS no longer causes crashes.` -* Fixed incorrect behavior when input_format_skip_unknown_fields=1 is set and there are negative numbers. -* Fixed an infinite loop in the `dictGetHierarchy()` function if there is some invalid data in the dictionary. -* Fixed `Syntax error: unexpected (...)` errors when running distributed queries with subqueries in an IN or JOIN clause and Merge tables. -* Fixed an incorrect interpretation of a SELECT query from Dictionary tables. -* Fixed the "Cannot mremap" error when using arrays in IN and JOIN clauses with more than 2 billion elements. -* Fixed the failover for dictionaries with MySQL as the source. +- Distributed tables using a Merge table now work correctly for a SELECT query with a condition on the `_table` field. +- Fixed a rare race condition in ReplicatedMergeTree when checking data parts. +- Fixed possible freezing on “leader election” when starting a server. +- The max\_replica\_delay\_for\_distributed\_queries setting was ignored when using a local replica of the data source. This has been fixed. +- Fixed incorrect behavior of `ALTER TABLE CLEAR COLUMN IN PARTITION` when attempting to clean a non-existing column. +- Fixed an exception in the multiIf function when using empty arrays or strings. +- Fixed excessive memory allocations when deserializing Native format. +- Fixed incorrect auto-update of Trie dictionaries. +- Fixed an exception when running queries with a GROUP BY clause from a Merge table when using SAMPLE. +- Fixed a crash of GROUP BY when using distributed\_aggregation\_memory\_efficient=1. +- Now you can specify the database.table in the right side of IN and JOIN. +- Too many threads were used for parallel aggregation. This has been fixed. +- Fixed how the “if” function works with FixedString arguments. +- SELECT worked incorrectly from a Distributed table for shards with a weight of 0. This has been fixed. +- Running `CREATE VIEW IF EXISTS no longer causes crashes.` +- Fixed incorrect behavior when input\_format\_skip\_unknown\_fields=1 is set and there are negative numbers. +- Fixed an infinite loop in the `dictGetHierarchy()` function if there is some invalid data in the dictionary. +- Fixed `Syntax error: unexpected (...)` errors when running distributed queries with subqueries in an IN or JOIN clause and Merge tables. +- Fixed an incorrect interpretation of a SELECT query from Dictionary tables. +- Fixed the “Cannot mremap” error when using arrays in IN and JOIN clauses with more than 2 billion elements. +- Fixed the failover for dictionaries with MySQL as the source. -#### Improved workflow for developing and assembling ClickHouse: +#### Improved workflow for developing and assembling ClickHouse: {#improved-workflow-for-developing-and-assembling-clickhouse-1} -* Builds can be assembled in Arcadia. -* You can use gcc 7 to compile ClickHouse. -* Parallel builds using ccache+distcc are faster now. +- Builds can be assembled in Arcadia. +- You can use gcc 7 to compile ClickHouse. +- Parallel builds using ccache+distcc are faster now. -### ClickHouse release 1.1.54245, 2017-07-04 +### ClickHouse release 1.1.54245, 2017-07-04 {#clickhouse-release-1-1-54245-2017-07-04} -#### New features: +#### New features: {#new-features-5} -* Distributed DDL (for example, `CREATE TABLE ON CLUSTER`) -* The replicated query `ALTER TABLE CLEAR COLUMN IN PARTITION.` -* The engine for Dictionary tables (access to dictionary data in the form of a table). -* Dictionary database engine (this type of database automatically has Dictionary tables available for all the connected external dictionaries). -* You can check for updates to the dictionary by sending a request to the source. -* Qualified column names -* Quoting identifiers using double quotation marks. -* Sessions in the HTTP interface. -* The OPTIMIZE query for a Replicated table can can run not only on the leader. +- Distributed DDL (for example, `CREATE TABLE ON CLUSTER`) +- The replicated query `ALTER TABLE CLEAR COLUMN IN PARTITION.` +- The engine for Dictionary tables (access to dictionary data in the form of a table). +- Dictionary database engine (this type of database automatically has Dictionary tables available for all the connected external dictionaries). +- You can check for updates to the dictionary by sending a request to the source. +- Qualified column names +- Quoting identifiers using double quotation marks. +- Sessions in the HTTP interface. +- The OPTIMIZE query for a Replicated table can can run not only on the leader. -#### Backward incompatible changes: +#### Backward incompatible changes: {#backward-incompatible-changes-2} -* Removed SET GLOBAL. +- Removed SET GLOBAL. -#### Minor changes: +#### Minor changes: {#minor-changes} -* Now after an alert is triggered, the log prints the full stack trace. -* Relaxed the verification of the number of damaged/extra data parts at startup (there were too many false positives). +- Now after an alert is triggered, the log prints the full stack trace. +- Relaxed the verification of the number of damaged/extra data parts at startup (there were too many false positives). -#### Bug fixes: +#### Bug fixes: {#bug-fixes-5} -* Fixed a bad connection "sticking" when inserting into a Distributed table. -* GLOBAL IN now works for a query from a Merge table that looks at a Distributed table. -* The incorrect number of cores was detected on a Google Compute Engine virtual machine. This has been fixed. -* Changes in how an executable source of cached external dictionaries works. -* Fixed the comparison of strings containing null characters. -* Fixed the comparison of Float32 primary key fields with constants. -* Previously, an incorrect estimate of the size of a field could lead to overly large allocations. -* Fixed a crash when querying a Nullable column added to a table using ALTER. -* Fixed a crash when sorting by a Nullable column, if the number of rows is less than LIMIT. -* Fixed an ORDER BY subquery consisting of only constant values. -* Previously, a Replicated table could remain in the invalid state after a failed DROP TABLE. -* Aliases for scalar subqueries with empty results are no longer lost. -* Now a query that used compilation does not fail with an error if the .so file gets damaged. +- Fixed a bad connection “sticking” when inserting into a Distributed table. +- GLOBAL IN now works for a query from a Merge table that looks at a Distributed table. +- The incorrect number of cores was detected on a Google Compute Engine virtual machine. This has been fixed. +- Changes in how an executable source of cached external dictionaries works. +- Fixed the comparison of strings containing null characters. +- Fixed the comparison of Float32 primary key fields with constants. +- Previously, an incorrect estimate of the size of a field could lead to overly large allocations. +- Fixed a crash when querying a Nullable column added to a table using ALTER. +- Fixed a crash when sorting by a Nullable column, if the number of rows is less than LIMIT. +- Fixed an ORDER BY subquery consisting of only constant values. +- Previously, a Replicated table could remain in the invalid state after a failed DROP TABLE. +- Aliases for scalar subqueries with empty results are no longer lost. +- Now a query that used compilation does not fail with an error if the .so file gets damaged. diff --git a/docs/en/changelog/2018.md b/docs/en/changelog/2018.md index 948c161b745..049a1c5bfc2 100644 --- a/docs/en/changelog/2018.md +++ b/docs/en/changelog/2018.md @@ -1,1046 +1,1054 @@ +## ClickHouse release 18.16 {#clickhouse-release-18-16} + +### ClickHouse release 18.16.1, 2018-12-21 {#clickhouse-release-18-16-1-2018-12-21} + +#### Bug fixes: {#bug-fixes} + +- Fixed an error that led to problems with updating dictionaries with the ODBC source. [\#3825](https://github.com/ClickHouse/ClickHouse/issues/3825), [\#3829](https://github.com/ClickHouse/ClickHouse/issues/3829) +- JIT compilation of aggregate functions now works with LowCardinality columns. [\#3838](https://github.com/ClickHouse/ClickHouse/issues/3838) + +#### Improvements: {#improvements} + +- Added the `low_cardinality_allow_in_native_format` setting (enabled by default). When disabled, LowCardinality columns will be converted to ordinary columns for SELECT queries and ordinary columns will be expected for INSERT queries. [\#3879](https://github.com/ClickHouse/ClickHouse/pull/3879) + +#### Build improvements: {#build-improvements} + +- Fixes for builds on macOS and ARM. + +### ClickHouse release 18.16.0, 2018-12-14 {#clickhouse-release-18-16-0-2018-12-14} + +#### New features: {#new-features} + +- `DEFAULT` expressions are evaluated for missing fields when loading data in semi-structured input formats (`JSONEachRow`, `TSKV`). The feature is enabled with the `insert_sample_with_metadata` setting. [\#3555](https://github.com/ClickHouse/ClickHouse/pull/3555) +- The `ALTER TABLE` query now has the `MODIFY ORDER BY` action for changing the sorting key when adding or removing a table column. This is useful for tables in the `MergeTree` family that perform additional tasks when merging based on this sorting key, such as `SummingMergeTree`, `AggregatingMergeTree`, and so on. [\#3581](https://github.com/ClickHouse/ClickHouse/pull/3581) [\#3755](https://github.com/ClickHouse/ClickHouse/pull/3755) +- For tables in the `MergeTree` family, now you can specify a different sorting key (`ORDER BY`) and index (`PRIMARY KEY`). The sorting key can be longer than the index. [\#3581](https://github.com/ClickHouse/ClickHouse/pull/3581) +- Added the `hdfs` table function and the `HDFS` table engine for importing and exporting data to HDFS. [chenxing-xc](https://github.com/ClickHouse/ClickHouse/pull/3617) +- Added functions for working with base64: `base64Encode`, `base64Decode`, `tryBase64Decode`. [Alexander Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/3350) +- Now you can use a parameter to configure the precision of the `uniqCombined` aggregate function (select the number of HyperLogLog cells). [\#3406](https://github.com/ClickHouse/ClickHouse/pull/3406) +- Added the `system.contributors` table that contains the names of everyone who made commits in ClickHouse. [\#3452](https://github.com/ClickHouse/ClickHouse/pull/3452) +- Added the ability to omit the partition for the `ALTER TABLE ... FREEZE` query in order to back up all partitions at once. [\#3514](https://github.com/ClickHouse/ClickHouse/pull/3514) +- Added `dictGet` and `dictGetOrDefault` functions that don’t require specifying the type of return value. The type is determined automatically from the dictionary description. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3564) +- Now you can specify comments for a column in the table description and change it using `ALTER`. [\#3377](https://github.com/ClickHouse/ClickHouse/pull/3377) +- Reading is supported for `Join` type tables with simple keys. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3728) +- Now you can specify the options `join_use_nulls`, `max_rows_in_join`, `max_bytes_in_join`, and `join_overflow_mode` when creating a `Join` type table. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3728) +- Added the `joinGet` function that allows you to use a `Join` type table like a dictionary. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3728) +- Added the `partition_key`, `sorting_key`, `primary_key`, and `sampling_key` columns to the `system.tables` table in order to provide information about table keys. [\#3609](https://github.com/ClickHouse/ClickHouse/pull/3609) +- Added the `is_in_partition_key`, `is_in_sorting_key`, `is_in_primary_key`, and `is_in_sampling_key` columns to the `system.columns` table. [\#3609](https://github.com/ClickHouse/ClickHouse/pull/3609) +- Added the `min_time` and `max_time` columns to the `system.parts` table. These columns are populated when the partitioning key is an expression consisting of `DateTime` columns. [Emmanuel Donin de Rosière](https://github.com/ClickHouse/ClickHouse/pull/3800) + +#### Bug fixes: {#bug-fixes-1} + +- Fixes and performance improvements for the `LowCardinality` data type. `GROUP BY` using `LowCardinality(Nullable(...))`. Getting the values of `extremes`. Processing high-order functions. `LEFT ARRAY JOIN`. Distributed `GROUP BY`. Functions that return `Array`. Execution of `ORDER BY`. Writing to `Distributed` tables (nicelulu). Backward compatibility for `INSERT` queries from old clients that implement the `Native` protocol. Support for `LowCardinality` for `JOIN`. Improved performance when working in a single stream. [\#3823](https://github.com/ClickHouse/ClickHouse/pull/3823) [\#3803](https://github.com/ClickHouse/ClickHouse/pull/3803) [\#3799](https://github.com/ClickHouse/ClickHouse/pull/3799) [\#3769](https://github.com/ClickHouse/ClickHouse/pull/3769) [\#3744](https://github.com/ClickHouse/ClickHouse/pull/3744) [\#3681](https://github.com/ClickHouse/ClickHouse/pull/3681) [\#3651](https://github.com/ClickHouse/ClickHouse/pull/3651) [\#3649](https://github.com/ClickHouse/ClickHouse/pull/3649) [\#3641](https://github.com/ClickHouse/ClickHouse/pull/3641) [\#3632](https://github.com/ClickHouse/ClickHouse/pull/3632) [\#3568](https://github.com/ClickHouse/ClickHouse/pull/3568) [\#3523](https://github.com/ClickHouse/ClickHouse/pull/3523) [\#3518](https://github.com/ClickHouse/ClickHouse/pull/3518) +- Fixed how the `select_sequential_consistency` option works. Previously, when this setting was enabled, an incomplete result was sometimes returned after beginning to write to a new partition. [\#2863](https://github.com/ClickHouse/ClickHouse/pull/2863) +- Databases are correctly specified when executing DDL `ON CLUSTER` queries and `ALTER UPDATE/DELETE`. [\#3772](https://github.com/ClickHouse/ClickHouse/pull/3772) [\#3460](https://github.com/ClickHouse/ClickHouse/pull/3460) +- Databases are correctly specified for subqueries inside a VIEW. [\#3521](https://github.com/ClickHouse/ClickHouse/pull/3521) +- Fixed a bug in `PREWHERE` with `FINAL` for `VersionedCollapsingMergeTree`. [7167bfd7](https://github.com/ClickHouse/ClickHouse/commit/7167bfd7b365538f7a91c4307ad77e552ab4e8c1) +- Now you can use `KILL QUERY` to cancel queries that have not started yet because they are waiting for the table to be locked. [\#3517](https://github.com/ClickHouse/ClickHouse/pull/3517) +- Corrected date and time calculations if the clocks were moved back at midnight (this happens in Iran, and happened in Moscow from 1981 to 1983). Previously, this led to the time being reset a day earlier than necessary, and also caused incorrect formatting of the date and time in text format. [\#3819](https://github.com/ClickHouse/ClickHouse/pull/3819) +- Fixed bugs in some cases of `VIEW` and subqueries that omit the database. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3521) +- Fixed a race condition when simultaneously reading from a `MATERIALIZED VIEW` and deleting a `MATERIALIZED VIEW` due to not locking the internal `MATERIALIZED VIEW`. [\#3404](https://github.com/ClickHouse/ClickHouse/pull/3404) [\#3694](https://github.com/ClickHouse/ClickHouse/pull/3694) +- Fixed the error `Lock handler cannot be nullptr.` [\#3689](https://github.com/ClickHouse/ClickHouse/pull/3689) +- Fixed query processing when the `compile_expressions` option is enabled (it’s enabled by default). Nondeterministic constant expressions like the `now` function are no longer unfolded. [\#3457](https://github.com/ClickHouse/ClickHouse/pull/3457) +- Fixed a crash when specifying a non-constant scale argument in `toDecimal32/64/128` functions. +- Fixed an error when trying to insert an array with `NULL` elements in the `Values` format into a column of type `Array` without `Nullable` (if `input_format_values_interpret_expressions` = 1). [\#3487](https://github.com/ClickHouse/ClickHouse/pull/3487) [\#3503](https://github.com/ClickHouse/ClickHouse/pull/3503) +- Fixed continuous error logging in `DDLWorker` if ZooKeeper is not available. [8f50c620](https://github.com/ClickHouse/ClickHouse/commit/8f50c620334988b28018213ec0092fe6423847e2) +- Fixed the return type for `quantile*` functions from `Date` and `DateTime` types of arguments. [\#3580](https://github.com/ClickHouse/ClickHouse/pull/3580) +- Fixed the `WITH` clause if it specifies a simple alias without expressions. [\#3570](https://github.com/ClickHouse/ClickHouse/pull/3570) +- Fixed processing of queries with named sub-queries and qualified column names when `enable_optimize_predicate_expression` is enabled. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3588) +- Fixed the error `Attempt to attach to nullptr thread group` when working with materialized views. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3623) +- Fixed a crash when passing certain incorrect arguments to the `arrayReverse` function. [73e3a7b6](https://github.com/ClickHouse/ClickHouse/commit/73e3a7b662161d6005e7727d8a711b930386b871) +- Fixed the buffer overflow in the `extractURLParameter` function. Improved performance. Added correct processing of strings containing zero bytes. [141e9799](https://github.com/ClickHouse/ClickHouse/commit/141e9799e49201d84ea8e951d1bed4fb6d3dacb5) +- Fixed buffer overflow in the `lowerUTF8` and `upperUTF8` functions. Removed the ability to execute these functions over `FixedString` type arguments. [\#3662](https://github.com/ClickHouse/ClickHouse/pull/3662) +- Fixed a rare race condition when deleting `MergeTree` tables. [\#3680](https://github.com/ClickHouse/ClickHouse/pull/3680) +- Fixed a race condition when reading from `Buffer` tables and simultaneously performing `ALTER` or `DROP` on the target tables. [\#3719](https://github.com/ClickHouse/ClickHouse/pull/3719) +- Fixed a segfault if the `max_temporary_non_const_columns` limit was exceeded. [\#3788](https://github.com/ClickHouse/ClickHouse/pull/3788) +#### Improvements: {#improvements-1} -## ClickHouse release 18.16 -### ClickHouse release 18.16.1, 2018-12-21 - -#### Bug fixes: - -* Fixed an error that led to problems with updating dictionaries with the ODBC source. [#3825](https://github.com/ClickHouse/ClickHouse/issues/3825), [#3829](https://github.com/ClickHouse/ClickHouse/issues/3829) -* JIT compilation of aggregate functions now works with LowCardinality columns. [#3838](https://github.com/ClickHouse/ClickHouse/issues/3838) - -#### Improvements: - -* Added the `low_cardinality_allow_in_native_format` setting (enabled by default). When disabled, LowCardinality columns will be converted to ordinary columns for SELECT queries and ordinary columns will be expected for INSERT queries. [#3879](https://github.com/ClickHouse/ClickHouse/pull/3879) - -#### Build improvements: - -* Fixes for builds on macOS and ARM. - -### ClickHouse release 18.16.0, 2018-12-14 - -#### New features: - -* `DEFAULT` expressions are evaluated for missing fields when loading data in semi-structured input formats (`JSONEachRow`, `TSKV`). The feature is enabled with the `insert_sample_with_metadata` setting. [#3555](https://github.com/ClickHouse/ClickHouse/pull/3555) -* The `ALTER TABLE` query now has the `MODIFY ORDER BY` action for changing the sorting key when adding or removing a table column. This is useful for tables in the `MergeTree` family that perform additional tasks when merging based on this sorting key, such as `SummingMergeTree`, `AggregatingMergeTree`, and so on. [#3581](https://github.com/ClickHouse/ClickHouse/pull/3581) [#3755](https://github.com/ClickHouse/ClickHouse/pull/3755) -* For tables in the `MergeTree` family, now you can specify a different sorting key (`ORDER BY`) and index (`PRIMARY KEY`). The sorting key can be longer than the index. [#3581](https://github.com/ClickHouse/ClickHouse/pull/3581) -* Added the `hdfs` table function and the `HDFS` table engine for importing and exporting data to HDFS. [chenxing-xc](https://github.com/ClickHouse/ClickHouse/pull/3617) -* Added functions for working with base64: `base64Encode`, `base64Decode`, `tryBase64Decode`. [Alexander Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/3350) -* Now you can use a parameter to configure the precision of the `uniqCombined` aggregate function (select the number of HyperLogLog cells). [#3406](https://github.com/ClickHouse/ClickHouse/pull/3406) -* Added the `system.contributors` table that contains the names of everyone who made commits in ClickHouse. [#3452](https://github.com/ClickHouse/ClickHouse/pull/3452) -* Added the ability to omit the partition for the `ALTER TABLE ... FREEZE` query in order to back up all partitions at once. [#3514](https://github.com/ClickHouse/ClickHouse/pull/3514) -* Added `dictGet` and `dictGetOrDefault` functions that don't require specifying the type of return value. The type is determined automatically from the dictionary description. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3564) -* Now you can specify comments for a column in the table description and change it using `ALTER`. [#3377](https://github.com/ClickHouse/ClickHouse/pull/3377) -* Reading is supported for `Join` type tables with simple keys. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3728) -* Now you can specify the options `join_use_nulls`, `max_rows_in_join`, `max_bytes_in_join`, and `join_overflow_mode` when creating a `Join` type table. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3728) -* Added the `joinGet` function that allows you to use a `Join` type table like a dictionary. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3728) -* Added the `partition_key`, `sorting_key`, `primary_key`, and `sampling_key` columns to the `system.tables` table in order to provide information about table keys. [#3609](https://github.com/ClickHouse/ClickHouse/pull/3609) -* Added the `is_in_partition_key`, `is_in_sorting_key`, `is_in_primary_key`, and `is_in_sampling_key` columns to the `system.columns` table. [#3609](https://github.com/ClickHouse/ClickHouse/pull/3609) -* Added the `min_time` and `max_time` columns to the `system.parts` table. These columns are populated when the partitioning key is an expression consisting of `DateTime` columns. [Emmanuel Donin de Rosière](https://github.com/ClickHouse/ClickHouse/pull/3800) - -#### Bug fixes: - -* Fixes and performance improvements for the `LowCardinality` data type. `GROUP BY` using `LowCardinality(Nullable(...))`. Getting the values of `extremes`. Processing high-order functions. `LEFT ARRAY JOIN`. Distributed `GROUP BY`. Functions that return `Array`. Execution of `ORDER BY`. Writing to `Distributed` tables (nicelulu). Backward compatibility for `INSERT` queries from old clients that implement the `Native` protocol. Support for `LowCardinality` for `JOIN`. Improved performance when working in a single stream. [#3823](https://github.com/ClickHouse/ClickHouse/pull/3823) [#3803](https://github.com/ClickHouse/ClickHouse/pull/3803) [#3799](https://github.com/ClickHouse/ClickHouse/pull/3799) [#3769](https://github.com/ClickHouse/ClickHouse/pull/3769) [#3744](https://github.com/ClickHouse/ClickHouse/pull/3744) [#3681](https://github.com/ClickHouse/ClickHouse/pull/3681) [#3651](https://github.com/ClickHouse/ClickHouse/pull/3651) [#3649](https://github.com/ClickHouse/ClickHouse/pull/3649) [#3641](https://github.com/ClickHouse/ClickHouse/pull/3641) [#3632](https://github.com/ClickHouse/ClickHouse/pull/3632) [#3568](https://github.com/ClickHouse/ClickHouse/pull/3568) [#3523](https://github.com/ClickHouse/ClickHouse/pull/3523) [#3518](https://github.com/ClickHouse/ClickHouse/pull/3518) -* Fixed how the `select_sequential_consistency` option works. Previously, when this setting was enabled, an incomplete result was sometimes returned after beginning to write to a new partition. [#2863](https://github.com/ClickHouse/ClickHouse/pull/2863) -* Databases are correctly specified when executing DDL `ON CLUSTER` queries and `ALTER UPDATE/DELETE`. [#3772](https://github.com/ClickHouse/ClickHouse/pull/3772) [#3460](https://github.com/ClickHouse/ClickHouse/pull/3460) -* Databases are correctly specified for subqueries inside a VIEW. [#3521](https://github.com/ClickHouse/ClickHouse/pull/3521) -* Fixed a bug in `PREWHERE` with `FINAL` for `VersionedCollapsingMergeTree`. [7167bfd7](https://github.com/ClickHouse/ClickHouse/commit/7167bfd7b365538f7a91c4307ad77e552ab4e8c1) -* Now you can use `KILL QUERY` to cancel queries that have not started yet because they are waiting for the table to be locked. [#3517](https://github.com/ClickHouse/ClickHouse/pull/3517) -* Corrected date and time calculations if the clocks were moved back at midnight (this happens in Iran, and happened in Moscow from 1981 to 1983). Previously, this led to the time being reset a day earlier than necessary, and also caused incorrect formatting of the date and time in text format. [#3819](https://github.com/ClickHouse/ClickHouse/pull/3819) -* Fixed bugs in some cases of `VIEW` and subqueries that omit the database. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3521) -* Fixed a race condition when simultaneously reading from a `MATERIALIZED VIEW` and deleting a `MATERIALIZED VIEW` due to not locking the internal `MATERIALIZED VIEW`. [#3404](https://github.com/ClickHouse/ClickHouse/pull/3404) [#3694](https://github.com/ClickHouse/ClickHouse/pull/3694) -* Fixed the error `Lock handler cannot be nullptr.` [#3689](https://github.com/ClickHouse/ClickHouse/pull/3689) -* Fixed query processing when the `compile_expressions` option is enabled (it's enabled by default). Nondeterministic constant expressions like the `now` function are no longer unfolded. [#3457](https://github.com/ClickHouse/ClickHouse/pull/3457) -* Fixed a crash when specifying a non-constant scale argument in `toDecimal32/64/128` functions. -* Fixed an error when trying to insert an array with `NULL` elements in the `Values` format into a column of type `Array` without `Nullable` (if `input_format_values_interpret_expressions` = 1). [#3487](https://github.com/ClickHouse/ClickHouse/pull/3487) [#3503](https://github.com/ClickHouse/ClickHouse/pull/3503) -* Fixed continuous error logging in `DDLWorker` if ZooKeeper is not available. [8f50c620](https://github.com/ClickHouse/ClickHouse/commit/8f50c620334988b28018213ec0092fe6423847e2) -* Fixed the return type for `quantile*` functions from `Date` and `DateTime` types of arguments. [#3580](https://github.com/ClickHouse/ClickHouse/pull/3580) -* Fixed the `WITH` clause if it specifies a simple alias without expressions. [#3570](https://github.com/ClickHouse/ClickHouse/pull/3570) -* Fixed processing of queries with named sub-queries and qualified column names when `enable_optimize_predicate_expression` is enabled. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3588) -* Fixed the error `Attempt to attach to nullptr thread group` when working with materialized views. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3623) -* Fixed a crash when passing certain incorrect arguments to the `arrayReverse` function. [73e3a7b6](https://github.com/ClickHouse/ClickHouse/commit/73e3a7b662161d6005e7727d8a711b930386b871) -* Fixed the buffer overflow in the `extractURLParameter` function. Improved performance. Added correct processing of strings containing zero bytes. [141e9799](https://github.com/ClickHouse/ClickHouse/commit/141e9799e49201d84ea8e951d1bed4fb6d3dacb5) -* Fixed buffer overflow in the `lowerUTF8` and `upperUTF8` functions. Removed the ability to execute these functions over `FixedString` type arguments. [#3662](https://github.com/ClickHouse/ClickHouse/pull/3662) -* Fixed a rare race condition when deleting `MergeTree` tables. [#3680](https://github.com/ClickHouse/ClickHouse/pull/3680) -* Fixed a race condition when reading from `Buffer` tables and simultaneously performing `ALTER` or `DROP` on the target tables. [#3719](https://github.com/ClickHouse/ClickHouse/pull/3719) -* Fixed a segfault if the `max_temporary_non_const_columns` limit was exceeded. [#3788](https://github.com/ClickHouse/ClickHouse/pull/3788) - -#### Improvements: - -* The server does not write the processed configuration files to the `/etc/clickhouse-server/` directory. Instead, it saves them in the `preprocessed_configs` directory inside `path`. This means that the `/etc/clickhouse-server/` directory doesn't have write access for the `clickhouse` user, which improves security. [#2443](https://github.com/ClickHouse/ClickHouse/pull/2443) -* The `min_merge_bytes_to_use_direct_io` option is set to 10 GiB by default. A merge that forms large parts of tables from the MergeTree family will be performed in `O_DIRECT` mode, which prevents excessive page cache eviction. [#3504](https://github.com/ClickHouse/ClickHouse/pull/3504) -* Accelerated server start when there is a very large number of tables. [#3398](https://github.com/ClickHouse/ClickHouse/pull/3398) -* Added a connection pool and HTTP `Keep-Alive` for connections between replicas. [#3594](https://github.com/ClickHouse/ClickHouse/pull/3594) -* If the query syntax is invalid, the `400 Bad Request` code is returned in the `HTTP` interface (500 was returned previously). [31bc680a](https://github.com/ClickHouse/ClickHouse/commit/31bc680ac5f4bb1d0360a8ba4696fa84bb47d6ab) -* The `join_default_strictness` option is set to `ALL` by default for compatibility. [120e2cbe](https://github.com/ClickHouse/ClickHouse/commit/120e2cbe2ff4fbad626c28042d9b28781c805afe) -* Removed logging to `stderr` from the `re2` library for invalid or complex regular expressions. [#3723](https://github.com/ClickHouse/ClickHouse/pull/3723) -* Added for the `Kafka` table engine: checks for subscriptions before beginning to read from Kafka; the kafka_max_block_size setting for the table. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3396) -* The `cityHash64`, `farmHash64`, `metroHash64`, `sipHash64`, `halfMD5`, `murmurHash2_32`, `murmurHash2_64`, `murmurHash3_32`, and `murmurHash3_64` functions now work for any number of arguments and for arguments in the form of tuples. [#3451](https://github.com/ClickHouse/ClickHouse/pull/3451) [#3519](https://github.com/ClickHouse/ClickHouse/pull/3519) -* The `arrayReverse` function now works with any types of arrays. [73e3a7b6](https://github.com/ClickHouse/ClickHouse/commit/73e3a7b662161d6005e7727d8a711b930386b871) -* Added an optional parameter: the slot size for the `timeSlots` function. [Kirill Shvakov](https://github.com/ClickHouse/ClickHouse/pull/3724) -* For `FULL` and `RIGHT JOIN`, the `max_block_size` setting is used for a stream of non-joined data from the right table. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3699) -* Added the `--secure` command line parameter in `clickhouse-benchmark` and `clickhouse-performance-test` to enable TLS. [#3688](https://github.com/ClickHouse/ClickHouse/pull/3688) [#3690](https://github.com/ClickHouse/ClickHouse/pull/3690) -* Type conversion when the structure of a `Buffer` type table does not match the structure of the destination table. [Vitaly Baranov](https://github.com/ClickHouse/ClickHouse/pull/3603) -* Added the `tcp_keep_alive_timeout` option to enable keep-alive packets after inactivity for the specified time interval. [#3441](https://github.com/ClickHouse/ClickHouse/pull/3441) -* Removed unnecessary quoting of values for the partition key in the `system.parts` table if it consists of a single column. [#3652](https://github.com/ClickHouse/ClickHouse/pull/3652) -* The modulo function works for `Date` and `DateTime` data types. [#3385](https://github.com/ClickHouse/ClickHouse/pull/3385) -* Added synonyms for the `POWER`, `LN`, `LCASE`, `UCASE`, `REPLACE`, `LOCATE`, `SUBSTR`, and `MID` functions. [#3774](https://github.com/ClickHouse/ClickHouse/pull/3774) [#3763](https://github.com/ClickHouse/ClickHouse/pull/3763) Some function names are case-insensitive for compatibility with the SQL standard. Added syntactic sugar `SUBSTRING(expr FROM start FOR length)` for compatibility with SQL. [#3804](https://github.com/ClickHouse/ClickHouse/pull/3804) -* Added the ability to `mlock` memory pages corresponding to `clickhouse-server` executable code to prevent it from being forced out of memory. This feature is disabled by default. [#3553](https://github.com/ClickHouse/ClickHouse/pull/3553) -* Improved performance when reading from `O_DIRECT` (with the `min_bytes_to_use_direct_io` option enabled). [#3405](https://github.com/ClickHouse/ClickHouse/pull/3405) -* Improved performance of the `dictGet...OrDefault` function for a constant key argument and a non-constant default argument. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3563) -* The `firstSignificantSubdomain` function now processes the domains `gov`, `mil`, and `edu`. [Igor Hatarist](https://github.com/ClickHouse/ClickHouse/pull/3601) Improved performance. [#3628](https://github.com/ClickHouse/ClickHouse/pull/3628) -* Ability to specify custom environment variables for starting `clickhouse-server` using the `SYS-V init.d` script by defining `CLICKHOUSE_PROGRAM_ENV` in `/etc/default/clickhouse`. -[Pavlo Bashynskyi](https://github.com/ClickHouse/ClickHouse/pull/3612) -* Correct return code for the clickhouse-server init script. [#3516](https://github.com/ClickHouse/ClickHouse/pull/3516) -* The `system.metrics` table now has the `VersionInteger` metric, and `system.build_options` has the added line `VERSION_INTEGER`, which contains the numeric form of the ClickHouse version, such as `18016000`. [#3644](https://github.com/ClickHouse/ClickHouse/pull/3644) -* Removed the ability to compare the `Date` type with a number to avoid potential errors like `date = 2018-12-17`, where quotes around the date are omitted by mistake. [#3687](https://github.com/ClickHouse/ClickHouse/pull/3687) -* Fixed the behavior of stateful functions like `rowNumberInAllBlocks`. They previously output a result that was one number larger due to starting during query analysis. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3729) -* If the `force_restore_data` file can't be deleted, an error message is displayed. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3794) - -#### Build improvements: - -* Updated the `jemalloc` library, which fixes a potential memory leak. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3557) -* Profiling with `jemalloc` is enabled by default in order to debug builds. [2cc82f5c](https://github.com/ClickHouse/ClickHouse/commit/2cc82f5cbe266421cd4c1165286c2c47e5ffcb15) -* Added the ability to run integration tests when only `Docker` is installed on the system. [#3650](https://github.com/ClickHouse/ClickHouse/pull/3650) -* Added the fuzz expression test in SELECT queries. [#3442](https://github.com/ClickHouse/ClickHouse/pull/3442) -* Added a stress test for commits, which performs functional tests in parallel and in random order to detect more race conditions. [#3438](https://github.com/ClickHouse/ClickHouse/pull/3438) -* Improved the method for starting clickhouse-server in a Docker image. [Elghazal Ahmed](https://github.com/ClickHouse/ClickHouse/pull/3663) -* For a Docker image, added support for initializing databases using files in the `/docker-entrypoint-initdb.d` directory. [Konstantin Lebedev](https://github.com/ClickHouse/ClickHouse/pull/3695) -* Fixes for builds on ARM. [#3709](https://github.com/ClickHouse/ClickHouse/pull/3709) - -#### Backward incompatible changes: - -* Removed the ability to compare the `Date` type with a number. Instead of `toDate('2018-12-18') = 17883`, you must use explicit type conversion `= toDate(17883)` [#3687](https://github.com/ClickHouse/ClickHouse/pull/3687) - -## ClickHouse release 18.14 -### ClickHouse release 18.14.19, 2018-12-19 - -#### Bug fixes: - -* Fixed an error that led to problems with updating dictionaries with the ODBC source. [#3825](https://github.com/ClickHouse/ClickHouse/issues/3825), [#3829](https://github.com/ClickHouse/ClickHouse/issues/3829) -* Databases are correctly specified when executing DDL `ON CLUSTER` queries. [#3460](https://github.com/ClickHouse/ClickHouse/pull/3460) -* Fixed a segfault if the `max_temporary_non_const_columns` limit was exceeded. [#3788](https://github.com/ClickHouse/ClickHouse/pull/3788) - -#### Build improvements: - -* Fixes for builds on ARM. - -### ClickHouse release 18.14.18, 2018-12-04 - -#### Bug fixes: -* Fixed error in `dictGet...` function for dictionaries of type `range`, if one of the arguments is constant and other is not. [#3751](https://github.com/ClickHouse/ClickHouse/pull/3751) -* Fixed error that caused messages `netlink: '...': attribute type 1 has an invalid length` to be printed in Linux kernel log, that was happening only on fresh enough versions of Linux kernel. [#3749](https://github.com/ClickHouse/ClickHouse/pull/3749) -* Fixed segfault in function `empty` for argument of `FixedString` type. [Daniel, Dao Quang Minh](https://github.com/ClickHouse/ClickHouse/pull/3703) -* Fixed excessive memory allocation when using large value of `max_query_size` setting (a memory chunk of `max_query_size` bytes was preallocated at once). [#3720](https://github.com/ClickHouse/ClickHouse/pull/3720) - -#### Build changes: -* Fixed build with LLVM/Clang libraries of version 7 from the OS packages (these libraries are used for runtime query compilation). [#3582](https://github.com/ClickHouse/ClickHouse/pull/3582) - -### ClickHouse release 18.14.17, 2018-11-30 - -#### Bug fixes: -* Fixed cases when the ODBC bridge process did not terminate with the main server process. [#3642](https://github.com/ClickHouse/ClickHouse/pull/3642) -* Fixed synchronous insertion into the `Distributed` table with a columns list that differs from the column list of the remote table. [#3673](https://github.com/ClickHouse/ClickHouse/pull/3673) -* Fixed a rare race condition that can lead to a crash when dropping a MergeTree table. [#3643](https://github.com/ClickHouse/ClickHouse/pull/3643) -* Fixed a query deadlock in case when query thread creation fails with the `Resource temporarily unavailable` error. [#3643](https://github.com/ClickHouse/ClickHouse/pull/3643) -* Fixed parsing of the `ENGINE` clause when the `CREATE AS table` syntax was used and the `ENGINE` clause was specified before the `AS table` (the error resulted in ignoring the specified engine). [#3692](https://github.com/ClickHouse/ClickHouse/pull/3692) - -### ClickHouse release 18.14.15, 2018-11-21 - -#### Bug fixes: -* The size of memory chunk was overestimated while deserializing the column of type `Array(String)` that leads to "Memory limit exceeded" errors. The issue appeared in version 18.12.13. [#3589](https://github.com/ClickHouse/ClickHouse/issues/3589) - -### ClickHouse release 18.14.14, 2018-11-20 - -#### Bug fixes: -* Fixed `ON CLUSTER` queries when cluster configured as secure (flag ``). [#3599](https://github.com/ClickHouse/ClickHouse/pull/3599) - -#### Build changes: -* Fixed problems (llvm-7 from system, macos) [#3582](https://github.com/ClickHouse/ClickHouse/pull/3582) - -### ClickHouse release 18.14.13, 2018-11-08 - -#### Bug fixes: -* Fixed the `Block structure mismatch in MergingSorted stream` error. [#3162](https://github.com/ClickHouse/ClickHouse/issues/3162) -* Fixed `ON CLUSTER` queries in case when secure connections were turned on in the cluster config (the `` flag). [#3465](https://github.com/ClickHouse/ClickHouse/pull/3465) -* Fixed an error in queries that used `SAMPLE`, `PREWHERE` and alias columns. [#3543](https://github.com/ClickHouse/ClickHouse/pull/3543) -* Fixed a rare `unknown compression method` error when the `min_bytes_to_use_direct_io` setting was enabled. [3544](https://github.com/ClickHouse/ClickHouse/pull/3544) - -#### Performance improvements: -* Fixed performance regression of queries with `GROUP BY` of columns of UInt16 or Date type when executing on AMD EPYC processors. [Igor Lapko](https://github.com/ClickHouse/ClickHouse/pull/3512) -* Fixed performance regression of queries that process long strings. [#3530](https://github.com/ClickHouse/ClickHouse/pull/3530) - -#### Build improvements: -* Improvements for simplifying the Arcadia build. [#3475](https://github.com/ClickHouse/ClickHouse/pull/3475), [#3535](https://github.com/ClickHouse/ClickHouse/pull/3535) - -### ClickHouse release 18.14.12, 2018-11-02 - -#### Bug fixes: - -* Fixed a crash on joining two unnamed subqueries. [#3505](https://github.com/ClickHouse/ClickHouse/pull/3505) -* Fixed generating incorrect queries (with an empty `WHERE` clause) when querying external databases. [hotid](https://github.com/ClickHouse/ClickHouse/pull/3477) -* Fixed using an incorrect timeout value in ODBC dictionaries. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3511) - -### ClickHouse release 18.14.11, 2018-10-29 - -#### Bug fixes: - -* Fixed the error `Block structure mismatch in UNION stream: different number of columns` in LIMIT queries. [#2156](https://github.com/ClickHouse/ClickHouse/issues/2156) -* Fixed errors when merging data in tables containing arrays inside Nested structures. [#3397](https://github.com/ClickHouse/ClickHouse/pull/3397) -* Fixed incorrect query results if the `merge_tree_uniform_read_distribution` setting is disabled (it is enabled by default). [#3429](https://github.com/ClickHouse/ClickHouse/pull/3429) -* Fixed an error on inserts to a Distributed table in Native format. [#3411](https://github.com/ClickHouse/ClickHouse/issues/3411) - -### ClickHouse release 18.14.10, 2018-10-23 - -* The `compile_expressions` setting (JIT compilation of expressions) is disabled by default. [#3410](https://github.com/ClickHouse/ClickHouse/pull/3410) -* The `enable_optimize_predicate_expression` setting is disabled by default. - -### ClickHouse release 18.14.9, 2018-10-16 - -#### New features: - -* The `WITH CUBE` modifier for `GROUP BY` (the alternative syntax `GROUP BY CUBE(...)` is also available). [#3172](https://github.com/ClickHouse/ClickHouse/pull/3172) -* Added the `formatDateTime` function. [Alexandr Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/2770) -* Added the `JDBC` table engine and `jdbc` table function (requires installing clickhouse-jdbc-bridge). [Alexandr Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/3210) -* Added functions for working with the ISO week number: `toISOWeek`, `toISOYear`, `toStartOfISOYear`, and `toDayOfYear`. [#3146](https://github.com/ClickHouse/ClickHouse/pull/3146) -* Now you can use `Nullable` columns for `MySQL` and `ODBC` tables. [#3362](https://github.com/ClickHouse/ClickHouse/pull/3362) -* Nested data structures can be read as nested objects in `JSONEachRow` format. Added the `input_format_import_nested_json` setting. [Veloman Yunkan](https://github.com/ClickHouse/ClickHouse/pull/3144) -* Parallel processing is available for many `MATERIALIZED VIEW`s when inserting data. See the `parallel_view_processing` setting. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3208) -* Added the `SYSTEM FLUSH LOGS` query (forced log flushes to system tables such as `query_log`) [#3321](https://github.com/ClickHouse/ClickHouse/pull/3321) -* Now you can use pre-defined `database` and `table` macros when declaring `Replicated` tables. [#3251](https://github.com/ClickHouse/ClickHouse/pull/3251) -* Added the ability to read `Decimal` type values in engineering notation (indicating powers of ten). [#3153](https://github.com/ClickHouse/ClickHouse/pull/3153) - -#### Experimental features: - -* Optimization of the GROUP BY clause for `LowCardinality data types.` [#3138](https://github.com/ClickHouse/ClickHouse/pull/3138) -* Optimized calculation of expressions for `LowCardinality data types.` [#3200](https://github.com/ClickHouse/ClickHouse/pull/3200) - -#### Improvements: - -* Significantly reduced memory consumption for queries with `ORDER BY` and `LIMIT`. See the `max_bytes_before_remerge_sort` setting. [#3205](https://github.com/ClickHouse/ClickHouse/pull/3205) -* In the absence of `JOIN` (`LEFT`, `INNER`, ...), `INNER JOIN` is assumed. [#3147](https://github.com/ClickHouse/ClickHouse/pull/3147) -* Qualified asterisks work correctly in queries with `JOIN`. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3202) -* The `ODBC` table engine correctly chooses the method for quoting identifiers in the SQL dialect of a remote database. [Alexandr Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/3210) -* The `compile_expressions` setting (JIT compilation of expressions) is enabled by default. -* Fixed behavior for simultaneous DROP DATABASE/TABLE IF EXISTS and CREATE DATABASE/TABLE IF NOT EXISTS. Previously, a `CREATE DATABASE ... IF NOT EXISTS` query could return the error message "File ... already exists", and the `CREATE TABLE ... IF NOT EXISTS` and `DROP TABLE IF EXISTS` queries could return `Table ... is creating or attaching right now`. [#3101](https://github.com/ClickHouse/ClickHouse/pull/3101) -* LIKE and IN expressions with a constant right half are passed to the remote server when querying from MySQL or ODBC tables. [#3182](https://github.com/ClickHouse/ClickHouse/pull/3182) -* Comparisons with constant expressions in a WHERE clause are passed to the remote server when querying from MySQL and ODBC tables. Previously, only comparisons with constants were passed. [#3182](https://github.com/ClickHouse/ClickHouse/pull/3182) -* Correct calculation of row width in the terminal for `Pretty` formats, including strings with hieroglyphs. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3257). -* `ON CLUSTER` can be specified for `ALTER UPDATE` queries. -* Improved performance for reading data in `JSONEachRow` format. [#3332](https://github.com/ClickHouse/ClickHouse/pull/3332) -* Added synonyms for the `LENGTH` and `CHARACTER_LENGTH` functions for compatibility. The `CONCAT` function is no longer case-sensitive. [#3306](https://github.com/ClickHouse/ClickHouse/pull/3306) -* Added the `TIMESTAMP` synonym for the `DateTime` type. [#3390](https://github.com/ClickHouse/ClickHouse/pull/3390) -* There is always space reserved for query_id in the server logs, even if the log line is not related to a query. This makes it easier to parse server text logs with third-party tools. -* Memory consumption by a query is logged when it exceeds the next level of an integer number of gigabytes. [#3205](https://github.com/ClickHouse/ClickHouse/pull/3205) -* Added compatibility mode for the case when the client library that uses the Native protocol sends fewer columns by mistake than the server expects for the INSERT query. This scenario was possible when using the clickhouse-cpp library. Previously, this scenario caused the server to crash. [#3171](https://github.com/ClickHouse/ClickHouse/pull/3171) -* In a user-defined WHERE expression in `clickhouse-copier`, you can now use a `partition_key` alias (for additional filtering by source table partition). This is useful if the partitioning scheme changes during copying, but only changes slightly. [#3166](https://github.com/ClickHouse/ClickHouse/pull/3166) -* The workflow of the `Kafka` engine has been moved to a background thread pool in order to automatically reduce the speed of data reading at high loads. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3215). -* Support for reading `Tuple` and `Nested` values of structures like `struct` in the `Cap'n'Proto format`. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3216) -* The list of top-level domains for the `firstSignificantSubdomain` function now includes the domain `biz`. [decaseal](https://github.com/ClickHouse/ClickHouse/pull/3219) -* In the configuration of external dictionaries, `null_value` is interpreted as the value of the default data type. [#3330](https://github.com/ClickHouse/ClickHouse/pull/3330) -* Support for the `intDiv` and `intDivOrZero` functions for `Decimal`. [b48402e8](https://github.com/ClickHouse/ClickHouse/commit/b48402e8712e2b9b151e0eef8193811d433a1264) -* Support for the `Date`, `DateTime`, `UUID`, and `Decimal` types as a key for the `sumMap` aggregate function. [#3281](https://github.com/ClickHouse/ClickHouse/pull/3281) -* Support for the `Decimal` data type in external dictionaries. [#3324](https://github.com/ClickHouse/ClickHouse/pull/3324) -* Support for the `Decimal` data type in `SummingMergeTree` tables. [#3348](https://github.com/ClickHouse/ClickHouse/pull/3348) -* Added specializations for `UUID` in `if`. [#3366](https://github.com/ClickHouse/ClickHouse/pull/3366) -* Reduced the number of `open` and `close` system calls when reading from a `MergeTree table`. [#3283](https://github.com/ClickHouse/ClickHouse/pull/3283) -* A `TRUNCATE TABLE` query can be executed on any replica (the query is passed to the leader replica). [Kirill Shvakov](https://github.com/ClickHouse/ClickHouse/pull/3375) - -#### Bug fixes: - -* Fixed an issue with `Dictionary` tables for `range_hashed` dictionaries. This error occurred in version 18.12.17. [#1702](https://github.com/ClickHouse/ClickHouse/pull/1702) -* Fixed an error when loading `range_hashed` dictionaries (the message `Unsupported type Nullable (...)`). This error occurred in version 18.12.17. [#3362](https://github.com/ClickHouse/ClickHouse/pull/3362) -* Fixed errors in the `pointInPolygon` function due to the accumulation of inaccurate calculations for polygons with a large number of vertices located close to each other. [#3331](https://github.com/ClickHouse/ClickHouse/pull/3331) [#3341](https://github.com/ClickHouse/ClickHouse/pull/3341) -* If after merging data parts, the checksum for the resulting part differs from the result of the same merge in another replica, the result of the merge is deleted and the data part is downloaded from the other replica (this is the correct behavior). But after downloading the data part, it couldn't be added to the working set because of an error that the part already exists (because the data part was deleted with some delay after the merge). This led to cyclical attempts to download the same data. [#3194](https://github.com/ClickHouse/ClickHouse/pull/3194) -* Fixed incorrect calculation of total memory consumption by queries (because of incorrect calculation, the `max_memory_usage_for_all_queries` setting worked incorrectly and the `MemoryTracking` metric had an incorrect value). This error occurred in version 18.12.13. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3344) -* Fixed the functionality of `CREATE TABLE ... ON CLUSTER ... AS SELECT ...` This error occurred in version 18.12.13. [#3247](https://github.com/ClickHouse/ClickHouse/pull/3247) -* Fixed unnecessary preparation of data structures for `JOIN`s on the server that initiates the query if the `JOIN` is only performed on remote servers. [#3340](https://github.com/ClickHouse/ClickHouse/pull/3340) -* Fixed bugs in the `Kafka` engine: deadlocks after exceptions when starting to read data, and locks upon completion [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3215). -* For `Kafka` tables, the optional `schema` parameter was not passed (the schema of the `Cap'n'Proto` format). [Vojtech Splichal](https://github.com/ClickHouse/ClickHouse/pull/3150) -* If the ensemble of ZooKeeper servers has servers that accept the connection but then immediately close it instead of responding to the handshake, ClickHouse chooses to connect another server. Previously, this produced the error `Cannot read all data. Bytes read: 0. Bytes expected: 4.` and the server couldn't start. [8218cf3a](https://github.com/ClickHouse/ClickHouse/commit/8218cf3a5f39a43401953769d6d12a0bb8d29da9) -* If the ensemble of ZooKeeper servers contains servers for which the DNS query returns an error, these servers are ignored. [17b8e209](https://github.com/ClickHouse/ClickHouse/commit/17b8e209221061325ad7ba0539f03c6e65f87f29) -* Fixed type conversion between `Date` and `DateTime` when inserting data in the `VALUES` format (if `input_format_values_interpret_expressions = 1`). Previously, the conversion was performed between the numerical value of the number of days in Unix Epoch time and the Unix timestamp, which led to unexpected results. [#3229](https://github.com/ClickHouse/ClickHouse/pull/3229) -* Corrected type conversion between `Decimal` and integer numbers. [#3211](https://github.com/ClickHouse/ClickHouse/pull/3211) -* Fixed errors in the `enable_optimize_predicate_expression` setting. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3231) -* Fixed a parsing error in CSV format with floating-point numbers if a non-default CSV separator is used, such as `;` [#3155](https://github.com/ClickHouse/ClickHouse/pull/3155) -* Fixed the `arrayCumSumNonNegative` function (it does not accumulate negative values if the accumulator is less than zero). [Aleksey Studnev](https://github.com/ClickHouse/ClickHouse/pull/3163) -* Fixed how `Merge` tables work on top of `Distributed` tables when using `PREWHERE`. [#3165](https://github.com/ClickHouse/ClickHouse/pull/3165) -* Bug fixes in the `ALTER UPDATE` query. -* Fixed bugs in the `odbc` table function that appeared in version 18.12. [#3197](https://github.com/ClickHouse/ClickHouse/pull/3197) -* Fixed the operation of aggregate functions with `StateArray` combinators. [#3188](https://github.com/ClickHouse/ClickHouse/pull/3188) -* Fixed a crash when dividing a `Decimal` value by zero. [69dd6609](https://github.com/ClickHouse/ClickHouse/commit/69dd6609193beb4e7acd3e6ad216eca0ccfb8179) -* Fixed output of types for operations using `Decimal` and integer arguments. [#3224](https://github.com/ClickHouse/ClickHouse/pull/3224) -* Fixed the segfault during `GROUP BY` on `Decimal128`. [3359ba06](https://github.com/ClickHouse/ClickHouse/commit/3359ba06c39fcd05bfdb87d6c64154819621e13a) -* The `log_query_threads` setting (logging information about each thread of query execution) now takes effect only if the `log_queries` option (logging information about queries) is set to 1. Since the `log_query_threads` option is enabled by default, information about threads was previously logged even if query logging was disabled. [#3241](https://github.com/ClickHouse/ClickHouse/pull/3241) -* Fixed an error in the distributed operation of the quantiles aggregate function (the error message `Not found column quantile...`). [292a8855](https://github.com/ClickHouse/ClickHouse/commit/292a885533b8e3b41ce8993867069d14cbd5a664) -* Fixed the compatibility problem when working on a cluster of version 18.12.17 servers and older servers at the same time. For distributed queries with GROUP BY keys of both fixed and non-fixed length, if there was a large amount of data to aggregate, the returned data was not always fully aggregated (two different rows contained the same aggregation keys). [#3254](https://github.com/ClickHouse/ClickHouse/pull/3254) -* Fixed handling of substitutions in `clickhouse-performance-test`, if the query contains only part of the substitutions declared in the test. [#3263](https://github.com/ClickHouse/ClickHouse/pull/3263) -* Fixed an error when using `FINAL` with `PREWHERE`. [#3298](https://github.com/ClickHouse/ClickHouse/pull/3298) -* Fixed an error when using `PREWHERE` over columns that were added during `ALTER`. [#3298](https://github.com/ClickHouse/ClickHouse/pull/3298) -* Added a check for the absence of `arrayJoin` for `DEFAULT` and `MATERIALIZED` expressions. Previously, `arrayJoin` led to an error when inserting data. [#3337](https://github.com/ClickHouse/ClickHouse/pull/3337) -* Added a check for the absence of `arrayJoin` in a `PREWHERE` clause. Previously, this led to messages like `Size ... doesn't match` or `Unknown compression method` when executing queries. [#3357](https://github.com/ClickHouse/ClickHouse/pull/3357) -* Fixed segfault that could occur in rare cases after optimization that replaced AND chains from equality evaluations with the corresponding IN expression. [liuyimin-bytedance](https://github.com/ClickHouse/ClickHouse/pull/3339) -* Minor corrections to `clickhouse-benchmark`: previously, client information was not sent to the server; now the number of queries executed is calculated more accurately when shutting down and for limiting the number of iterations. [#3351](https://github.com/ClickHouse/ClickHouse/pull/3351) [#3352](https://github.com/ClickHouse/ClickHouse/pull/3352) - -#### Backward incompatible changes: - -* Removed the `allow_experimental_decimal_type` option. The `Decimal` data type is available for default use. [#3329](https://github.com/ClickHouse/ClickHouse/pull/3329) - -## ClickHouse release 18.12 - -### ClickHouse release 18.12.17, 2018-09-16 - -#### New features: - -* `invalidate_query` (the ability to specify a query to check whether an external dictionary needs to be updated) is implemented for the `clickhouse` source. [#3126](https://github.com/ClickHouse/ClickHouse/pull/3126) -* Added the ability to use `UInt*`, `Int*`, and `DateTime` data types (along with the `Date` type) as a `range_hashed` external dictionary key that defines the boundaries of ranges. Now `NULL` can be used to designate an open range. [Vasily Nemkov](https://github.com/ClickHouse/ClickHouse/pull/3123) -* The `Decimal` type now supports `var*` and `stddev*` aggregate functions. [#3129](https://github.com/ClickHouse/ClickHouse/pull/3129) -* The `Decimal` type now supports mathematical functions (`exp`, `sin` and so on.) [#3129](https://github.com/ClickHouse/ClickHouse/pull/3129) -* The `system.part_log` table now has the `partition_id` column. [#3089](https://github.com/ClickHouse/ClickHouse/pull/3089) - -#### Bug fixes: - -* `Merge` now works correctly on `Distributed` tables. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3159) -* Fixed incompatibility (unnecessary dependency on the `glibc` version) that made it impossible to run ClickHouse on `Ubuntu Precise` and older versions. The incompatibility arose in version 18.12.13. [#3130](https://github.com/ClickHouse/ClickHouse/pull/3130) -* Fixed errors in the `enable_optimize_predicate_expression` setting. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3107) -* Fixed a minor issue with backwards compatibility that appeared when working with a cluster of replicas on versions earlier than 18.12.13 and simultaneously creating a new replica of a table on a server with a newer version (shown in the message `Can not clone replica, because the ... updated to new ClickHouse version`, which is logical, but shouldn't happen). [#3122](https://github.com/ClickHouse/ClickHouse/pull/3122) - -#### Backward incompatible changes: - -* The `enable_optimize_predicate_expression` option is enabled by default (which is rather optimistic). If query analysis errors occur that are related to searching for the column names, set `enable_optimize_predicate_expression` to 0. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3107) - -### ClickHouse release 18.12.14, 2018-09-13 - -#### New features: - -* Added support for `ALTER UPDATE` queries. [#3035](https://github.com/ClickHouse/ClickHouse/pull/3035) -* Added the `allow_ddl` option, which restricts the user's access to DDL queries. [#3104](https://github.com/ClickHouse/ClickHouse/pull/3104) -* Added the `min_merge_bytes_to_use_direct_io` option for `MergeTree` engines, which allows you to set a threshold for the total size of the merge (when above the threshold, data part files will be handled using O_DIRECT). [#3117](https://github.com/ClickHouse/ClickHouse/pull/3117) -* The `system.merges` system table now contains the `partition_id` column. [#3099](https://github.com/ClickHouse/ClickHouse/pull/3099) - -#### Improvements - -* If a data part remains unchanged during mutation, it isn't downloaded by replicas. [#3103](https://github.com/ClickHouse/ClickHouse/pull/3103) -* Autocomplete is available for names of settings when working with `clickhouse-client`. [#3106](https://github.com/ClickHouse/ClickHouse/pull/3106) - -#### Bug fixes: - -* Added a check for the sizes of arrays that are elements of `Nested` type fields when inserting. [#3118](https://github.com/ClickHouse/ClickHouse/pull/3118) -* Fixed an error updating external dictionaries with the `ODBC` source and `hashed` storage. This error occurred in version 18.12.13. -* Fixed a crash when creating a temporary table from a query with an `IN` condition. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3098) -* Fixed an error in aggregate functions for arrays that can have `NULL` elements. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3097) - - -### ClickHouse release 18.12.13, 2018-09-10 - -#### New features: - -* Added the `DECIMAL(digits, scale)` data type (`Decimal32(scale)`, `Decimal64(scale)`, `Decimal128(scale)`). To enable it, use the setting `allow_experimental_decimal_type`. [#2846](https://github.com/ClickHouse/ClickHouse/pull/2846) [#2970](https://github.com/ClickHouse/ClickHouse/pull/2970) [#3008](https://github.com/ClickHouse/ClickHouse/pull/3008) [#3047](https://github.com/ClickHouse/ClickHouse/pull/3047) -* New `WITH ROLLUP` modifier for `GROUP BY` (alternative syntax: `GROUP BY ROLLUP(...)`). [#2948](https://github.com/ClickHouse/ClickHouse/pull/2948) -* In queries with JOIN, the star character expands to a list of columns in all tables, in compliance with the SQL standard. You can restore the old behavior by setting `asterisk_left_columns_only` to 1 on the user configuration level. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2787) -* Added support for JOIN with table functions. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2907) -* Autocomplete by pressing Tab in clickhouse-client. [Sergey Shcherbin](https://github.com/ClickHouse/ClickHouse/pull/2447) -* Ctrl+C in clickhouse-client clears a query that was entered. [#2877](https://github.com/ClickHouse/ClickHouse/pull/2877) -* Added the `join_default_strictness` setting (values: `"`, `'any'`, `'all'`). This allows you to not specify `ANY` or `ALL` for `JOIN`. [#2982](https://github.com/ClickHouse/ClickHouse/pull/2982) -* Each line of the server log related to query processing shows the query ID. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) -* Now you can get query execution logs in clickhouse-client (use the `send_logs_level` setting). With distributed query processing, logs are cascaded from all the servers. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) -* The `system.query_log` and `system.processes` (`SHOW PROCESSLIST`) tables now have information about all changed settings when you run a query (the nested structure of the `Settings` data). Added the `log_query_settings` setting. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) -* The `system.query_log` and `system.processes` tables now show information about the number of threads that are participating in query execution (see the `thread_numbers` column). [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) -* Added `ProfileEvents` counters that measure the time spent on reading and writing over the network and reading and writing to disk, the number of network errors, and the time spent waiting when network bandwidth is limited. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) -* Added `ProfileEvents`counters that contain the system metrics from rusage (you can use them to get information about CPU usage in userspace and the kernel, page faults, and context switches), as well as taskstats metrics (use these to obtain information about I/O wait time, CPU wait time, and the amount of data read and recorded, both with and without page cache). [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) -* The `ProfileEvents` counters are applied globally and for each query, as well as for each query execution thread, which allows you to profile resource consumption by query in detail. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) -* Added the `system.query_thread_log` table, which contains information about each query execution thread. Added the `log_query_threads` setting. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) -* The `system.metrics` and `system.events` tables now have built-in documentation. [#3016](https://github.com/ClickHouse/ClickHouse/pull/3016) -* Added the `arrayEnumerateDense` function. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2975) -* Added the `arrayCumSumNonNegative` and `arrayDifference` functions. [Aleksey Studnev](https://github.com/ClickHouse/ClickHouse/pull/2942) -* Added the `retention` aggregate function. [Sundy Li](https://github.com/ClickHouse/ClickHouse/pull/2887) -* Now you can add (merge) states of aggregate functions by using the plus operator, and multiply the states of aggregate functions by a nonnegative constant. [#3062](https://github.com/ClickHouse/ClickHouse/pull/3062) [#3034](https://github.com/ClickHouse/ClickHouse/pull/3034) -* Tables in the MergeTree family now have the virtual column `_partition_id`. [#3089](https://github.com/ClickHouse/ClickHouse/pull/3089) - -#### Experimental features: - -* Added the `LowCardinality(T)` data type. This data type automatically creates a local dictionary of values and allows data processing without unpacking the dictionary. [#2830](https://github.com/ClickHouse/ClickHouse/pull/2830) -* Added a cache of JIT-compiled functions and a counter for the number of uses before compiling. To JIT compile expressions, enable the `compile_expressions` setting. [#2990](https://github.com/ClickHouse/ClickHouse/pull/2990) [#3077](https://github.com/ClickHouse/ClickHouse/pull/3077) - -#### Improvements: - -* Fixed the problem with unlimited accumulation of the replication log when there are abandoned replicas. Added an effective recovery mode for replicas with a long lag. -* Improved performance of `GROUP BY` with multiple aggregation fields when one of them is string and the others are fixed length. -* Improved performance when using `PREWHERE` and with implicit transfer of expressions in `PREWHERE`. -* Improved parsing performance for text formats (`CSV`, `TSV`). [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2977) [#2980](https://github.com/ClickHouse/ClickHouse/pull/2980) -* Improved performance of reading strings and arrays in binary formats. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2955) -* Increased performance and reduced memory consumption for queries to `system.tables` and `system.columns` when there is a very large number of tables on a single server. [#2953](https://github.com/ClickHouse/ClickHouse/pull/2953) -* Fixed a performance problem in the case of a large stream of queries that result in an error (the ` _dl_addr` function is visible in `perf top`, but the server isn't using much CPU). [#2938](https://github.com/ClickHouse/ClickHouse/pull/2938) -* Conditions are cast into the View (when `enable_optimize_predicate_expression` is enabled). [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2907) -* Improvements to the functionality for the `UUID` data type. [#3074](https://github.com/ClickHouse/ClickHouse/pull/3074) [#2985](https://github.com/ClickHouse/ClickHouse/pull/2985) -* The `UUID` data type is supported in The-Alchemist dictionaries. [#2822](https://github.com/ClickHouse/ClickHouse/pull/2822) -* The `visitParamExtractRaw` function works correctly with nested structures. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2974) -* When the `input_format_skip_unknown_fields` setting is enabled, object fields in `JSONEachRow` format are skipped correctly. [BlahGeek](https://github.com/ClickHouse/ClickHouse/pull/2958) -* For a `CASE` expression with conditions, you can now omit `ELSE`, which is equivalent to `ELSE NULL`. [#2920](https://github.com/ClickHouse/ClickHouse/pull/2920) -* The operation timeout can now be configured when working with ZooKeeper. [urykhy](https://github.com/ClickHouse/ClickHouse/pull/2971) -* You can specify an offset for `LIMIT n, m` as `LIMIT n OFFSET m`. [#2840](https://github.com/ClickHouse/ClickHouse/pull/2840) -* You can use the `SELECT TOP n` syntax as an alternative for `LIMIT`. [#2840](https://github.com/ClickHouse/ClickHouse/pull/2840) -* Increased the size of the queue to write to system tables, so the `SystemLog parameter queue is full` error doesn't happen as often. -* The `windowFunnel` aggregate function now supports events that meet multiple conditions. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2801) -* Duplicate columns can be used in a `USING` clause for `JOIN`. [#3006](https://github.com/ClickHouse/ClickHouse/pull/3006) -* `Pretty` formats now have a limit on column alignment by width. Use the `output_format_pretty_max_column_pad_width` setting. If a value is wider, it will still be displayed in its entirety, but the other cells in the table will not be too wide. [#3003](https://github.com/ClickHouse/ClickHouse/pull/3003) -* The `odbc` table function now allows you to specify the database/schema name. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2885) -* Added the ability to use a username specified in the `clickhouse-client` config file. [Vladimir Kozbin](https://github.com/ClickHouse/ClickHouse/pull/2909) -* The `ZooKeeperExceptions` counter has been split into three counters: `ZooKeeperUserExceptions`, `ZooKeeperHardwareExceptions`, and `ZooKeeperOtherExceptions`. -* `ALTER DELETE` queries work for materialized views. -* Added randomization when running the cleanup thread periodically for `ReplicatedMergeTree` tables in order to avoid periodic load spikes when there are a very large number of `ReplicatedMergeTree` tables. -* Support for `ATTACH TABLE ... ON CLUSTER` queries. [#3025](https://github.com/ClickHouse/ClickHouse/pull/3025) - -#### Bug fixes: - -* Fixed an issue with `Dictionary` tables (throws the `Size of offsets doesn't match size of column` or `Unknown compression method` exception). This bug appeared in version 18.10.3. [#2913](https://github.com/ClickHouse/ClickHouse/issues/2913) -* Fixed a bug when merging `CollapsingMergeTree` tables if one of the data parts is empty (these parts are formed during merge or `ALTER DELETE` if all data was deleted), and the `vertical` algorithm was used for the merge. [#3049](https://github.com/ClickHouse/ClickHouse/pull/3049) -* Fixed a race condition during `DROP` or `TRUNCATE` for `Memory` tables with a simultaneous `SELECT`, which could lead to server crashes. This bug appeared in version 1.1.54388. [#3038](https://github.com/ClickHouse/ClickHouse/pull/3038) -* Fixed the possibility of data loss when inserting in `Replicated` tables if the `Session is expired` error is returned (data loss can be detected by the `ReplicatedDataLoss` metric). This error occurred in version 1.1.54378. [#2939](https://github.com/ClickHouse/ClickHouse/pull/2939) [#2949](https://github.com/ClickHouse/ClickHouse/pull/2949) [#2964](https://github.com/ClickHouse/ClickHouse/pull/2964) -* Fixed a segfault during `JOIN ... ON`. [#3000](https://github.com/ClickHouse/ClickHouse/pull/3000) -* Fixed the error searching column names when the `WHERE` expression consists entirely of a qualified column name, such as `WHERE table.column`. [#2994](https://github.com/ClickHouse/ClickHouse/pull/2994) -* Fixed the "Not found column" error that occurred when executing distributed queries if a single column consisting of an IN expression with a subquery is requested from a remote server. [#3087](https://github.com/ClickHouse/ClickHouse/pull/3087) -* Fixed the `Block structure mismatch in UNION stream: different number of columns` error that occurred for distributed queries if one of the shards is local and the other is not, and optimization of the move to `PREWHERE` is triggered. [#2226](https://github.com/ClickHouse/ClickHouse/pull/2226) [#3037](https://github.com/ClickHouse/ClickHouse/pull/3037) [#3055](https://github.com/ClickHouse/ClickHouse/pull/3055) [#3065](https://github.com/ClickHouse/ClickHouse/pull/3065) [#3073](https://github.com/ClickHouse/ClickHouse/pull/3073) [#3090](https://github.com/ClickHouse/ClickHouse/pull/3090) [#3093](https://github.com/ClickHouse/ClickHouse/pull/3093) -* Fixed the `pointInPolygon` function for certain cases of non-convex polygons. [#2910](https://github.com/ClickHouse/ClickHouse/pull/2910) -* Fixed the incorrect result when comparing `nan` with integers. [#3024](https://github.com/ClickHouse/ClickHouse/pull/3024) -* Fixed an error in the `zlib-ng` library that could lead to segfault in rare cases. [#2854](https://github.com/ClickHouse/ClickHouse/pull/2854) -* Fixed a memory leak when inserting into a table with `AggregateFunction` columns, if the state of the aggregate function is not simple (allocates memory separately), and if a single insertion request results in multiple small blocks. [#3084](https://github.com/ClickHouse/ClickHouse/pull/3084) -* Fixed a race condition when creating and deleting the same `Buffer` or `MergeTree` table simultaneously. -* Fixed the possibility of a segfault when comparing tuples made up of certain non-trivial types, such as tuples. [#2989](https://github.com/ClickHouse/ClickHouse/pull/2989) -* Fixed the possibility of a segfault when running certain `ON CLUSTER` queries. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2960) -* Fixed an error in the `arrayDistinct` function for `Nullable` array elements. [#2845](https://github.com/ClickHouse/ClickHouse/pull/2845) [#2937](https://github.com/ClickHouse/ClickHouse/pull/2937) -* The `enable_optimize_predicate_expression` option now correctly supports cases with `SELECT *`. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2929) -* Fixed the segfault when re-initializing the ZooKeeper session. [#2917](https://github.com/ClickHouse/ClickHouse/pull/2917) -* Fixed potential blocking when working with ZooKeeper. -* Fixed incorrect code for adding nested data structures in a `SummingMergeTree`. -* When allocating memory for states of aggregate functions, alignment is correctly taken into account, which makes it possible to use operations that require alignment when implementing states of aggregate functions. [chenxing-xc](https://github.com/ClickHouse/ClickHouse/pull/2808) - -#### Security fix: - -* Safe use of ODBC data sources. Interaction with ODBC drivers uses a separate `clickhouse-odbc-bridge` process. Errors in third-party ODBC drivers no longer cause problems with server stability or vulnerabilities. [#2828](https://github.com/ClickHouse/ClickHouse/pull/2828) [#2879](https://github.com/ClickHouse/ClickHouse/pull/2879) [#2886](https://github.com/ClickHouse/ClickHouse/pull/2886) [#2893](https://github.com/ClickHouse/ClickHouse/pull/2893) [#2921](https://github.com/ClickHouse/ClickHouse/pull/2921) -* Fixed incorrect validation of the file path in the `catBoostPool` table function. [#2894](https://github.com/ClickHouse/ClickHouse/pull/2894) -* The contents of system tables (`tables`, `databases`, `parts`, `columns`, `parts_columns`, `merges`, `mutations`, `replicas`, and `replication_queue`) are filtered according to the user's configured access to databases (`allow_databases`). [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2856) - -#### Backward incompatible changes: - -* In queries with JOIN, the star character expands to a list of columns in all tables, in compliance with the SQL standard. You can restore the old behavior by setting `asterisk_left_columns_only` to 1 on the user configuration level. - -#### Build changes: - -* Most integration tests can now be run by commit. -* Code style checks can also be run by commit. -* The `memcpy` implementation is chosen correctly when building on CentOS7/Fedora. [Etienne Champetier](https://github.com/ClickHouse/ClickHouse/pull/2912) -* When using clang to build, some warnings from `-Weverything` have been added, in addition to the regular `-Wall-Wextra -Werror`. [#2957](https://github.com/ClickHouse/ClickHouse/pull/2957) -* Debugging the build uses the `jemalloc` debug option. -* The interface of the library for interacting with ZooKeeper is declared abstract. [#2950](https://github.com/ClickHouse/ClickHouse/pull/2950) - -## ClickHouse release 18.10 - -### ClickHouse release 18.10.3, 2018-08-13 - -#### New features: - -* HTTPS can be used for replication. [#2760](https://github.com/ClickHouse/ClickHouse/pull/2760) -* Added the functions `murmurHash2_64`, `murmurHash3_32`, `murmurHash3_64`, and `murmurHash3_128` in addition to the existing `murmurHash2_32`. [#2791](https://github.com/ClickHouse/ClickHouse/pull/2791) -* Support for Nullable types in the ClickHouse ODBC driver (`ODBCDriver2` output format). [#2834](https://github.com/ClickHouse/ClickHouse/pull/2834) -* Support for `UUID` in the key columns. - -#### Improvements: +- The server does not write the processed configuration files to the `/etc/clickhouse-server/` directory. Instead, it saves them in the `preprocessed_configs` directory inside `path`. This means that the `/etc/clickhouse-server/` directory doesn’t have write access for the `clickhouse` user, which improves security. [\#2443](https://github.com/ClickHouse/ClickHouse/pull/2443) +- The `min_merge_bytes_to_use_direct_io` option is set to 10 GiB by default. A merge that forms large parts of tables from the MergeTree family will be performed in `O_DIRECT` mode, which prevents excessive page cache eviction. [\#3504](https://github.com/ClickHouse/ClickHouse/pull/3504) +- Accelerated server start when there is a very large number of tables. [\#3398](https://github.com/ClickHouse/ClickHouse/pull/3398) +- Added a connection pool and HTTP `Keep-Alive` for connections between replicas. [\#3594](https://github.com/ClickHouse/ClickHouse/pull/3594) +- If the query syntax is invalid, the `400 Bad Request` code is returned in the `HTTP` interface (500 was returned previously). [31bc680a](https://github.com/ClickHouse/ClickHouse/commit/31bc680ac5f4bb1d0360a8ba4696fa84bb47d6ab) +- The `join_default_strictness` option is set to `ALL` by default for compatibility. [120e2cbe](https://github.com/ClickHouse/ClickHouse/commit/120e2cbe2ff4fbad626c28042d9b28781c805afe) +- Removed logging to `stderr` from the `re2` library for invalid or complex regular expressions. [\#3723](https://github.com/ClickHouse/ClickHouse/pull/3723) +- Added for the `Kafka` table engine: checks for subscriptions before beginning to read from Kafka; the kafka\_max\_block\_size setting for the table. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3396) +- The `cityHash64`, `farmHash64`, `metroHash64`, `sipHash64`, `halfMD5`, `murmurHash2_32`, `murmurHash2_64`, `murmurHash3_32`, and `murmurHash3_64` functions now work for any number of arguments and for arguments in the form of tuples. [\#3451](https://github.com/ClickHouse/ClickHouse/pull/3451) [\#3519](https://github.com/ClickHouse/ClickHouse/pull/3519) +- The `arrayReverse` function now works with any types of arrays. [73e3a7b6](https://github.com/ClickHouse/ClickHouse/commit/73e3a7b662161d6005e7727d8a711b930386b871) +- Added an optional parameter: the slot size for the `timeSlots` function. [Kirill Shvakov](https://github.com/ClickHouse/ClickHouse/pull/3724) +- For `FULL` and `RIGHT JOIN`, the `max_block_size` setting is used for a stream of non-joined data from the right table. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3699) +- Added the `--secure` command line parameter in `clickhouse-benchmark` and `clickhouse-performance-test` to enable TLS. [\#3688](https://github.com/ClickHouse/ClickHouse/pull/3688) [\#3690](https://github.com/ClickHouse/ClickHouse/pull/3690) +- Type conversion when the structure of a `Buffer` type table does not match the structure of the destination table. [Vitaly Baranov](https://github.com/ClickHouse/ClickHouse/pull/3603) +- Added the `tcp_keep_alive_timeout` option to enable keep-alive packets after inactivity for the specified time interval. [\#3441](https://github.com/ClickHouse/ClickHouse/pull/3441) +- Removed unnecessary quoting of values for the partition key in the `system.parts` table if it consists of a single column. [\#3652](https://github.com/ClickHouse/ClickHouse/pull/3652) +- The modulo function works for `Date` and `DateTime` data types. [\#3385](https://github.com/ClickHouse/ClickHouse/pull/3385) +- Added synonyms for the `POWER`, `LN`, `LCASE`, `UCASE`, `REPLACE`, `LOCATE`, `SUBSTR`, and `MID` functions. [\#3774](https://github.com/ClickHouse/ClickHouse/pull/3774) [\#3763](https://github.com/ClickHouse/ClickHouse/pull/3763) Some function names are case-insensitive for compatibility with the SQL standard. Added syntactic sugar `SUBSTRING(expr FROM start FOR length)` for compatibility with SQL. [\#3804](https://github.com/ClickHouse/ClickHouse/pull/3804) +- Added the ability to `mlock` memory pages corresponding to `clickhouse-server` executable code to prevent it from being forced out of memory. This feature is disabled by default. [\#3553](https://github.com/ClickHouse/ClickHouse/pull/3553) +- Improved performance when reading from `O_DIRECT` (with the `min_bytes_to_use_direct_io` option enabled). [\#3405](https://github.com/ClickHouse/ClickHouse/pull/3405) +- Improved performance of the `dictGet...OrDefault` function for a constant key argument and a non-constant default argument. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3563) +- The `firstSignificantSubdomain` function now processes the domains `gov`, `mil`, and `edu`. [Igor Hatarist](https://github.com/ClickHouse/ClickHouse/pull/3601) Improved performance. [\#3628](https://github.com/ClickHouse/ClickHouse/pull/3628) +- Ability to specify custom environment variables for starting `clickhouse-server` using the `SYS-V init.d` script by defining `CLICKHOUSE_PROGRAM_ENV` in `/etc/default/clickhouse`. + [Pavlo Bashynskyi](https://github.com/ClickHouse/ClickHouse/pull/3612) +- Correct return code for the clickhouse-server init script. [\#3516](https://github.com/ClickHouse/ClickHouse/pull/3516) +- The `system.metrics` table now has the `VersionInteger` metric, and `system.build_options` has the added line `VERSION_INTEGER`, which contains the numeric form of the ClickHouse version, such as `18016000`. [\#3644](https://github.com/ClickHouse/ClickHouse/pull/3644) +- Removed the ability to compare the `Date` type with a number to avoid potential errors like `date = 2018-12-17`, where quotes around the date are omitted by mistake. [\#3687](https://github.com/ClickHouse/ClickHouse/pull/3687) +- Fixed the behavior of stateful functions like `rowNumberInAllBlocks`. They previously output a result that was one number larger due to starting during query analysis. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3729) +- If the `force_restore_data` file can’t be deleted, an error message is displayed. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3794) -* Clusters can be removed without restarting the server when they are deleted from the config files. [#2777](https://github.com/ClickHouse/ClickHouse/pull/2777) -* External dictionaries can be removed without restarting the server when they are removed from config files. [#2779](https://github.com/ClickHouse/ClickHouse/pull/2779) -* Added `SETTINGS` support for the `Kafka` table engine. [Alexander Marshalov](https://github.com/ClickHouse/ClickHouse/pull/2781) -* Improvements for the `UUID` data type (not yet complete). [#2618](https://github.com/ClickHouse/ClickHouse/pull/2618) -* Support for empty parts after merges in the `SummingMergeTree`, `CollapsingMergeTree` and `VersionedCollapsingMergeTree` engines. [#2815](https://github.com/ClickHouse/ClickHouse/pull/2815) -* Old records of completed mutations are deleted (`ALTER DELETE`). [#2784](https://github.com/ClickHouse/ClickHouse/pull/2784) -* Added the `system.merge_tree_settings` table. [Kirill Shvakov](https://github.com/ClickHouse/ClickHouse/pull/2841) -* The `system.tables` table now has dependency columns: `dependencies_database` and `dependencies_table`. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2851) -* Added the `max_partition_size_to_drop` config option. [#2782](https://github.com/ClickHouse/ClickHouse/pull/2782) -* Added the `output_format_json_escape_forward_slashes` option. [Alexander Bocharov](https://github.com/ClickHouse/ClickHouse/pull/2812) -* Added the `max_fetch_partition_retries_count` setting. [#2831](https://github.com/ClickHouse/ClickHouse/pull/2831) -* Added the `prefer_localhost_replica` setting for disabling the preference for a local replica and going to a local replica without inter-process interaction. [#2832](https://github.com/ClickHouse/ClickHouse/pull/2832) -* The `quantileExact` aggregate function returns `nan` in the case of aggregation on an empty `Float32` or `Float64` set. [Sundy Li](https://github.com/ClickHouse/ClickHouse/pull/2855) +#### Build improvements: {#build-improvements-1} -#### Bug fixes: +- Updated the `jemalloc` library, which fixes a potential memory leak. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3557) +- Profiling with `jemalloc` is enabled by default in order to debug builds. [2cc82f5c](https://github.com/ClickHouse/ClickHouse/commit/2cc82f5cbe266421cd4c1165286c2c47e5ffcb15) +- Added the ability to run integration tests when only `Docker` is installed on the system. [\#3650](https://github.com/ClickHouse/ClickHouse/pull/3650) +- Added the fuzz expression test in SELECT queries. [\#3442](https://github.com/ClickHouse/ClickHouse/pull/3442) +- Added a stress test for commits, which performs functional tests in parallel and in random order to detect more race conditions. [\#3438](https://github.com/ClickHouse/ClickHouse/pull/3438) +- Improved the method for starting clickhouse-server in a Docker image. [Elghazal Ahmed](https://github.com/ClickHouse/ClickHouse/pull/3663) +- For a Docker image, added support for initializing databases using files in the `/docker-entrypoint-initdb.d` directory. [Konstantin Lebedev](https://github.com/ClickHouse/ClickHouse/pull/3695) +- Fixes for builds on ARM. [\#3709](https://github.com/ClickHouse/ClickHouse/pull/3709) -* Removed unnecessary escaping of the connection string parameters for ODBC, which made it impossible to establish a connection. This error occurred in version 18.6.0. -* Fixed the logic for processing `REPLACE PARTITION` commands in the replication queue. If there are two `REPLACE` commands for the same partition, the incorrect logic could cause one of them to remain in the replication queue and not be executed. [#2814](https://github.com/ClickHouse/ClickHouse/pull/2814) -* Fixed a merge bug when all data parts were empty (parts that were formed from a merge or from `ALTER DELETE` if all data was deleted). This bug appeared in version 18.1.0. [#2930](https://github.com/ClickHouse/ClickHouse/pull/2930) -* Fixed an error for concurrent `Set` or `Join`. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2823) -* Fixed the `Block structure mismatch in UNION stream: different number of columns` error that occurred for `UNION ALL` queries inside a sub-query if one of the `SELECT` queries contains duplicate column names. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2094) -* Fixed a memory leak if an exception occurred when connecting to a MySQL server. -* Fixed incorrect clickhouse-client response code in case of a query error. -* Fixed incorrect behavior of materialized views containing DISTINCT. [#2795](https://github.com/ClickHouse/ClickHouse/issues/2795) +#### Backward incompatible changes: {#backward-incompatible-changes} -#### Backward incompatible changes +- Removed the ability to compare the `Date` type with a number. Instead of `toDate('2018-12-18') = 17883`, you must use explicit type conversion `= toDate(17883)` [\#3687](https://github.com/ClickHouse/ClickHouse/pull/3687) -* Removed support for CHECK TABLE queries for Distributed tables. +## ClickHouse release 18.14 {#clickhouse-release-18-14} -#### Build changes: +### ClickHouse release 18.14.19, 2018-12-19 {#clickhouse-release-18-14-19-2018-12-19} -* The allocator has been replaced: `jemalloc` is now used instead of `tcmalloc`. In some scenarios, this increases speed up to 20%. However, there are queries that have slowed by up to 20%. Memory consumption has been reduced by approximately 10% in some scenarios, with improved stability. With highly competitive loads, CPU usage in userspace and in system shows just a slight increase. [#2773](https://github.com/ClickHouse/ClickHouse/pull/2773) -* Use of libressl from a submodule. [#1983](https://github.com/ClickHouse/ClickHouse/pull/1983) [#2807](https://github.com/ClickHouse/ClickHouse/pull/2807) -* Use of unixodbc from a submodule. [#2789](https://github.com/ClickHouse/ClickHouse/pull/2789) -* Use of mariadb-connector-c from a submodule. [#2785](https://github.com/ClickHouse/ClickHouse/pull/2785) -* Added functional test files to the repository that depend on the availability of test data (for the time being, without the test data itself). +#### Bug fixes: {#bug-fixes-2} -## ClickHouse release 18.6 +- Fixed an error that led to problems with updating dictionaries with the ODBC source. [\#3825](https://github.com/ClickHouse/ClickHouse/issues/3825), [\#3829](https://github.com/ClickHouse/ClickHouse/issues/3829) +- Databases are correctly specified when executing DDL `ON CLUSTER` queries. [\#3460](https://github.com/ClickHouse/ClickHouse/pull/3460) +- Fixed a segfault if the `max_temporary_non_const_columns` limit was exceeded. [\#3788](https://github.com/ClickHouse/ClickHouse/pull/3788) -### ClickHouse release 18.6.0, 2018-08-02 +#### Build improvements: {#build-improvements-2} -#### New features: +- Fixes for builds on ARM. -* Added support for ON expressions for the JOIN ON syntax: -`JOIN ON Expr([table.]column ...) = Expr([table.]column, ...) [AND Expr([table.]column, ...) = Expr([table.]column, ...) ...]` -The expression must be a chain of equalities joined by the AND operator. Each side of the equality can be an arbitrary expression over the columns of one of the tables. The use of fully qualified column names is supported (`table.name`, `database.table.name`, `table_alias.name`, `subquery_alias.name`) for the right table. [#2742](https://github.com/ClickHouse/ClickHouse/pull/2742) -* HTTPS can be enabled for replication. [#2760](https://github.com/ClickHouse/ClickHouse/pull/2760) +### ClickHouse release 18.14.18, 2018-12-04 {#clickhouse-release-18-14-18-2018-12-04} -#### Improvements: +#### Bug fixes: {#bug-fixes-3} -* The server passes the patch component of its version to the client. Data about the patch version component is in `system.processes` and `query_log`. [#2646](https://github.com/ClickHouse/ClickHouse/pull/2646) +- Fixed error in `dictGet...` function for dictionaries of type `range`, if one of the arguments is constant and other is not. [\#3751](https://github.com/ClickHouse/ClickHouse/pull/3751) +- Fixed error that caused messages `netlink: '...': attribute type 1 has an invalid length` to be printed in Linux kernel log, that was happening only on fresh enough versions of Linux kernel. [\#3749](https://github.com/ClickHouse/ClickHouse/pull/3749) +- Fixed segfault in function `empty` for argument of `FixedString` type. [Daniel, Dao Quang Minh](https://github.com/ClickHouse/ClickHouse/pull/3703) +- Fixed excessive memory allocation when using large value of `max_query_size` setting (a memory chunk of `max_query_size` bytes was preallocated at once). [\#3720](https://github.com/ClickHouse/ClickHouse/pull/3720) -## ClickHouse release 18.5 +#### Build changes: {#build-changes} -### ClickHouse release 18.5.1, 2018-07-31 +- Fixed build with LLVM/Clang libraries of version 7 from the OS packages (these libraries are used for runtime query compilation). [\#3582](https://github.com/ClickHouse/ClickHouse/pull/3582) -#### New features: +### ClickHouse release 18.14.17, 2018-11-30 {#clickhouse-release-18-14-17-2018-11-30} -* Added the hash function `murmurHash2_32` [#2756](https://github.com/ClickHouse/ClickHouse/pull/2756). +#### Bug fixes: {#bug-fixes-4} -#### Improvements: +- Fixed cases when the ODBC bridge process did not terminate with the main server process. [\#3642](https://github.com/ClickHouse/ClickHouse/pull/3642) +- Fixed synchronous insertion into the `Distributed` table with a columns list that differs from the column list of the remote table. [\#3673](https://github.com/ClickHouse/ClickHouse/pull/3673) +- Fixed a rare race condition that can lead to a crash when dropping a MergeTree table. [\#3643](https://github.com/ClickHouse/ClickHouse/pull/3643) +- Fixed a query deadlock in case when query thread creation fails with the `Resource temporarily unavailable` error. [\#3643](https://github.com/ClickHouse/ClickHouse/pull/3643) +- Fixed parsing of the `ENGINE` clause when the `CREATE AS table` syntax was used and the `ENGINE` clause was specified before the `AS table` (the error resulted in ignoring the specified engine). [\#3692](https://github.com/ClickHouse/ClickHouse/pull/3692) -* Now you can use the `from_env` [#2741](https://github.com/ClickHouse/ClickHouse/pull/2741) attribute to set values in config files from environment variables. -* Added case-insensitive versions of the `coalesce`, `ifNull`, and `nullIf functions` [#2752](https://github.com/ClickHouse/ClickHouse/pull/2752). +### ClickHouse release 18.14.15, 2018-11-21 {#clickhouse-release-18-14-15-2018-11-21} -#### Bug fixes: +#### Bug fixes: {#bug-fixes-5} -* Fixed a possible bug when starting a replica [#2759](https://github.com/ClickHouse/ClickHouse/pull/2759). +- The size of memory chunk was overestimated while deserializing the column of type `Array(String)` that leads to “Memory limit exceeded” errors. The issue appeared in version 18.12.13. [\#3589](https://github.com/ClickHouse/ClickHouse/issues/3589) -## ClickHouse release 18.4 +### ClickHouse release 18.14.14, 2018-11-20 {#clickhouse-release-18-14-14-2018-11-20} -### ClickHouse release 18.4.0, 2018-07-28 +#### Bug fixes: {#bug-fixes-6} -#### New features: +- Fixed `ON CLUSTER` queries when cluster configured as secure (flag ``). [\#3599](https://github.com/ClickHouse/ClickHouse/pull/3599) -* Added system tables: `formats`, `data_type_families`, `aggregate_function_combinators`, `table_functions`, `table_engines`, `collations` [#2721](https://github.com/ClickHouse/ClickHouse/pull/2721). -* Added the ability to use a table function instead of a table as an argument of a `remote` or `cluster table function` [#2708](https://github.com/ClickHouse/ClickHouse/pull/2708). -* Support for `HTTP Basic` authentication in the replication protocol [#2727](https://github.com/ClickHouse/ClickHouse/pull/2727). -* The `has` function now allows searching for a numeric value in an array of `Enum` values [Maxim Khrisanfov](https://github.com/ClickHouse/ClickHouse/pull/2699). -* Support for adding arbitrary message separators when reading from `Kafka` [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2701). +#### Build changes: {#build-changes-1} -#### Improvements: +- Fixed problems (llvm-7 from system, macos) [\#3582](https://github.com/ClickHouse/ClickHouse/pull/3582) -* The `ALTER TABLE t DELETE WHERE` query does not rewrite data parts that were not affected by the WHERE condition [#2694](https://github.com/ClickHouse/ClickHouse/pull/2694). -* The `use_minimalistic_checksums_in_zookeeper` option for `ReplicatedMergeTree` tables is enabled by default. This setting was added in version 1.1.54378, 2018-04-16. Versions that are older than 1.1.54378 can no longer be installed. -* Support for running `KILL` and `OPTIMIZE` queries that specify `ON CLUSTER` [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2689). +### ClickHouse release 18.14.13, 2018-11-08 {#clickhouse-release-18-14-13-2018-11-08} -#### Bug fixes: +#### Bug fixes: {#bug-fixes-7} -* Fixed the error `Column ... is not under an aggregate function and not in GROUP BY` for aggregation with an IN expression. This bug appeared in version 18.1.0. ([bbdd780b](https://github.com/ClickHouse/ClickHouse/commit/bbdd780be0be06a0f336775941cdd536878dd2c2)) -* Fixed a bug in the `windowFunnel aggregate function` [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2735). -* Fixed a bug in the `anyHeavy` aggregate function ([a2101df2](https://github.com/ClickHouse/ClickHouse/commit/a2101df25a6a0fba99aa71f8793d762af2b801ee)) -* Fixed server crash when using the `countArray()` aggregate function. +- Fixed the `Block structure mismatch in MergingSorted stream` error. [\#3162](https://github.com/ClickHouse/ClickHouse/issues/3162) +- Fixed `ON CLUSTER` queries in case when secure connections were turned on in the cluster config (the `` flag). [\#3465](https://github.com/ClickHouse/ClickHouse/pull/3465) +- Fixed an error in queries that used `SAMPLE`, `PREWHERE` and alias columns. [\#3543](https://github.com/ClickHouse/ClickHouse/pull/3543) +- Fixed a rare `unknown compression method` error when the `min_bytes_to_use_direct_io` setting was enabled. [3544](https://github.com/ClickHouse/ClickHouse/pull/3544) -#### Backward incompatible changes: +#### Performance improvements: {#performance-improvements} -* Parameters for `Kafka` engine was changed from `Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format[, kafka_schema, kafka_num_consumers])` to `Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format[, kafka_row_delimiter, kafka_schema, kafka_num_consumers])`. If your tables use `kafka_schema` or `kafka_num_consumers` parameters, you have to manually edit the metadata files `path/metadata/database/table.sql` and add `kafka_row_delimiter` parameter with `''` value. +- Fixed performance regression of queries with `GROUP BY` of columns of UInt16 or Date type when executing on AMD EPYC processors. [Igor Lapko](https://github.com/ClickHouse/ClickHouse/pull/3512) +- Fixed performance regression of queries that process long strings. [\#3530](https://github.com/ClickHouse/ClickHouse/pull/3530) -## ClickHouse release 18.1 +#### Build improvements: {#build-improvements-3} -### ClickHouse release 18.1.0, 2018-07-23 +- Improvements for simplifying the Arcadia build. [\#3475](https://github.com/ClickHouse/ClickHouse/pull/3475), [\#3535](https://github.com/ClickHouse/ClickHouse/pull/3535) -#### New features: +### ClickHouse release 18.14.12, 2018-11-02 {#clickhouse-release-18-14-12-2018-11-02} -* Support for the `ALTER TABLE t DELETE WHERE` query for non-replicated MergeTree tables ([#2634](https://github.com/ClickHouse/ClickHouse/pull/2634)). -* Support for arbitrary types for the `uniq*` family of aggregate functions ([#2010](https://github.com/ClickHouse/ClickHouse/issues/2010)). -* Support for arbitrary types in comparison operators ([#2026](https://github.com/ClickHouse/ClickHouse/issues/2026)). -* The `users.xml` file allows setting a subnet mask in the format `10.0.0.1/255.255.255.0`. This is necessary for using masks for IPv6 networks with zeros in the middle ([#2637](https://github.com/ClickHouse/ClickHouse/pull/2637)). -* Added the `arrayDistinct` function ([#2670](https://github.com/ClickHouse/ClickHouse/pull/2670)). -* The SummingMergeTree engine can now work with AggregateFunction type columns ([Constantin S. Pan](https://github.com/ClickHouse/ClickHouse/pull/2566)). +#### Bug fixes: {#bug-fixes-8} -#### Improvements: +- Fixed a crash on joining two unnamed subqueries. [\#3505](https://github.com/ClickHouse/ClickHouse/pull/3505) +- Fixed generating incorrect queries (with an empty `WHERE` clause) when querying external databases. [hotid](https://github.com/ClickHouse/ClickHouse/pull/3477) +- Fixed using an incorrect timeout value in ODBC dictionaries. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3511) -* Changed the numbering scheme for release versions. Now the first part contains the year of release (A.D., Moscow timezone, minus 2000), the second part contains the number for major changes (increases for most releases), and the third part is the patch version. Releases are still backward compatible, unless otherwise stated in the changelog. -* Faster conversions of floating-point numbers to a string ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2664)). -* If some rows were skipped during an insert due to parsing errors (this is possible with the `input_allow_errors_num` and `input_allow_errors_ratio` settings enabled), the number of skipped rows is now written to the server log ([Leonardo Cecchi](https://github.com/ClickHouse/ClickHouse/pull/2669)). +### ClickHouse release 18.14.11, 2018-10-29 {#clickhouse-release-18-14-11-2018-10-29} -#### Bug fixes: +#### Bug fixes: {#bug-fixes-9} -* Fixed the TRUNCATE command for temporary tables ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2624)). -* Fixed a rare deadlock in the ZooKeeper client library that occurred when there was a network error while reading the response ([c315200](https://github.com/ClickHouse/ClickHouse/commit/c315200e64b87e44bdf740707fc857d1fdf7e947)). -* Fixed an error during a CAST to Nullable types ([#1322](https://github.com/ClickHouse/ClickHouse/issues/1322)). -* Fixed the incorrect result of the `maxIntersection()` function when the boundaries of intervals coincided ([Michael Furmur](https://github.com/ClickHouse/ClickHouse/pull/2657)). -* Fixed incorrect transformation of the OR expression chain in a function argument ([chenxing-xc](https://github.com/ClickHouse/ClickHouse/pull/2663)). -* Fixed performance degradation for queries containing `IN (subquery)` expressions inside another subquery ([#2571](https://github.com/ClickHouse/ClickHouse/issues/2571)). -* Fixed incompatibility between servers with different versions in distributed queries that use a `CAST` function that isn't in uppercase letters ([fe8c4d6](https://github.com/ClickHouse/ClickHouse/commit/fe8c4d64e434cacd4ceef34faa9005129f2190a5)). -* Added missing quoting of identifiers for queries to an external DBMS ([#2635](https://github.com/ClickHouse/ClickHouse/issues/2635)). +- Fixed the error `Block structure mismatch in UNION stream: different number of columns` in LIMIT queries. [\#2156](https://github.com/ClickHouse/ClickHouse/issues/2156) +- Fixed errors when merging data in tables containing arrays inside Nested structures. [\#3397](https://github.com/ClickHouse/ClickHouse/pull/3397) +- Fixed incorrect query results if the `merge_tree_uniform_read_distribution` setting is disabled (it is enabled by default). [\#3429](https://github.com/ClickHouse/ClickHouse/pull/3429) +- Fixed an error on inserts to a Distributed table in Native format. [\#3411](https://github.com/ClickHouse/ClickHouse/issues/3411) -#### Backward incompatible changes: +### ClickHouse release 18.14.10, 2018-10-23 {#clickhouse-release-18-14-10-2018-10-23} -* Converting a string containing the number zero to DateTime does not work. Example: `SELECT toDateTime('0')`. This is also the reason that `DateTime DEFAULT '0'` does not work in tables, as well as `0` in dictionaries. Solution: replace `0` with `0000-00-00 00:00:00`. +- The `compile_expressions` setting (JIT compilation of expressions) is disabled by default. [\#3410](https://github.com/ClickHouse/ClickHouse/pull/3410) +- The `enable_optimize_predicate_expression` setting is disabled by default. -## ClickHouse release 1.1 +### ClickHouse release 18.14.9, 2018-10-16 {#clickhouse-release-18-14-9-2018-10-16} -### ClickHouse release 1.1.54394, 2018-07-12 +#### New features: {#new-features-1} -#### New features: +- The `WITH CUBE` modifier for `GROUP BY` (the alternative syntax `GROUP BY CUBE(...)` is also available). [\#3172](https://github.com/ClickHouse/ClickHouse/pull/3172) +- Added the `formatDateTime` function. [Alexandr Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/2770) +- Added the `JDBC` table engine and `jdbc` table function (requires installing clickhouse-jdbc-bridge). [Alexandr Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/3210) +- Added functions for working with the ISO week number: `toISOWeek`, `toISOYear`, `toStartOfISOYear`, and `toDayOfYear`. [\#3146](https://github.com/ClickHouse/ClickHouse/pull/3146) +- Now you can use `Nullable` columns for `MySQL` and `ODBC` tables. [\#3362](https://github.com/ClickHouse/ClickHouse/pull/3362) +- Nested data structures can be read as nested objects in `JSONEachRow` format. Added the `input_format_import_nested_json` setting. [Veloman Yunkan](https://github.com/ClickHouse/ClickHouse/pull/3144) +- Parallel processing is available for many `MATERIALIZED VIEW`s when inserting data. See the `parallel_view_processing` setting. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3208) +- Added the `SYSTEM FLUSH LOGS` query (forced log flushes to system tables such as `query_log`) [\#3321](https://github.com/ClickHouse/ClickHouse/pull/3321) +- Now you can use pre-defined `database` and `table` macros when declaring `Replicated` tables. [\#3251](https://github.com/ClickHouse/ClickHouse/pull/3251) +- Added the ability to read `Decimal` type values in engineering notation (indicating powers of ten). [\#3153](https://github.com/ClickHouse/ClickHouse/pull/3153) -* Added the `histogram` aggregate function ([Mikhail Surin](https://github.com/ClickHouse/ClickHouse/pull/2521)). -* Now `OPTIMIZE TABLE ... FINAL` can be used without specifying partitions for `ReplicatedMergeTree` ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2600)). +#### Experimental features: {#experimental-features} -#### Bug fixes: +- Optimization of the GROUP BY clause for `LowCardinality data types.` [\#3138](https://github.com/ClickHouse/ClickHouse/pull/3138) +- Optimized calculation of expressions for `LowCardinality data types.` [\#3200](https://github.com/ClickHouse/ClickHouse/pull/3200) -* Fixed a problem with a very small timeout for sockets (one second) for reading and writing when sending and downloading replicated data, which made it impossible to download larger parts if there is a load on the network or disk (it resulted in cyclical attempts to download parts). This error occurred in version 1.1.54388. -* Fixed issues when using chroot in ZooKeeper if you inserted duplicate data blocks in the table. -* The `has` function now works correctly for an array with Nullable elements ([#2115](https://github.com/ClickHouse/ClickHouse/issues/2115)). -* The `system.tables` table now works correctly when used in distributed queries. The `metadata_modification_time` and `engine_full` columns are now non-virtual. Fixed an error that occurred if only these columns were queried from the table. -* Fixed how an empty `TinyLog` table works after inserting an empty data block ([#2563](https://github.com/ClickHouse/ClickHouse/issues/2563)). -* The `system.zookeeper` table works if the value of the node in ZooKeeper is NULL. +#### Improvements: {#improvements-2} -### ClickHouse release 1.1.54390, 2018-07-06 +- Significantly reduced memory consumption for queries with `ORDER BY` and `LIMIT`. See the `max_bytes_before_remerge_sort` setting. [\#3205](https://github.com/ClickHouse/ClickHouse/pull/3205) +- In the absence of `JOIN` (`LEFT`, `INNER`, …), `INNER JOIN` is assumed. [\#3147](https://github.com/ClickHouse/ClickHouse/pull/3147) +- Qualified asterisks work correctly in queries with `JOIN`. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3202) +- The `ODBC` table engine correctly chooses the method for quoting identifiers in the SQL dialect of a remote database. [Alexandr Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/3210) +- The `compile_expressions` setting (JIT compilation of expressions) is enabled by default. +- Fixed behavior for simultaneous DROP DATABASE/TABLE IF EXISTS and CREATE DATABASE/TABLE IF NOT EXISTS. Previously, a `CREATE DATABASE ... IF NOT EXISTS` query could return the error message “File … already exists”, and the `CREATE TABLE ... IF NOT EXISTS` and `DROP TABLE IF EXISTS` queries could return `Table ... is creating or attaching right now`. [\#3101](https://github.com/ClickHouse/ClickHouse/pull/3101) +- LIKE and IN expressions with a constant right half are passed to the remote server when querying from MySQL or ODBC tables. [\#3182](https://github.com/ClickHouse/ClickHouse/pull/3182) +- Comparisons with constant expressions in a WHERE clause are passed to the remote server when querying from MySQL and ODBC tables. Previously, only comparisons with constants were passed. [\#3182](https://github.com/ClickHouse/ClickHouse/pull/3182) +- Correct calculation of row width in the terminal for `Pretty` formats, including strings with hieroglyphs. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3257). +- `ON CLUSTER` can be specified for `ALTER UPDATE` queries. +- Improved performance for reading data in `JSONEachRow` format. [\#3332](https://github.com/ClickHouse/ClickHouse/pull/3332) +- Added synonyms for the `LENGTH` and `CHARACTER_LENGTH` functions for compatibility. The `CONCAT` function is no longer case-sensitive. [\#3306](https://github.com/ClickHouse/ClickHouse/pull/3306) +- Added the `TIMESTAMP` synonym for the `DateTime` type. [\#3390](https://github.com/ClickHouse/ClickHouse/pull/3390) +- There is always space reserved for query\_id in the server logs, even if the log line is not related to a query. This makes it easier to parse server text logs with third-party tools. +- Memory consumption by a query is logged when it exceeds the next level of an integer number of gigabytes. [\#3205](https://github.com/ClickHouse/ClickHouse/pull/3205) +- Added compatibility mode for the case when the client library that uses the Native protocol sends fewer columns by mistake than the server expects for the INSERT query. This scenario was possible when using the clickhouse-cpp library. Previously, this scenario caused the server to crash. [\#3171](https://github.com/ClickHouse/ClickHouse/pull/3171) +- In a user-defined WHERE expression in `clickhouse-copier`, you can now use a `partition_key` alias (for additional filtering by source table partition). This is useful if the partitioning scheme changes during copying, but only changes slightly. [\#3166](https://github.com/ClickHouse/ClickHouse/pull/3166) +- The workflow of the `Kafka` engine has been moved to a background thread pool in order to automatically reduce the speed of data reading at high loads. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3215). +- Support for reading `Tuple` and `Nested` values of structures like `struct` in the `Cap'n'Proto format`. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3216) +- The list of top-level domains for the `firstSignificantSubdomain` function now includes the domain `biz`. [decaseal](https://github.com/ClickHouse/ClickHouse/pull/3219) +- In the configuration of external dictionaries, `null_value` is interpreted as the value of the default data type. [\#3330](https://github.com/ClickHouse/ClickHouse/pull/3330) +- Support for the `intDiv` and `intDivOrZero` functions for `Decimal`. [b48402e8](https://github.com/ClickHouse/ClickHouse/commit/b48402e8712e2b9b151e0eef8193811d433a1264) +- Support for the `Date`, `DateTime`, `UUID`, and `Decimal` types as a key for the `sumMap` aggregate function. [\#3281](https://github.com/ClickHouse/ClickHouse/pull/3281) +- Support for the `Decimal` data type in external dictionaries. [\#3324](https://github.com/ClickHouse/ClickHouse/pull/3324) +- Support for the `Decimal` data type in `SummingMergeTree` tables. [\#3348](https://github.com/ClickHouse/ClickHouse/pull/3348) +- Added specializations for `UUID` in `if`. [\#3366](https://github.com/ClickHouse/ClickHouse/pull/3366) +- Reduced the number of `open` and `close` system calls when reading from a `MergeTree table`. [\#3283](https://github.com/ClickHouse/ClickHouse/pull/3283) +- A `TRUNCATE TABLE` query can be executed on any replica (the query is passed to the leader replica). [Kirill Shvakov](https://github.com/ClickHouse/ClickHouse/pull/3375) + +#### Bug fixes: {#bug-fixes-10} + +- Fixed an issue with `Dictionary` tables for `range_hashed` dictionaries. This error occurred in version 18.12.17. [\#1702](https://github.com/ClickHouse/ClickHouse/pull/1702) +- Fixed an error when loading `range_hashed` dictionaries (the message `Unsupported type Nullable (...)`). This error occurred in version 18.12.17. [\#3362](https://github.com/ClickHouse/ClickHouse/pull/3362) +- Fixed errors in the `pointInPolygon` function due to the accumulation of inaccurate calculations for polygons with a large number of vertices located close to each other. [\#3331](https://github.com/ClickHouse/ClickHouse/pull/3331) [\#3341](https://github.com/ClickHouse/ClickHouse/pull/3341) +- If after merging data parts, the checksum for the resulting part differs from the result of the same merge in another replica, the result of the merge is deleted and the data part is downloaded from the other replica (this is the correct behavior). But after downloading the data part, it couldn’t be added to the working set because of an error that the part already exists (because the data part was deleted with some delay after the merge). This led to cyclical attempts to download the same data. [\#3194](https://github.com/ClickHouse/ClickHouse/pull/3194) +- Fixed incorrect calculation of total memory consumption by queries (because of incorrect calculation, the `max_memory_usage_for_all_queries` setting worked incorrectly and the `MemoryTracking` metric had an incorrect value). This error occurred in version 18.12.13. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3344) +- Fixed the functionality of `CREATE TABLE ... ON CLUSTER ... AS SELECT ...` This error occurred in version 18.12.13. [\#3247](https://github.com/ClickHouse/ClickHouse/pull/3247) +- Fixed unnecessary preparation of data structures for `JOIN`s on the server that initiates the query if the `JOIN` is only performed on remote servers. [\#3340](https://github.com/ClickHouse/ClickHouse/pull/3340) +- Fixed bugs in the `Kafka` engine: deadlocks after exceptions when starting to read data, and locks upon completion [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3215). +- For `Kafka` tables, the optional `schema` parameter was not passed (the schema of the `Cap'n'Proto` format). [Vojtech Splichal](https://github.com/ClickHouse/ClickHouse/pull/3150) +- If the ensemble of ZooKeeper servers has servers that accept the connection but then immediately close it instead of responding to the handshake, ClickHouse chooses to connect another server. Previously, this produced the error `Cannot read all data. Bytes read: 0. Bytes expected: 4.` and the server couldn’t start. [8218cf3a](https://github.com/ClickHouse/ClickHouse/commit/8218cf3a5f39a43401953769d6d12a0bb8d29da9) +- If the ensemble of ZooKeeper servers contains servers for which the DNS query returns an error, these servers are ignored. [17b8e209](https://github.com/ClickHouse/ClickHouse/commit/17b8e209221061325ad7ba0539f03c6e65f87f29) +- Fixed type conversion between `Date` and `DateTime` when inserting data in the `VALUES` format (if `input_format_values_interpret_expressions = 1`). Previously, the conversion was performed between the numerical value of the number of days in Unix Epoch time and the Unix timestamp, which led to unexpected results. [\#3229](https://github.com/ClickHouse/ClickHouse/pull/3229) +- Corrected type conversion between `Decimal` and integer numbers. [\#3211](https://github.com/ClickHouse/ClickHouse/pull/3211) +- Fixed errors in the `enable_optimize_predicate_expression` setting. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3231) +- Fixed a parsing error in CSV format with floating-point numbers if a non-default CSV separator is used, such as `;` [\#3155](https://github.com/ClickHouse/ClickHouse/pull/3155) +- Fixed the `arrayCumSumNonNegative` function (it does not accumulate negative values if the accumulator is less than zero). [Aleksey Studnev](https://github.com/ClickHouse/ClickHouse/pull/3163) +- Fixed how `Merge` tables work on top of `Distributed` tables when using `PREWHERE`. [\#3165](https://github.com/ClickHouse/ClickHouse/pull/3165) +- Bug fixes in the `ALTER UPDATE` query. +- Fixed bugs in the `odbc` table function that appeared in version 18.12. [\#3197](https://github.com/ClickHouse/ClickHouse/pull/3197) +- Fixed the operation of aggregate functions with `StateArray` combinators. [\#3188](https://github.com/ClickHouse/ClickHouse/pull/3188) +- Fixed a crash when dividing a `Decimal` value by zero. [69dd6609](https://github.com/ClickHouse/ClickHouse/commit/69dd6609193beb4e7acd3e6ad216eca0ccfb8179) +- Fixed output of types for operations using `Decimal` and integer arguments. [\#3224](https://github.com/ClickHouse/ClickHouse/pull/3224) +- Fixed the segfault during `GROUP BY` on `Decimal128`. [3359ba06](https://github.com/ClickHouse/ClickHouse/commit/3359ba06c39fcd05bfdb87d6c64154819621e13a) +- The `log_query_threads` setting (logging information about each thread of query execution) now takes effect only if the `log_queries` option (logging information about queries) is set to 1. Since the `log_query_threads` option is enabled by default, information about threads was previously logged even if query logging was disabled. [\#3241](https://github.com/ClickHouse/ClickHouse/pull/3241) +- Fixed an error in the distributed operation of the quantiles aggregate function (the error message `Not found column quantile...`). [292a8855](https://github.com/ClickHouse/ClickHouse/commit/292a885533b8e3b41ce8993867069d14cbd5a664) +- Fixed the compatibility problem when working on a cluster of version 18.12.17 servers and older servers at the same time. For distributed queries with GROUP BY keys of both fixed and non-fixed length, if there was a large amount of data to aggregate, the returned data was not always fully aggregated (two different rows contained the same aggregation keys). [\#3254](https://github.com/ClickHouse/ClickHouse/pull/3254) +- Fixed handling of substitutions in `clickhouse-performance-test`, if the query contains only part of the substitutions declared in the test. [\#3263](https://github.com/ClickHouse/ClickHouse/pull/3263) +- Fixed an error when using `FINAL` with `PREWHERE`. [\#3298](https://github.com/ClickHouse/ClickHouse/pull/3298) +- Fixed an error when using `PREWHERE` over columns that were added during `ALTER`. [\#3298](https://github.com/ClickHouse/ClickHouse/pull/3298) +- Added a check for the absence of `arrayJoin` for `DEFAULT` and `MATERIALIZED` expressions. Previously, `arrayJoin` led to an error when inserting data. [\#3337](https://github.com/ClickHouse/ClickHouse/pull/3337) +- Added a check for the absence of `arrayJoin` in a `PREWHERE` clause. Previously, this led to messages like `Size ... doesn't match` or `Unknown compression method` when executing queries. [\#3357](https://github.com/ClickHouse/ClickHouse/pull/3357) +- Fixed segfault that could occur in rare cases after optimization that replaced AND chains from equality evaluations with the corresponding IN expression. [liuyimin-bytedance](https://github.com/ClickHouse/ClickHouse/pull/3339) +- Minor corrections to `clickhouse-benchmark`: previously, client information was not sent to the server; now the number of queries executed is calculated more accurately when shutting down and for limiting the number of iterations. [\#3351](https://github.com/ClickHouse/ClickHouse/pull/3351) [\#3352](https://github.com/ClickHouse/ClickHouse/pull/3352) + +#### Backward incompatible changes: {#backward-incompatible-changes-1} + +- Removed the `allow_experimental_decimal_type` option. The `Decimal` data type is available for default use. [\#3329](https://github.com/ClickHouse/ClickHouse/pull/3329) + +## ClickHouse release 18.12 {#clickhouse-release-18-12} + +### ClickHouse release 18.12.17, 2018-09-16 {#clickhouse-release-18-12-17-2018-09-16} + +#### New features: {#new-features-2} + +- `invalidate_query` (the ability to specify a query to check whether an external dictionary needs to be updated) is implemented for the `clickhouse` source. [\#3126](https://github.com/ClickHouse/ClickHouse/pull/3126) +- Added the ability to use `UInt*`, `Int*`, and `DateTime` data types (along with the `Date` type) as a `range_hashed` external dictionary key that defines the boundaries of ranges. Now `NULL` can be used to designate an open range. [Vasily Nemkov](https://github.com/ClickHouse/ClickHouse/pull/3123) +- The `Decimal` type now supports `var*` and `stddev*` aggregate functions. [\#3129](https://github.com/ClickHouse/ClickHouse/pull/3129) +- The `Decimal` type now supports mathematical functions (`exp`, `sin` and so on.) [\#3129](https://github.com/ClickHouse/ClickHouse/pull/3129) +- The `system.part_log` table now has the `partition_id` column. [\#3089](https://github.com/ClickHouse/ClickHouse/pull/3089) + +#### Bug fixes: {#bug-fixes-11} + +- `Merge` now works correctly on `Distributed` tables. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3159) +- Fixed incompatibility (unnecessary dependency on the `glibc` version) that made it impossible to run ClickHouse on `Ubuntu Precise` and older versions. The incompatibility arose in version 18.12.13. [\#3130](https://github.com/ClickHouse/ClickHouse/pull/3130) +- Fixed errors in the `enable_optimize_predicate_expression` setting. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3107) +- Fixed a minor issue with backwards compatibility that appeared when working with a cluster of replicas on versions earlier than 18.12.13 and simultaneously creating a new replica of a table on a server with a newer version (shown in the message `Can not clone replica, because the ... updated to new ClickHouse version`, which is logical, but shouldn’t happen). [\#3122](https://github.com/ClickHouse/ClickHouse/pull/3122) + +#### Backward incompatible changes: {#backward-incompatible-changes-2} + +- The `enable_optimize_predicate_expression` option is enabled by default (which is rather optimistic). If query analysis errors occur that are related to searching for the column names, set `enable_optimize_predicate_expression` to 0. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3107) + +### ClickHouse release 18.12.14, 2018-09-13 {#clickhouse-release-18-12-14-2018-09-13} + +#### New features: {#new-features-3} + +- Added support for `ALTER UPDATE` queries. [\#3035](https://github.com/ClickHouse/ClickHouse/pull/3035) +- Added the `allow_ddl` option, which restricts the user’s access to DDL queries. [\#3104](https://github.com/ClickHouse/ClickHouse/pull/3104) +- Added the `min_merge_bytes_to_use_direct_io` option for `MergeTree` engines, which allows you to set a threshold for the total size of the merge (when above the threshold, data part files will be handled using O\_DIRECT). [\#3117](https://github.com/ClickHouse/ClickHouse/pull/3117) +- The `system.merges` system table now contains the `partition_id` column. [\#3099](https://github.com/ClickHouse/ClickHouse/pull/3099) + +#### Improvements {#improvements-3} + +- If a data part remains unchanged during mutation, it isn’t downloaded by replicas. [\#3103](https://github.com/ClickHouse/ClickHouse/pull/3103) +- Autocomplete is available for names of settings when working with `clickhouse-client`. [\#3106](https://github.com/ClickHouse/ClickHouse/pull/3106) + +#### Bug fixes: {#bug-fixes-12} + +- Added a check for the sizes of arrays that are elements of `Nested` type fields when inserting. [\#3118](https://github.com/ClickHouse/ClickHouse/pull/3118) +- Fixed an error updating external dictionaries with the `ODBC` source and `hashed` storage. This error occurred in version 18.12.13. +- Fixed a crash when creating a temporary table from a query with an `IN` condition. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3098) +- Fixed an error in aggregate functions for arrays that can have `NULL` elements. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3097) + +### ClickHouse release 18.12.13, 2018-09-10 {#clickhouse-release-18-12-13-2018-09-10} + +#### New features: {#new-features-4} + +- Added the `DECIMAL(digits, scale)` data type (`Decimal32(scale)`, `Decimal64(scale)`, `Decimal128(scale)`). To enable it, use the setting `allow_experimental_decimal_type`. [\#2846](https://github.com/ClickHouse/ClickHouse/pull/2846) [\#2970](https://github.com/ClickHouse/ClickHouse/pull/2970) [\#3008](https://github.com/ClickHouse/ClickHouse/pull/3008) [\#3047](https://github.com/ClickHouse/ClickHouse/pull/3047) +- New `WITH ROLLUP` modifier for `GROUP BY` (alternative syntax: `GROUP BY ROLLUP(...)`). [\#2948](https://github.com/ClickHouse/ClickHouse/pull/2948) +- In queries with JOIN, the star character expands to a list of columns in all tables, in compliance with the SQL standard. You can restore the old behavior by setting `asterisk_left_columns_only` to 1 on the user configuration level. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2787) +- Added support for JOIN with table functions. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2907) +- Autocomplete by pressing Tab in clickhouse-client. [Sergey Shcherbin](https://github.com/ClickHouse/ClickHouse/pull/2447) +- Ctrl+C in clickhouse-client clears a query that was entered. [\#2877](https://github.com/ClickHouse/ClickHouse/pull/2877) +- Added the `join_default_strictness` setting (values: `"`, `'any'`, `'all'`). This allows you to not specify `ANY` or `ALL` for `JOIN`. [\#2982](https://github.com/ClickHouse/ClickHouse/pull/2982) +- Each line of the server log related to query processing shows the query ID. [\#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +- Now you can get query execution logs in clickhouse-client (use the `send_logs_level` setting). With distributed query processing, logs are cascaded from all the servers. [\#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +- The `system.query_log` and `system.processes` (`SHOW PROCESSLIST`) tables now have information about all changed settings when you run a query (the nested structure of the `Settings` data). Added the `log_query_settings` setting. [\#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +- The `system.query_log` and `system.processes` tables now show information about the number of threads that are participating in query execution (see the `thread_numbers` column). [\#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +- Added `ProfileEvents` counters that measure the time spent on reading and writing over the network and reading and writing to disk, the number of network errors, and the time spent waiting when network bandwidth is limited. [\#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +- Added `ProfileEvents`counters that contain the system metrics from rusage (you can use them to get information about CPU usage in userspace and the kernel, page faults, and context switches), as well as taskstats metrics (use these to obtain information about I/O wait time, CPU wait time, and the amount of data read and recorded, both with and without page cache). [\#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +- The `ProfileEvents` counters are applied globally and for each query, as well as for each query execution thread, which allows you to profile resource consumption by query in detail. [\#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +- Added the `system.query_thread_log` table, which contains information about each query execution thread. Added the `log_query_threads` setting. [\#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +- The `system.metrics` and `system.events` tables now have built-in documentation. [\#3016](https://github.com/ClickHouse/ClickHouse/pull/3016) +- Added the `arrayEnumerateDense` function. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2975) +- Added the `arrayCumSumNonNegative` and `arrayDifference` functions. [Aleksey Studnev](https://github.com/ClickHouse/ClickHouse/pull/2942) +- Added the `retention` aggregate function. [Sundy Li](https://github.com/ClickHouse/ClickHouse/pull/2887) +- Now you can add (merge) states of aggregate functions by using the plus operator, and multiply the states of aggregate functions by a nonnegative constant. [\#3062](https://github.com/ClickHouse/ClickHouse/pull/3062) [\#3034](https://github.com/ClickHouse/ClickHouse/pull/3034) +- Tables in the MergeTree family now have the virtual column `_partition_id`. [\#3089](https://github.com/ClickHouse/ClickHouse/pull/3089) + +#### Experimental features: {#experimental-features-1} + +- Added the `LowCardinality(T)` data type. This data type automatically creates a local dictionary of values and allows data processing without unpacking the dictionary. [\#2830](https://github.com/ClickHouse/ClickHouse/pull/2830) +- Added a cache of JIT-compiled functions and a counter for the number of uses before compiling. To JIT compile expressions, enable the `compile_expressions` setting. [\#2990](https://github.com/ClickHouse/ClickHouse/pull/2990) [\#3077](https://github.com/ClickHouse/ClickHouse/pull/3077) + +#### Improvements: {#improvements-4} + +- Fixed the problem with unlimited accumulation of the replication log when there are abandoned replicas. Added an effective recovery mode for replicas with a long lag. +- Improved performance of `GROUP BY` with multiple aggregation fields when one of them is string and the others are fixed length. +- Improved performance when using `PREWHERE` and with implicit transfer of expressions in `PREWHERE`. +- Improved parsing performance for text formats (`CSV`, `TSV`). [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2977) [\#2980](https://github.com/ClickHouse/ClickHouse/pull/2980) +- Improved performance of reading strings and arrays in binary formats. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2955) +- Increased performance and reduced memory consumption for queries to `system.tables` and `system.columns` when there is a very large number of tables on a single server. [\#2953](https://github.com/ClickHouse/ClickHouse/pull/2953) +- Fixed a performance problem in the case of a large stream of queries that result in an error (the `_dl_addr` function is visible in `perf top`, but the server isn’t using much CPU). [\#2938](https://github.com/ClickHouse/ClickHouse/pull/2938) +- Conditions are cast into the View (when `enable_optimize_predicate_expression` is enabled). [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2907) +- Improvements to the functionality for the `UUID` data type. [\#3074](https://github.com/ClickHouse/ClickHouse/pull/3074) [\#2985](https://github.com/ClickHouse/ClickHouse/pull/2985) +- The `UUID` data type is supported in The-Alchemist dictionaries. [\#2822](https://github.com/ClickHouse/ClickHouse/pull/2822) +- The `visitParamExtractRaw` function works correctly with nested structures. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2974) +- When the `input_format_skip_unknown_fields` setting is enabled, object fields in `JSONEachRow` format are skipped correctly. [BlahGeek](https://github.com/ClickHouse/ClickHouse/pull/2958) +- For a `CASE` expression with conditions, you can now omit `ELSE`, which is equivalent to `ELSE NULL`. [\#2920](https://github.com/ClickHouse/ClickHouse/pull/2920) +- The operation timeout can now be configured when working with ZooKeeper. [urykhy](https://github.com/ClickHouse/ClickHouse/pull/2971) +- You can specify an offset for `LIMIT n, m` as `LIMIT n OFFSET m`. [\#2840](https://github.com/ClickHouse/ClickHouse/pull/2840) +- You can use the `SELECT TOP n` syntax as an alternative for `LIMIT`. [\#2840](https://github.com/ClickHouse/ClickHouse/pull/2840) +- Increased the size of the queue to write to system tables, so the `SystemLog parameter queue is full` error doesn’t happen as often. +- The `windowFunnel` aggregate function now supports events that meet multiple conditions. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2801) +- Duplicate columns can be used in a `USING` clause for `JOIN`. [\#3006](https://github.com/ClickHouse/ClickHouse/pull/3006) +- `Pretty` formats now have a limit on column alignment by width. Use the `output_format_pretty_max_column_pad_width` setting. If a value is wider, it will still be displayed in its entirety, but the other cells in the table will not be too wide. [\#3003](https://github.com/ClickHouse/ClickHouse/pull/3003) +- The `odbc` table function now allows you to specify the database/schema name. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2885) +- Added the ability to use a username specified in the `clickhouse-client` config file. [Vladimir Kozbin](https://github.com/ClickHouse/ClickHouse/pull/2909) +- The `ZooKeeperExceptions` counter has been split into three counters: `ZooKeeperUserExceptions`, `ZooKeeperHardwareExceptions`, and `ZooKeeperOtherExceptions`. +- `ALTER DELETE` queries work for materialized views. +- Added randomization when running the cleanup thread periodically for `ReplicatedMergeTree` tables in order to avoid periodic load spikes when there are a very large number of `ReplicatedMergeTree` tables. +- Support for `ATTACH TABLE ... ON CLUSTER` queries. [\#3025](https://github.com/ClickHouse/ClickHouse/pull/3025) + +#### Bug fixes: {#bug-fixes-13} + +- Fixed an issue with `Dictionary` tables (throws the `Size of offsets doesn't match size of column` or `Unknown compression method` exception). This bug appeared in version 18.10.3. [\#2913](https://github.com/ClickHouse/ClickHouse/issues/2913) +- Fixed a bug when merging `CollapsingMergeTree` tables if one of the data parts is empty (these parts are formed during merge or `ALTER DELETE` if all data was deleted), and the `vertical` algorithm was used for the merge. [\#3049](https://github.com/ClickHouse/ClickHouse/pull/3049) +- Fixed a race condition during `DROP` or `TRUNCATE` for `Memory` tables with a simultaneous `SELECT`, which could lead to server crashes. This bug appeared in version 1.1.54388. [\#3038](https://github.com/ClickHouse/ClickHouse/pull/3038) +- Fixed the possibility of data loss when inserting in `Replicated` tables if the `Session is expired` error is returned (data loss can be detected by the `ReplicatedDataLoss` metric). This error occurred in version 1.1.54378. [\#2939](https://github.com/ClickHouse/ClickHouse/pull/2939) [\#2949](https://github.com/ClickHouse/ClickHouse/pull/2949) [\#2964](https://github.com/ClickHouse/ClickHouse/pull/2964) +- Fixed a segfault during `JOIN ... ON`. [\#3000](https://github.com/ClickHouse/ClickHouse/pull/3000) +- Fixed the error searching column names when the `WHERE` expression consists entirely of a qualified column name, such as `WHERE table.column`. [\#2994](https://github.com/ClickHouse/ClickHouse/pull/2994) +- Fixed the “Not found column” error that occurred when executing distributed queries if a single column consisting of an IN expression with a subquery is requested from a remote server. [\#3087](https://github.com/ClickHouse/ClickHouse/pull/3087) +- Fixed the `Block structure mismatch in UNION stream: different number of columns` error that occurred for distributed queries if one of the shards is local and the other is not, and optimization of the move to `PREWHERE` is triggered. [\#2226](https://github.com/ClickHouse/ClickHouse/pull/2226) [\#3037](https://github.com/ClickHouse/ClickHouse/pull/3037) [\#3055](https://github.com/ClickHouse/ClickHouse/pull/3055) [\#3065](https://github.com/ClickHouse/ClickHouse/pull/3065) [\#3073](https://github.com/ClickHouse/ClickHouse/pull/3073) [\#3090](https://github.com/ClickHouse/ClickHouse/pull/3090) [\#3093](https://github.com/ClickHouse/ClickHouse/pull/3093) +- Fixed the `pointInPolygon` function for certain cases of non-convex polygons. [\#2910](https://github.com/ClickHouse/ClickHouse/pull/2910) +- Fixed the incorrect result when comparing `nan` with integers. [\#3024](https://github.com/ClickHouse/ClickHouse/pull/3024) +- Fixed an error in the `zlib-ng` library that could lead to segfault in rare cases. [\#2854](https://github.com/ClickHouse/ClickHouse/pull/2854) +- Fixed a memory leak when inserting into a table with `AggregateFunction` columns, if the state of the aggregate function is not simple (allocates memory separately), and if a single insertion request results in multiple small blocks. [\#3084](https://github.com/ClickHouse/ClickHouse/pull/3084) +- Fixed a race condition when creating and deleting the same `Buffer` or `MergeTree` table simultaneously. +- Fixed the possibility of a segfault when comparing tuples made up of certain non-trivial types, such as tuples. [\#2989](https://github.com/ClickHouse/ClickHouse/pull/2989) +- Fixed the possibility of a segfault when running certain `ON CLUSTER` queries. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2960) +- Fixed an error in the `arrayDistinct` function for `Nullable` array elements. [\#2845](https://github.com/ClickHouse/ClickHouse/pull/2845) [\#2937](https://github.com/ClickHouse/ClickHouse/pull/2937) +- The `enable_optimize_predicate_expression` option now correctly supports cases with `SELECT *`. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2929) +- Fixed the segfault when re-initializing the ZooKeeper session. [\#2917](https://github.com/ClickHouse/ClickHouse/pull/2917) +- Fixed potential blocking when working with ZooKeeper. +- Fixed incorrect code for adding nested data structures in a `SummingMergeTree`. +- When allocating memory for states of aggregate functions, alignment is correctly taken into account, which makes it possible to use operations that require alignment when implementing states of aggregate functions. [chenxing-xc](https://github.com/ClickHouse/ClickHouse/pull/2808) + +#### Security fix: {#security-fix} + +- Safe use of ODBC data sources. Interaction with ODBC drivers uses a separate `clickhouse-odbc-bridge` process. Errors in third-party ODBC drivers no longer cause problems with server stability or vulnerabilities. [\#2828](https://github.com/ClickHouse/ClickHouse/pull/2828) [\#2879](https://github.com/ClickHouse/ClickHouse/pull/2879) [\#2886](https://github.com/ClickHouse/ClickHouse/pull/2886) [\#2893](https://github.com/ClickHouse/ClickHouse/pull/2893) [\#2921](https://github.com/ClickHouse/ClickHouse/pull/2921) +- Fixed incorrect validation of the file path in the `catBoostPool` table function. [\#2894](https://github.com/ClickHouse/ClickHouse/pull/2894) +- The contents of system tables (`tables`, `databases`, `parts`, `columns`, `parts_columns`, `merges`, `mutations`, `replicas`, and `replication_queue`) are filtered according to the user’s configured access to databases (`allow_databases`). [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2856) + +#### Backward incompatible changes: {#backward-incompatible-changes-3} + +- In queries with JOIN, the star character expands to a list of columns in all tables, in compliance with the SQL standard. You can restore the old behavior by setting `asterisk_left_columns_only` to 1 on the user configuration level. + +#### Build changes: {#build-changes-2} + +- Most integration tests can now be run by commit. +- Code style checks can also be run by commit. +- The `memcpy` implementation is chosen correctly when building on CentOS7/Fedora. [Etienne Champetier](https://github.com/ClickHouse/ClickHouse/pull/2912) +- When using clang to build, some warnings from `-Weverything` have been added, in addition to the regular `-Wall-Wextra -Werror`. [\#2957](https://github.com/ClickHouse/ClickHouse/pull/2957) +- Debugging the build uses the `jemalloc` debug option. +- The interface of the library for interacting with ZooKeeper is declared abstract. [\#2950](https://github.com/ClickHouse/ClickHouse/pull/2950) + +## ClickHouse release 18.10 {#clickhouse-release-18-10} + +### ClickHouse release 18.10.3, 2018-08-13 {#clickhouse-release-18-10-3-2018-08-13} + +#### New features: {#new-features-5} + +- HTTPS can be used for replication. [\#2760](https://github.com/ClickHouse/ClickHouse/pull/2760) +- Added the functions `murmurHash2_64`, `murmurHash3_32`, `murmurHash3_64`, and `murmurHash3_128` in addition to the existing `murmurHash2_32`. [\#2791](https://github.com/ClickHouse/ClickHouse/pull/2791) +- Support for Nullable types in the ClickHouse ODBC driver (`ODBCDriver2` output format). [\#2834](https://github.com/ClickHouse/ClickHouse/pull/2834) +- Support for `UUID` in the key columns. + +#### Improvements: {#improvements-5} -#### New features: +- Clusters can be removed without restarting the server when they are deleted from the config files. [\#2777](https://github.com/ClickHouse/ClickHouse/pull/2777) +- External dictionaries can be removed without restarting the server when they are removed from config files. [\#2779](https://github.com/ClickHouse/ClickHouse/pull/2779) +- Added `SETTINGS` support for the `Kafka` table engine. [Alexander Marshalov](https://github.com/ClickHouse/ClickHouse/pull/2781) +- Improvements for the `UUID` data type (not yet complete). [\#2618](https://github.com/ClickHouse/ClickHouse/pull/2618) +- Support for empty parts after merges in the `SummingMergeTree`, `CollapsingMergeTree` and `VersionedCollapsingMergeTree` engines. [\#2815](https://github.com/ClickHouse/ClickHouse/pull/2815) +- Old records of completed mutations are deleted (`ALTER DELETE`). [\#2784](https://github.com/ClickHouse/ClickHouse/pull/2784) +- Added the `system.merge_tree_settings` table. [Kirill Shvakov](https://github.com/ClickHouse/ClickHouse/pull/2841) +- The `system.tables` table now has dependency columns: `dependencies_database` and `dependencies_table`. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2851) +- Added the `max_partition_size_to_drop` config option. [\#2782](https://github.com/ClickHouse/ClickHouse/pull/2782) +- Added the `output_format_json_escape_forward_slashes` option. [Alexander Bocharov](https://github.com/ClickHouse/ClickHouse/pull/2812) +- Added the `max_fetch_partition_retries_count` setting. [\#2831](https://github.com/ClickHouse/ClickHouse/pull/2831) +- Added the `prefer_localhost_replica` setting for disabling the preference for a local replica and going to a local replica without inter-process interaction. [\#2832](https://github.com/ClickHouse/ClickHouse/pull/2832) +- The `quantileExact` aggregate function returns `nan` in the case of aggregation on an empty `Float32` or `Float64` set. [Sundy Li](https://github.com/ClickHouse/ClickHouse/pull/2855) -* Queries can be sent in `multipart/form-data` format (in the `query` field), which is useful if external data is also sent for query processing ([Olga Hvostikova](https://github.com/ClickHouse/ClickHouse/pull/2490)). -* Added the ability to enable or disable processing single or double quotes when reading data in CSV format. You can configure this in the `format_csv_allow_single_quotes` and `format_csv_allow_double_quotes` settings ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2574)). -* Now `OPTIMIZE TABLE ... FINAL` can be used without specifying the partition for non-replicated variants of `MergeTree` ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2599)). +#### Bug fixes: {#bug-fixes-14} -#### Improvements: +- Removed unnecessary escaping of the connection string parameters for ODBC, which made it impossible to establish a connection. This error occurred in version 18.6.0. +- Fixed the logic for processing `REPLACE PARTITION` commands in the replication queue. If there are two `REPLACE` commands for the same partition, the incorrect logic could cause one of them to remain in the replication queue and not be executed. [\#2814](https://github.com/ClickHouse/ClickHouse/pull/2814) +- Fixed a merge bug when all data parts were empty (parts that were formed from a merge or from `ALTER DELETE` if all data was deleted). This bug appeared in version 18.1.0. [\#2930](https://github.com/ClickHouse/ClickHouse/pull/2930) +- Fixed an error for concurrent `Set` or `Join`. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2823) +- Fixed the `Block structure mismatch in UNION stream: different number of columns` error that occurred for `UNION ALL` queries inside a sub-query if one of the `SELECT` queries contains duplicate column names. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2094) +- Fixed a memory leak if an exception occurred when connecting to a MySQL server. +- Fixed incorrect clickhouse-client response code in case of a query error. +- Fixed incorrect behavior of materialized views containing DISTINCT. [\#2795](https://github.com/ClickHouse/ClickHouse/issues/2795) -* Improved performance, reduced memory consumption, and correct memory consumption tracking with use of the IN operator when a table index could be used ([#2584](https://github.com/ClickHouse/ClickHouse/pull/2584)). -* Removed redundant checking of checksums when adding a data part. This is important when there are a large number of replicas, because in these cases the total number of checks was equal to N^2. -* Added support for `Array(Tuple(...))` arguments for the `arrayEnumerateUniq` function ([#2573](https://github.com/ClickHouse/ClickHouse/pull/2573)). -* Added `Nullable` support for the `runningDifference` function ([#2594](https://github.com/ClickHouse/ClickHouse/pull/2594)). -* Improved query analysis performance when there is a very large number of expressions ([#2572](https://github.com/ClickHouse/ClickHouse/pull/2572)). -* Faster selection of data parts for merging in `ReplicatedMergeTree` tables. Faster recovery of the ZooKeeper session ([#2597](https://github.com/ClickHouse/ClickHouse/pull/2597)). -* The `format_version.txt` file for `MergeTree` tables is re-created if it is missing, which makes sense if ClickHouse is launched after copying the directory structure without files ([Ciprian Hacman](https://github.com/ClickHouse/ClickHouse/pull/2593)). +#### Backward incompatible changes {#backward-incompatible-changes-4} -#### Bug fixes: +- Removed support for CHECK TABLE queries for Distributed tables. -* Fixed a bug when working with ZooKeeper that could make it impossible to recover the session and readonly states of tables before restarting the server. -* Fixed a bug when working with ZooKeeper that could result in old nodes not being deleted if the session is interrupted. -* Fixed an error in the `quantileTDigest` function for Float arguments (this bug was introduced in version 1.1.54388) ([Mikhail Surin](https://github.com/ClickHouse/ClickHouse/pull/2553)). -* Fixed a bug in the index for MergeTree tables if the primary key column is located inside the function for converting types between signed and unsigned integers of the same size ([#2603](https://github.com/ClickHouse/ClickHouse/pull/2603)). -* Fixed segfault if `macros` are used but they aren't in the config file ([#2570](https://github.com/ClickHouse/ClickHouse/pull/2570)). -* Fixed switching to the default database when reconnecting the client ([#2583](https://github.com/ClickHouse/ClickHouse/pull/2583)). -* Fixed a bug that occurred when the `use_index_for_in_with_subqueries` setting was disabled. +#### Build changes: {#build-changes-3} -#### Security fix: +- The allocator has been replaced: `jemalloc` is now used instead of `tcmalloc`. In some scenarios, this increases speed up to 20%. However, there are queries that have slowed by up to 20%. Memory consumption has been reduced by approximately 10% in some scenarios, with improved stability. With highly competitive loads, CPU usage in userspace and in system shows just a slight increase. [\#2773](https://github.com/ClickHouse/ClickHouse/pull/2773) +- Use of libressl from a submodule. [\#1983](https://github.com/ClickHouse/ClickHouse/pull/1983) [\#2807](https://github.com/ClickHouse/ClickHouse/pull/2807) +- Use of unixodbc from a submodule. [\#2789](https://github.com/ClickHouse/ClickHouse/pull/2789) +- Use of mariadb-connector-c from a submodule. [\#2785](https://github.com/ClickHouse/ClickHouse/pull/2785) +- Added functional test files to the repository that depend on the availability of test data (for the time being, without the test data itself). -* Sending files is no longer possible when connected to MySQL (`LOAD DATA LOCAL INFILE`). +## ClickHouse release 18.6 {#clickhouse-release-18-6} -### ClickHouse release 1.1.54388, 2018-06-28 +### ClickHouse release 18.6.0, 2018-08-02 {#clickhouse-release-18-6-0-2018-08-02} -#### New features: +#### New features: {#new-features-6} -* Support for the `ALTER TABLE t DELETE WHERE` query for replicated tables. Added the `system.mutations` table to track progress of this type of queries. -* Support for the `ALTER TABLE t [REPLACE|ATTACH] PARTITION` query for \*MergeTree tables. -* Support for the `TRUNCATE TABLE` query ([Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2260)) -* Several new `SYSTEM` queries for replicated tables (`RESTART REPLICAS`, `SYNC REPLICA`, `[STOP|START] [MERGES|FETCHES|SENDS REPLICATED|REPLICATION QUEUES]`). -* Added the ability to write to a table with the MySQL engine and the corresponding table function ([sundy-li](https://github.com/ClickHouse/ClickHouse/pull/2294)). -* Added the `url()` table function and the `URL` table engine ([Alexander Sapin](https://github.com/ClickHouse/ClickHouse/pull/2501)). -* Added the `windowFunnel` aggregate function ([sundy-li](https://github.com/ClickHouse/ClickHouse/pull/2352)). -* New `startsWith` and `endsWith` functions for strings ([Vadim Plakhtinsky](https://github.com/ClickHouse/ClickHouse/pull/2429)). -* The `numbers()` table function now allows you to specify the offset ([Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2535)). -* The password to `clickhouse-client` can be entered interactively. -* Server logs can now be sent to syslog ([Alexander Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/2459)). -* Support for logging in dictionaries with a shared library source ([Alexander Sapin](https://github.com/ClickHouse/ClickHouse/pull/2472)). -* Support for custom CSV delimiters ([Ivan Zhukov](https://github.com/ClickHouse/ClickHouse/pull/2263)) -* Added the `date_time_input_format` setting. If you switch this setting to `'best_effort'`, DateTime values will be read in a wide range of formats. -* Added the `clickhouse-obfuscator` utility for data obfuscation. Usage example: publishing data used in performance tests. +- Added support for ON expressions for the JOIN ON syntax: + `JOIN ON Expr([table.]column ...) = Expr([table.]column, ...) [AND Expr([table.]column, ...) = Expr([table.]column, ...) ...]` + The expression must be a chain of equalities joined by the AND operator. Each side of the equality can be an arbitrary expression over the columns of one of the tables. The use of fully qualified column names is supported (`table.name`, `database.table.name`, `table_alias.name`, `subquery_alias.name`) for the right table. [\#2742](https://github.com/ClickHouse/ClickHouse/pull/2742) +- HTTPS can be enabled for replication. [\#2760](https://github.com/ClickHouse/ClickHouse/pull/2760) -#### Experimental features: +#### Improvements: {#improvements-6} -* Added the ability to calculate `and` arguments only where they are needed ([Anastasia Tsarkova](https://github.com/ClickHouse/ClickHouse/pull/2272)) -* JIT compilation to native code is now available for some expressions ([pyos](https://github.com/ClickHouse/ClickHouse/pull/2277)). +- The server passes the patch component of its version to the client. Data about the patch version component is in `system.processes` and `query_log`. [\#2646](https://github.com/ClickHouse/ClickHouse/pull/2646) -#### Bug fixes: +## ClickHouse release 18.5 {#clickhouse-release-18-5} -* Duplicates no longer appear for a query with `DISTINCT` and `ORDER BY`. -* Queries with `ARRAY JOIN` and `arrayFilter` no longer return an incorrect result. -* Fixed an error when reading an array column from a Nested structure ([#2066](https://github.com/ClickHouse/ClickHouse/issues/2066)). -* Fixed an error when analyzing queries with a HAVING clause like `HAVING tuple IN (...)`. -* Fixed an error when analyzing queries with recursive aliases. -* Fixed an error when reading from ReplacingMergeTree with a condition in PREWHERE that filters all rows ([#2525](https://github.com/ClickHouse/ClickHouse/issues/2525)). -* User profile settings were not applied when using sessions in the HTTP interface. -* Fixed how settings are applied from the command line parameters in clickhouse-local. -* The ZooKeeper client library now uses the session timeout received from the server. -* Fixed a bug in the ZooKeeper client library when the client waited for the server response longer than the timeout. -* Fixed pruning of parts for queries with conditions on partition key columns ([#2342](https://github.com/ClickHouse/ClickHouse/issues/2342)). -* Merges are now possible after `CLEAR COLUMN IN PARTITION` ([#2315](https://github.com/ClickHouse/ClickHouse/issues/2315)). -* Type mapping in the ODBC table function has been fixed ([sundy-li](https://github.com/ClickHouse/ClickHouse/pull/2268)). -* Type comparisons have been fixed for `DateTime` with and without the time zone ([Alexander Bocharov](https://github.com/ClickHouse/ClickHouse/pull/2400)). -* Fixed syntactic parsing and formatting of the `CAST` operator. -* Fixed insertion into a materialized view for the Distributed table engine ([Babacar Diassé](https://github.com/ClickHouse/ClickHouse/pull/2411)). -* Fixed a race condition when writing data from the `Kafka` engine to materialized views ([Yangkuan Liu](https://github.com/ClickHouse/ClickHouse/pull/2448)). -* Fixed SSRF in the remote() table function. -* Fixed exit behavior of `clickhouse-client` in multiline mode ([#2510](https://github.com/ClickHouse/ClickHouse/issues/2510)). +### ClickHouse release 18.5.1, 2018-07-31 {#clickhouse-release-18-5-1-2018-07-31} -#### Improvements: +#### New features: {#new-features-7} -* Background tasks in replicated tables are now performed in a thread pool instead of in separate threads ([Silviu Caragea](https://github.com/ClickHouse/ClickHouse/pull/1722)). -* Improved LZ4 compression performance. -* Faster analysis for queries with a large number of JOINs and sub-queries. -* The DNS cache is now updated automatically when there are too many network errors. -* Table inserts no longer occur if the insert into one of the materialized views is not possible because it has too many parts. -* Corrected the discrepancy in the event counters `Query`, `SelectQuery`, and `InsertQuery`. -* Expressions like `tuple IN (SELECT tuple)` are allowed if the tuple types match. -* A server with replicated tables can start even if you haven't configured ZooKeeper. -* When calculating the number of available CPU cores, limits on cgroups are now taken into account ([Atri Sharma](https://github.com/ClickHouse/ClickHouse/pull/2325)). -* Added chown for config directories in the systemd config file ([Mikhail Shiryaev](https://github.com/ClickHouse/ClickHouse/pull/2421)). - -#### Build changes: - -* The gcc8 compiler can be used for builds. -* Added the ability to build llvm from submodule. -* The version of the librdkafka library has been updated to v0.11.4. -* Added the ability to use the system libcpuid library. The library version has been updated to 0.4.0. -* Fixed the build using the vectorclass library ([Babacar Diassé](https://github.com/ClickHouse/ClickHouse/pull/2274)). -* Cmake now generates files for ninja by default (like when using `-G Ninja`). -* Added the ability to use the libtinfo library instead of libtermcap ([Georgy Kondratiev](https://github.com/ClickHouse/ClickHouse/pull/2519)). -* Fixed a header file conflict in Fedora Rawhide ([#2520](https://github.com/ClickHouse/ClickHouse/issues/2520)). +- Added the hash function `murmurHash2_32` [\#2756](https://github.com/ClickHouse/ClickHouse/pull/2756). -#### Backward incompatible changes: +#### Improvements: {#improvements-7} -* Removed escaping in `Vertical` and `Pretty*` formats and deleted the `VerticalRaw` format. -* If servers with version 1.1.54388 (or newer) and servers with an older version are used simultaneously in a distributed query and the query has the `cast(x, 'Type')` expression without the `AS` keyword and doesn't have the word `cast` in uppercase, an exception will be thrown with a message like `Not found column cast(0, 'UInt8') in block`. Solution: Update the server on the entire cluster. +- Now you can use the `from_env` [\#2741](https://github.com/ClickHouse/ClickHouse/pull/2741) attribute to set values in config files from environment variables. +- Added case-insensitive versions of the `coalesce`, `ifNull`, and `nullIf functions` [\#2752](https://github.com/ClickHouse/ClickHouse/pull/2752). -### ClickHouse release 1.1.54385, 2018-06-01 +#### Bug fixes: {#bug-fixes-15} -#### Bug fixes: +- Fixed a possible bug when starting a replica [\#2759](https://github.com/ClickHouse/ClickHouse/pull/2759). -* Fixed an error that in some cases caused ZooKeeper operations to block. +## ClickHouse release 18.4 {#clickhouse-release-18-4} -### ClickHouse release 1.1.54383, 2018-05-22 +### ClickHouse release 18.4.0, 2018-07-28 {#clickhouse-release-18-4-0-2018-07-28} -#### Bug fixes: +#### New features: {#new-features-8} -* Fixed a slowdown of replication queue if a table has many replicas. +- Added system tables: `formats`, `data_type_families`, `aggregate_function_combinators`, `table_functions`, `table_engines`, `collations` [\#2721](https://github.com/ClickHouse/ClickHouse/pull/2721). +- Added the ability to use a table function instead of a table as an argument of a `remote` or `cluster table function` [\#2708](https://github.com/ClickHouse/ClickHouse/pull/2708). +- Support for `HTTP Basic` authentication in the replication protocol [\#2727](https://github.com/ClickHouse/ClickHouse/pull/2727). +- The `has` function now allows searching for a numeric value in an array of `Enum` values [Maxim Khrisanfov](https://github.com/ClickHouse/ClickHouse/pull/2699). +- Support for adding arbitrary message separators when reading from `Kafka` [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2701). -### ClickHouse release 1.1.54381, 2018-05-14 +#### Improvements: {#improvements-8} -#### Bug fixes: +- The `ALTER TABLE t DELETE WHERE` query does not rewrite data parts that were not affected by the WHERE condition [\#2694](https://github.com/ClickHouse/ClickHouse/pull/2694). +- The `use_minimalistic_checksums_in_zookeeper` option for `ReplicatedMergeTree` tables is enabled by default. This setting was added in version 1.1.54378, 2018-04-16. Versions that are older than 1.1.54378 can no longer be installed. +- Support for running `KILL` and `OPTIMIZE` queries that specify `ON CLUSTER` [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2689). -* Fixed a nodes leak in ZooKeeper when ClickHouse loses connection to ZooKeeper server. +#### Bug fixes: {#bug-fixes-16} -### ClickHouse release 1.1.54380, 2018-04-21 +- Fixed the error `Column ... is not under an aggregate function and not in GROUP BY` for aggregation with an IN expression. This bug appeared in version 18.1.0. ([bbdd780b](https://github.com/ClickHouse/ClickHouse/commit/bbdd780be0be06a0f336775941cdd536878dd2c2)) +- Fixed a bug in the `windowFunnel aggregate function` [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2735). +- Fixed a bug in the `anyHeavy` aggregate function ([a2101df2](https://github.com/ClickHouse/ClickHouse/commit/a2101df25a6a0fba99aa71f8793d762af2b801ee)) +- Fixed server crash when using the `countArray()` aggregate function. -#### New features: +#### Backward incompatible changes: {#backward-incompatible-changes-5} -* Added the table function `file(path, format, structure)`. An example reading bytes from `/dev/urandom`: `ln -s /dev/urandom /var/lib/clickhouse/user_files/random``clickhouse-client -q "SELECT * FROM file('random', 'RowBinary', 'd UInt8') LIMIT 10"`. +- Parameters for `Kafka` engine was changed from `Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format[, kafka_schema, kafka_num_consumers])` to `Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format[, kafka_row_delimiter, kafka_schema, kafka_num_consumers])`. If your tables use `kafka_schema` or `kafka_num_consumers` parameters, you have to manually edit the metadata files `path/metadata/database/table.sql` and add `kafka_row_delimiter` parameter with `''` value. -#### Improvements: +## ClickHouse release 18.1 {#clickhouse-release-18-1} -* Subqueries can be wrapped in `()` brackets to enhance query readability. For example: `(SELECT 1) UNION ALL (SELECT 1)`. -* Simple `SELECT` queries from the `system.processes` table are not included in the `max_concurrent_queries` limit. - -#### Bug fixes: - -* Fixed incorrect behavior of the `IN` operator when select from `MATERIALIZED VIEW`. -* Fixed incorrect filtering by partition index in expressions like `partition_key_column IN (...)`. -* Fixed inability to execute `OPTIMIZE` query on non-leader replica if `REANAME` was performed on the table. -* Fixed the authorization error when executing `OPTIMIZE` or `ALTER` queries on a non-leader replica. -* Fixed freezing of `KILL QUERY`. -* Fixed an error in ZooKeeper client library which led to loss of watches, freezing of distributed DDL queue, and slowdowns in the replication queue if a non-empty `chroot` prefix is used in the ZooKeeper configuration. - -#### Backward incompatible changes: - -* Removed support for expressions like `(a, b) IN (SELECT (a, b))` (you can use the equivalent expression `(a, b) IN (SELECT a, b)`). In previous releases, these expressions led to undetermined `WHERE` filtering or caused errors. - -### ClickHouse release 1.1.54378, 2018-04-16 - -#### New features: - -* Logging level can be changed without restarting the server. -* Added the `SHOW CREATE DATABASE` query. -* The `query_id` can be passed to `clickhouse-client` (elBroom). -* New setting: `max_network_bandwidth_for_all_users`. -* Added support for `ALTER TABLE ... PARTITION ... ` for `MATERIALIZED VIEW`. -* Added information about the size of data parts in uncompressed form in the system table. -* Server-to-server encryption support for distributed tables (`1` in the replica config in ``). -* Configuration of the table level for the `ReplicatedMergeTree` family in order to minimize the amount of data stored in Zookeeper: : `use_minimalistic_checksums_in_zookeeper = 1` -* Configuration of the `clickhouse-client` prompt. By default, server names are now output to the prompt. The server's display name can be changed. It's also sent in the `X-ClickHouse-Display-Name` HTTP header (Kirill Shvakov). -* Multiple comma-separated `topics` can be specified for the `Kafka` engine (Tobias Adamson) -* When a query is stopped by `KILL QUERY` or `replace_running_query`, the client receives the `Query was canceled` exception instead of an incomplete result. - -#### Improvements: - -* `ALTER TABLE ... DROP/DETACH PARTITION` queries are run at the front of the replication queue. -* `SELECT ... FINAL` and `OPTIMIZE ... FINAL` can be used even when the table has a single data part. -* A `query_log` table is recreated on the fly if it was deleted manually (Kirill Shvakov). -* The `lengthUTF8` function runs faster (zhang2014). -* Improved performance of synchronous inserts in `Distributed` tables (`insert_distributed_sync = 1`) when there is a very large number of shards. -* The server accepts the `send_timeout` and `receive_timeout` settings from the client and applies them when connecting to the client (they are applied in reverse order: the server socket's `send_timeout` is set to the `receive_timeout` value received from the client, and vice versa). -* More robust crash recovery for asynchronous insertion into `Distributed` tables. -* The return type of the `countEqual` function changed from `UInt32` to `UInt64` (谢磊). - -#### Bug fixes: - -* Fixed an error with `IN` when the left side of the expression is `Nullable`. -* Correct results are now returned when using tuples with `IN` when some of the tuple components are in the table index. -* The `max_execution_time` limit now works correctly with distributed queries. -* Fixed errors when calculating the size of composite columns in the `system.columns` table. -* Fixed an error when creating a temporary table `CREATE TEMPORARY TABLE IF NOT EXISTS.` -* Fixed errors in `StorageKafka` (##2075) -* Fixed server crashes from invalid arguments of certain aggregate functions. -* Fixed the error that prevented the `DETACH DATABASE` query from stopping background tasks for `ReplicatedMergeTree` tables. -* `Too many parts` state is less likely to happen when inserting into aggregated materialized views (##2084). -* Corrected recursive handling of substitutions in the config if a substitution must be followed by another substitution on the same level. -* Corrected the syntax in the metadata file when creating a `VIEW` that uses a query with `UNION ALL`. -* `SummingMergeTree` now works correctly for summation of nested data structures with a composite key. -* Fixed the possibility of a race condition when choosing the leader for `ReplicatedMergeTree` tables. - -#### Build changes: - -* The build supports `ninja` instead of `make` and uses `ninja` by default for building releases. -* Renamed packages: `clickhouse-server-base` in `clickhouse-common-static`; `clickhouse-server-common` in `clickhouse-server`; `clickhouse-common-dbg` in `clickhouse-common-static-dbg`. To install, use `clickhouse-server clickhouse-client`. Packages with the old names will still load in the repositories for backward compatibility. - -#### Backward incompatible changes: - -* Removed the special interpretation of an IN expression if an array is specified on the left side. Previously, the expression `arr IN (set)` was interpreted as "at least one `arr` element belongs to the `set`". To get the same behavior in the new version, write `arrayExists(x -> x IN (set), arr)`. -* Disabled the incorrect use of the socket option `SO_REUSEPORT`, which was incorrectly enabled by default in the Poco library. Note that on Linux there is no longer any reason to simultaneously specify the addresses `::` and `0.0.0.0` for listen – use just `::`, which allows listening to the connection both over IPv4 and IPv6 (with the default kernel config settings). You can also revert to the behavior from previous versions by specifying `1` in the config. - -### ClickHouse release 1.1.54370, 2018-03-16 - -#### New features: - -* Added the `system.macros` table and auto updating of macros when the config file is changed. -* Added the `SYSTEM RELOAD CONFIG` query. -* Added the `maxIntersections(left_col, right_col)` aggregate function, which returns the maximum number of simultaneously intersecting intervals `[left; right]`. The `maxIntersectionsPosition(left, right)` function returns the beginning of the "maximum" interval. ([Michael Furmur](https://github.com/ClickHouse/ClickHouse/pull/2012)). - -#### Improvements: - -* When inserting data in a `Replicated` table, fewer requests are made to `ZooKeeper` (and most of the user-level errors have disappeared from the `ZooKeeper` log). -* Added the ability to create aliases for data sets. Example: `WITH (1, 2, 3) AS set SELECT number IN set FROM system.numbers LIMIT 10`. - -#### Bug fixes: - -* Fixed the `Illegal PREWHERE` error when reading from Merge tables for `Distributed`tables. -* Added fixes that allow you to start clickhouse-server in IPv4-only Docker containers. -* Fixed a race condition when reading from system `system.parts_columns tables.` -* Removed double buffering during a synchronous insert to a `Distributed` table, which could have caused the connection to timeout. -* Fixed a bug that caused excessively long waits for an unavailable replica before beginning a `SELECT` query. -* Fixed incorrect dates in the `system.parts` table. -* Fixed a bug that made it impossible to insert data in a `Replicated` table if `chroot` was non-empty in the configuration of the `ZooKeeper` cluster. -* Fixed the vertical merging algorithm for an empty `ORDER BY` table. -* Restored the ability to use dictionaries in queries to remote tables, even if these dictionaries are not present on the requestor server. This functionality was lost in release 1.1.54362. -* Restored the behavior for queries like `SELECT * FROM remote('server2', default.table) WHERE col IN (SELECT col2 FROM default.table)` when the right side of the `IN` should use a remote `default.table` instead of a local one. This behavior was broken in version 1.1.54358. -* Removed extraneous error-level logging of `Not found column ... in block`. - -### Clickhouse Release 1.1.54362, 2018-03-11 - -#### New features: - -* Aggregation without `GROUP BY` for an empty set (such as `SELECT count(*) FROM table WHERE 0`) now returns a result with one row with null values for aggregate functions, in compliance with the SQL standard. To restore the old behavior (return an empty result), set `empty_result_for_aggregation_by_empty_set` to 1. -* Added type conversion for `UNION ALL`. Different alias names are allowed in `SELECT` positions in `UNION ALL`, in compliance with the SQL standard. -* Arbitrary expressions are supported in `LIMIT BY` clauses. Previously, it was only possible to use columns resulting from `SELECT`. -* An index of `MergeTree` tables is used when `IN` is applied to a tuple of expressions from the columns of the primary key. Example: `WHERE (UserID, EventDate) IN ((123, '2000-01-01'), ...)` (Anastasiya Tsarkova). -* Added the `clickhouse-copier` tool for copying between clusters and resharding data (beta). -* Added consistent hashing functions: `yandexConsistentHash`, `jumpConsistentHash`, `sumburConsistentHash`. They can be used as a sharding key in order to reduce the amount of network traffic during subsequent reshardings. -* Added functions: `arrayAny`, `arrayAll`, `hasAny`, `hasAll`, `arrayIntersect`, `arrayResize`. -* Added the `arrayCumSum` function (Javi Santana). -* Added the `parseDateTimeBestEffort`, `parseDateTimeBestEffortOrZero`, and `parseDateTimeBestEffortOrNull` functions to read the DateTime from a string containing text in a wide variety of possible formats. -* Data can be partially reloaded from external dictionaries during updating (load just the records in which the value of the specified field greater than in the previous download) (Arsen Hakobyan). -* Added the `cluster` table function. Example: `cluster(cluster_name, db, table)`. The `remote` table function can accept the cluster name as the first argument, if it is specified as an identifier. -* The `remote` and `cluster` table functions can be used in `INSERT` queries. -* Added the `create_table_query` and `engine_full` virtual columns to the `system.tables`table . The `metadata_modification_time` column is virtual. -* Added the `data_path` and `metadata_path` columns to `system.tables`and` system.databases` tables, and added the `path` column to the `system.parts` and `system.parts_columns` tables. -* Added additional information about merges in the `system.part_log` table. -* An arbitrary partitioning key can be used for the `system.query_log` table (Kirill Shvakov). -* The `SHOW TABLES` query now also shows temporary tables. Added temporary tables and the `is_temporary` column to `system.tables` (zhang2014). -* Added `DROP TEMPORARY TABLE` and `EXISTS TEMPORARY TABLE` queries (zhang2014). -* Support for `SHOW CREATE TABLE` for temporary tables (zhang2014). -* Added the `system_profile` configuration parameter for the settings used by internal processes. -* Support for loading `object_id` as an attribute in `MongoDB` dictionaries (Pavel Litvinenko). -* Reading `null` as the default value when loading data for an external dictionary with the `MongoDB` source (Pavel Litvinenko). -* Reading `DateTime` values in the `Values` format from a Unix timestamp without single quotes. -* Failover is supported in `remote` table functions for cases when some of the replicas are missing the requested table. -* Configuration settings can be overridden in the command line when you run `clickhouse-server`. Example: `clickhouse-server -- --logger.level=information`. -* Implemented the `empty` function from a `FixedString` argument: the function returns 1 if the string consists entirely of null bytes (zhang2014). -* Added the `listen_try`configuration parameter for listening to at least one of the listen addresses without quitting, if some of the addresses can't be listened to (useful for systems with disabled support for IPv4 or IPv6). -* Added the `VersionedCollapsingMergeTree` table engine. -* Support for rows and arbitrary numeric types for the `library` dictionary source. -* `MergeTree` tables can be used without a primary key (you need to specify `ORDER BY tuple()`). -* A `Nullable` type can be `CAST` to a non-`Nullable` type if the argument is not `NULL`. -* `RENAME TABLE` can be performed for `VIEW`. -* Added the `throwIf` function. -* Added the `odbc_default_field_size` option, which allows you to extend the maximum size of the value loaded from an ODBC source (by default, it is 1024). -* The `system.processes` table and `SHOW PROCESSLIST` now have the `is_cancelled` and `peak_memory_usage` columns. - -#### Improvements: - -* Limits and quotas on the result are no longer applied to intermediate data for `INSERT SELECT` queries or for `SELECT` subqueries. -* Fewer false triggers of `force_restore_data` when checking the status of `Replicated` tables when the server starts. -* Added the `allow_distributed_ddl` option. -* Nondeterministic functions are not allowed in expressions for `MergeTree` table keys. -* Files with substitutions from `config.d` directories are loaded in alphabetical order. -* Improved performance of the `arrayElement` function in the case of a constant multidimensional array with an empty array as one of the elements. Example: `[[1], []][x]`. -* The server starts faster now when using configuration files with very large substitutions (for instance, very large lists of IP networks). -* When running a query, table valued functions run once. Previously, `remote` and `mysql` table valued functions performed the same query twice to retrieve the table structure from a remote server. -* The `MkDocs` documentation generator is used. -* When you try to delete a table column that `DEFAULT`/`MATERIALIZED` expressions of other columns depend on, an exception is thrown (zhang2014). -* Added the ability to parse an empty line in text formats as the number 0 for `Float` data types. This feature was previously available but was lost in release 1.1.54342. -* `Enum` values can be used in `min`, `max`, `sum` and some other functions. In these cases, it uses the corresponding numeric values. This feature was previously available but was lost in the release 1.1.54337. -* Added `max_expanded_ast_elements` to restrict the size of the AST after recursively expanding aliases. - -#### Bug fixes: - -* Fixed cases when unnecessary columns were removed from subqueries in error, or not removed from subqueries containing `UNION ALL`. -* Fixed a bug in merges for `ReplacingMergeTree` tables. -* Fixed synchronous insertions in `Distributed` tables (`insert_distributed_sync = 1`). -* Fixed segfault for certain uses of `FULL` and `RIGHT JOIN` with duplicate columns in subqueries. -* Fixed segfault for certain uses of `replace_running_query` and `KILL QUERY`. -* Fixed the order of the `source` and `last_exception` columns in the `system.dictionaries` table. -* Fixed a bug when the `DROP DATABASE` query did not delete the file with metadata. -* Fixed the `DROP DATABASE` query for `Dictionary` databases. -* Fixed the low precision of `uniqHLL12` and `uniqCombined` functions for cardinalities greater than 100 million items (Alex Bocharov). -* Fixed the calculation of implicit default values when necessary to simultaneously calculate default explicit expressions in `INSERT` queries (zhang2014). -* Fixed a rare case when a query to a `MergeTree` table couldn't finish (chenxing-xc). -* Fixed a crash that occurred when running a `CHECK` query for `Distributed` tables if all shards are local (chenxing.xc). -* Fixed a slight performance regression with functions that use regular expressions. -* Fixed a performance regression when creating multidimensional arrays from complex expressions. -* Fixed a bug that could cause an extra `FORMAT` section to appear in an `.sql` file with metadata. -* Fixed a bug that caused the `max_table_size_to_drop` limit to apply when trying to delete a `MATERIALIZED VIEW` looking at an explicitly specified table. -* Fixed incompatibility with old clients (old clients were sometimes sent data with the `DateTime('timezone')` type, which they do not understand). -* Fixed a bug when reading `Nested` column elements of structures that were added using `ALTER` but that are empty for the old partitions, when the conditions for these columns moved to `PREWHERE`. -* Fixed a bug when filtering tables by virtual `_table` columns in queries to `Merge` tables. -* Fixed a bug when using `ALIAS` columns in `Distributed` tables. -* Fixed a bug that made dynamic compilation impossible for queries with aggregate functions from the `quantile` family. -* Fixed a race condition in the query execution pipeline that occurred in very rare cases when using `Merge` tables with a large number of tables, and when using `GLOBAL` subqueries. -* Fixed a crash when passing arrays of different sizes to an `arrayReduce` function when using aggregate functions from multiple arguments. -* Prohibited the use of queries with `UNION ALL` in a `MATERIALIZED VIEW`. -* Fixed an error during initialization of the `part_log` system table when the server starts (by default, `part_log` is disabled). - -#### Backward incompatible changes: - -* Removed the `distributed_ddl_allow_replicated_alter` option. This behavior is enabled by default. -* Removed the `strict_insert_defaults` setting. If you were using this functionality, write to `clickhouse-feedback@yandex-team.com`. -* Removed the `UnsortedMergeTree` engine. - -### Clickhouse Release 1.1.54343, 2018-02-05 - -* Added macros support for defining cluster names in distributed DDL queries and constructors of Distributed tables: `CREATE TABLE distr ON CLUSTER '{cluster}' (...) ENGINE = Distributed('{cluster}', 'db', 'table')`. -* Now queries like `SELECT ... FROM table WHERE expr IN (subquery)` are processed using the `table` index. -* Improved processing of duplicates when inserting to Replicated tables, so they no longer slow down execution of the replication queue. - -### Clickhouse Release 1.1.54342, 2018-01-22 +### ClickHouse release 18.1.0, 2018-07-23 {#clickhouse-release-18-1-0-2018-07-23} + +#### New features: {#new-features-9} + +- Support for the `ALTER TABLE t DELETE WHERE` query for non-replicated MergeTree tables ([\#2634](https://github.com/ClickHouse/ClickHouse/pull/2634)). +- Support for arbitrary types for the `uniq*` family of aggregate functions ([\#2010](https://github.com/ClickHouse/ClickHouse/issues/2010)). +- Support for arbitrary types in comparison operators ([\#2026](https://github.com/ClickHouse/ClickHouse/issues/2026)). +- The `users.xml` file allows setting a subnet mask in the format `10.0.0.1/255.255.255.0`. This is necessary for using masks for IPv6 networks with zeros in the middle ([\#2637](https://github.com/ClickHouse/ClickHouse/pull/2637)). +- Added the `arrayDistinct` function ([\#2670](https://github.com/ClickHouse/ClickHouse/pull/2670)). +- The SummingMergeTree engine can now work with AggregateFunction type columns ([Constantin S. Pan](https://github.com/ClickHouse/ClickHouse/pull/2566)). + +#### Improvements: {#improvements-9} + +- Changed the numbering scheme for release versions. Now the first part contains the year of release (A.D., Moscow timezone, minus 2000), the second part contains the number for major changes (increases for most releases), and the third part is the patch version. Releases are still backward compatible, unless otherwise stated in the changelog. +- Faster conversions of floating-point numbers to a string ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2664)). +- If some rows were skipped during an insert due to parsing errors (this is possible with the `input_allow_errors_num` and `input_allow_errors_ratio` settings enabled), the number of skipped rows is now written to the server log ([Leonardo Cecchi](https://github.com/ClickHouse/ClickHouse/pull/2669)). + +#### Bug fixes: {#bug-fixes-17} + +- Fixed the TRUNCATE command for temporary tables ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2624)). +- Fixed a rare deadlock in the ZooKeeper client library that occurred when there was a network error while reading the response ([c315200](https://github.com/ClickHouse/ClickHouse/commit/c315200e64b87e44bdf740707fc857d1fdf7e947)). +- Fixed an error during a CAST to Nullable types ([\#1322](https://github.com/ClickHouse/ClickHouse/issues/1322)). +- Fixed the incorrect result of the `maxIntersection()` function when the boundaries of intervals coincided ([Michael Furmur](https://github.com/ClickHouse/ClickHouse/pull/2657)). +- Fixed incorrect transformation of the OR expression chain in a function argument ([chenxing-xc](https://github.com/ClickHouse/ClickHouse/pull/2663)). +- Fixed performance degradation for queries containing `IN (subquery)` expressions inside another subquery ([\#2571](https://github.com/ClickHouse/ClickHouse/issues/2571)). +- Fixed incompatibility between servers with different versions in distributed queries that use a `CAST` function that isn’t in uppercase letters ([fe8c4d6](https://github.com/ClickHouse/ClickHouse/commit/fe8c4d64e434cacd4ceef34faa9005129f2190a5)). +- Added missing quoting of identifiers for queries to an external DBMS ([\#2635](https://github.com/ClickHouse/ClickHouse/issues/2635)). + +#### Backward incompatible changes: {#backward-incompatible-changes-6} + +- Converting a string containing the number zero to DateTime does not work. Example: `SELECT toDateTime('0')`. This is also the reason that `DateTime DEFAULT '0'` does not work in tables, as well as `0` in dictionaries. Solution: replace `0` with `0000-00-00 00:00:00`. + +## ClickHouse release 1.1 {#clickhouse-release-1-1} + +### ClickHouse release 1.1.54394, 2018-07-12 {#clickhouse-release-1-1-54394-2018-07-12} + +#### New features: {#new-features-10} + +- Added the `histogram` aggregate function ([Mikhail Surin](https://github.com/ClickHouse/ClickHouse/pull/2521)). +- Now `OPTIMIZE TABLE ... FINAL` can be used without specifying partitions for `ReplicatedMergeTree` ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2600)). + +#### Bug fixes: {#bug-fixes-18} + +- Fixed a problem with a very small timeout for sockets (one second) for reading and writing when sending and downloading replicated data, which made it impossible to download larger parts if there is a load on the network or disk (it resulted in cyclical attempts to download parts). This error occurred in version 1.1.54388. +- Fixed issues when using chroot in ZooKeeper if you inserted duplicate data blocks in the table. +- The `has` function now works correctly for an array with Nullable elements ([\#2115](https://github.com/ClickHouse/ClickHouse/issues/2115)). +- The `system.tables` table now works correctly when used in distributed queries. The `metadata_modification_time` and `engine_full` columns are now non-virtual. Fixed an error that occurred if only these columns were queried from the table. +- Fixed how an empty `TinyLog` table works after inserting an empty data block ([\#2563](https://github.com/ClickHouse/ClickHouse/issues/2563)). +- The `system.zookeeper` table works if the value of the node in ZooKeeper is NULL. + +### ClickHouse release 1.1.54390, 2018-07-06 {#clickhouse-release-1-1-54390-2018-07-06} + +#### New features: {#new-features-11} + +- Queries can be sent in `multipart/form-data` format (in the `query` field), which is useful if external data is also sent for query processing ([Olga Hvostikova](https://github.com/ClickHouse/ClickHouse/pull/2490)). +- Added the ability to enable or disable processing single or double quotes when reading data in CSV format. You can configure this in the `format_csv_allow_single_quotes` and `format_csv_allow_double_quotes` settings ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2574)). +- Now `OPTIMIZE TABLE ... FINAL` can be used without specifying the partition for non-replicated variants of `MergeTree` ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2599)). + +#### Improvements: {#improvements-10} + +- Improved performance, reduced memory consumption, and correct memory consumption tracking with use of the IN operator when a table index could be used ([\#2584](https://github.com/ClickHouse/ClickHouse/pull/2584)). +- Removed redundant checking of checksums when adding a data part. This is important when there are a large number of replicas, because in these cases the total number of checks was equal to N^2. +- Added support for `Array(Tuple(...))` arguments for the `arrayEnumerateUniq` function ([\#2573](https://github.com/ClickHouse/ClickHouse/pull/2573)). +- Added `Nullable` support for the `runningDifference` function ([\#2594](https://github.com/ClickHouse/ClickHouse/pull/2594)). +- Improved query analysis performance when there is a very large number of expressions ([\#2572](https://github.com/ClickHouse/ClickHouse/pull/2572)). +- Faster selection of data parts for merging in `ReplicatedMergeTree` tables. Faster recovery of the ZooKeeper session ([\#2597](https://github.com/ClickHouse/ClickHouse/pull/2597)). +- The `format_version.txt` file for `MergeTree` tables is re-created if it is missing, which makes sense if ClickHouse is launched after copying the directory structure without files ([Ciprian Hacman](https://github.com/ClickHouse/ClickHouse/pull/2593)). + +#### Bug fixes: {#bug-fixes-19} + +- Fixed a bug when working with ZooKeeper that could make it impossible to recover the session and readonly states of tables before restarting the server. +- Fixed a bug when working with ZooKeeper that could result in old nodes not being deleted if the session is interrupted. +- Fixed an error in the `quantileTDigest` function for Float arguments (this bug was introduced in version 1.1.54388) ([Mikhail Surin](https://github.com/ClickHouse/ClickHouse/pull/2553)). +- Fixed a bug in the index for MergeTree tables if the primary key column is located inside the function for converting types between signed and unsigned integers of the same size ([\#2603](https://github.com/ClickHouse/ClickHouse/pull/2603)). +- Fixed segfault if `macros` are used but they aren’t in the config file ([\#2570](https://github.com/ClickHouse/ClickHouse/pull/2570)). +- Fixed switching to the default database when reconnecting the client ([\#2583](https://github.com/ClickHouse/ClickHouse/pull/2583)). +- Fixed a bug that occurred when the `use_index_for_in_with_subqueries` setting was disabled. + +#### Security fix: {#security-fix-1} + +- Sending files is no longer possible when connected to MySQL (`LOAD DATA LOCAL INFILE`). + +### ClickHouse release 1.1.54388, 2018-06-28 {#clickhouse-release-1-1-54388-2018-06-28} + +#### New features: {#new-features-12} + +- Support for the `ALTER TABLE t DELETE WHERE` query for replicated tables. Added the `system.mutations` table to track progress of this type of queries. +- Support for the `ALTER TABLE t [REPLACE|ATTACH] PARTITION` query for \*MergeTree tables. +- Support for the `TRUNCATE TABLE` query ([Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2260)) +- Several new `SYSTEM` queries for replicated tables (`RESTART REPLICAS`, `SYNC REPLICA`, `[STOP|START] [MERGES|FETCHES|SENDS REPLICATED|REPLICATION QUEUES]`). +- Added the ability to write to a table with the MySQL engine and the corresponding table function ([sundy-li](https://github.com/ClickHouse/ClickHouse/pull/2294)). +- Added the `url()` table function and the `URL` table engine ([Alexander Sapin](https://github.com/ClickHouse/ClickHouse/pull/2501)). +- Added the `windowFunnel` aggregate function ([sundy-li](https://github.com/ClickHouse/ClickHouse/pull/2352)). +- New `startsWith` and `endsWith` functions for strings ([Vadim Plakhtinsky](https://github.com/ClickHouse/ClickHouse/pull/2429)). +- The `numbers()` table function now allows you to specify the offset ([Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2535)). +- The password to `clickhouse-client` can be entered interactively. +- Server logs can now be sent to syslog ([Alexander Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/2459)). +- Support for logging in dictionaries with a shared library source ([Alexander Sapin](https://github.com/ClickHouse/ClickHouse/pull/2472)). +- Support for custom CSV delimiters ([Ivan Zhukov](https://github.com/ClickHouse/ClickHouse/pull/2263)) +- Added the `date_time_input_format` setting. If you switch this setting to `'best_effort'`, DateTime values will be read in a wide range of formats. +- Added the `clickhouse-obfuscator` utility for data obfuscation. Usage example: publishing data used in performance tests. + +#### Experimental features: {#experimental-features-2} + +- Added the ability to calculate `and` arguments only where they are needed ([Anastasia Tsarkova](https://github.com/ClickHouse/ClickHouse/pull/2272)) +- JIT compilation to native code is now available for some expressions ([pyos](https://github.com/ClickHouse/ClickHouse/pull/2277)). + +#### Bug fixes: {#bug-fixes-20} + +- Duplicates no longer appear for a query with `DISTINCT` and `ORDER BY`. +- Queries with `ARRAY JOIN` and `arrayFilter` no longer return an incorrect result. +- Fixed an error when reading an array column from a Nested structure ([\#2066](https://github.com/ClickHouse/ClickHouse/issues/2066)). +- Fixed an error when analyzing queries with a HAVING clause like `HAVING tuple IN (...)`. +- Fixed an error when analyzing queries with recursive aliases. +- Fixed an error when reading from ReplacingMergeTree with a condition in PREWHERE that filters all rows ([\#2525](https://github.com/ClickHouse/ClickHouse/issues/2525)). +- User profile settings were not applied when using sessions in the HTTP interface. +- Fixed how settings are applied from the command line parameters in clickhouse-local. +- The ZooKeeper client library now uses the session timeout received from the server. +- Fixed a bug in the ZooKeeper client library when the client waited for the server response longer than the timeout. +- Fixed pruning of parts for queries with conditions on partition key columns ([\#2342](https://github.com/ClickHouse/ClickHouse/issues/2342)). +- Merges are now possible after `CLEAR COLUMN IN PARTITION` ([\#2315](https://github.com/ClickHouse/ClickHouse/issues/2315)). +- Type mapping in the ODBC table function has been fixed ([sundy-li](https://github.com/ClickHouse/ClickHouse/pull/2268)). +- Type comparisons have been fixed for `DateTime` with and without the time zone ([Alexander Bocharov](https://github.com/ClickHouse/ClickHouse/pull/2400)). +- Fixed syntactic parsing and formatting of the `CAST` operator. +- Fixed insertion into a materialized view for the Distributed table engine ([Babacar Diassé](https://github.com/ClickHouse/ClickHouse/pull/2411)). +- Fixed a race condition when writing data from the `Kafka` engine to materialized views ([Yangkuan Liu](https://github.com/ClickHouse/ClickHouse/pull/2448)). +- Fixed SSRF in the remote() table function. +- Fixed exit behavior of `clickhouse-client` in multiline mode ([\#2510](https://github.com/ClickHouse/ClickHouse/issues/2510)). + +#### Improvements: {#improvements-11} + +- Background tasks in replicated tables are now performed in a thread pool instead of in separate threads ([Silviu Caragea](https://github.com/ClickHouse/ClickHouse/pull/1722)). +- Improved LZ4 compression performance. +- Faster analysis for queries with a large number of JOINs and sub-queries. +- The DNS cache is now updated automatically when there are too many network errors. +- Table inserts no longer occur if the insert into one of the materialized views is not possible because it has too many parts. +- Corrected the discrepancy in the event counters `Query`, `SelectQuery`, and `InsertQuery`. +- Expressions like `tuple IN (SELECT tuple)` are allowed if the tuple types match. +- A server with replicated tables can start even if you haven’t configured ZooKeeper. +- When calculating the number of available CPU cores, limits on cgroups are now taken into account ([Atri Sharma](https://github.com/ClickHouse/ClickHouse/pull/2325)). +- Added chown for config directories in the systemd config file ([Mikhail Shiryaev](https://github.com/ClickHouse/ClickHouse/pull/2421)). + +#### Build changes: {#build-changes-4} + +- The gcc8 compiler can be used for builds. +- Added the ability to build llvm from submodule. +- The version of the librdkafka library has been updated to v0.11.4. +- Added the ability to use the system libcpuid library. The library version has been updated to 0.4.0. +- Fixed the build using the vectorclass library ([Babacar Diassé](https://github.com/ClickHouse/ClickHouse/pull/2274)). +- Cmake now generates files for ninja by default (like when using `-G Ninja`). +- Added the ability to use the libtinfo library instead of libtermcap ([Georgy Kondratiev](https://github.com/ClickHouse/ClickHouse/pull/2519)). +- Fixed a header file conflict in Fedora Rawhide ([\#2520](https://github.com/ClickHouse/ClickHouse/issues/2520)). + +#### Backward incompatible changes: {#backward-incompatible-changes-7} + +- Removed escaping in `Vertical` and `Pretty*` formats and deleted the `VerticalRaw` format. +- If servers with version 1.1.54388 (or newer) and servers with an older version are used simultaneously in a distributed query and the query has the `cast(x, 'Type')` expression without the `AS` keyword and doesn’t have the word `cast` in uppercase, an exception will be thrown with a message like `Not found column cast(0, 'UInt8') in block`. Solution: Update the server on the entire cluster. + +### ClickHouse release 1.1.54385, 2018-06-01 {#clickhouse-release-1-1-54385-2018-06-01} + +#### Bug fixes: {#bug-fixes-21} + +- Fixed an error that in some cases caused ZooKeeper operations to block. + +### ClickHouse release 1.1.54383, 2018-05-22 {#clickhouse-release-1-1-54383-2018-05-22} + +#### Bug fixes: {#bug-fixes-22} + +- Fixed a slowdown of replication queue if a table has many replicas. + +### ClickHouse release 1.1.54381, 2018-05-14 {#clickhouse-release-1-1-54381-2018-05-14} + +#### Bug fixes: {#bug-fixes-23} + +- Fixed a nodes leak in ZooKeeper when ClickHouse loses connection to ZooKeeper server. + +### ClickHouse release 1.1.54380, 2018-04-21 {#clickhouse-release-1-1-54380-2018-04-21} + +#### New features: {#new-features-13} + +- Added the table function `file(path, format, structure)`. An example reading bytes from `/dev/urandom`: ``` ln -s /dev/urandom /var/lib/clickhouse/user_files/random``clickhouse-client -q "SELECT * FROM file('random', 'RowBinary', 'd UInt8') LIMIT 10" ```. + +#### Improvements: {#improvements-12} + +- Subqueries can be wrapped in `()` brackets to enhance query readability. For example: `(SELECT 1) UNION ALL (SELECT 1)`. +- Simple `SELECT` queries from the `system.processes` table are not included in the `max_concurrent_queries` limit. + +#### Bug fixes: {#bug-fixes-24} + +- Fixed incorrect behavior of the `IN` operator when select from `MATERIALIZED VIEW`. +- Fixed incorrect filtering by partition index in expressions like `partition_key_column IN (...)`. +- Fixed inability to execute `OPTIMIZE` query on non-leader replica if `REANAME` was performed on the table. +- Fixed the authorization error when executing `OPTIMIZE` or `ALTER` queries on a non-leader replica. +- Fixed freezing of `KILL QUERY`. +- Fixed an error in ZooKeeper client library which led to loss of watches, freezing of distributed DDL queue, and slowdowns in the replication queue if a non-empty `chroot` prefix is used in the ZooKeeper configuration. + +#### Backward incompatible changes: {#backward-incompatible-changes-8} + +- Removed support for expressions like `(a, b) IN (SELECT (a, b))` (you can use the equivalent expression `(a, b) IN (SELECT a, b)`). In previous releases, these expressions led to undetermined `WHERE` filtering or caused errors. + +### ClickHouse release 1.1.54378, 2018-04-16 {#clickhouse-release-1-1-54378-2018-04-16} + +#### New features: {#new-features-14} + +- Logging level can be changed without restarting the server. +- Added the `SHOW CREATE DATABASE` query. +- The `query_id` can be passed to `clickhouse-client` (elBroom). +- New setting: `max_network_bandwidth_for_all_users`. +- Added support for `ALTER TABLE ... PARTITION ...` for `MATERIALIZED VIEW`. +- Added information about the size of data parts in uncompressed form in the system table. +- Server-to-server encryption support for distributed tables (`1` in the replica config in ``). +- Configuration of the table level for the `ReplicatedMergeTree` family in order to minimize the amount of data stored in Zookeeper: : `use_minimalistic_checksums_in_zookeeper = 1` +- Configuration of the `clickhouse-client` prompt. By default, server names are now output to the prompt. The server’s display name can be changed. It’s also sent in the `X-ClickHouse-Display-Name` HTTP header (Kirill Shvakov). +- Multiple comma-separated `topics` can be specified for the `Kafka` engine (Tobias Adamson) +- When a query is stopped by `KILL QUERY` or `replace_running_query`, the client receives the `Query was canceled` exception instead of an incomplete result. + +#### Improvements: {#improvements-13} + +- `ALTER TABLE ... DROP/DETACH PARTITION` queries are run at the front of the replication queue. +- `SELECT ... FINAL` and `OPTIMIZE ... FINAL` can be used even when the table has a single data part. +- A `query_log` table is recreated on the fly if it was deleted manually (Kirill Shvakov). +- The `lengthUTF8` function runs faster (zhang2014). +- Improved performance of synchronous inserts in `Distributed` tables (`insert_distributed_sync = 1`) when there is a very large number of shards. +- The server accepts the `send_timeout` and `receive_timeout` settings from the client and applies them when connecting to the client (they are applied in reverse order: the server socket’s `send_timeout` is set to the `receive_timeout` value received from the client, and vice versa). +- More robust crash recovery for asynchronous insertion into `Distributed` tables. +- The return type of the `countEqual` function changed from `UInt32` to `UInt64` (谢磊). + +#### Bug fixes: {#bug-fixes-25} + +- Fixed an error with `IN` when the left side of the expression is `Nullable`. +- Correct results are now returned when using tuples with `IN` when some of the tuple components are in the table index. +- The `max_execution_time` limit now works correctly with distributed queries. +- Fixed errors when calculating the size of composite columns in the `system.columns` table. +- Fixed an error when creating a temporary table `CREATE TEMPORARY TABLE IF NOT EXISTS.` +- Fixed errors in `StorageKafka` (\#\#2075) +- Fixed server crashes from invalid arguments of certain aggregate functions. +- Fixed the error that prevented the `DETACH DATABASE` query from stopping background tasks for `ReplicatedMergeTree` tables. +- `Too many parts` state is less likely to happen when inserting into aggregated materialized views (\#\#2084). +- Corrected recursive handling of substitutions in the config if a substitution must be followed by another substitution on the same level. +- Corrected the syntax in the metadata file when creating a `VIEW` that uses a query with `UNION ALL`. +- `SummingMergeTree` now works correctly for summation of nested data structures with a composite key. +- Fixed the possibility of a race condition when choosing the leader for `ReplicatedMergeTree` tables. + +#### Build changes: {#build-changes-5} + +- The build supports `ninja` instead of `make` and uses `ninja` by default for building releases. +- Renamed packages: `clickhouse-server-base` in `clickhouse-common-static`; `clickhouse-server-common` in `clickhouse-server`; `clickhouse-common-dbg` in `clickhouse-common-static-dbg`. To install, use `clickhouse-server clickhouse-client`. Packages with the old names will still load in the repositories for backward compatibility. + +#### Backward incompatible changes: {#backward-incompatible-changes-9} + +- Removed the special interpretation of an IN expression if an array is specified on the left side. Previously, the expression `arr IN (set)` was interpreted as “at least one `arr` element belongs to the `set`”. To get the same behavior in the new version, write `arrayExists(x -> x IN (set), arr)`. +- Disabled the incorrect use of the socket option `SO_REUSEPORT`, which was incorrectly enabled by default in the Poco library. Note that on Linux there is no longer any reason to simultaneously specify the addresses `::` and `0.0.0.0` for listen – use just `::`, which allows listening to the connection both over IPv4 and IPv6 (with the default kernel config settings). You can also revert to the behavior from previous versions by specifying `1` in the config. + +### ClickHouse release 1.1.54370, 2018-03-16 {#clickhouse-release-1-1-54370-2018-03-16} + +#### New features: {#new-features-15} + +- Added the `system.macros` table and auto updating of macros when the config file is changed. +- Added the `SYSTEM RELOAD CONFIG` query. +- Added the `maxIntersections(left_col, right_col)` aggregate function, which returns the maximum number of simultaneously intersecting intervals `[left; right]`. The `maxIntersectionsPosition(left, right)` function returns the beginning of the “maximum” interval. ([Michael Furmur](https://github.com/ClickHouse/ClickHouse/pull/2012)). + +#### Improvements: {#improvements-14} + +- When inserting data in a `Replicated` table, fewer requests are made to `ZooKeeper` (and most of the user-level errors have disappeared from the `ZooKeeper` log). +- Added the ability to create aliases for data sets. Example: `WITH (1, 2, 3) AS set SELECT number IN set FROM system.numbers LIMIT 10`. + +#### Bug fixes: {#bug-fixes-26} + +- Fixed the `Illegal PREWHERE` error when reading from Merge tables for `Distributed`tables. +- Added fixes that allow you to start clickhouse-server in IPv4-only Docker containers. +- Fixed a race condition when reading from system `system.parts_columns tables.` +- Removed double buffering during a synchronous insert to a `Distributed` table, which could have caused the connection to timeout. +- Fixed a bug that caused excessively long waits for an unavailable replica before beginning a `SELECT` query. +- Fixed incorrect dates in the `system.parts` table. +- Fixed a bug that made it impossible to insert data in a `Replicated` table if `chroot` was non-empty in the configuration of the `ZooKeeper` cluster. +- Fixed the vertical merging algorithm for an empty `ORDER BY` table. +- Restored the ability to use dictionaries in queries to remote tables, even if these dictionaries are not present on the requestor server. This functionality was lost in release 1.1.54362. +- Restored the behavior for queries like `SELECT * FROM remote('server2', default.table) WHERE col IN (SELECT col2 FROM default.table)` when the right side of the `IN` should use a remote `default.table` instead of a local one. This behavior was broken in version 1.1.54358. +- Removed extraneous error-level logging of `Not found column ... in block`. + +### Clickhouse Release 1.1.54362, 2018-03-11 {#clickhouse-release-1-1-54362-2018-03-11} + +#### New features: {#new-features-16} + +- Aggregation without `GROUP BY` for an empty set (such as `SELECT count(*) FROM table WHERE 0`) now returns a result with one row with null values for aggregate functions, in compliance with the SQL standard. To restore the old behavior (return an empty result), set `empty_result_for_aggregation_by_empty_set` to 1. +- Added type conversion for `UNION ALL`. Different alias names are allowed in `SELECT` positions in `UNION ALL`, in compliance with the SQL standard. +- Arbitrary expressions are supported in `LIMIT BY` clauses. Previously, it was only possible to use columns resulting from `SELECT`. +- An index of `MergeTree` tables is used when `IN` is applied to a tuple of expressions from the columns of the primary key. Example: `WHERE (UserID, EventDate) IN ((123, '2000-01-01'), ...)` (Anastasiya Tsarkova). +- Added the `clickhouse-copier` tool for copying between clusters and resharding data (beta). +- Added consistent hashing functions: `yandexConsistentHash`, `jumpConsistentHash`, `sumburConsistentHash`. They can be used as a sharding key in order to reduce the amount of network traffic during subsequent reshardings. +- Added functions: `arrayAny`, `arrayAll`, `hasAny`, `hasAll`, `arrayIntersect`, `arrayResize`. +- Added the `arrayCumSum` function (Javi Santana). +- Added the `parseDateTimeBestEffort`, `parseDateTimeBestEffortOrZero`, and `parseDateTimeBestEffortOrNull` functions to read the DateTime from a string containing text in a wide variety of possible formats. +- Data can be partially reloaded from external dictionaries during updating (load just the records in which the value of the specified field greater than in the previous download) (Arsen Hakobyan). +- Added the `cluster` table function. Example: `cluster(cluster_name, db, table)`. The `remote` table function can accept the cluster name as the first argument, if it is specified as an identifier. +- The `remote` and `cluster` table functions can be used in `INSERT` queries. +- Added the `create_table_query` and `engine_full` virtual columns to the `system.tables`table . The `metadata_modification_time` column is virtual. +- Added the `data_path` and `metadata_path` columns to `system.tables`and`system.databases` tables, and added the `path` column to the `system.parts` and `system.parts_columns` tables. +- Added additional information about merges in the `system.part_log` table. +- An arbitrary partitioning key can be used for the `system.query_log` table (Kirill Shvakov). +- The `SHOW TABLES` query now also shows temporary tables. Added temporary tables and the `is_temporary` column to `system.tables` (zhang2014). +- Added `DROP TEMPORARY TABLE` and `EXISTS TEMPORARY TABLE` queries (zhang2014). +- Support for `SHOW CREATE TABLE` for temporary tables (zhang2014). +- Added the `system_profile` configuration parameter for the settings used by internal processes. +- Support for loading `object_id` as an attribute in `MongoDB` dictionaries (Pavel Litvinenko). +- Reading `null` as the default value when loading data for an external dictionary with the `MongoDB` source (Pavel Litvinenko). +- Reading `DateTime` values in the `Values` format from a Unix timestamp without single quotes. +- Failover is supported in `remote` table functions for cases when some of the replicas are missing the requested table. +- Configuration settings can be overridden in the command line when you run `clickhouse-server`. Example: `clickhouse-server -- --logger.level=information`. +- Implemented the `empty` function from a `FixedString` argument: the function returns 1 if the string consists entirely of null bytes (zhang2014). +- Added the `listen_try`configuration parameter for listening to at least one of the listen addresses without quitting, if some of the addresses can’t be listened to (useful for systems with disabled support for IPv4 or IPv6). +- Added the `VersionedCollapsingMergeTree` table engine. +- Support for rows and arbitrary numeric types for the `library` dictionary source. +- `MergeTree` tables can be used without a primary key (you need to specify `ORDER BY tuple()`). +- A `Nullable` type can be `CAST` to a non-`Nullable` type if the argument is not `NULL`. +- `RENAME TABLE` can be performed for `VIEW`. +- Added the `throwIf` function. +- Added the `odbc_default_field_size` option, which allows you to extend the maximum size of the value loaded from an ODBC source (by default, it is 1024). +- The `system.processes` table and `SHOW PROCESSLIST` now have the `is_cancelled` and `peak_memory_usage` columns. + +#### Improvements: {#improvements-15} + +- Limits and quotas on the result are no longer applied to intermediate data for `INSERT SELECT` queries or for `SELECT` subqueries. +- Fewer false triggers of `force_restore_data` when checking the status of `Replicated` tables when the server starts. +- Added the `allow_distributed_ddl` option. +- Nondeterministic functions are not allowed in expressions for `MergeTree` table keys. +- Files with substitutions from `config.d` directories are loaded in alphabetical order. +- Improved performance of the `arrayElement` function in the case of a constant multidimensional array with an empty array as one of the elements. Example: `[[1], []][x]`. +- The server starts faster now when using configuration files with very large substitutions (for instance, very large lists of IP networks). +- When running a query, table valued functions run once. Previously, `remote` and `mysql` table valued functions performed the same query twice to retrieve the table structure from a remote server. +- The `MkDocs` documentation generator is used. +- When you try to delete a table column that `DEFAULT`/`MATERIALIZED` expressions of other columns depend on, an exception is thrown (zhang2014). +- Added the ability to parse an empty line in text formats as the number 0 for `Float` data types. This feature was previously available but was lost in release 1.1.54342. +- `Enum` values can be used in `min`, `max`, `sum` and some other functions. In these cases, it uses the corresponding numeric values. This feature was previously available but was lost in the release 1.1.54337. +- Added `max_expanded_ast_elements` to restrict the size of the AST after recursively expanding aliases. + +#### Bug fixes: {#bug-fixes-27} + +- Fixed cases when unnecessary columns were removed from subqueries in error, or not removed from subqueries containing `UNION ALL`. +- Fixed a bug in merges for `ReplacingMergeTree` tables. +- Fixed synchronous insertions in `Distributed` tables (`insert_distributed_sync = 1`). +- Fixed segfault for certain uses of `FULL` and `RIGHT JOIN` with duplicate columns in subqueries. +- Fixed segfault for certain uses of `replace_running_query` and `KILL QUERY`. +- Fixed the order of the `source` and `last_exception` columns in the `system.dictionaries` table. +- Fixed a bug when the `DROP DATABASE` query did not delete the file with metadata. +- Fixed the `DROP DATABASE` query for `Dictionary` databases. +- Fixed the low precision of `uniqHLL12` and `uniqCombined` functions for cardinalities greater than 100 million items (Alex Bocharov). +- Fixed the calculation of implicit default values when necessary to simultaneously calculate default explicit expressions in `INSERT` queries (zhang2014). +- Fixed a rare case when a query to a `MergeTree` table couldn’t finish (chenxing-xc). +- Fixed a crash that occurred when running a `CHECK` query for `Distributed` tables if all shards are local (chenxing.xc). +- Fixed a slight performance regression with functions that use regular expressions. +- Fixed a performance regression when creating multidimensional arrays from complex expressions. +- Fixed a bug that could cause an extra `FORMAT` section to appear in an `.sql` file with metadata. +- Fixed a bug that caused the `max_table_size_to_drop` limit to apply when trying to delete a `MATERIALIZED VIEW` looking at an explicitly specified table. +- Fixed incompatibility with old clients (old clients were sometimes sent data with the `DateTime('timezone')` type, which they do not understand). +- Fixed a bug when reading `Nested` column elements of structures that were added using `ALTER` but that are empty for the old partitions, when the conditions for these columns moved to `PREWHERE`. +- Fixed a bug when filtering tables by virtual `_table` columns in queries to `Merge` tables. +- Fixed a bug when using `ALIAS` columns in `Distributed` tables. +- Fixed a bug that made dynamic compilation impossible for queries with aggregate functions from the `quantile` family. +- Fixed a race condition in the query execution pipeline that occurred in very rare cases when using `Merge` tables with a large number of tables, and when using `GLOBAL` subqueries. +- Fixed a crash when passing arrays of different sizes to an `arrayReduce` function when using aggregate functions from multiple arguments. +- Prohibited the use of queries with `UNION ALL` in a `MATERIALIZED VIEW`. +- Fixed an error during initialization of the `part_log` system table when the server starts (by default, `part_log` is disabled). + +#### Backward incompatible changes: {#backward-incompatible-changes-10} + +- Removed the `distributed_ddl_allow_replicated_alter` option. This behavior is enabled by default. +- Removed the `strict_insert_defaults` setting. If you were using this functionality, write to `clickhouse-feedback@yandex-team.com`. +- Removed the `UnsortedMergeTree` engine. + +### Clickhouse Release 1.1.54343, 2018-02-05 {#clickhouse-release-1-1-54343-2018-02-05} + +- Added macros support for defining cluster names in distributed DDL queries and constructors of Distributed tables: `CREATE TABLE distr ON CLUSTER '{cluster}' (...) ENGINE = Distributed('{cluster}', 'db', 'table')`. +- Now queries like `SELECT ... FROM table WHERE expr IN (subquery)` are processed using the `table` index. +- Improved processing of duplicates when inserting to Replicated tables, so they no longer slow down execution of the replication queue. + +### Clickhouse Release 1.1.54342, 2018-01-22 {#clickhouse-release-1-1-54342-2018-01-22} This release contains bug fixes for the previous release 1.1.54337: -* Fixed a regression in 1.1.54337: if the default user has readonly access, then the server refuses to start up with the message `Cannot create database in readonly mode`. -* Fixed a regression in 1.1.54337: on systems with systemd, logs are always written to syslog regardless of the configuration; the watchdog script still uses init.d. -* Fixed a regression in 1.1.54337: wrong default configuration in the Docker image. -* Fixed nondeterministic behavior of GraphiteMergeTree (you can see it in log messages `Data after merge is not byte-identical to the data on another replicas`). -* Fixed a bug that may lead to inconsistent merges after OPTIMIZE query to Replicated tables (you may see it in log messages `Part ... intersects the previous part`). -* Buffer tables now work correctly when MATERIALIZED columns are present in the destination table (by zhang2014). -* Fixed a bug in implementation of NULL. +- Fixed a regression in 1.1.54337: if the default user has readonly access, then the server refuses to start up with the message `Cannot create database in readonly mode`. +- Fixed a regression in 1.1.54337: on systems with systemd, logs are always written to syslog regardless of the configuration; the watchdog script still uses init.d. +- Fixed a regression in 1.1.54337: wrong default configuration in the Docker image. +- Fixed nondeterministic behavior of GraphiteMergeTree (you can see it in log messages `Data after merge is not byte-identical to the data on another replicas`). +- Fixed a bug that may lead to inconsistent merges after OPTIMIZE query to Replicated tables (you may see it in log messages `Part ... intersects the previous part`). +- Buffer tables now work correctly when MATERIALIZED columns are present in the destination table (by zhang2014). +- Fixed a bug in implementation of NULL. -### Clickhouse Release 1.1.54337, 2018-01-18 +### Clickhouse Release 1.1.54337, 2018-01-18 {#clickhouse-release-1-1-54337-2018-01-18} -#### New features: +#### New features: {#new-features-17} -* Added support for storage of multi-dimensional arrays and tuples (`Tuple` data type) in tables. -* Support for table functions for `DESCRIBE` and `INSERT` queries. Added support for subqueries in `DESCRIBE`. Examples: `DESC TABLE remote('host', default.hits)`; `DESC TABLE (SELECT 1)`; `INSERT INTO TABLE FUNCTION remote('host', default.hits)`. Support for `INSERT INTO TABLE` in addition to `INSERT INTO`. -* Improved support for time zones. The `DateTime` data type can be annotated with the timezone that is used for parsing and formatting in text formats. Example: `DateTime('Europe/Moscow')`. When timezones are specified in functions for `DateTime` arguments, the return type will track the timezone, and the value will be displayed as expected. -* Added the functions `toTimeZone`, `timeDiff`, `toQuarter`, `toRelativeQuarterNum`. The `toRelativeHour`/`Minute`/`Second` functions can take a value of type `Date` as an argument. The `now` function name is case-sensitive. -* Added the `toStartOfFifteenMinutes` function (Kirill Shvakov). -* Added the `clickhouse format` tool for formatting queries. -* Added the `format_schema_path` configuration parameter (Marek Vavruşa). It is used for specifying a schema in `Cap'n Proto` format. Schema files can be located only in the specified directory. -* Added support for config substitutions (`incl` and `conf.d`) for configuration of external dictionaries and models (Pavel Yakunin). -* Added a column with documentation for the `system.settings` table (Kirill Shvakov). -* Added the `system.parts_columns` table with information about column sizes in each data part of `MergeTree` tables. -* Added the `system.models` table with information about loaded `CatBoost` machine learning models. -* Added the `mysql` and `odbc` table function and corresponding `MySQL` and `ODBC` table engines for accessing remote databases. This functionality is in the beta stage. -* Added the possibility to pass an argument of type `AggregateFunction` for the `groupArray` aggregate function (so you can create an array of states of some aggregate function). -* Removed restrictions on various combinations of aggregate function combinators. For example, you can use `avgForEachIf` as well as `avgIfForEach` aggregate functions, which have different behaviors. -* The `-ForEach` aggregate function combinator is extended for the case of aggregate functions of multiple arguments. -* Added support for aggregate functions of `Nullable` arguments even for cases when the function returns a non-`Nullable` result (added with the contribution of Silviu Caragea). Example: `groupArray`, `groupUniqArray`, `topK`. -* Added the `max_client_network_bandwidth` for `clickhouse-client` (Kirill Shvakov). -* Users with the ` readonly = 2` setting are allowed to work with TEMPORARY tables (CREATE, DROP, INSERT...) (Kirill Shvakov). -* Added support for using multiple consumers with the `Kafka` engine. Extended configuration options for `Kafka` (Marek Vavruša). -* Added the `intExp3` and `intExp4` functions. -* Added the `sumKahan` aggregate function. -* Added the to * Number* OrNull functions, where * Number* is a numeric type. -* Added support for `WITH` clauses for an `INSERT SELECT` query (author: zhang2014). -* Added settings: `http_connection_timeout`, `http_send_timeout`, `http_receive_timeout`. In particular, these settings are used for downloading data parts for replication. Changing these settings allows for faster failover if the network is overloaded. -* Added support for `ALTER` for tables of type `Null` (Anastasiya Tsarkova). -* The `reinterpretAsString` function is extended for all data types that are stored contiguously in memory. -* Added the `--silent` option for the `clickhouse-local` tool. It suppresses printing query execution info in stderr. -* Added support for reading values of type `Date` from text in a format where the month and/or day of the month is specified using a single digit instead of two digits (Amos Bird). +- Added support for storage of multi-dimensional arrays and tuples (`Tuple` data type) in tables. +- Support for table functions for `DESCRIBE` and `INSERT` queries. Added support for subqueries in `DESCRIBE`. Examples: `DESC TABLE remote('host', default.hits)`; `DESC TABLE (SELECT 1)`; `INSERT INTO TABLE FUNCTION remote('host', default.hits)`. Support for `INSERT INTO TABLE` in addition to `INSERT INTO`. +- Improved support for time zones. The `DateTime` data type can be annotated with the timezone that is used for parsing and formatting in text formats. Example: `DateTime('Europe/Moscow')`. When timezones are specified in functions for `DateTime` arguments, the return type will track the timezone, and the value will be displayed as expected. +- Added the functions `toTimeZone`, `timeDiff`, `toQuarter`, `toRelativeQuarterNum`. The `toRelativeHour`/`Minute`/`Second` functions can take a value of type `Date` as an argument. The `now` function name is case-sensitive. +- Added the `toStartOfFifteenMinutes` function (Kirill Shvakov). +- Added the `clickhouse format` tool for formatting queries. +- Added the `format_schema_path` configuration parameter (Marek Vavruşa). It is used for specifying a schema in `Cap'n Proto` format. Schema files can be located only in the specified directory. +- Added support for config substitutions (`incl` and `conf.d`) for configuration of external dictionaries and models (Pavel Yakunin). +- Added a column with documentation for the `system.settings` table (Kirill Shvakov). +- Added the `system.parts_columns` table with information about column sizes in each data part of `MergeTree` tables. +- Added the `system.models` table with information about loaded `CatBoost` machine learning models. +- Added the `mysql` and `odbc` table function and corresponding `MySQL` and `ODBC` table engines for accessing remote databases. This functionality is in the beta stage. +- Added the possibility to pass an argument of type `AggregateFunction` for the `groupArray` aggregate function (so you can create an array of states of some aggregate function). +- Removed restrictions on various combinations of aggregate function combinators. For example, you can use `avgForEachIf` as well as `avgIfForEach` aggregate functions, which have different behaviors. +- The `-ForEach` aggregate function combinator is extended for the case of aggregate functions of multiple arguments. +- Added support for aggregate functions of `Nullable` arguments even for cases when the function returns a non-`Nullable` result (added with the contribution of Silviu Caragea). Example: `groupArray`, `groupUniqArray`, `topK`. +- Added the `max_client_network_bandwidth` for `clickhouse-client` (Kirill Shvakov). +- Users with the `readonly = 2` setting are allowed to work with TEMPORARY tables (CREATE, DROP, INSERT…) (Kirill Shvakov). +- Added support for using multiple consumers with the `Kafka` engine. Extended configuration options for `Kafka` (Marek Vavruša). +- Added the `intExp3` and `intExp4` functions. +- Added the `sumKahan` aggregate function. +- Added the to \* Number\* OrNull functions, where \* Number\* is a numeric type. +- Added support for `WITH` clauses for an `INSERT SELECT` query (author: zhang2014). +- Added settings: `http_connection_timeout`, `http_send_timeout`, `http_receive_timeout`. In particular, these settings are used for downloading data parts for replication. Changing these settings allows for faster failover if the network is overloaded. +- Added support for `ALTER` for tables of type `Null` (Anastasiya Tsarkova). +- The `reinterpretAsString` function is extended for all data types that are stored contiguously in memory. +- Added the `--silent` option for the `clickhouse-local` tool. It suppresses printing query execution info in stderr. +- Added support for reading values of type `Date` from text in a format where the month and/or day of the month is specified using a single digit instead of two digits (Amos Bird). -#### Performance optimizations: +#### Performance optimizations: {#performance-optimizations} -* Improved performance of aggregate functions `min`, `max`, `any`, `anyLast`, `anyHeavy`, `argMin`, `argMax` from string arguments. -* Improved performance of the functions `isInfinite`, `isFinite`, `isNaN`, `roundToExp2`. -* Improved performance of parsing and formatting `Date` and `DateTime` type values in text format. -* Improved performance and precision of parsing floating point numbers. -* Lowered memory usage for `JOIN` in the case when the left and right parts have columns with identical names that are not contained in `USING` . -* Improved performance of aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr` by reducing computational stability. The old functions are available under the names `varSampStable`, `varPopStable`, `stddevSampStable`, `stddevPopStable`, `covarSampStable`, `covarPopStable`, `corrStable`. +- Improved performance of aggregate functions `min`, `max`, `any`, `anyLast`, `anyHeavy`, `argMin`, `argMax` from string arguments. +- Improved performance of the functions `isInfinite`, `isFinite`, `isNaN`, `roundToExp2`. +- Improved performance of parsing and formatting `Date` and `DateTime` type values in text format. +- Improved performance and precision of parsing floating point numbers. +- Lowered memory usage for `JOIN` in the case when the left and right parts have columns with identical names that are not contained in `USING` . +- Improved performance of aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr` by reducing computational stability. The old functions are available under the names `varSampStable`, `varPopStable`, `stddevSampStable`, `stddevPopStable`, `covarSampStable`, `covarPopStable`, `corrStable`. -#### Bug fixes: +#### Bug fixes: {#bug-fixes-28} -* Fixed data deduplication after running a `DROP` or `DETACH PARTITION` query. In the previous version, dropping a partition and inserting the same data again was not working because inserted blocks were considered duplicates. -* Fixed a bug that could lead to incorrect interpretation of the `WHERE` clause for ` CREATE MATERIALIZED VIEW` queries with `POPULATE` . -* Fixed a bug in using the `root_path` parameter in the `zookeeper_servers` configuration. -* Fixed unexpected results of passing the `Date` argument to `toStartOfDay` . -* Fixed the `addMonths` and `subtractMonths` functions and the arithmetic for ` INTERVAL n MONTH` in cases when the result has the previous year. -* Added missing support for the `UUID` data type for `DISTINCT` , `JOIN` , and `uniq` aggregate functions and external dictionaries (Evgeniy Ivanov). Support for `UUID` is still incomplete. -* Fixed `SummingMergeTree` behavior in cases when the rows summed to zero. -* Various fixes for the `Kafka` engine (Marek Vavruša). -* Fixed incorrect behavior of the `Join` table engine (Amos Bird). -* Fixed incorrect allocator behavior under FreeBSD and OS X. -* The `extractAll` function now supports empty matches. -* Fixed an error that blocked usage of `libressl` instead of `openssl` . -* Fixed the ` CREATE TABLE AS SELECT` query from temporary tables. -* Fixed non-atomicity of updating the replication queue. This could lead to replicas being out of sync until the server restarts. -* Fixed possible overflow in `gcd` , `lcm` and `modulo` (`%` operator) (Maks Skorokhod). -* `-preprocessed` files are now created after changing `umask` (`umask` can be changed in the config). -* Fixed a bug in the background check of parts (`MergeTreePartChecker` ) when using a custom partition key. -* Fixed parsing of tuples (values of the `Tuple` data type) in text formats. -* Improved error messages about incompatible types passed to `multiIf` , `array` and some other functions. -* Redesigned support for `Nullable` types. Fixed bugs that may lead to a server crash. Fixed almost all other bugs related to ` NULL` support: incorrect type conversions in INSERT SELECT, insufficient support for Nullable in HAVING and PREWHERE, `join_use_nulls` mode, Nullable types as arguments of `OR` operator, etc. -* Fixed various bugs related to internal semantics of data types. Examples: unnecessary summing of `Enum` type fields in `SummingMergeTree` ; alignment of `Enum` types in `Pretty` formats, etc. -* Stricter checks for allowed combinations of composite columns. -* Fixed the overflow when specifying a very large parameter for the `FixedString` data type. -* Fixed a bug in the `topK` aggregate function in a generic case. -* Added the missing check for equality of array sizes in arguments of n-ary variants of aggregate functions with an `-Array` combinator. -* Fixed a bug in `--pager` for `clickhouse-client` (author: ks1322). -* Fixed the precision of the `exp10` function. -* Fixed the behavior of the `visitParamExtract` function for better compliance with documentation. -* Fixed the crash when incorrect data types are specified. -* Fixed the behavior of `DISTINCT` in the case when all columns are constants. -* Fixed query formatting in the case of using the `tupleElement` function with a complex constant expression as the tuple element index. -* Fixed a bug in `Dictionary` tables for `range_hashed` dictionaries. -* Fixed a bug that leads to excessive rows in the result of `FULL` and ` RIGHT JOIN` (Amos Bird). -* Fixed a server crash when creating and removing temporary files in `config.d` directories during config reload. -* Fixed the ` SYSTEM DROP DNS CACHE` query: the cache was flushed but addresses of cluster nodes were not updated. -* Fixed the behavior of ` MATERIALIZED VIEW` after executing ` DETACH TABLE` for the table under the view (Marek Vavruša). +- Fixed data deduplication after running a `DROP` or `DETACH PARTITION` query. In the previous version, dropping a partition and inserting the same data again was not working because inserted blocks were considered duplicates. +- Fixed a bug that could lead to incorrect interpretation of the `WHERE` clause for `CREATE MATERIALIZED VIEW` queries with `POPULATE` . +- Fixed a bug in using the `root_path` parameter in the `zookeeper_servers` configuration. +- Fixed unexpected results of passing the `Date` argument to `toStartOfDay` . +- Fixed the `addMonths` and `subtractMonths` functions and the arithmetic for `INTERVAL n MONTH` in cases when the result has the previous year. +- Added missing support for the `UUID` data type for `DISTINCT` , `JOIN` , and `uniq` aggregate functions and external dictionaries (Evgeniy Ivanov). Support for `UUID` is still incomplete. +- Fixed `SummingMergeTree` behavior in cases when the rows summed to zero. +- Various fixes for the `Kafka` engine (Marek Vavruša). +- Fixed incorrect behavior of the `Join` table engine (Amos Bird). +- Fixed incorrect allocator behavior under FreeBSD and OS X. +- The `extractAll` function now supports empty matches. +- Fixed an error that blocked usage of `libressl` instead of `openssl` . +- Fixed the `CREATE TABLE AS SELECT` query from temporary tables. +- Fixed non-atomicity of updating the replication queue. This could lead to replicas being out of sync until the server restarts. +- Fixed possible overflow in `gcd` , `lcm` and `modulo` (`%` operator) (Maks Skorokhod). +- `-preprocessed` files are now created after changing `umask` (`umask` can be changed in the config). +- Fixed a bug in the background check of parts (`MergeTreePartChecker` ) when using a custom partition key. +- Fixed parsing of tuples (values of the `Tuple` data type) in text formats. +- Improved error messages about incompatible types passed to `multiIf` , `array` and some other functions. +- Redesigned support for `Nullable` types. Fixed bugs that may lead to a server crash. Fixed almost all other bugs related to `NULL` support: incorrect type conversions in INSERT SELECT, insufficient support for Nullable in HAVING and PREWHERE, `join_use_nulls` mode, Nullable types as arguments of `OR` operator, etc. +- Fixed various bugs related to internal semantics of data types. Examples: unnecessary summing of `Enum` type fields in `SummingMergeTree` ; alignment of `Enum` types in `Pretty` formats, etc. +- Stricter checks for allowed combinations of composite columns. +- Fixed the overflow when specifying a very large parameter for the `FixedString` data type. +- Fixed a bug in the `topK` aggregate function in a generic case. +- Added the missing check for equality of array sizes in arguments of n-ary variants of aggregate functions with an `-Array` combinator. +- Fixed a bug in `--pager` for `clickhouse-client` (author: ks1322). +- Fixed the precision of the `exp10` function. +- Fixed the behavior of the `visitParamExtract` function for better compliance with documentation. +- Fixed the crash when incorrect data types are specified. +- Fixed the behavior of `DISTINCT` in the case when all columns are constants. +- Fixed query formatting in the case of using the `tupleElement` function with a complex constant expression as the tuple element index. +- Fixed a bug in `Dictionary` tables for `range_hashed` dictionaries. +- Fixed a bug that leads to excessive rows in the result of `FULL` and `RIGHT JOIN` (Amos Bird). +- Fixed a server crash when creating and removing temporary files in `config.d` directories during config reload. +- Fixed the `SYSTEM DROP DNS CACHE` query: the cache was flushed but addresses of cluster nodes were not updated. +- Fixed the behavior of `MATERIALIZED VIEW` after executing `DETACH TABLE` for the table under the view (Marek Vavruša). -#### Build improvements: +#### Build improvements: {#build-improvements-4} -* The `pbuilder` tool is used for builds. The build process is almost completely independent of the build host environment. -* A single build is used for different OS versions. Packages and binaries have been made compatible with a wide range of Linux systems. -* Added the `clickhouse-test` package. It can be used to run functional tests. -* The source tarball can now be published to the repository. It can be used to reproduce the build without using GitHub. -* Added limited integration with Travis CI. Due to limits on build time in Travis, only the debug build is tested and a limited subset of tests are run. -* Added support for `Cap'n'Proto` in the default build. -* Changed the format of documentation sources from `Restricted Text` to `Markdown`. -* Added support for `systemd` (Vladimir Smirnov). It is disabled by default due to incompatibility with some OS images and can be enabled manually. -* For dynamic code generation, `clang` and `lld` are embedded into the `clickhouse` binary. They can also be invoked as ` clickhouse clang` and ` clickhouse lld` . -* Removed usage of GNU extensions from the code. Enabled the `-Wextra` option. When building with `clang` the default is `libc++` instead of `libstdc++`. -* Extracted `clickhouse_parsers` and `clickhouse_common_io` libraries to speed up builds of various tools. +- The `pbuilder` tool is used for builds. The build process is almost completely independent of the build host environment. +- A single build is used for different OS versions. Packages and binaries have been made compatible with a wide range of Linux systems. +- Added the `clickhouse-test` package. It can be used to run functional tests. +- The source tarball can now be published to the repository. It can be used to reproduce the build without using GitHub. +- Added limited integration with Travis CI. Due to limits on build time in Travis, only the debug build is tested and a limited subset of tests are run. +- Added support for `Cap'n'Proto` in the default build. +- Changed the format of documentation sources from `Restricted Text` to `Markdown`. +- Added support for `systemd` (Vladimir Smirnov). It is disabled by default due to incompatibility with some OS images and can be enabled manually. +- For dynamic code generation, `clang` and `lld` are embedded into the `clickhouse` binary. They can also be invoked as `clickhouse clang` and `clickhouse lld` . +- Removed usage of GNU extensions from the code. Enabled the `-Wextra` option. When building with `clang` the default is `libc++` instead of `libstdc++`. +- Extracted `clickhouse_parsers` and `clickhouse_common_io` libraries to speed up builds of various tools. -#### Backward incompatible changes: +#### Backward incompatible changes: {#backward-incompatible-changes-11} -* The format for marks in `Log` type tables that contain `Nullable` columns was changed in a backward incompatible way. If you have these tables, you should convert them to the `TinyLog` type before starting up the new server version. To do this, replace `ENGINE = Log` with `ENGINE = TinyLog` in the corresponding `.sql` file in the `metadata` directory. If your table doesn't have `Nullable` columns or if the type of your table is not `Log`, then you don't need to do anything. -* Removed the `experimental_allow_extended_storage_definition_syntax` setting. Now this feature is enabled by default. -* The `runningIncome` function was renamed to `runningDifferenceStartingWithFirstvalue` to avoid confusion. -* Removed the ` FROM ARRAY JOIN arr` syntax when ARRAY JOIN is specified directly after FROM with no table (Amos Bird). -* Removed the `BlockTabSeparated` format that was used solely for demonstration purposes. -* Changed the state format for aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. If you have stored states of these aggregate functions in tables (using the `AggregateFunction` data type or materialized views with corresponding states), please write to clickhouse-feedback@yandex-team.com. -* In previous server versions there was an undocumented feature: if an aggregate function depends on parameters, you can still specify it without parameters in the AggregateFunction data type. Example: `AggregateFunction(quantiles, UInt64)` instead of `AggregateFunction(quantiles(0.5, 0.9), UInt64)`. This feature was lost. Although it was undocumented, we plan to support it again in future releases. -* Enum data types cannot be used in min/max aggregate functions. This ability will be returned in the next release. +- The format for marks in `Log` type tables that contain `Nullable` columns was changed in a backward incompatible way. If you have these tables, you should convert them to the `TinyLog` type before starting up the new server version. To do this, replace `ENGINE = Log` with `ENGINE = TinyLog` in the corresponding `.sql` file in the `metadata` directory. If your table doesn’t have `Nullable` columns or if the type of your table is not `Log`, then you don’t need to do anything. +- Removed the `experimental_allow_extended_storage_definition_syntax` setting. Now this feature is enabled by default. +- The `runningIncome` function was renamed to `runningDifferenceStartingWithFirstvalue` to avoid confusion. +- Removed the `FROM ARRAY JOIN arr` syntax when ARRAY JOIN is specified directly after FROM with no table (Amos Bird). +- Removed the `BlockTabSeparated` format that was used solely for demonstration purposes. +- Changed the state format for aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. If you have stored states of these aggregate functions in tables (using the `AggregateFunction` data type or materialized views with corresponding states), please write to clickhouse-feedback@yandex-team.com. +- In previous server versions there was an undocumented feature: if an aggregate function depends on parameters, you can still specify it without parameters in the AggregateFunction data type. Example: `AggregateFunction(quantiles, UInt64)` instead of `AggregateFunction(quantiles(0.5, 0.9), UInt64)`. This feature was lost. Although it was undocumented, we plan to support it again in future releases. +- Enum data types cannot be used in min/max aggregate functions. This ability will be returned in the next release. -#### Please note when upgrading: +#### Please note when upgrading: {#please-note-when-upgrading} -* When doing a rolling update on a cluster, at the point when some of the replicas are running the old version of ClickHouse and some are running the new version, replication is temporarily stopped and the message ` unknown parameter 'shard'` appears in the log. Replication will continue after all replicas of the cluster are updated. -* If different versions of ClickHouse are running on the cluster servers, it is possible that distributed queries using the following functions will have incorrect results: `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. You should update all cluster nodes. +- When doing a rolling update on a cluster, at the point when some of the replicas are running the old version of ClickHouse and some are running the new version, replication is temporarily stopped and the message `unknown parameter 'shard'` appears in the log. Replication will continue after all replicas of the cluster are updated. +- If different versions of ClickHouse are running on the cluster servers, it is possible that distributed queries using the following functions will have incorrect results: `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. You should update all cluster nodes. diff --git a/docs/en/changelog/2019.md b/docs/en/changelog/2019.md index def3e7baad0..7cc7d6f3bac 100644 --- a/docs/en/changelog/2019.md +++ b/docs/en/changelog/2019.md @@ -1,1945 +1,2061 @@ -## ClickHouse release v19.17 - -### ClickHouse release v19.17.6.36, 2019-12-27 - -#### Bug Fix -* Fixed potential buffer overflow in decompress. Malicious user can pass fabricated compressed data that could cause read after buffer. This issue was found by Eldar Zaitov from Yandex information security team. [#8404](https://github.com/ClickHouse/ClickHouse/pull/8404) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed possible server crash (`std::terminate`) when the server cannot send or write data in JSON or XML format with values of String data type (that require UTF-8 validation) or when compressing result data with Brotli algorithm or in some other rare cases. [#8384](https://github.com/ClickHouse/ClickHouse/pull/8384) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed dictionaries with source from a clickhouse `VIEW`, now reading such dictionaries doesn't cause the error `There is no query`. [#8351](https://github.com/ClickHouse/ClickHouse/pull/8351) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed checking if a client host is allowed by host_regexp specified in users.xml. [#8241](https://github.com/ClickHouse/ClickHouse/pull/8241), [#8342](https://github.com/ClickHouse/ClickHouse/pull/8342) ([Vitaly Baranov](https://github.com/vitlibar)) -* `RENAME TABLE` for a distributed table now renames the folder containing inserted data before sending to shards. This fixes an issue with successive renames `tableA->tableB`, `tableC->tableA`. [#8306](https://github.com/ClickHouse/ClickHouse/pull/8306) ([tavplubix](https://github.com/tavplubix)) -* `range_hashed` external dictionaries created by DDL queries now allow ranges of arbitrary numeric types. [#8275](https://github.com/ClickHouse/ClickHouse/pull/8275) ([alesapin](https://github.com/alesapin)) -* Fixed `INSERT INTO table SELECT ... FROM mysql(...)` table function. [#8234](https://github.com/ClickHouse/ClickHouse/pull/8234) ([tavplubix](https://github.com/tavplubix)) -* Fixed segfault in `INSERT INTO TABLE FUNCTION file()` while inserting into a file which doesn't exist. Now in this case file would be created and then insert would be processed. [#8177](https://github.com/ClickHouse/ClickHouse/pull/8177) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixed bitmapAnd error when intersecting an aggregated bitmap and a scalar bitmap. [#8082](https://github.com/ClickHouse/ClickHouse/pull/8082) ([Yue Huang](https://github.com/moon03432)) -* Fixed segfault when `EXISTS` query was used without `TABLE` or `DICTIONARY` qualifier, just like `EXISTS t`. [#8213](https://github.com/ClickHouse/ClickHouse/pull/8213) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed return type for functions `rand` and `randConstant` in case of nullable argument. Now functions always return `UInt32` and never `Nullable(UInt32)`. [#8204](https://github.com/ClickHouse/ClickHouse/pull/8204) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed `DROP DICTIONARY IF EXISTS db.dict`, now it doesn't throw exception if `db` doesn't exist. [#8185](https://github.com/ClickHouse/ClickHouse/pull/8185) ([Vitaly Baranov](https://github.com/vitlibar)) -* If a table wasn't completely dropped because of server crash, the server will try to restore and load it [#8176](https://github.com/ClickHouse/ClickHouse/pull/8176) ([tavplubix](https://github.com/tavplubix)) -* Fixed a trivial count query for a distributed table if there are more than two shard local table. [#8164](https://github.com/ClickHouse/ClickHouse/pull/8164) ([小路](https://github.com/nicelulu)) -* Fixed bug that lead to a data race in DB::BlockStreamProfileInfo::calculateRowsBeforeLimit() [#8143](https://github.com/ClickHouse/ClickHouse/pull/8143) ([Alexander Kazakov](https://github.com/Akazz)) -* Fixed `ALTER table MOVE part` executed immediately after merging the specified part, which could cause moving a part which the specified part merged into. Now it correctly moves the specified part. [#8104](https://github.com/ClickHouse/ClickHouse/pull/8104) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Expressions for dictionaries can be specified as strings now. This is useful for calculation of attributes while extracting data from non-ClickHouse sources because it allows to use non-ClickHouse syntax for those expressions. [#8098](https://github.com/ClickHouse/ClickHouse/pull/8098) ([alesapin](https://github.com/alesapin)) -* Fixed a very rare race in `clickhouse-copier` because of an overflow in ZXid. [#8088](https://github.com/ClickHouse/ClickHouse/pull/8088) ([Ding Xiang Fei](https://github.com/dingxiangfei2009)) -* Fixed the bug when after the query failed (due to "Too many simultaneous queries" for example) it would not read external tables info, and the -next request would interpret this info as the beginning of the next query causing an error like `Unknown packet from client`. [#8084](https://github.com/ClickHouse/ClickHouse/pull/8084) ([Azat Khuzhin](https://github.com/azat)) -* Avoid null dereference after "Unknown packet X from server" [#8071](https://github.com/ClickHouse/ClickHouse/pull/8071) ([Azat Khuzhin](https://github.com/azat)) -* Restore support of all ICU locales, add the ability to apply collations for constant expressions and add language name to system.collations table. [#8051](https://github.com/ClickHouse/ClickHouse/pull/8051) ([alesapin](https://github.com/alesapin)) -* Number of streams for read from `StorageFile` and `StorageHDFS` is now limited, to avoid exceeding the memory limit. [#7981](https://github.com/ClickHouse/ClickHouse/pull/7981) ([alesapin](https://github.com/alesapin)) -* Fixed `CHECK TABLE` query for `*MergeTree` tables without key. [#7979](https://github.com/ClickHouse/ClickHouse/pull/7979) ([alesapin](https://github.com/alesapin)) -* Removed the mutation number from a part name in case there were no mutations. This removing improved the compatibility with older versions. [#8250](https://github.com/ClickHouse/ClickHouse/pull/8250) ([alesapin](https://github.com/alesapin)) -* Fixed the bug that mutations are skipped for some attached parts due to their data_version are larger than the table mutation version. [#7812](https://github.com/ClickHouse/ClickHouse/pull/7812) ([Zhichang Yu](https://github.com/yuzhichang)) -* Allow starting the server with redundant copies of parts after moving them to another device. [#7810](https://github.com/ClickHouse/ClickHouse/pull/7810) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Fixed the error "Sizes of columns doesn't match" that might appear when using aggregate function columns. [#7790](https://github.com/ClickHouse/ClickHouse/pull/7790) ([Boris Granveaud](https://github.com/bgranvea)) -* Now an exception will be thrown in case of using WITH TIES alongside LIMIT BY. And now it's possible to use TOP with LIMIT BY. [#7637](https://github.com/ClickHouse/ClickHouse/pull/7637) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) -* Fix dictionary reload if it has `invalidate_query`, which stopped updates and some exception on previous update tries. [#8029](https://github.com/ClickHouse/ClickHouse/pull/8029) ([alesapin](https://github.com/alesapin)) - -### ClickHouse release v19.17.4.11, 2019-11-22 - -#### Backward Incompatible Change -* Using column instead of AST to store scalar subquery results for better performance. Setting `enable_scalar_subquery_optimization` was added in 19.17 and it was enabled by default. It leads to errors like [this](https://github.com/ClickHouse/ClickHouse/issues/7851) during upgrade to 19.17.2 or 19.17.3 from previous versions. This setting was disabled by default in 19.17.4, to make possible upgrading from 19.16 and older versions without errors. [#7392](https://github.com/ClickHouse/ClickHouse/pull/7392) ([Amos Bird](https://github.com/amosbird)) - -#### New Feature -* Add the ability to create dictionaries with DDL queries. [#7360](https://github.com/ClickHouse/ClickHouse/pull/7360) ([alesapin](https://github.com/alesapin)) -* Make `bloom_filter` type of index supporting `LowCardinality` and `Nullable` [#7363](https://github.com/ClickHouse/ClickHouse/issues/7363) [#7561](https://github.com/ClickHouse/ClickHouse/pull/7561) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Add function `isValidJSON` to check that passed string is a valid json. [#5910](https://github.com/ClickHouse/ClickHouse/issues/5910) [#7293](https://github.com/ClickHouse/ClickHouse/pull/7293) ([Vdimir](https://github.com/Vdimir)) -* Implement `arrayCompact` function [#7328](https://github.com/ClickHouse/ClickHouse/pull/7328) ([Memo](https://github.com/Joeywzr)) -* Created function `hex` for Decimal numbers. It works like `hex(reinterpretAsString())`, but doesn't delete last zero bytes. [#7355](https://github.com/ClickHouse/ClickHouse/pull/7355) ([Mikhail Korotov](https://github.com/millb)) -* Add `arrayFill` and `arrayReverseFill` functions, which replace elements by other elements in front/back of them in the array. [#7380](https://github.com/ClickHouse/ClickHouse/pull/7380) ([hcz](https://github.com/hczhcz)) -* Add `CRC32IEEE()`/`CRC64()` support [#7480](https://github.com/ClickHouse/ClickHouse/pull/7480) ([Azat Khuzhin](https://github.com/azat)) -* Implement `char` function similar to one in [mysql](https://dev.mysql.com/doc/refman/8.0/en/string-functions.html#function_char) [#7486](https://github.com/ClickHouse/ClickHouse/pull/7486) ([sundyli](https://github.com/sundy-li)) -* Add `bitmapTransform` function. It transforms an array of values in a bitmap to another array of values, the result is a new bitmap [#7598](https://github.com/ClickHouse/ClickHouse/pull/7598) ([Zhichang Yu](https://github.com/yuzhichang)) -* Implemented `javaHashUTF16LE()` function [#7651](https://github.com/ClickHouse/ClickHouse/pull/7651) ([achimbab](https://github.com/achimbab)) -* Add `_shard_num` virtual column for the Distributed engine [#7624](https://github.com/ClickHouse/ClickHouse/pull/7624) ([Azat Khuzhin](https://github.com/azat)) - -#### Experimental Feature -* Support for processors (new query execution pipeline) in `MergeTree`. [#7181](https://github.com/ClickHouse/ClickHouse/pull/7181) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) - -#### Bug Fix -* Fix incorrect float parsing in `Values` [#7817](https://github.com/ClickHouse/ClickHouse/issues/7817) [#7870](https://github.com/ClickHouse/ClickHouse/pull/7870) ([tavplubix](https://github.com/tavplubix)) -* Fix rare deadlock which can happen when trace_log is enabled. [#7838](https://github.com/ClickHouse/ClickHouse/pull/7838) ([filimonov](https://github.com/filimonov)) -* Prevent message duplication when producing Kafka table has any MVs selecting from it [#7265](https://github.com/ClickHouse/ClickHouse/pull/7265) ([Ivan](https://github.com/abyss7)) -* Support for `Array(LowCardinality(Nullable(String)))` in `IN`. Resolves [#7364](https://github.com/ClickHouse/ClickHouse/issues/7364) [#7366](https://github.com/ClickHouse/ClickHouse/pull/7366) ([achimbab](https://github.com/achimbab)) -* Add handling of `SQL_TINYINT` and `SQL_BIGINT`, and fix handling of `SQL_FLOAT` data source types in ODBC Bridge. [#7491](https://github.com/ClickHouse/ClickHouse/pull/7491) ([Denis Glazachev](https://github.com/traceon)) -* Fix aggregation (`avg` and quantiles) over empty decimal columns [#7431](https://github.com/ClickHouse/ClickHouse/pull/7431) ([Andrey Konyaev](https://github.com/akonyaev90)) -* Fix `INSERT` into Distributed with `MATERIALIZED` columns [#7377](https://github.com/ClickHouse/ClickHouse/pull/7377) ([Azat Khuzhin](https://github.com/azat)) -* Make `MOVE PARTITION` work if some parts of partition are already on destination disk or volume [#7434](https://github.com/ClickHouse/ClickHouse/pull/7434) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Fixed bug with hardlinks failing to be created during mutations in `ReplicatedMergeTree` in multi-disk configurations. [#7558](https://github.com/ClickHouse/ClickHouse/pull/7558) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Fixed a bug with a mutation on a MergeTree when whole part remains unchanged and best space is being found on another disk [#7602](https://github.com/ClickHouse/ClickHouse/pull/7602) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Fixed bug with `keep_free_space_ratio` not being read from disks configuration [#7645](https://github.com/ClickHouse/ClickHouse/pull/7645) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Fix bug with table contains only `Tuple` columns or columns with complex paths. Fixes [7541](https://github.com/ClickHouse/ClickHouse/issues/7541). [#7545](https://github.com/ClickHouse/ClickHouse/pull/7545) ([alesapin](https://github.com/alesapin)) -* Do not account memory for Buffer engine in max_memory_usage limit [#7552](https://github.com/ClickHouse/ClickHouse/pull/7552) ([Azat Khuzhin](https://github.com/azat)) -* Fix final mark usage in `MergeTree` tables ordered by `tuple()`. In rare cases it could lead to `Can't adjust last granule` error while select. [#7639](https://github.com/ClickHouse/ClickHouse/pull/7639) ([Anton Popov](https://github.com/CurtizJ)) -* Fix bug in mutations that have predicate with actions that require context (for example functions for json), which may lead to crashes or strange exceptions. [#7664](https://github.com/ClickHouse/ClickHouse/pull/7664) ([alesapin](https://github.com/alesapin)) -* Fix mismatch of database and table names escaping in `data/` and `shadow/` directories [#7575](https://github.com/ClickHouse/ClickHouse/pull/7575) ([Alexander Burmak](https://github.com/Alex-Burmak)) -* Support duplicated keys in RIGHT|FULL JOINs, e.g. ```ON t.x = u.x AND t.x = u.y```. Fix crash in this case. [#7586](https://github.com/ClickHouse/ClickHouse/pull/7586) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix `Not found column in block` when joining on expression with RIGHT or FULL JOIN. [#7641](https://github.com/ClickHouse/ClickHouse/pull/7641) ([Artem Zuikov](https://github.com/4ertus2)) -* One more attempt to fix infinite loop in `PrettySpace` format [#7591](https://github.com/ClickHouse/ClickHouse/pull/7591) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fix bug in `concat` function when all arguments were `FixedString` of the same size. [#7635](https://github.com/ClickHouse/ClickHouse/pull/7635) ([alesapin](https://github.com/alesapin)) -* Fixed exception in case of using 1 argument while defining S3, URL and HDFS storages. [#7618](https://github.com/ClickHouse/ClickHouse/pull/7618) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Fix scope of the InterpreterSelectQuery for views with query [#7601](https://github.com/ClickHouse/ClickHouse/pull/7601) ([Azat Khuzhin](https://github.com/azat)) - -#### Improvement -* `Nullable` columns recognized and NULL-values handled correctly by ODBC-bridge [#7402](https://github.com/ClickHouse/ClickHouse/pull/7402) ([Vasily Nemkov](https://github.com/Enmk)) -* Write current batch for distributed send atomically [#7600](https://github.com/ClickHouse/ClickHouse/pull/7600) ([Azat Khuzhin](https://github.com/azat)) -* Throw an exception if we cannot detect table for column name in query. [#7358](https://github.com/ClickHouse/ClickHouse/pull/7358) ([Artem Zuikov](https://github.com/4ertus2)) -* Add `merge_max_block_size` setting to `MergeTreeSettings` [#7412](https://github.com/ClickHouse/ClickHouse/pull/7412) ([Artem Zuikov](https://github.com/4ertus2)) -* Queries with `HAVING` and without `GROUP BY` assume group by constant. So, `SELECT 1 HAVING 1` now returns a result. [#7496](https://github.com/ClickHouse/ClickHouse/pull/7496) ([Amos Bird](https://github.com/amosbird)) -* Support parsing `(X,)` as tuple similar to python. [#7501](https://github.com/ClickHouse/ClickHouse/pull/7501), [#7562](https://github.com/ClickHouse/ClickHouse/pull/7562) ([Amos Bird](https://github.com/amosbird)) -* Make `range` function behaviors almost like pythonic one. [#7518](https://github.com/ClickHouse/ClickHouse/pull/7518) ([sundyli](https://github.com/sundy-li)) -* Add `constraints` columns to table `system.settings` [#7553](https://github.com/ClickHouse/ClickHouse/pull/7553) ([Vitaly Baranov](https://github.com/vitlibar)) -* Better Null format for tcp handler, so that it's possible to use `select ignore() from table format Null` for perf measure via clickhouse-client [#7606](https://github.com/ClickHouse/ClickHouse/pull/7606) ([Amos Bird](https://github.com/amosbird)) -* Queries like `CREATE TABLE ... AS (SELECT (1, 2))` are parsed correctly [#7542](https://github.com/ClickHouse/ClickHouse/pull/7542) ([hcz](https://github.com/hczhcz)) - -#### Performance Improvement -* The performance of aggregation over short string keys is improved. [#6243](https://github.com/ClickHouse/ClickHouse/pull/6243) ([Alexander Kuzmenkov](https://github.com/akuzm), [Amos Bird](https://github.com/amosbird)) -* Run another pass of syntax/expression analysis to get potential optimizations after constant predicates are folded. [#7497](https://github.com/ClickHouse/ClickHouse/pull/7497) ([Amos Bird](https://github.com/amosbird)) -* Use storage meta info to evaluate trivial `SELECT count() FROM table;` [#7510](https://github.com/ClickHouse/ClickHouse/pull/7510) ([Amos Bird](https://github.com/amosbird), [alexey-milovidov](https://github.com/alexey-milovidov)) -* Vectorize processing `arrayReduce` similar to Aggregator `addBatch`. [#7608](https://github.com/ClickHouse/ClickHouse/pull/7608) ([Amos Bird](https://github.com/amosbird)) -* Minor improvements in performance of `Kafka` consumption [#7475](https://github.com/ClickHouse/ClickHouse/pull/7475) ([Ivan](https://github.com/abyss7)) - -#### Build/Testing/Packaging Improvement -* Add support for cross-compiling to the CPU architecture AARCH64. Refactor packager script. [#7370](https://github.com/ClickHouse/ClickHouse/pull/7370) [#7539](https://github.com/ClickHouse/ClickHouse/pull/7539) ([Ivan](https://github.com/abyss7)) -* Unpack darwin-x86_64 and linux-aarch64 toolchains into mounted Docker volume when building packages [#7534](https://github.com/ClickHouse/ClickHouse/pull/7534) ([Ivan](https://github.com/abyss7)) -* Update Docker Image for Binary Packager [#7474](https://github.com/ClickHouse/ClickHouse/pull/7474) ([Ivan](https://github.com/abyss7)) -* Fixed compile errors on MacOS Catalina [#7585](https://github.com/ClickHouse/ClickHouse/pull/7585) ([Ernest Poletaev](https://github.com/ernestp)) -* Some refactoring in query analysis logic: split complex class into several simple ones. [#7454](https://github.com/ClickHouse/ClickHouse/pull/7454) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix build without submodules [#7295](https://github.com/ClickHouse/ClickHouse/pull/7295) ([proller](https://github.com/proller)) -* Better `add_globs` in CMake files [#7418](https://github.com/ClickHouse/ClickHouse/pull/7418) ([Amos Bird](https://github.com/amosbird)) -* Remove hardcoded paths in `unwind` target [#7460](https://github.com/ClickHouse/ClickHouse/pull/7460) ([Konstantin Podshumok](https://github.com/podshumok)) -* Allow to use mysql format without ssl [#7524](https://github.com/ClickHouse/ClickHouse/pull/7524) ([proller](https://github.com/proller)) - -#### Other -* Added ANTLR4 grammar for ClickHouse SQL dialect [#7595](https://github.com/ClickHouse/ClickHouse/issues/7595) [#7596](https://github.com/ClickHouse/ClickHouse/pull/7596) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -## ClickHouse release v19.16 - -#### Clickhouse release v19.16.14.65, 2020-03-05 - -* Fix distributed subqueries incompatibility with older CH versions. Fixes [#7851](https://github.com/ClickHouse/ClickHouse/issues/7851) -[(tabplubix)](https://github.com/tavplubix) -* When executing `CREATE` query, fold constant expressions in storage engine arguments. Replace empty database name with current database. Fixes [#6508](https://github.com/ClickHouse/ClickHouse/issues/6508), [#3492](https://github.com/ClickHouse/ClickHouse/issues/3492). Also fix check for local address in `ClickHouseDictionarySource`. -[#9262](https://github.com/ClickHouse/ClickHouse/pull/9262) [(tabplubix)](https://github.com/tavplubix) -* Now background merges in `*MergeTree` table engines family preserve storage policy volume order more accurately. -[#8549](https://github.com/ClickHouse/ClickHouse/pull/8549) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Prevent losing data in `Kafka` in rare cases when exception happens after reading suffix but before commit. Fixes [#9378](https://github.com/ClickHouse/ClickHouse/issues/9378). Related: [#7175](https://github.com/ClickHouse/ClickHouse/issues/7175) -[#9507](https://github.com/ClickHouse/ClickHouse/pull/9507) [(filimonov)](https://github.com/filimonov) -* Fix bug leading to server termination when trying to use / drop `Kafka` table created with wrong parameters. Fixes [#9494](https://github.com/ClickHouse/ClickHouse/issues/9494). Incorporates [#9507](https://github.com/ClickHouse/ClickHouse/issues/9507). -[#9513](https://github.com/ClickHouse/ClickHouse/pull/9513) [(filimonov)](https://github.com/filimonov) -* Allow using `MaterializedView` with subqueries above `Kafka` tables. -[#8197](https://github.com/ClickHouse/ClickHouse/pull/8197) ([filimonov](https://github.com/filimonov)) - -#### New Feature -* Add `deduplicate_blocks_in_dependent_materialized_views` option to control the behaviour of idempotent inserts into tables with materialized views. This new feature was added to the bugfix release by a special request from Altinity. -[#9070](https://github.com/ClickHouse/ClickHouse/pull/9070) [(urykhy)](https://github.com/urykhy) - -### ClickHouse release v19.16.2.2, 2019-10-30 - -#### Backward Incompatible Change -* Add missing arity validation for count/counIf. - [#7095](https://github.com/ClickHouse/ClickHouse/issues/7095) -[#7298](https://github.com/ClickHouse/ClickHouse/pull/7298) ([Vdimir](https://github.com/Vdimir)) -* Remove legacy `asterisk_left_columns_only` setting (it was disabled by default). - [#7335](https://github.com/ClickHouse/ClickHouse/pull/7335) ([Artem -Zuikov](https://github.com/4ertus2)) -* Format strings for Template data format are now specified in files. - [#7118](https://github.com/ClickHouse/ClickHouse/pull/7118) -([tavplubix](https://github.com/tavplubix)) - -#### New Feature -* Introduce uniqCombined64() to calculate cardinality greater than UINT_MAX. - [#7213](https://github.com/ClickHouse/ClickHouse/pull/7213), -[#7222](https://github.com/ClickHouse/ClickHouse/pull/7222) ([Azat -Khuzhin](https://github.com/azat)) -* Support Bloom filter indexes on Array columns. - [#6984](https://github.com/ClickHouse/ClickHouse/pull/6984) -([achimbab](https://github.com/achimbab)) -* Add a function `getMacro(name)` that returns String with the value of corresponding `` - from server configuration. [#7240](https://github.com/ClickHouse/ClickHouse/pull/7240) -([alexey-milovidov](https://github.com/alexey-milovidov)) -* Set two configuration options for a dictionary based on an HTTP source: `credentials` and - `http-headers`. [#7092](https://github.com/ClickHouse/ClickHouse/pull/7092) ([Guillaume -Tassery](https://github.com/YiuRULE)) -* Add a new ProfileEvent `Merge` that counts the number of launched background merges. - [#7093](https://github.com/ClickHouse/ClickHouse/pull/7093) ([Mikhail -Korotov](https://github.com/millb)) -* Add fullHostName function that returns a fully qualified domain name. - [#7263](https://github.com/ClickHouse/ClickHouse/issues/7263) -[#7291](https://github.com/ClickHouse/ClickHouse/pull/7291) ([sundyli](https://github.com/sundy-li)) -* Add function `arraySplit` and `arrayReverseSplit` which split an array by "cut off" - conditions. They are useful in time sequence handling. -[#7294](https://github.com/ClickHouse/ClickHouse/pull/7294) ([hcz](https://github.com/hczhcz)) -* Add new functions that return the Array of all matched indices in multiMatch family of functions. - [#7299](https://github.com/ClickHouse/ClickHouse/pull/7299) ([Danila -Kutenin](https://github.com/danlark1)) -* Add a new database engine `Lazy` that is optimized for storing a large number of small -Log - tables. [#7171](https://github.com/ClickHouse/ClickHouse/pull/7171) ([Nikita -Vasilev](https://github.com/nikvas0)) -* Add aggregate functions groupBitmapAnd, -Or, -Xor for bitmap columns. [#7109](https://github.com/ClickHouse/ClickHouse/pull/7109) ([Zhichang -Yu](https://github.com/yuzhichang)) -* Add aggregate function combinators -OrNull and -OrDefault, which return null - or default values when there is nothing to aggregate. -[#7331](https://github.com/ClickHouse/ClickHouse/pull/7331) -([hcz](https://github.com/hczhcz)) -* Introduce CustomSeparated data format that supports custom escaping and - delimiter rules. [#7118](https://github.com/ClickHouse/ClickHouse/pull/7118) -([tavplubix](https://github.com/tavplubix)) -* Support Redis as source of external dictionary. [#4361](https://github.com/ClickHouse/ClickHouse/pull/4361) [#6962](https://github.com/ClickHouse/ClickHouse/pull/6962) ([comunodi](https://github.com/comunodi), [Anton -Popov](https://github.com/CurtizJ)) - -#### Bug Fix -* Fix wrong query result if it has `WHERE IN (SELECT ...)` section and `optimize_read_in_order` is - used. [#7371](https://github.com/ClickHouse/ClickHouse/pull/7371) ([Anton -Popov](https://github.com/CurtizJ)) -* Disabled MariaDB authentication plugin, which depends on files outside of project. - [#7140](https://github.com/ClickHouse/ClickHouse/pull/7140) ([Yuriy -Baranov](https://github.com/yurriy)) -* Fix exception `Cannot convert column ... because it is constant but values of constants are - different in source and result` which could rarely happen when functions `now()`, `today()`, -`yesterday()`, `randConstant()` are used. -[#7156](https://github.com/ClickHouse/ClickHouse/pull/7156) ([Nikolai -Kochetov](https://github.com/KochetovNicolai)) -* Fixed issue of using HTTP keep alive timeout instead of TCP keep alive timeout. - [#7351](https://github.com/ClickHouse/ClickHouse/pull/7351) ([Vasily -Nemkov](https://github.com/Enmk)) -* Fixed a segmentation fault in groupBitmapOr (issue [#7109](https://github.com/ClickHouse/ClickHouse/issues/7109)). - [#7289](https://github.com/ClickHouse/ClickHouse/pull/7289) ([Zhichang -Yu](https://github.com/yuzhichang)) -* For materialized views the commit for Kafka is called after all data were written. - [#7175](https://github.com/ClickHouse/ClickHouse/pull/7175) ([Ivan](https://github.com/abyss7)) -* Fixed wrong `duration_ms` value in `system.part_log` table. It was ten times off. - [#7172](https://github.com/ClickHouse/ClickHouse/pull/7172) ([Vladimir -Chebotarev](https://github.com/excitoon)) -* A quick fix to resolve crash in LIVE VIEW table and re-enabling all LIVE VIEW tests. - [#7201](https://github.com/ClickHouse/ClickHouse/pull/7201) -([vzakaznikov](https://github.com/vzakaznikov)) -* Serialize NULL values correctly in min/max indexes of MergeTree parts. - [#7234](https://github.com/ClickHouse/ClickHouse/pull/7234) ([Alexander -Kuzmenkov](https://github.com/akuzm)) -* Don't put virtual columns to .sql metadata when table is created as `CREATE TABLE AS`. - [#7183](https://github.com/ClickHouse/ClickHouse/pull/7183) ([Ivan](https://github.com/abyss7)) -* Fix segmentation fault in `ATTACH PART` query. - [#7185](https://github.com/ClickHouse/ClickHouse/pull/7185) -([alesapin](https://github.com/alesapin)) -* Fix wrong result for some queries given by the optimization of empty IN subqueries and empty - INNER/RIGHT JOIN. [#7284](https://github.com/ClickHouse/ClickHouse/pull/7284) ([Nikolai -Kochetov](https://github.com/KochetovNicolai)) -* Fixing AddressSanitizer error in the LIVE VIEW getHeader() method. - [#7271](https://github.com/ClickHouse/ClickHouse/pull/7271) -([vzakaznikov](https://github.com/vzakaznikov)) - -#### Improvement -* Add a message in case of queue_wait_max_ms wait takes place. - [#7390](https://github.com/ClickHouse/ClickHouse/pull/7390) ([Azat -Khuzhin](https://github.com/azat)) -* Made setting `s3_min_upload_part_size` table-level. - [#7059](https://github.com/ClickHouse/ClickHouse/pull/7059) ([Vladimir -Chebotarev](https://github.com/excitoon)) -* Check TTL in StorageFactory. [#7304](https://github.com/ClickHouse/ClickHouse/pull/7304) - ([sundyli](https://github.com/sundy-li)) -* Squash left-hand blocks in partial merge join (optimization). - [#7122](https://github.com/ClickHouse/ClickHouse/pull/7122) ([Artem -Zuikov](https://github.com/4ertus2)) -* Do not allow non-deterministic functions in mutations of Replicated table engines, because this - can introduce inconsistencies between replicas. -[#7247](https://github.com/ClickHouse/ClickHouse/pull/7247) ([Alexander -Kazakov](https://github.com/Akazz)) -* Disable memory tracker while converting exception stack trace to string. It can prevent the loss - of error messages of type `Memory limit exceeded` on server, which caused the `Attempt to read -after eof` exception on client. [#7264](https://github.com/ClickHouse/ClickHouse/pull/7264) -([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Miscellaneous format improvements. Resolves - [#6033](https://github.com/ClickHouse/ClickHouse/issues/6033), -[#2633](https://github.com/ClickHouse/ClickHouse/issues/2633), -[#6611](https://github.com/ClickHouse/ClickHouse/issues/6611), -[#6742](https://github.com/ClickHouse/ClickHouse/issues/6742) -[#7215](https://github.com/ClickHouse/ClickHouse/pull/7215) -([tavplubix](https://github.com/tavplubix)) -* ClickHouse ignores values on the right side of IN operator that are not convertible to the left - side type. Make it work properly for compound types -- Array and Tuple. -[#7283](https://github.com/ClickHouse/ClickHouse/pull/7283) ([Alexander -Kuzmenkov](https://github.com/akuzm)) -* Support missing inequalities for ASOF JOIN. It's possible to join less-or-equal variant and strict - greater and less variants for ASOF column in ON syntax. -[#7282](https://github.com/ClickHouse/ClickHouse/pull/7282) ([Artem -Zuikov](https://github.com/4ertus2)) -* Optimize partial merge join. [#7070](https://github.com/ClickHouse/ClickHouse/pull/7070) - ([Artem Zuikov](https://github.com/4ertus2)) -* Do not use more than 98K of memory in uniqCombined functions. - [#7236](https://github.com/ClickHouse/ClickHouse/pull/7236), -[#7270](https://github.com/ClickHouse/ClickHouse/pull/7270) ([Azat -Khuzhin](https://github.com/azat)) -* Flush parts of right-hand joining table on disk in PartialMergeJoin (if there is not enough - memory). Load data back when needed. [#7186](https://github.com/ClickHouse/ClickHouse/pull/7186) -([Artem Zuikov](https://github.com/4ertus2)) - -#### Performance Improvement -* Speed up joinGet with const arguments by avoiding data duplication. - [#7359](https://github.com/ClickHouse/ClickHouse/pull/7359) ([Amos -Bird](https://github.com/amosbird)) -* Return early if the subquery is empty. - [#7007](https://github.com/ClickHouse/ClickHouse/pull/7007) ([小路](https://github.com/nicelulu)) -* Optimize parsing of SQL expression in Values. - [#6781](https://github.com/ClickHouse/ClickHouse/pull/6781) -([tavplubix](https://github.com/tavplubix)) - -#### Build/Testing/Packaging Improvement -* Disable some contribs for cross-compilation to Mac OS. - [#7101](https://github.com/ClickHouse/ClickHouse/pull/7101) ([Ivan](https://github.com/abyss7)) -* Add missing linking with PocoXML for clickhouse_common_io. - [#7200](https://github.com/ClickHouse/ClickHouse/pull/7200) ([Azat -Khuzhin](https://github.com/azat)) -* Accept multiple test filter arguments in clickhouse-test. - [#7226](https://github.com/ClickHouse/ClickHouse/pull/7226) ([Alexander -Kuzmenkov](https://github.com/akuzm)) -* Enable musl and jemalloc for ARM. [#7300](https://github.com/ClickHouse/ClickHouse/pull/7300) - ([Amos Bird](https://github.com/amosbird)) -* Added `--client-option` parameter to `clickhouse-test` to pass additional parameters to client. - [#7277](https://github.com/ClickHouse/ClickHouse/pull/7277) ([Nikolai -Kochetov](https://github.com/KochetovNicolai)) -* Preserve existing configs on rpm package upgrade. - [#7103](https://github.com/ClickHouse/ClickHouse/pull/7103) -([filimonov](https://github.com/filimonov)) -* Fix errors detected by PVS. [#7153](https://github.com/ClickHouse/ClickHouse/pull/7153) ([Artem - Zuikov](https://github.com/4ertus2)) -* Fix build for Darwin. [#7149](https://github.com/ClickHouse/ClickHouse/pull/7149) - ([Ivan](https://github.com/abyss7)) -* glibc 2.29 compatibility. [#7142](https://github.com/ClickHouse/ClickHouse/pull/7142) ([Amos - Bird](https://github.com/amosbird)) -* Make sure dh_clean does not touch potential source files. - [#7205](https://github.com/ClickHouse/ClickHouse/pull/7205) ([Amos -Bird](https://github.com/amosbird)) -* Attempt to avoid conflict when updating from altinity rpm - it has config file packaged separately - in clickhouse-server-common. [#7073](https://github.com/ClickHouse/ClickHouse/pull/7073) -([filimonov](https://github.com/filimonov)) -* Optimize some header files for faster rebuilds. - [#7212](https://github.com/ClickHouse/ClickHouse/pull/7212), -[#7231](https://github.com/ClickHouse/ClickHouse/pull/7231) ([Alexander -Kuzmenkov](https://github.com/akuzm)) -* Add performance tests for Date and DateTime. [#7332](https://github.com/ClickHouse/ClickHouse/pull/7332) ([Vasily - Nemkov](https://github.com/Enmk)) -* Fix some tests that contained non-deterministic mutations. - [#7132](https://github.com/ClickHouse/ClickHouse/pull/7132) ([Alexander -Kazakov](https://github.com/Akazz)) -* Add build with MemorySanitizer to CI. [#7066](https://github.com/ClickHouse/ClickHouse/pull/7066) - ([Alexander Kuzmenkov](https://github.com/akuzm)) -* Avoid use of uninitialized values in MetricsTransmitter. - [#7158](https://github.com/ClickHouse/ClickHouse/pull/7158) ([Azat -Khuzhin](https://github.com/azat)) -* Fix some issues in Fields found by MemorySanitizer. - [#7135](https://github.com/ClickHouse/ClickHouse/pull/7135), -[#7179](https://github.com/ClickHouse/ClickHouse/pull/7179) ([Alexander -Kuzmenkov](https://github.com/akuzm)), [#7376](https://github.com/ClickHouse/ClickHouse/pull/7376) -([Amos Bird](https://github.com/amosbird)) -* Fix undefined behavior in murmurhash32. [#7388](https://github.com/ClickHouse/ClickHouse/pull/7388) ([Amos - Bird](https://github.com/amosbird)) -* Fix undefined behavior in StoragesInfoStream. [#7384](https://github.com/ClickHouse/ClickHouse/pull/7384) - ([tavplubix](https://github.com/tavplubix)) -* Fixed constant expressions folding for external database engines (MySQL, ODBC, JDBC). In previous - versions it wasn't working for multiple constant expressions and was not working at all for Date, -DateTime and UUID. This fixes [#7245](https://github.com/ClickHouse/ClickHouse/issues/7245) -[#7252](https://github.com/ClickHouse/ClickHouse/pull/7252) -([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixing ThreadSanitizer data race error in the LIVE VIEW when accessing no_users_thread variable. - [#7353](https://github.com/ClickHouse/ClickHouse/pull/7353) -([vzakaznikov](https://github.com/vzakaznikov)) -* Get rid of malloc symbols in libcommon - [#7134](https://github.com/ClickHouse/ClickHouse/pull/7134), -[#7065](https://github.com/ClickHouse/ClickHouse/pull/7065) ([Amos -Bird](https://github.com/amosbird)) -* Add global flag ENABLE_LIBRARIES for disabling all libraries. - [#7063](https://github.com/ClickHouse/ClickHouse/pull/7063) -([proller](https://github.com/proller)) - -#### Code cleanup -* Generalize configuration repository to prepare for DDL for Dictionaries. [#7155](https://github.com/ClickHouse/ClickHouse/pull/7155) - ([alesapin](https://github.com/alesapin)) -* Parser for dictionaries DDL without any semantic. - [#7209](https://github.com/ClickHouse/ClickHouse/pull/7209) -([alesapin](https://github.com/alesapin)) -* Split ParserCreateQuery into different smaller parsers. - [#7253](https://github.com/ClickHouse/ClickHouse/pull/7253) -([alesapin](https://github.com/alesapin)) -* Small refactoring and renaming near external dictionaries. - [#7111](https://github.com/ClickHouse/ClickHouse/pull/7111) -([alesapin](https://github.com/alesapin)) -* Refactor some code to prepare for role-based access control. [#7235](https://github.com/ClickHouse/ClickHouse/pull/7235) ([Vitaly - Baranov](https://github.com/vitlibar)) -* Some improvements in DatabaseOrdinary code. - [#7086](https://github.com/ClickHouse/ClickHouse/pull/7086) ([Nikita -Vasilev](https://github.com/nikvas0)) -* Do not use iterators in find() and emplace() methods of hash tables. -[#7026](https://github.com/ClickHouse/ClickHouse/pull/7026) ([Alexander -Kuzmenkov](https://github.com/akuzm)) -* Fix getMultipleValuesFromConfig in case when parameter root is not empty. [#7374](https://github.com/ClickHouse/ClickHouse/pull/7374) -([Mikhail Korotov](https://github.com/millb)) -* Remove some copy-paste (TemporaryFile and TemporaryFileStream) - [#7166](https://github.com/ClickHouse/ClickHouse/pull/7166) ([Artem -Zuikov](https://github.com/4ertus2)) -* Improved code readability a little bit (`MergeTreeData::getActiveContainingPart`). - [#7361](https://github.com/ClickHouse/ClickHouse/pull/7361) ([Vladimir -Chebotarev](https://github.com/excitoon)) -* Wait for all scheduled jobs, which are using local objects, if `ThreadPool::schedule(...)` throws - an exception. Rename `ThreadPool::schedule(...)` to `ThreadPool::scheduleOrThrowOnError(...)` and -fix comments to make obvious that it may throw. -[#7350](https://github.com/ClickHouse/ClickHouse/pull/7350) -([tavplubix](https://github.com/tavplubix)) - -## ClickHouse release 19.15 - -### ClickHouse release 19.15.4.10, 2019-10-31 - -#### Bug Fix -* Added handling of SQL_TINYINT and SQL_BIGINT, and fix handling of SQL_FLOAT data source types in ODBC Bridge. -[#7491](https://github.com/ClickHouse/ClickHouse/pull/7491) ([Denis Glazachev](https://github.com/traceon)) -* Allowed to have some parts on destination disk or volume in MOVE PARTITION. -[#7434](https://github.com/ClickHouse/ClickHouse/pull/7434) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Fixed NULL-values in nullable columns through ODBC-bridge. -[#7402](https://github.com/ClickHouse/ClickHouse/pull/7402) ([Vasily Nemkov](https://github.com/Enmk)) -* Fixed INSERT into Distributed non local node with MATERIALIZED columns. -[#7377](https://github.com/ClickHouse/ClickHouse/pull/7377) ([Azat Khuzhin](https://github.com/azat)) -* Fixed function getMultipleValuesFromConfig. -[#7374](https://github.com/ClickHouse/ClickHouse/pull/7374) ([Mikhail Korotov](https://github.com/millb)) -* Fixed issue of using HTTP keep alive timeout instead of TCP keep alive timeout. -[#7351](https://github.com/ClickHouse/ClickHouse/pull/7351) ([Vasily Nemkov](https://github.com/Enmk)) -* Wait for all jobs to finish on exception (fixes rare segfaults). -[#7350](https://github.com/ClickHouse/ClickHouse/pull/7350) ([tavplubix](https://github.com/tavplubix)) -* Don't push to MVs when inserting into Kafka table. -[#7265](https://github.com/ClickHouse/ClickHouse/pull/7265) ([Ivan](https://github.com/abyss7)) -* Disable memory tracker for exception stack. -[#7264](https://github.com/ClickHouse/ClickHouse/pull/7264) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed bad code in transforming query for external database. -[#7252](https://github.com/ClickHouse/ClickHouse/pull/7252) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid use of uninitialized values in MetricsTransmitter. -[#7158](https://github.com/ClickHouse/ClickHouse/pull/7158) ([Azat Khuzhin](https://github.com/azat)) -* Added example config with macros for tests ([alexey-milovidov](https://github.com/alexey-milovidov)) - -### ClickHouse release 19.15.3.6, 2019-10-09 - -#### Bug Fix -* Fixed bad_variant in hashed dictionary. -([alesapin](https://github.com/alesapin)) -* Fixed up bug with segmentation fault in ATTACH PART query. -([alesapin](https://github.com/alesapin)) -* Fixed time calculation in `MergeTreeData`. -([Vladimir Chebotarev](https://github.com/excitoon)) -* Commit to Kafka explicitly after the writing is finalized. -[#7175](https://github.com/ClickHouse/ClickHouse/pull/7175) ([Ivan](https://github.com/abyss7)) -* Serialize NULL values correctly in min/max indexes of MergeTree parts. -[#7234](https://github.com/ClickHouse/ClickHouse/pull/7234) ([Alexander Kuzmenkov](https://github.com/akuzm)) - -### ClickHouse release 19.15.2.2, 2019-10-01 - -#### New Feature -* Tiered storage: support to use multiple storage volumes for tables with MergeTree engine. It's possible to store fresh data on SSD and automatically move old data to HDD. ([example](https://clickhouse.github.io/clickhouse-presentations/meetup30/new_features/#12)). [#4918](https://github.com/ClickHouse/ClickHouse/pull/4918) ([Igr](https://github.com/ObjatieGroba)) [#6489](https://github.com/ClickHouse/ClickHouse/pull/6489) ([alesapin](https://github.com/alesapin)) -* Add table function `input` for reading incoming data in `INSERT SELECT` query. [#5450](https://github.com/ClickHouse/ClickHouse/pull/5450) ([palasonic1](https://github.com/palasonic1)) [#6832](https://github.com/ClickHouse/ClickHouse/pull/6832) ([Anton Popov](https://github.com/CurtizJ)) -* Add a `sparse_hashed` dictionary layout, that is functionally equivalent to the `hashed` layout, but is more memory efficient. It uses about twice as less memory at the cost of slower value retrieval. [#6894](https://github.com/ClickHouse/ClickHouse/pull/6894) ([Azat Khuzhin](https://github.com/azat)) -* Implement ability to define list of users for access to dictionaries. Only current connected database using. [#6907](https://github.com/ClickHouse/ClickHouse/pull/6907) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Add `LIMIT` option to `SHOW` query. [#6944](https://github.com/ClickHouse/ClickHouse/pull/6944) ([Philipp Malkovsky](https://github.com/malkfilipp)) -* Add `bitmapSubsetLimit(bitmap, range_start, limit)` function, that returns subset of the smallest `limit` values in set that is no smaller than `range_start`. [#6957](https://github.com/ClickHouse/ClickHouse/pull/6957) ([Zhichang Yu](https://github.com/yuzhichang)) -* Add `bitmapMin` and `bitmapMax` functions. [#6970](https://github.com/ClickHouse/ClickHouse/pull/6970) ([Zhichang Yu](https://github.com/yuzhichang)) -* Add function `repeat` related to [issue-6648](https://github.com/ClickHouse/ClickHouse/issues/6648) [#6999](https://github.com/ClickHouse/ClickHouse/pull/6999) ([flynn](https://github.com/ucasFL)) - -#### Experimental Feature -* Implement (in memory) Merge Join variant that does not change current pipeline. Result is partially sorted by merge key. Set `partial_merge_join = 1` to use this feature. The Merge Join is still in development. [#6940](https://github.com/ClickHouse/ClickHouse/pull/6940) ([Artem Zuikov](https://github.com/4ertus2)) -* Add `S3` engine and table function. It is still in development (no authentication support yet). [#5596](https://github.com/ClickHouse/ClickHouse/pull/5596) ([Vladimir Chebotarev](https://github.com/excitoon)) - -#### Improvement -* Every message read from Kafka is inserted atomically. This resolves almost all known issues with Kafka engine. [#6950](https://github.com/ClickHouse/ClickHouse/pull/6950) ([Ivan](https://github.com/abyss7)) -* Improvements for failover of Distributed queries. Shorten recovery time, also it is now configurable and can be seen in `system.clusters`. [#6399](https://github.com/ClickHouse/ClickHouse/pull/6399) ([Vasily Nemkov](https://github.com/Enmk)) -* Support numeric values for Enums directly in `IN` section. #6766 [#6941](https://github.com/ClickHouse/ClickHouse/pull/6941) ([dimarub2000](https://github.com/dimarub2000)) -* Support (optional, disabled by default) redirects on URL storage. [#6914](https://github.com/ClickHouse/ClickHouse/pull/6914) ([maqroll](https://github.com/maqroll)) -* Add information message when client with an older version connects to a server. [#6893](https://github.com/ClickHouse/ClickHouse/pull/6893) ([Philipp Malkovsky](https://github.com/malkfilipp)) -* Remove maximum backoff sleep time limit for sending data in Distributed tables [#6895](https://github.com/ClickHouse/ClickHouse/pull/6895) ([Azat Khuzhin](https://github.com/azat)) -* Add ability to send profile events (counters) with cumulative values to graphite. It can be enabled under `` in server `config.xml`. [#6969](https://github.com/ClickHouse/ClickHouse/pull/6969) ([Azat Khuzhin](https://github.com/azat)) -* Add automatically cast type `T` to `LowCardinality(T)` while inserting data in column of type `LowCardinality(T)` in Native format via HTTP. [#6891](https://github.com/ClickHouse/ClickHouse/pull/6891) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Add ability to use function `hex` without using `reinterpretAsString` for `Float32`, `Float64`. [#7024](https://github.com/ClickHouse/ClickHouse/pull/7024) ([Mikhail Korotov](https://github.com/millb)) - -#### Build/Testing/Packaging Improvement -* Add gdb-index to clickhouse binary with debug info. It will speed up startup time of `gdb`. [#6947](https://github.com/ClickHouse/ClickHouse/pull/6947) ([alesapin](https://github.com/alesapin)) -* Speed up deb packaging with patched dpkg-deb which uses `pigz`. [#6960](https://github.com/ClickHouse/ClickHouse/pull/6960) ([alesapin](https://github.com/alesapin)) -* Set `enable_fuzzing = 1` to enable libfuzzer instrumentation of all the project code. [#7042](https://github.com/ClickHouse/ClickHouse/pull/7042) ([kyprizel](https://github.com/kyprizel)) -* Add split build smoke test in CI. [#7061](https://github.com/ClickHouse/ClickHouse/pull/7061) ([alesapin](https://github.com/alesapin)) -* Add build with MemorySanitizer to CI. [#7066](https://github.com/ClickHouse/ClickHouse/pull/7066) ([Alexander Kuzmenkov](https://github.com/akuzm)) -* Replace `libsparsehash` with `sparsehash-c11` [#6965](https://github.com/ClickHouse/ClickHouse/pull/6965) ([Azat Khuzhin](https://github.com/azat)) - -#### Bug Fix -* Fixed performance degradation of index analysis on complex keys on large tables. This fixes #6924. [#7075](https://github.com/ClickHouse/ClickHouse/pull/7075) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix logical error causing segfaults when selecting from Kafka empty topic. [#6909](https://github.com/ClickHouse/ClickHouse/pull/6909) ([Ivan](https://github.com/abyss7)) -* Fix too early MySQL connection close in `MySQLBlockInputStream.cpp`. [#6882](https://github.com/ClickHouse/ClickHouse/pull/6882) ([Clément Rodriguez](https://github.com/clemrodriguez)) -* Returned support for very old Linux kernels (fix [#6841](https://github.com/ClickHouse/ClickHouse/issues/6841)) [#6853](https://github.com/ClickHouse/ClickHouse/pull/6853) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix possible data loss in `insert select` query in case of empty block in input stream. #6834 #6862 [#6911](https://github.com/ClickHouse/ClickHouse/pull/6911) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params [#6928](https://github.com/ClickHouse/ClickHouse/pull/6928) ([proller](https://github.com/proller)) -* Fix complex queries with array joins and global subqueries. [#6934](https://github.com/ClickHouse/ClickHouse/pull/6934) ([Ivan](https://github.com/abyss7)) -* Fix `Unknown identifier` error in ORDER BY and GROUP BY with multiple JOINs [#7022](https://github.com/ClickHouse/ClickHouse/pull/7022) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed `MSan` warning while executing function with `LowCardinality` argument. [#7062](https://github.com/ClickHouse/ClickHouse/pull/7062) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) - -#### Backward Incompatible Change -* Changed serialization format of bitmap* aggregate function states to improve performance. Serialized states of bitmap* from previous versions cannot be read. [#6908](https://github.com/ClickHouse/ClickHouse/pull/6908) ([Zhichang Yu](https://github.com/yuzhichang)) - -## ClickHouse release 19.14 -### ClickHouse release 19.14.7.15, 2019-10-02 - -#### Bug Fix -* This release also contains all bug fixes from 19.11.12.69. -* Fixed compatibility for distributed queries between 19.14 and earlier versions. This fixes [#7068](https://github.com/ClickHouse/ClickHouse/issues/7068). [#7069](https://github.com/ClickHouse/ClickHouse/pull/7069) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -### ClickHouse release 19.14.6.12, 2019-09-19 - -#### Bug Fix -* Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params. [#6928](https://github.com/ClickHouse/ClickHouse/pull/6928) ([proller](https://github.com/proller)) -* Fixed subquery name in queries with `ARRAY JOIN` and `GLOBAL IN subquery` with alias. Use subquery alias for external table name if it is specified. [#6934](https://github.com/ClickHouse/ClickHouse/pull/6934) ([Ivan](https://github.com/abyss7)) - -#### Build/Testing/Packaging Improvement -* Fix [flapping](https://clickhouse-test-reports.s3.yandex.net/6944/aab95fd5175a513413c7395a73a82044bdafb906/functional_stateless_tests_(debug).html) test `00715_fetch_merged_or_mutated_part_zookeeper` by rewriting it to a shell scripts because it needs to wait for mutations to apply. [#6977](https://github.com/ClickHouse/ClickHouse/pull/6977) ([Alexander Kazakov](https://github.com/Akazz)) -* Fixed UBSan and MemSan failure in function `groupUniqArray` with emtpy array argument. It was caused by placing of empty `PaddedPODArray` into hash table zero cell because constructor for zero cell value was not called. [#6937](https://github.com/ClickHouse/ClickHouse/pull/6937) ([Amos Bird](https://github.com/amosbird)) - -### ClickHouse release 19.14.3.3, 2019-09-10 - -#### New Feature -* `WITH FILL` modifier for `ORDER BY`. (continuation of [#5069](https://github.com/ClickHouse/ClickHouse/issues/5069)) [#6610](https://github.com/ClickHouse/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) -* `WITH TIES` modifier for `LIMIT`. (continuation of [#5069](https://github.com/ClickHouse/ClickHouse/issues/5069)) [#6610](https://github.com/ClickHouse/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) -* Parse unquoted `NULL` literal as NULL (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [#5990](https://github.com/ClickHouse/ClickHouse/issues/5990) [#6055](https://github.com/ClickHouse/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) -* Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [#6092](https://github.com/ClickHouse/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) -* New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [#6363](https://github.com/ClickHouse/ClickHouse/issues/6363) [#6467](https://github.com/ClickHouse/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6530](https://github.com/ClickHouse/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to write ClickHouse text logs to `system.text_log` table. [#6037](https://github.com/ClickHouse/ClickHouse/issues/6037) [#6103](https://github.com/ClickHouse/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6164](https://github.com/ClickHouse/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Show private symbols in stack traces (this is done via parsing symbol tables of ELF files). Added information about file and line number in stack traces if debug info is present. Speedup symbol name lookup with indexing symbols present in program. Added new SQL functions for introspection: `demangle` and `addressToLine`. Renamed function `symbolizeAddress` to `addressToSymbol` for consistency. Function `addressToSymbol` will return mangled name for performance reasons and you have to apply `demangle`. Added setting `allow_introspection_functions` which is turned off by default. [#6201](https://github.com/ClickHouse/ClickHouse/pull/6201) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Table function `values` (the name is case-insensitive). It allows to read from `VALUES` list proposed in [#5984](https://github.com/ClickHouse/ClickHouse/issues/5984). Example: `SELECT * FROM VALUES('a UInt64, s String', (1, 'one'), (2, 'two'), (3, 'three'))`. [#6217](https://github.com/ClickHouse/ClickHouse/issues/6217). [#6209](https://github.com/ClickHouse/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) -* Added an ability to alter storage settings. Syntax: `ALTER TABLE MODIFY SETTING = `. [#6366](https://github.com/ClickHouse/ClickHouse/pull/6366) [#6669](https://github.com/ClickHouse/ClickHouse/pull/6669) [#6685](https://github.com/ClickHouse/ClickHouse/pull/6685) ([alesapin](https://github.com/alesapin)) -* Support for removing of detached parts. Syntax: `ALTER TABLE DROP DETACHED PART ''`. [#6158](https://github.com/ClickHouse/ClickHouse/pull/6158) ([tavplubix](https://github.com/tavplubix)) -* Table constraints. Allows to add constraint to table definition which will be checked at insert. [#5273](https://github.com/ClickHouse/ClickHouse/pull/5273) ([Gleb Novikov](https://github.com/NanoBjorn)) [#6652](https://github.com/ClickHouse/ClickHouse/pull/6652) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Suppport for cascaded materialized views. [#6324](https://github.com/ClickHouse/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) -* Turn on query profiler by default to sample every query execution thread once a second. [#6283](https://github.com/ClickHouse/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Input format `ORC`. [#6454](https://github.com/ClickHouse/ClickHouse/pull/6454) [#6703](https://github.com/ClickHouse/ClickHouse/pull/6703) ([akonyaev90](https://github.com/akonyaev90)) -* Added two new functions: `sigmoid` and `tanh` (that are useful for machine learning applications). [#6254](https://github.com/ClickHouse/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Function `hasToken(haystack, token)`, `hasTokenCaseInsensitive(haystack, token)` to check if given token is in haystack. Token is a maximal length substring between two non alphanumeric ASCII characters (or boundaries of haystack). Token must be a constant string. Supported by tokenbf_v1 index specialization. [#6596](https://github.com/ClickHouse/ClickHouse/pull/6596), [#6662](https://github.com/ClickHouse/ClickHouse/pull/6662) ([Vasily Nemkov](https://github.com/Enmk)) -* New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column in a block of data. [#5925](https://github.com/ClickHouse/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/ClickHouse/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/ClickHouse/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) -* Created a function `currentUser()`, returning login of authorized user. Added alias `user()` for compatibility with MySQL. [#6470](https://github.com/ClickHouse/ClickHouse/pull/6470) ([Alex Krash](https://github.com/alex-krash)) -* New aggregate functions `quantilesExactInclusive` and `quantilesExactExclusive` which were proposed in [#5885](https://github.com/ClickHouse/ClickHouse/issues/5885). [#6477](https://github.com/ClickHouse/ClickHouse/pull/6477) ([dimarub2000](https://github.com/dimarub2000)) -* Function `bitmapRange(bitmap, range_begin, range_end)` which returns new set with specified range (not include the `range_end`). [#6314](https://github.com/ClickHouse/ClickHouse/pull/6314) ([Zhichang Yu](https://github.com/yuzhichang)) -* Function `geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)` which creates array of precision-long strings of geohash-boxes covering provided area. [#6127](https://github.com/ClickHouse/ClickHouse/pull/6127) ([Vasily Nemkov](https://github.com/Enmk)) -* Implement support for INSERT query with `Kafka` tables. [#6012](https://github.com/ClickHouse/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) -* Added support for `_partition` and `_timestamp` virtual columns to Kafka engine. [#6400](https://github.com/ClickHouse/ClickHouse/pull/6400) ([Ivan](https://github.com/abyss7)) -* Possibility to remove sensitive data from `query_log`, server logs, process list with regexp-based rules. [#5710](https://github.com/ClickHouse/ClickHouse/pull/5710) ([filimonov](https://github.com/filimonov)) - -#### Experimental Feature -* Input and output data format `Template`. It allows to specify custom format string for input and output. [#4354](https://github.com/ClickHouse/ClickHouse/issues/4354) [#6727](https://github.com/ClickHouse/ClickHouse/pull/6727) ([tavplubix](https://github.com/tavplubix)) -* Implementation of `LIVE VIEW` tables that were originally proposed in [#2898](https://github.com/ClickHouse/ClickHouse/pull/2898), prepared in [#3925](https://github.com/ClickHouse/ClickHouse/issues/3925), and then updated in [#5541](https://github.com/ClickHouse/ClickHouse/issues/5541). See [#5541](https://github.com/ClickHouse/ClickHouse/issues/5541) for detailed description. [#5541](https://github.com/ClickHouse/ClickHouse/issues/5541) ([vzakaznikov](https://github.com/vzakaznikov)) [#6425](https://github.com/ClickHouse/ClickHouse/pull/6425) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) [#6656](https://github.com/ClickHouse/ClickHouse/pull/6656) ([vzakaznikov](https://github.com/vzakaznikov)) Note that `LIVE VIEW` feature may be removed in next versions. - -#### Bug Fix -* This release also contains all bug fixes from 19.13 and 19.11. -* Fix segmentation fault when the table has skip indices and vertical merge happens. [#6723](https://github.com/ClickHouse/ClickHouse/pull/6723) ([alesapin](https://github.com/alesapin)) -* Fix per-column TTL with non-trivial column defaults. Previously in case of force TTL merge with `OPTIMIZE ... FINAL` query, expired values was replaced by type defaults instead of user-specified column defaults. [#6796](https://github.com/ClickHouse/ClickHouse/pull/6796) ([Anton Popov](https://github.com/CurtizJ)) -* Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/ClickHouse/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) -* Fixed infinite loop when reading Kafka messages. Do not pause/resume consumer on subscription at all - otherwise it may get paused indefinitely in some scenarios. [#6354](https://github.com/ClickHouse/ClickHouse/pull/6354) ([Ivan](https://github.com/abyss7)) -* Fix `Key expression contains comparison between inconvertible types` exception in `bitmapContains` function. [#6136](https://github.com/ClickHouse/ClickHouse/issues/6136) [#6146](https://github.com/ClickHouse/ClickHouse/issues/6146) [#6156](https://github.com/ClickHouse/ClickHouse/pull/6156) ([dimarub2000](https://github.com/dimarub2000)) -* Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [#6384](https://github.com/ClickHouse/ClickHouse/pull/6384) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/ClickHouse/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Removed extra verbose logging in MySQL interface [#6389](https://github.com/ClickHouse/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Return the ability to parse boolean settings from 'true' and 'false' in the configuration file. [#6278](https://github.com/ClickHouse/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) -* Fix crash in `quantile` and `median` function over `Nullable(Decimal128)`. [#6378](https://github.com/ClickHouse/ClickHouse/pull/6378) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed possible incomplete result returned by `SELECT` query with `WHERE` condition on primary key contained conversion to Float type. It was caused by incorrect checking of monotonicity in `toFloat` function. [#6248](https://github.com/ClickHouse/ClickHouse/issues/6248) [#6374](https://github.com/ClickHouse/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) -* Check `max_expanded_ast_elements` setting for mutations. Clear mutations after `TRUNCATE TABLE`. [#6205](https://github.com/ClickHouse/ClickHouse/pull/6205) ([Winter Zhang](https://github.com/zhang2014)) -* Fix JOIN results for key columns when used with `join_use_nulls`. Attach Nulls instead of columns defaults. [#6249](https://github.com/ClickHouse/ClickHouse/pull/6249) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix for skip indices with vertical merge and alter. Fix for `Bad size of marks file` exception. [#6594](https://github.com/ClickHouse/ClickHouse/issues/6594) [#6713](https://github.com/ClickHouse/ClickHouse/pull/6713) ([alesapin](https://github.com/alesapin)) -* Fix rare crash in `ALTER MODIFY COLUMN` and vertical merge when one of merged/altered parts is empty (0 rows) [#6746](https://github.com/ClickHouse/ClickHouse/issues/6746) [#6780](https://github.com/ClickHouse/ClickHouse/pull/6780) ([alesapin](https://github.com/alesapin)) -* Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/ClickHouse/ClickHouse/issues/6257). [#6281](https://github.com/ClickHouse/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [#6404](https://github.com/ClickHouse/ClickHouse/pull/6404) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed unsafe code around `getIdentifier` function. [#6401](https://github.com/ClickHouse/ClickHouse/issues/6401) [#6409](https://github.com/ClickHouse/ClickHouse/pull/6409) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed bug in MySQL wire protocol (is used while connecting to ClickHouse form MySQL client). Caused by heap buffer overflow in `PacketPayloadWriteBuffer`. [#6212](https://github.com/ClickHouse/ClickHouse/pull/6212) ([Yuriy Baranov](https://github.com/yurriy)) -* Fixed memory leak in `bitmapSubsetInRange` function. [#6819](https://github.com/ClickHouse/ClickHouse/pull/6819) ([Zhichang Yu](https://github.com/yuzhichang)) -* Fix rare bug when mutation executed after granularity change. [#6816](https://github.com/ClickHouse/ClickHouse/pull/6816) ([alesapin](https://github.com/alesapin)) -* Allow protobuf message with all fields by default. [#6132](https://github.com/ClickHouse/ClickHouse/pull/6132) ([Vitaly Baranov](https://github.com/vitlibar)) -* Resolve a bug with `nullIf` function when we send a `NULL` argument on the second argument. [#6446](https://github.com/ClickHouse/ClickHouse/pull/6446) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Fix rare bug with wrong memory allocation/deallocation in complex key cache dictionaries with string fields which leads to infinite memory consumption (looks like memory leak). Bug reproduces when string size was a power of two starting from eight (8, 16, 32, etc). [#6447](https://github.com/ClickHouse/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) -* Fixed Gorilla encoding on small sequences which caused exception `Cannot write after end of buffer`. [#6398](https://github.com/ClickHouse/ClickHouse/issues/6398) [#6444](https://github.com/ClickHouse/ClickHouse/pull/6444) ([Vasily Nemkov](https://github.com/Enmk)) -* Allow to use not nullable types in JOINs with `join_use_nulls` enabled. [#6705](https://github.com/ClickHouse/ClickHouse/pull/6705) ([Artem Zuikov](https://github.com/4ertus2)) -* Disable `Poco::AbstractConfiguration` substitutions in query in `clickhouse-client`. [#6706](https://github.com/ClickHouse/ClickHouse/pull/6706) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid deadlock in `REPLACE PARTITION`. [#6677](https://github.com/ClickHouse/ClickHouse/pull/6677) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Using `arrayReduce` for constant arguments may lead to segfault. [#6242](https://github.com/ClickHouse/ClickHouse/issues/6242) [#6326](https://github.com/ClickHouse/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix inconsistent parts which can appear if replica was restored after `DROP PARTITION`. [#6522](https://github.com/ClickHouse/ClickHouse/issues/6522) [#6523](https://github.com/ClickHouse/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) -* Fixed hang in `JSONExtractRaw` function. [#6195](https://github.com/ClickHouse/ClickHouse/issues/6195) [#6198](https://github.com/ClickHouse/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug with incorrect skip indices serialization and aggregation with adaptive granularity. [#6594](https://github.com/ClickHouse/ClickHouse/issues/6594). [#6748](https://github.com/ClickHouse/ClickHouse/pull/6748) ([alesapin](https://github.com/alesapin)) -* Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [#6225](https://github.com/ClickHouse/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) -* Fix bug with writing secondary indices marks with adaptive granularity. [#6126](https://github.com/ClickHouse/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin)) -* Fix initialization order while server startup. Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [#6080](https://github.com/ClickHouse/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7)) -* Clearing the data buffer from the previous read operation that was completed with an error. [#6026](https://github.com/ClickHouse/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) -* Fix bug with enabling adaptive granularity when creating a new replica for Replicated\*MergeTree table. [#6394](https://github.com/ClickHouse/ClickHouse/issues/6394) [#6452](https://github.com/ClickHouse/ClickHouse/pull/6452) ([alesapin](https://github.com/alesapin)) -* Fixed possible crash during server startup in case of exception happened in `libunwind` during exception at access to uninitialized `ThreadStatus` structure. [#6456](https://github.com/ClickHouse/ClickHouse/pull/6456) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) -* Fix crash in `yandexConsistentHash` function. Found by fuzz test. [#6304](https://github.com/ClickHouse/ClickHouse/issues/6304) [#6305](https://github.com/ClickHouse/ClickHouse/pull/6305) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed the possibility of hanging queries when server is overloaded and global thread pool becomes near full. This have higher chance to happen on clusters with large number of shards (hundreds), because distributed queries allocate a thread per connection to each shard. For example, this issue may reproduce if a cluster of 330 shards is processing 30 concurrent distributed queries. This issue affects all versions starting from 19.2. [#6301](https://github.com/ClickHouse/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed logic of `arrayEnumerateUniqRanked` function. [#6423](https://github.com/ClickHouse/ClickHouse/pull/6423) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix segfault when decoding symbol table. [#6603](https://github.com/ClickHouse/ClickHouse/pull/6603) ([Amos Bird](https://github.com/amosbird)) -* Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn't contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [#6094](https://github.com/ClickHouse/ClickHouse/issues/6094) [#6119](https://github.com/ClickHouse/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Removed extra quoting of description in `system.settings` table. [#6696](https://github.com/ClickHouse/ClickHouse/issues/6696) [#6699](https://github.com/ClickHouse/ClickHouse/pull/6699) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid possible deadlock in `TRUNCATE` of Replicated table. [#6695](https://github.com/ClickHouse/ClickHouse/pull/6695) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix reading in order of sorting key. [#6189](https://github.com/ClickHouse/ClickHouse/pull/6189) ([Anton Popov](https://github.com/CurtizJ)) -* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) -* Fix bug opened by [#4405](https://github.com/ClickHouse/ClickHouse/pull/4405) (since 19.4.0). Reproduces in queries to Distributed tables over MergeTree tables when we doesn't query any columns (`SELECT 1`). [#6236](https://github.com/ClickHouse/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) -* Fixed overflow in integer division of signed type to unsigned type. The behaviour was exactly as in C or C++ language (integer promotion rules) that may be surprising. Please note that the overflow is still possible when dividing large signed number to large unsigned number or vice-versa (but that case is less usual). The issue existed in all server versions. [#6214](https://github.com/ClickHouse/ClickHouse/issues/6214) [#6233](https://github.com/ClickHouse/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Limit maximum sleep time for throttling when `max_execution_speed` or `max_execution_speed_bytes` is set. Fixed false errors like `Estimated query execution time (inf seconds) is too long`. [#5547](https://github.com/ClickHouse/ClickHouse/issues/5547) [#6232](https://github.com/ClickHouse/ClickHouse/pull/6232) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed issues about using `MATERIALIZED` columns and aliases in `MaterializedView`. [#448](https://github.com/ClickHouse/ClickHouse/issues/448) [#3484](https://github.com/ClickHouse/ClickHouse/issues/3484) [#3450](https://github.com/ClickHouse/ClickHouse/issues/3450) [#2878](https://github.com/ClickHouse/ClickHouse/issues/2878) [#2285](https://github.com/ClickHouse/ClickHouse/issues/2285) [#3796](https://github.com/ClickHouse/ClickHouse/pull/3796) ([Amos Bird](https://github.com/amosbird)) [#6316](https://github.com/ClickHouse/ClickHouse/pull/6316) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix `FormatFactory` behaviour for input streams which are not implemented as processor. [#6495](https://github.com/ClickHouse/ClickHouse/pull/6495) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed typo. [#6631](https://github.com/ClickHouse/ClickHouse/pull/6631) ([Alex Ryndin](https://github.com/alexryndin)) -* Typo in the error message ( is -> are ). [#6839](https://github.com/ClickHouse/ClickHouse/pull/6839) ([Denis Zhuravlev](https://github.com/den-crane)) -* Fixed error while parsing of columns list from string if type contained a comma (this issue was relevant for `File`, `URL`, `HDFS` storages) [#6217](https://github.com/ClickHouse/ClickHouse/issues/6217). [#6209](https://github.com/ClickHouse/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) - -#### Security Fix -* This release also contains all bug security fixes from 19.13 and 19.11. -* Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser. Fixed the possibility of stack overflow in Merge and Distributed tables, materialized views and conditions for row-level security that involve subqueries. [#6433](https://github.com/ClickHouse/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Improvement -* Correct implementation of ternary logic for `AND/OR`. [#6048](https://github.com/ClickHouse/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) -* Now values and rows with expired TTL will be removed after `OPTIMIZE ... FINAL` query from old parts without TTL infos or with outdated TTL infos, e.g. after `ALTER ... MODIFY TTL` query. Added queries `SYSTEM STOP/START TTL MERGES` to disallow/allow assign merges with TTL and filter expired values in all merges. [#6274](https://github.com/ClickHouse/ClickHouse/pull/6274) ([Anton Popov](https://github.com/CurtizJ)) -* Possibility to change the location of ClickHouse history file for client using `CLICKHOUSE_HISTORY_FILE` env. [#6840](https://github.com/ClickHouse/ClickHouse/pull/6840) ([filimonov](https://github.com/filimonov)) -* Remove `dry_run` flag from `InterpreterSelectQuery`. ... [#6375](https://github.com/ClickHouse/ClickHouse/pull/6375) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Support `ASOF JOIN` with `ON` section. [#6211](https://github.com/ClickHouse/ClickHouse/pull/6211) ([Artem Zuikov](https://github.com/4ertus2)) -* Better support of skip indexes for mutations and replication. Support for `MATERIALIZE/CLEAR INDEX ... IN PARTITION` query. `UPDATE x = x` recalculates all indices that use column `x`. [#5053](https://github.com/ClickHouse/ClickHouse/pull/5053) ([Nikita Vasilev](https://github.com/nikvas0)) -* Allow to `ATTACH` live views (for example, at the server startup) regardless to `allow_experimental_live_view` setting. [#6754](https://github.com/ClickHouse/ClickHouse/pull/6754) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* For stack traces gathered by query profiler, do not include stack frames generated by the query profiler itself. [#6250](https://github.com/ClickHouse/ClickHouse/pull/6250) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Now table functions `values`, `file`, `url`, `hdfs` have support for ALIAS columns. [#6255](https://github.com/ClickHouse/ClickHouse/pull/6255) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Throw an exception if `config.d` file doesn't have the corresponding root element as the config file. [#6123](https://github.com/ClickHouse/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) -* Print extra info in exception message for `no space left on device`. [#6182](https://github.com/ClickHouse/ClickHouse/issues/6182), [#6252](https://github.com/ClickHouse/ClickHouse/issues/6252) [#6352](https://github.com/ClickHouse/ClickHouse/pull/6352) ([tavplubix](https://github.com/tavplubix)) -* When determining shards of a `Distributed` table to be covered by a read query (for `optimize_skip_unused_shards` = 1) ClickHouse now checks conditions from both `prewhere` and `where` clauses of select statement. [#6521](https://github.com/ClickHouse/ClickHouse/pull/6521) ([Alexander Kazakov](https://github.com/Akazz)) -* Enabled `SIMDJSON` for machines without AVX2 but with SSE 4.2 and PCLMUL instruction set. [#6285](https://github.com/ClickHouse/ClickHouse/issues/6285) [#6320](https://github.com/ClickHouse/ClickHouse/pull/6320) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* ClickHouse can work on filesystems without `O_DIRECT` support (such as ZFS and BtrFS) without additional tuning. [#4449](https://github.com/ClickHouse/ClickHouse/issues/4449) [#6730](https://github.com/ClickHouse/ClickHouse/pull/6730) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Support push down predicate for final subquery. [#6120](https://github.com/ClickHouse/ClickHouse/pull/6120) ([TCeason](https://github.com/TCeason)) [#6162](https://github.com/ClickHouse/ClickHouse/pull/6162) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Better `JOIN ON` keys extraction [#6131](https://github.com/ClickHouse/ClickHouse/pull/6131) ([Artem Zuikov](https://github.com/4ertus2)) -* Upated `SIMDJSON`. [#6285](https://github.com/ClickHouse/ClickHouse/issues/6285). [#6306](https://github.com/ClickHouse/ClickHouse/pull/6306) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Optimize selecting of smallest column for `SELECT count()` query. [#6344](https://github.com/ClickHouse/ClickHouse/pull/6344) ([Amos Bird](https://github.com/amosbird)) -* Added `strict` parameter in `windowFunnel()`. When the `strict` is set, the `windowFunnel()` applies conditions only for the unique values. [#6548](https://github.com/ClickHouse/ClickHouse/pull/6548) ([achimbab](https://github.com/achimbab)) -* Safer interface of `mysqlxx::Pool`. [#6150](https://github.com/ClickHouse/ClickHouse/pull/6150) ([avasiliev](https://github.com/avasiliev)) -* Options line size when executing with `--help` option now corresponds with terminal size. [#6590](https://github.com/ClickHouse/ClickHouse/pull/6590) ([dimarub2000](https://github.com/dimarub2000)) -* Disable "read in order" optimization for aggregation without keys. [#6599](https://github.com/ClickHouse/ClickHouse/pull/6599) ([Anton Popov](https://github.com/CurtizJ)) -* HTTP status code for `INCORRECT_DATA` and `TYPE_MISMATCH` error codes was changed from default `500 Internal Server Error` to `400 Bad Request`. [#6271](https://github.com/ClickHouse/ClickHouse/pull/6271) ([Alexander Rodin](https://github.com/a-rodin)) -* Move Join object from `ExpressionAction` into `AnalyzedJoin`. `ExpressionAnalyzer` and `ExpressionAction` do not know about `Join` class anymore. Its logic is hidden by `AnalyzedJoin` iface. [#6801](https://github.com/ClickHouse/ClickHouse/pull/6801) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed possible deadlock of distributed queries when one of shards is localhost but the query is sent via network connection. [#6759](https://github.com/ClickHouse/ClickHouse/pull/6759) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Changed semantic of multiple tables `RENAME` to avoid possible deadlocks. [#6757](https://github.com/ClickHouse/ClickHouse/issues/6757). [#6756](https://github.com/ClickHouse/ClickHouse/pull/6756) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Rewritten MySQL compatibility server to prevent loading full packet payload in memory. Decreased memory consumption for each connection to approximately `2 * DBMS_DEFAULT_BUFFER_SIZE` (read/write buffers). [#5811](https://github.com/ClickHouse/ClickHouse/pull/5811) ([Yuriy Baranov](https://github.com/yurriy)) -* Move AST alias interpreting logic out of parser that doesn't have to know anything about query semantics. [#6108](https://github.com/ClickHouse/ClickHouse/pull/6108) ([Artem Zuikov](https://github.com/4ertus2)) -* Slightly more safe parsing of `NamesAndTypesList`. [#6408](https://github.com/ClickHouse/ClickHouse/issues/6408). [#6410](https://github.com/ClickHouse/ClickHouse/pull/6410) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `clickhouse-copier`: Allow use `where_condition` from config with `partition_key` alias in query for checking partition existence (Earlier it was used only in reading data queries). [#6577](https://github.com/ClickHouse/ClickHouse/pull/6577) ([proller](https://github.com/proller)) -* Added optional message argument in `throwIf`. ([#5772](https://github.com/ClickHouse/ClickHouse/issues/5772)) [#6329](https://github.com/ClickHouse/ClickHouse/pull/6329) ([Vdimir](https://github.com/Vdimir)) -* Server exception got while sending insertion data is now being processed in client as well. [#5891](https://github.com/ClickHouse/ClickHouse/issues/5891) [#6711](https://github.com/ClickHouse/ClickHouse/pull/6711) ([dimarub2000](https://github.com/dimarub2000)) -* Added a metric `DistributedFilesToInsert` that shows the total number of files in filesystem that are selected to send to remote servers by Distributed tables. The number is summed across all shards. [#6600](https://github.com/ClickHouse/ClickHouse/pull/6600) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Move most of JOINs prepare logic from `ExpressionAction/ExpressionAnalyzer` to `AnalyzedJoin`. [#6785](https://github.com/ClickHouse/ClickHouse/pull/6785) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix TSan [warning](https://clickhouse-test-reports.s3.yandex.net/6399/c1c1d1daa98e199e620766f1bd06a5921050a00d/functional_stateful_tests_(thread).html) 'lock-order-inversion'. [#6740](https://github.com/ClickHouse/ClickHouse/pull/6740) ([Vasily Nemkov](https://github.com/Enmk)) -* Better information messages about lack of Linux capabilities. Logging fatal errors with "fatal" level, that will make it easier to find in `system.text_log`. [#6441](https://github.com/ClickHouse/ClickHouse/pull/6441) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* When enable dumping temporary data to the disk to restrict memory usage during `GROUP BY`, `ORDER BY`, it didn't check the free disk space. The fix add a new setting `min_free_disk_space`, when the free disk space it smaller then the threshold, the query will stop and throw `ErrorCodes::NOT_ENOUGH_SPACE`. [#6678](https://github.com/ClickHouse/ClickHouse/pull/6678) ([Weiqing Xu](https://github.com/weiqxu)) [#6691](https://github.com/ClickHouse/ClickHouse/pull/6691) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Removed recursive rwlock by thread. It makes no sense, because threads are reused between queries. `SELECT` query may acquire a lock in one thread, hold a lock from another thread and exit from first thread. In the same time, first thread can be reused by `DROP` query. This will lead to false "Attempt to acquire exclusive lock recursively" messages. [#6771](https://github.com/ClickHouse/ClickHouse/pull/6771) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Split `ExpressionAnalyzer.appendJoin()`. Prepare a place in `ExpressionAnalyzer` for `MergeJoin`. [#6524](https://github.com/ClickHouse/ClickHouse/pull/6524) ([Artem Zuikov](https://github.com/4ertus2)) -* Added `mysql_native_password` authentication plugin to MySQL compatibility server. [#6194](https://github.com/ClickHouse/ClickHouse/pull/6194) ([Yuriy Baranov](https://github.com/yurriy)) -* Less number of `clock_gettime` calls; fixed ABI compatibility between debug/release in `Allocator` (insignificant issue). [#6197](https://github.com/ClickHouse/ClickHouse/pull/6197) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Move `collectUsedColumns` from `ExpressionAnalyzer` to `SyntaxAnalyzer`. `SyntaxAnalyzer` makes `required_source_columns` itself now. [#6416](https://github.com/ClickHouse/ClickHouse/pull/6416) ([Artem Zuikov](https://github.com/4ertus2)) -* Add setting `joined_subquery_requires_alias` to require aliases for subselects and table functions in `FROM` that more than one table is present (i.e. queries with JOINs). [#6733](https://github.com/ClickHouse/ClickHouse/pull/6733) ([Artem Zuikov](https://github.com/4ertus2)) -* Extract `GetAggregatesVisitor` class from `ExpressionAnalyzer`. [#6458](https://github.com/ClickHouse/ClickHouse/pull/6458) ([Artem Zuikov](https://github.com/4ertus2)) -* `system.query_log`: change data type of `type` column to `Enum`. [#6265](https://github.com/ClickHouse/ClickHouse/pull/6265) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) -* Static linking of `sha256_password` authentication plugin. [#6512](https://github.com/ClickHouse/ClickHouse/pull/6512) ([Yuriy Baranov](https://github.com/yurriy)) -* Avoid extra dependency for the setting `compile` to work. In previous versions, the user may get error like `cannot open crti.o`, `unable to find library -lc` etc. [#6309](https://github.com/ClickHouse/ClickHouse/pull/6309) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* More validation of the input that may come from malicious replica. [#6303](https://github.com/ClickHouse/ClickHouse/pull/6303) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Now `clickhouse-obfuscator` file is available in `clickhouse-client` package. In previous versions it was available as `clickhouse obfuscator` (with whitespace). [#5816](https://github.com/ClickHouse/ClickHouse/issues/5816) [#6609](https://github.com/ClickHouse/ClickHouse/pull/6609) ([dimarub2000](https://github.com/dimarub2000)) -* Fixed deadlock when we have at least two queries that read at least two tables in different order and another query that performs DDL operation on one of tables. Fixed another very rare deadlock. [#6764](https://github.com/ClickHouse/ClickHouse/pull/6764) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added `os_thread_ids` column to `system.processes` and `system.query_log` for better debugging possibilities. [#6763](https://github.com/ClickHouse/ClickHouse/pull/6763) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* A workaround for PHP mysqlnd extension bugs which occur when `sha256_password` is used as a default authentication plugin (described in [#6031](https://github.com/ClickHouse/ClickHouse/issues/6031)). [#6113](https://github.com/ClickHouse/ClickHouse/pull/6113) ([Yuriy Baranov](https://github.com/yurriy)) -* Remove unneeded place with changed nullability columns. [#6693](https://github.com/ClickHouse/ClickHouse/pull/6693) ([Artem Zuikov](https://github.com/4ertus2)) -* Set default value of `queue_max_wait_ms` to zero, because current value (five seconds) makes no sense. There are rare circumstances when this settings has any use. Added settings `replace_running_query_max_wait_ms`, `kafka_max_wait_ms` and `connection_pool_max_wait_ms` for disambiguation. [#6692](https://github.com/ClickHouse/ClickHouse/pull/6692) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Extract `SelectQueryExpressionAnalyzer` from `ExpressionAnalyzer`. Keep the last one for non-select queries. [#6499](https://github.com/ClickHouse/ClickHouse/pull/6499) ([Artem Zuikov](https://github.com/4ertus2)) -* Removed duplicating input and output formats. [#6239](https://github.com/ClickHouse/ClickHouse/pull/6239) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [#6230](https://github.com/ClickHouse/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `MergeTree` now has an additional option `ttl_only_drop_parts` (disabled by default) to avoid partial pruning of parts, so that they dropped completely when all the rows in a part are expired. [#6191](https://github.com/ClickHouse/ClickHouse/pull/6191) ([Sergi Vladykin](https://github.com/svladykin)) -* Type checks for set index functions. Throw exception if function got a wrong type. This fixes fuzz test with UBSan. [#6511](https://github.com/ClickHouse/ClickHouse/pull/6511) ([Nikita Vasilev](https://github.com/nikvas0)) - -#### Performance Improvement -* Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with sorting key in `MergeTree` tables. This optimization is controlled by `optimize_read_in_order` setting. [#6054](https://github.com/ClickHouse/ClickHouse/pull/6054) [#6629](https://github.com/ClickHouse/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) -* Allow to use multiple threads during parts loading and removal. [#6372](https://github.com/ClickHouse/ClickHouse/issues/6372) [#6074](https://github.com/ClickHouse/ClickHouse/issues/6074) [#6438](https://github.com/ClickHouse/ClickHouse/pull/6438) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [#6435](https://github.com/ClickHouse/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/ClickHouse/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) Using Danila Kutenin variant to make fastops working [#6317](https://github.com/ClickHouse/ClickHouse/pull/6317) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/ClickHouse/ClickHouse/pull/6298) [#6701](https://github.com/ClickHouse/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) -* Improved performance of `simdjson` library by getting rid of dynamic allocation in `ParsedJson::Iterator`. [#6479](https://github.com/ClickHouse/ClickHouse/pull/6479) ([Vitaly Baranov](https://github.com/vitlibar)) -* Pre-fault pages when allocating memory with `mmap()`. [#6667](https://github.com/ClickHouse/ClickHouse/pull/6667) ([akuzm](https://github.com/akuzm)) -* Fix performance bug in `Decimal` comparison. [#6380](https://github.com/ClickHouse/ClickHouse/pull/6380) ([Artem Zuikov](https://github.com/4ertus2)) - -#### Build/Testing/Packaging Improvement -* Remove Compiler (runtime template instantiation) because we've win over it's performance. [#6646](https://github.com/ClickHouse/ClickHouse/pull/6646) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added performance test to show degradation of performance in gcc-9 in more isolated way. [#6302](https://github.com/ClickHouse/ClickHouse/pull/6302) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added table function `numbers_mt`, which is multithreaded version of `numbers`. Updated performance tests with hash functions. [#6554](https://github.com/ClickHouse/ClickHouse/pull/6554) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Comparison mode in `clickhouse-benchmark` [#6220](https://github.com/ClickHouse/ClickHouse/issues/6220) [#6343](https://github.com/ClickHouse/ClickHouse/pull/6343) ([dimarub2000](https://github.com/dimarub2000)) -* Best effort for printing stack traces. Also added `SIGPROF` as a debugging signal to print stack trace of a running thread. [#6529](https://github.com/ClickHouse/ClickHouse/pull/6529) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Every function in its own file, part 10. [#6321](https://github.com/ClickHouse/ClickHouse/pull/6321) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Remove doubled const `TABLE_IS_READ_ONLY`. [#6566](https://github.com/ClickHouse/ClickHouse/pull/6566) ([filimonov](https://github.com/filimonov)) -* Formatting changes for `StringHashMap` PR [#5417](https://github.com/ClickHouse/ClickHouse/issues/5417). [#6700](https://github.com/ClickHouse/ClickHouse/pull/6700) ([akuzm](https://github.com/akuzm)) -* Better subquery for join creation in `ExpressionAnalyzer`. [#6824](https://github.com/ClickHouse/ClickHouse/pull/6824) ([Artem Zuikov](https://github.com/4ertus2)) -* Remove a redundant condition (found by PVS Studio). [#6775](https://github.com/ClickHouse/ClickHouse/pull/6775) ([akuzm](https://github.com/akuzm)) -* Separate the hash table interface for `ReverseIndex`. [#6672](https://github.com/ClickHouse/ClickHouse/pull/6672) ([akuzm](https://github.com/akuzm)) -* Refactoring of settings. [#6689](https://github.com/ClickHouse/ClickHouse/pull/6689) ([alesapin](https://github.com/alesapin)) -* Add comments for `set` index functions. [#6319](https://github.com/ClickHouse/ClickHouse/pull/6319) ([Nikita Vasilev](https://github.com/nikvas0)) -* Increase OOM score in debug version on Linux. [#6152](https://github.com/ClickHouse/ClickHouse/pull/6152) ([akuzm](https://github.com/akuzm)) -* HDFS HA now work in debug build. [#6650](https://github.com/ClickHouse/ClickHouse/pull/6650) ([Weiqing Xu](https://github.com/weiqxu)) -* Added a test to `transform_query_for_external_database`. [#6388](https://github.com/ClickHouse/ClickHouse/pull/6388) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add test for multiple materialized views for Kafka table. [#6509](https://github.com/ClickHouse/ClickHouse/pull/6509) ([Ivan](https://github.com/abyss7)) -* Make a better build scheme. [#6500](https://github.com/ClickHouse/ClickHouse/pull/6500) ([Ivan](https://github.com/abyss7)) -* Fixed `test_external_dictionaries` integration in case it was executed under non root user. [#6507](https://github.com/ClickHouse/ClickHouse/pull/6507) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* The bug reproduces when total size of written packets exceeds `DBMS_DEFAULT_BUFFER_SIZE`. [#6204](https://github.com/ClickHouse/ClickHouse/pull/6204) ([Yuriy Baranov](https://github.com/yurriy)) -* Added a test for `RENAME` table race condition [#6752](https://github.com/ClickHouse/ClickHouse/pull/6752) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid data race on Settings in `KILL QUERY`. [#6753](https://github.com/ClickHouse/ClickHouse/pull/6753) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add integration test for handling errors by a cache dictionary. [#6755](https://github.com/ClickHouse/ClickHouse/pull/6755) ([Vitaly Baranov](https://github.com/vitlibar)) -* Disable parsing of ELF object files on Mac OS, because it makes no sense. [#6578](https://github.com/ClickHouse/ClickHouse/pull/6578) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Attempt to make changelog generator better. [#6327](https://github.com/ClickHouse/ClickHouse/pull/6327) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Adding `-Wshadow` switch to the GCC. [#6325](https://github.com/ClickHouse/ClickHouse/pull/6325) ([kreuzerkrieg](https://github.com/kreuzerkrieg)) -* Removed obsolete code for `mimalloc` support. [#6715](https://github.com/ClickHouse/ClickHouse/pull/6715) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `zlib-ng` determines x86 capabilities and saves this info to global variables. This is done in defalteInit call, which may be made by different threads simultaneously. To avoid multithreaded writes, do it on library startup. [#6141](https://github.com/ClickHouse/ClickHouse/pull/6141) ([akuzm](https://github.com/akuzm)) -* Regression test for a bug which in join which was fixed in [#5192](https://github.com/ClickHouse/ClickHouse/issues/5192). [#6147](https://github.com/ClickHouse/ClickHouse/pull/6147) ([Bakhtiyor Ruziev](https://github.com/theruziev)) -* Fixed MSan report. [#6144](https://github.com/ClickHouse/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix flapping TTL test. [#6782](https://github.com/ClickHouse/ClickHouse/pull/6782) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed false data race in `MergeTreeDataPart::is_frozen` field. [#6583](https://github.com/ClickHouse/ClickHouse/pull/6583) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed timeouts in fuzz test. In previous version, it managed to find false hangup in query `SELECT * FROM numbers_mt(gccMurmurHash(''))`. [#6582](https://github.com/ClickHouse/ClickHouse/pull/6582) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added debug checks to `static_cast` of columns. [#6581](https://github.com/ClickHouse/ClickHouse/pull/6581) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Support for Oracle Linux in official RPM packages. [#6356](https://github.com/ClickHouse/ClickHouse/issues/6356) [#6585](https://github.com/ClickHouse/ClickHouse/pull/6585) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Changed json perftests from `once` to `loop` type. [#6536](https://github.com/ClickHouse/ClickHouse/pull/6536) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* `odbc-bridge.cpp` defines `main()` so it should not be included in `clickhouse-lib`. [#6538](https://github.com/ClickHouse/ClickHouse/pull/6538) ([Orivej Desh](https://github.com/orivej)) -* Test for crash in `FULL|RIGHT JOIN` with nulls in right table's keys. [#6362](https://github.com/ClickHouse/ClickHouse/pull/6362) ([Artem Zuikov](https://github.com/4ertus2)) -* Added a test for the limit on expansion of aliases just in case. [#6442](https://github.com/ClickHouse/ClickHouse/pull/6442) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Switched from `boost::filesystem` to `std::filesystem` where appropriate. [#6253](https://github.com/ClickHouse/ClickHouse/pull/6253) [#6385](https://github.com/ClickHouse/ClickHouse/pull/6385) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added RPM packages to website. [#6251](https://github.com/ClickHouse/ClickHouse/pull/6251) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add a test for fixed `Unknown identifier` exception in `IN` section. [#6708](https://github.com/ClickHouse/ClickHouse/pull/6708) ([Artem Zuikov](https://github.com/4ertus2)) -* Simplify `shared_ptr_helper` because people facing difficulties understanding it. [#6675](https://github.com/ClickHouse/ClickHouse/pull/6675) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added performance tests for fixed Gorilla and DoubleDelta codec. [#6179](https://github.com/ClickHouse/ClickHouse/pull/6179) ([Vasily Nemkov](https://github.com/Enmk)) -* Split the integration test `test_dictionaries` into 4 separate tests. [#6776](https://github.com/ClickHouse/ClickHouse/pull/6776) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix PVS-Studio warning in `PipelineExecutor`. [#6777](https://github.com/ClickHouse/ClickHouse/pull/6777) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Allow to use `library` dictionary source with ASan. [#6482](https://github.com/ClickHouse/ClickHouse/pull/6482) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added option to generate changelog from a list of PRs. [#6350](https://github.com/ClickHouse/ClickHouse/pull/6350) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Lock the `TinyLog` storage when reading. [#6226](https://github.com/ClickHouse/ClickHouse/pull/6226) ([akuzm](https://github.com/akuzm)) -* Check for broken symlinks in CI. [#6634](https://github.com/ClickHouse/ClickHouse/pull/6634) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Increase timeout for "stack overflow" test because it may take a long time in debug build. [#6637](https://github.com/ClickHouse/ClickHouse/pull/6637) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added a check for double whitespaces. [#6643](https://github.com/ClickHouse/ClickHouse/pull/6643) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix `new/delete` memory tracking when build with sanitizers. Tracking is not clear. It only prevents memory limit exceptions in tests. [#6450](https://github.com/ClickHouse/ClickHouse/pull/6450) ([Artem Zuikov](https://github.com/4ertus2)) -* Enable back the check of undefined symbols while linking. [#6453](https://github.com/ClickHouse/ClickHouse/pull/6453) ([Ivan](https://github.com/abyss7)) -* Avoid rebuilding `hyperscan` every day. [#6307](https://github.com/ClickHouse/ClickHouse/pull/6307) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed UBSan report in `ProtobufWriter`. [#6163](https://github.com/ClickHouse/ClickHouse/pull/6163) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Don't allow to use query profiler with sanitizers because it is not compatible. [#6769](https://github.com/ClickHouse/ClickHouse/pull/6769) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add test for reloading a dictionary after fail by timer. [#6114](https://github.com/ClickHouse/ClickHouse/pull/6114) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix inconsistency in `PipelineExecutor::prepareProcessor` argument type. [#6494](https://github.com/ClickHouse/ClickHouse/pull/6494) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Added a test for bad URIs. [#6493](https://github.com/ClickHouse/ClickHouse/pull/6493) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added more checks to `CAST` function. This should get more information about segmentation fault in fuzzy test. [#6346](https://github.com/ClickHouse/ClickHouse/pull/6346) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Added `gcc-9` support to `docker/builder` container that builds image locally. [#6333](https://github.com/ClickHouse/ClickHouse/pull/6333) ([Gleb Novikov](https://github.com/NanoBjorn)) -* Test for primary key with `LowCardinality(String)`. [#5044](https://github.com/ClickHouse/ClickHouse/issues/5044) [#6219](https://github.com/ClickHouse/ClickHouse/pull/6219) ([dimarub2000](https://github.com/dimarub2000)) -* Fixed tests affected by slow stack traces printing. [#6315](https://github.com/ClickHouse/ClickHouse/pull/6315) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add a test case for crash in `groupUniqArray` fixed in [#6029](https://github.com/ClickHouse/ClickHouse/pull/6029). [#4402](https://github.com/ClickHouse/ClickHouse/issues/4402) [#6129](https://github.com/ClickHouse/ClickHouse/pull/6129) ([akuzm](https://github.com/akuzm)) -* Fixed indices mutations tests. [#6645](https://github.com/ClickHouse/ClickHouse/pull/6645) ([Nikita Vasilev](https://github.com/nikvas0)) -* In performance test, do not read query log for queries we didn't run. [#6427](https://github.com/ClickHouse/ClickHouse/pull/6427) ([akuzm](https://github.com/akuzm)) -* Materialized view now could be created with any low cardinality types regardless to the setting about suspicious low cardinality types. [#6428](https://github.com/ClickHouse/ClickHouse/pull/6428) ([Olga Khvostikova](https://github.com/stavrolia)) -* Updated tests for `send_logs_level` setting. [#6207](https://github.com/ClickHouse/ClickHouse/pull/6207) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix build under gcc-8.2. [#6196](https://github.com/ClickHouse/ClickHouse/pull/6196) ([Max Akhmedov](https://github.com/zlobober)) -* Fix build with internal libc++. [#6724](https://github.com/ClickHouse/ClickHouse/pull/6724) ([Ivan](https://github.com/abyss7)) -* Fix shared build with `rdkafka` library [#6101](https://github.com/ClickHouse/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) -* Fixes for Mac OS build (incomplete). [#6390](https://github.com/ClickHouse/ClickHouse/pull/6390) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6429](https://github.com/ClickHouse/ClickHouse/pull/6429) ([alex-zaitsev](https://github.com/alex-zaitsev)) -* Fix "splitted" build. [#6618](https://github.com/ClickHouse/ClickHouse/pull/6618) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Other build fixes: [#6186](https://github.com/ClickHouse/ClickHouse/pull/6186) ([Amos Bird](https://github.com/amosbird)) [#6486](https://github.com/ClickHouse/ClickHouse/pull/6486) [#6348](https://github.com/ClickHouse/ClickHouse/pull/6348) ([vxider](https://github.com/Vxider)) [#6744](https://github.com/ClickHouse/ClickHouse/pull/6744) ([Ivan](https://github.com/abyss7)) [#6016](https://github.com/ClickHouse/ClickHouse/pull/6016) [#6421](https://github.com/ClickHouse/ClickHouse/pull/6421) [#6491](https://github.com/ClickHouse/ClickHouse/pull/6491) ([proller](https://github.com/proller)) - -#### Backward Incompatible Change -* Removed rarely used table function `catBoostPool` and storage `CatBoostPool`. If you have used this table function, please write email to `clickhouse-feedback@yandex-team.com`. Note that CatBoost integration remains and will be supported. [#6279](https://github.com/ClickHouse/ClickHouse/pull/6279) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Disable `ANY RIGHT JOIN` and `ANY FULL JOIN` by default. Set `any_join_distinct_right_table_keys` setting to enable them. [#5126](https://github.com/ClickHouse/ClickHouse/issues/5126) [#6351](https://github.com/ClickHouse/ClickHouse/pull/6351) ([Artem Zuikov](https://github.com/4ertus2)) - -## ClickHouse release 19.13 -### ClickHouse release 19.13.6.51, 2019-10-02 - -#### Bug Fix -* This release also contains all bug fixes from 19.11.12.69. - -### ClickHouse release 19.13.5.44, 2019-09-20 - -#### Bug Fix -* This release also contains all bug fixes from 19.14.6.12. -* Fixed possible inconsistent state of table while executing `DROP` query for replicated table while zookeeper is not accessible. [#6045](https://github.com/ClickHouse/ClickHouse/issues/6045) [#6413](https://github.com/ClickHouse/ClickHouse/pull/6413) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) -* Fix for data race in StorageMerge [#6717](https://github.com/ClickHouse/ClickHouse/pull/6717) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug introduced in query profiler which leads to endless recv from socket. [#6386](https://github.com/ClickHouse/ClickHouse/pull/6386) ([alesapin](https://github.com/alesapin)) -* Fix excessive CPU usage while executing `JSONExtractRaw` function over a boolean value. [#6208](https://github.com/ClickHouse/ClickHouse/pull/6208) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fixes the regression while pushing to materialized view. [#6415](https://github.com/ClickHouse/ClickHouse/pull/6415) ([Ivan](https://github.com/abyss7)) -* Table function `url` had the vulnerability allowed the attacker to inject arbitrary HTTP headers in the request. This issue was found by [Nikita Tikhomirov](https://github.com/NSTikhomirov). [#6466](https://github.com/ClickHouse/ClickHouse/pull/6466) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix useless `AST` check in Set index. [#6510](https://github.com/ClickHouse/ClickHouse/issues/6510) [#6651](https://github.com/ClickHouse/ClickHouse/pull/6651) ([Nikita Vasilev](https://github.com/nikvas0)) -* Fixed parsing of `AggregateFunction` values embedded in query. [#6575](https://github.com/ClickHouse/ClickHouse/issues/6575) [#6773](https://github.com/ClickHouse/ClickHouse/pull/6773) ([Zhichang Yu](https://github.com/yuzhichang)) -* Fixed wrong behaviour of `trim` functions family. [#6647](https://github.com/ClickHouse/ClickHouse/pull/6647) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -### ClickHouse release 19.13.4.32, 2019-09-10 - -#### Bug Fix -* This release also contains all bug security fixes from 19.11.9.52 and 19.11.10.54. -* Fixed data race in `system.parts` table and `ALTER` query. [#6245](https://github.com/ClickHouse/ClickHouse/issues/6245) [#6513](https://github.com/ClickHouse/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed mismatched header in streams happened in case of reading from empty distributed table with sample and prewhere. [#6167](https://github.com/ClickHouse/ClickHouse/issues/6167) ([Lixiang Qian](https://github.com/fancyqlx)) [#6823](https://github.com/ClickHouse/ClickHouse/pull/6823) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed crash when using `IN` clause with a subquery with a tuple. [#6125](https://github.com/ClickHouse/ClickHouse/issues/6125) [#6550](https://github.com/ClickHouse/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) -* Fix case with same column names in `GLOBAL JOIN ON` section. [#6181](https://github.com/ClickHouse/ClickHouse/pull/6181) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix crash when casting types to `Decimal` that do not support it. Throw exception instead. [#6297](https://github.com/ClickHouse/ClickHouse/pull/6297) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed crash in `extractAll()` function. [#6644](https://github.com/ClickHouse/ClickHouse/pull/6644) ([Artem Zuikov](https://github.com/4ertus2)) -* Query transformation for `MySQL`, `ODBC`, `JDBC` table functions now works properly for `SELECT WHERE` queries with multiple `AND` expressions. [#6381](https://github.com/ClickHouse/ClickHouse/issues/6381) [#6676](https://github.com/ClickHouse/ClickHouse/pull/6676) ([dimarub2000](https://github.com/dimarub2000)) -* Added previous declaration checks for MySQL 8 integration. [#6569](https://github.com/ClickHouse/ClickHouse/pull/6569) ([Rafael David Tinoco](https://github.com/rafaeldtinoco)) - -#### Security Fix -* Fix two vulnerabilities in codecs in decompression phase (malicious user can fabricate compressed data that will lead to buffer overflow in decompression). [#6670](https://github.com/ClickHouse/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) - - -### ClickHouse release 19.13.3.26, 2019-08-22 - -#### Bug Fix -* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) -* Fix NPE when using IN clause with a subquery with a tuple. [#6125](https://github.com/ClickHouse/ClickHouse/issues/6125) [#6550](https://github.com/ClickHouse/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) -* Fixed an issue that if a stale replica becomes alive, it may still have data parts that were removed by DROP PARTITION. [#6522](https://github.com/ClickHouse/ClickHouse/issues/6522) [#6523](https://github.com/ClickHouse/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) -* Fixed issue with parsing CSV [#6426](https://github.com/ClickHouse/ClickHouse/issues/6426) [#6559](https://github.com/ClickHouse/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) -* Fixed data race in system.parts table and ALTER query. This fixes [#6245](https://github.com/ClickHouse/ClickHouse/issues/6245). [#6513](https://github.com/ClickHouse/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/ClickHouse/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/ClickHouse/ClickHouse/issues/6224) [#6282](https://github.com/ClickHouse/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) - -#### Security Fix -* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/ClickHouse/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -### ClickHouse release 19.13.2.19, 2019-08-14 - -#### New Feature -* Sampling profiler on query level. [Example](https://gist.github.com/alexey-milovidov/92758583dd41c24c360fdb8d6a4da194). [#4247](https://github.com/ClickHouse/ClickHouse/issues/4247) ([laplab](https://github.com/laplab)) [#6124](https://github.com/ClickHouse/ClickHouse/pull/6124) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6250](https://github.com/ClickHouse/ClickHouse/pull/6250) [#6283](https://github.com/ClickHouse/ClickHouse/pull/6283) [#6386](https://github.com/ClickHouse/ClickHouse/pull/6386) -* Allow to specify a list of columns with `COLUMNS('regexp')` expression that works like a more sophisticated variant of `*` asterisk. [#5951](https://github.com/ClickHouse/ClickHouse/pull/5951) ([mfridental](https://github.com/mfridental)), ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `CREATE TABLE AS table_function()` is now possible [#6057](https://github.com/ClickHouse/ClickHouse/pull/6057) ([dimarub2000](https://github.com/dimarub2000)) -* Adam optimizer for stochastic gradient descent is used by default in `stochasticLinearRegression()` and `stochasticLogisticRegression()` aggregate functions, because it shows good quality without almost any tuning. [#6000](https://github.com/ClickHouse/ClickHouse/pull/6000) ([Quid37](https://github.com/Quid37)) -* Added functions for working with the сustom week number [#5212](https://github.com/ClickHouse/ClickHouse/pull/5212) ([Andy Yang](https://github.com/andyyzh)) -* `RENAME` queries now work with all storages. [#5953](https://github.com/ClickHouse/ClickHouse/pull/5953) ([Ivan](https://github.com/abyss7)) -* Now client receive logs from server with any desired level by setting `send_logs_level` regardless to the log level specified in server settings. [#5964](https://github.com/ClickHouse/ClickHouse/pull/5964) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) - -#### Backward Incompatible Change -* The setting `input_format_defaults_for_omitted_fields` is enabled by default. Inserts in Distributed tables need this setting to be the same on cluster (you need to set it before rolling update). It enables calculation of complex default expressions for omitted fields in `JSONEachRow` and `CSV*` formats. It should be the expected behavior but may lead to negligible performance difference. [#6043](https://github.com/ClickHouse/ClickHouse/pull/6043) ([Artem Zuikov](https://github.com/4ertus2)), [#5625](https://github.com/ClickHouse/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) - -#### Experimental features -* New query processing pipeline. Use `experimental_use_processors=1` option to enable it. Use for your own trouble. [#4914](https://github.com/ClickHouse/ClickHouse/pull/4914) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) - -#### Bug Fix -* Kafka integration has been fixed in this version. -* Fixed `DoubleDelta` encoding of `Int64` for large `DoubleDelta` values, improved `DoubleDelta` encoding for random data for `Int32`. [#5998](https://github.com/ClickHouse/ClickHouse/pull/5998) ([Vasily Nemkov](https://github.com/Enmk)) -* Fixed overestimation of `max_rows_to_read` if the setting `merge_tree_uniform_read_distribution` is set to 0. [#6019](https://github.com/ClickHouse/ClickHouse/pull/6019) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Improvement -* Throws an exception if `config.d` file doesn't have the corresponding root element as the config file [#6123](https://github.com/ClickHouse/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) - -#### Performance Improvement -* Optimize `count()`. Now it uses the smallest column (if possible). [#6028](https://github.com/ClickHouse/ClickHouse/pull/6028) ([Amos Bird](https://github.com/amosbird)) - -#### Build/Testing/Packaging Improvement -* Report memory usage in performance tests. [#5899](https://github.com/ClickHouse/ClickHouse/pull/5899) ([akuzm](https://github.com/akuzm)) -* Fix build with external `libcxx` [#6010](https://github.com/ClickHouse/ClickHouse/pull/6010) ([Ivan](https://github.com/abyss7)) -* Fix shared build with `rdkafka` library [#6101](https://github.com/ClickHouse/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) - -## ClickHouse release 19.11 - -### ClickHouse release 19.11.13.74, 2019-11-01 - -#### Bug Fix -* Fixed rare crash in `ALTER MODIFY COLUMN` and vertical merge when one of merged/altered parts is empty (0 rows). [#6780](https://github.com/ClickHouse/ClickHouse/pull/6780) ([alesapin](https://github.com/alesapin)) -* Manual update of `SIMDJSON`. This fixes possible flooding of stderr files with bogus json diagnostic messages. [#7548](https://github.com/ClickHouse/ClickHouse/pull/7548) ([Alexander Kazakov](https://github.com/Akazz)) -* Fixed bug with `mrk` file extension for mutations ([alesapin](https://github.com/alesapin)) - -### ClickHouse release 19.11.12.69, 2019-10-02 - -#### Bug Fix -* Fixed performance degradation of index analysis on complex keys on large tables. This fixes [#6924](https://github.com/ClickHouse/ClickHouse/issues/6924). [#7075](https://github.com/ClickHouse/ClickHouse/pull/7075) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid rare SIGSEGV while sending data in tables with Distributed engine (`Failed to send batch: file with index XXXXX is absent`). [#7032](https://github.com/ClickHouse/ClickHouse/pull/7032) ([Azat Khuzhin](https://github.com/azat)) -* Fix `Unknown identifier` with multiple joins. This fixes [#5254](https://github.com/ClickHouse/ClickHouse/issues/5254). [#7022](https://github.com/ClickHouse/ClickHouse/pull/7022) ([Artem Zuikov](https://github.com/4ertus2)) - -### ClickHouse release 19.11.11.57, 2019-09-13 -* Fix logical error causing segfaults when selecting from Kafka empty topic. [#6902](https://github.com/ClickHouse/ClickHouse/issues/6902) [#6909](https://github.com/ClickHouse/ClickHouse/pull/6909) ([Ivan](https://github.com/abyss7)) -* Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params. [#6928](https://github.com/ClickHouse/ClickHouse/pull/6928) ([proller](https://github.com/proller)) - -### ClickHouse release 19.11.10.54, 2019-09-10 - -#### Bug Fix -* Do store offsets for Kafka messages manually to be able to commit them all at once for all partitions. Fixes potential duplication in "one consumer - many partitions" scenario. [#6872](https://github.com/ClickHouse/ClickHouse/pull/6872) ([Ivan](https://github.com/abyss7)) - -### ClickHouse release 19.11.9.52, 2019-09-6 -* Improve error handling in cache dictionaries. [#6737](https://github.com/ClickHouse/ClickHouse/pull/6737) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fixed bug in function `arrayEnumerateUniqRanked`. [#6779](https://github.com/ClickHouse/ClickHouse/pull/6779) ([proller](https://github.com/proller)) -* Fix `JSONExtract` function while extracting a `Tuple` from JSON. [#6718](https://github.com/ClickHouse/ClickHouse/pull/6718) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/ClickHouse/ClickHouse/issues/6224) [#6282](https://github.com/ClickHouse/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) -* Fixed performance test. [#6392](https://github.com/ClickHouse/ClickHouse/pull/6392) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Parquet: Fix reading boolean columns. [#6579](https://github.com/ClickHouse/ClickHouse/pull/6579) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed wrong behaviour of `nullIf` function for constant arguments. [#6518](https://github.com/ClickHouse/ClickHouse/pull/6518) ([Guillaume Tassery](https://github.com/YiuRULE)) [#6580](https://github.com/ClickHouse/ClickHouse/pull/6580) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/ClickHouse/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) -* Fixed an issue when long `ALTER UPDATE` or `ALTER DELETE` may prevent regular merges to run. Prevent mutations from executing if there is no enough free threads available. [#6502](https://github.com/ClickHouse/ClickHouse/issues/6502) [#6617](https://github.com/ClickHouse/ClickHouse/pull/6617) ([tavplubix](https://github.com/tavplubix)) -* Fixed error with processing "timezone" in server configuration file. [#6709](https://github.com/ClickHouse/ClickHouse/pull/6709) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix kafka tests. [#6805](https://github.com/ClickHouse/ClickHouse/pull/6805) ([Ivan](https://github.com/abyss7)) - -#### Security Fix -* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse runs, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/ClickHouse/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -### ClickHouse release 19.11.8.46, 2019-08-22 - -#### Bug Fix -* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) -* Fix NPE when using IN clause with a subquery with a tuple. [#6125](https://github.com/ClickHouse/ClickHouse/issues/6125) [#6550](https://github.com/ClickHouse/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) -* Fixed an issue that if a stale replica becomes alive, it may still have data parts that were removed by DROP PARTITION. [#6522](https://github.com/ClickHouse/ClickHouse/issues/6522) [#6523](https://github.com/ClickHouse/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) -* Fixed issue with parsing CSV [#6426](https://github.com/ClickHouse/ClickHouse/issues/6426) [#6559](https://github.com/ClickHouse/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) -* Fixed data race in system.parts table and ALTER query. This fixes [#6245](https://github.com/ClickHouse/ClickHouse/issues/6245). [#6513](https://github.com/ClickHouse/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/ClickHouse/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -### ClickHouse release 19.11.7.40, 2019-08-14 - -#### Bug fix -* Kafka integration has been fixed in this version. -* Fix segfault when using `arrayReduce` for constant arguments. [#6326](https://github.com/ClickHouse/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed `toFloat()` monotonicity. [#6374](https://github.com/ClickHouse/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) -* Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [#6384](https://github.com/ClickHouse/ClickHouse/pull/6384) ([CurtizJ](https://github.com/CurtizJ)) -* Fixed logic of `arrayEnumerateUniqRanked` function. [#6423](https://github.com/ClickHouse/ClickHouse/pull/6423) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Removed extra verbose logging from MySQL handler. [#6389](https://github.com/ClickHouse/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [#6404](https://github.com/ClickHouse/ClickHouse/pull/6404) ([CurtizJ](https://github.com/CurtizJ)) -* Do not expose virtual columns in `system.columns` table. This is required for backward compatibility. [#6406](https://github.com/ClickHouse/ClickHouse/pull/6406) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug with memory allocation for string fields in complex key cache dictionary. [#6447](https://github.com/ClickHouse/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) -* Fix bug with enabling adaptive granularity when creating new replica for `Replicated*MergeTree` table. [#6452](https://github.com/ClickHouse/ClickHouse/pull/6452) ([alesapin](https://github.com/alesapin)) -* Fix infinite loop when reading Kafka messages. [#6354](https://github.com/ClickHouse/ClickHouse/pull/6354) ([abyss7](https://github.com/abyss7)) -* Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser and possibility of stack overflow in `Merge` and `Distributed` tables [#6433](https://github.com/ClickHouse/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed Gorilla encoding error on small sequences. [#6444](https://github.com/ClickHouse/ClickHouse/pull/6444) ([Enmk](https://github.com/Enmk)) - -#### Improvement -* Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [#6230](https://github.com/ClickHouse/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -### ClickHouse release 19.11.5.28, 2019-08-05 - -#### Bug fix -* Fixed the possibility of hanging queries when server is overloaded. [#6301](https://github.com/ClickHouse/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix FPE in yandexConsistentHash function. This fixes [#6304](https://github.com/ClickHouse/ClickHouse/issues/6304). [#6126](https://github.com/ClickHouse/ClickHouse/pull/6126) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/ClickHouse/ClickHouse/issues/6257). [#6281](https://github.com/ClickHouse/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix parsing of `bool` settings from `true` and `false` strings in configuration files. [#6278](https://github.com/ClickHouse/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) -* Fix rare bug with incompatible stream headers in queries to `Distributed` table over `MergeTree` table when part of `WHERE` moves to `PREWHERE`. [#6236](https://github.com/ClickHouse/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) -* Fixed overflow in integer division of signed type to unsigned type. This fixes [#6214](https://github.com/ClickHouse/ClickHouse/issues/6214). [#6233](https://github.com/ClickHouse/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Backward Incompatible Change -* `Kafka` still broken. - -### ClickHouse release 19.11.4.24, 2019-08-01 - -#### Bug Fix -* Fix bug with writing secondary indices marks with adaptive granularity. [#6126](https://github.com/ClickHouse/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin)) -* Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [#6225](https://github.com/ClickHouse/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed hang in `JSONExtractRaw` function. Fixed [#6195](https://github.com/ClickHouse/ClickHouse/issues/6195) [#6198](https://github.com/ClickHouse/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix segfault in ExternalLoader::reloadOutdated(). [#6082](https://github.com/ClickHouse/ClickHouse/pull/6082) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fixed the case when server may close listening sockets but not shutdown and continue serving remaining queries. You may end up with two running clickhouse-server processes. Sometimes, the server may return an error `bad_function_call` for remaining queries. [#6231](https://github.com/ClickHouse/ClickHouse/pull/6231) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed useless and incorrect condition on update field for initial loading of external dictionaries via ODBC, MySQL, ClickHouse and HTTP. This fixes [#6069](https://github.com/ClickHouse/ClickHouse/issues/6069) [#6083](https://github.com/ClickHouse/ClickHouse/pull/6083) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn't contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [#6094](https://github.com/ClickHouse/ClickHouse/issues/6094) [#6119](https://github.com/ClickHouse/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix non-deterministic result of "uniq" aggregate function in extreme rare cases. The bug was present in all ClickHouse versions. [#6058](https://github.com/ClickHouse/ClickHouse/pull/6058) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Segfault when we set a little bit too high CIDR on the function `IPv6CIDRToRange`. [#6068](https://github.com/ClickHouse/ClickHouse/pull/6068) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Fixed small memory leak when server throw many exceptions from many different contexts. [#6144](https://github.com/ClickHouse/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix the situation when consumer got paused before subscription and not resumed afterwards. [#6075](https://github.com/ClickHouse/ClickHouse/pull/6075) ([Ivan](https://github.com/abyss7)) Note that Kafka is broken in this version. -* Clearing the Kafka data buffer from the previous read operation that was completed with an error [#6026](https://github.com/ClickHouse/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) Note that Kafka is broken in this version. -* Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [#6080](https://github.com/ClickHouse/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7)) - -#### Build/Testing/Packaging Improvement -* Added official `rpm` packages. [#5740](https://github.com/ClickHouse/ClickHouse/pull/5740) ([proller](https://github.com/proller)) ([alesapin](https://github.com/alesapin)) -* Add an ability to build `.rpm` and `.tgz` packages with `packager` script. [#5769](https://github.com/ClickHouse/ClickHouse/pull/5769) ([alesapin](https://github.com/alesapin)) -* Fixes for "Arcadia" build system. [#6223](https://github.com/ClickHouse/ClickHouse/pull/6223) ([proller](https://github.com/proller)) - -#### Backward Incompatible Change -* `Kafka` is broken in this version. - - -### ClickHouse release 19.11.3.11, 2019-07-18 - -#### New Feature -* Added support for prepared statements. [#5331](https://github.com/ClickHouse/ClickHouse/pull/5331/) ([Alexander](https://github.com/sanych73)) [#5630](https://github.com/ClickHouse/ClickHouse/pull/5630) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `DoubleDelta` and `Gorilla` column codecs [#5600](https://github.com/ClickHouse/ClickHouse/pull/5600) ([Vasily Nemkov](https://github.com/Enmk)) -* Added `os_thread_priority` setting that allows to control the "nice" value of query processing threads that is used by OS to adjust dynamic scheduling priority. It requires `CAP_SYS_NICE` capabilities to work. This implements [#5858](https://github.com/ClickHouse/ClickHouse/issues/5858) [#5909](https://github.com/ClickHouse/ClickHouse/pull/5909) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Implement `_topic`, `_offset`, `_key` columns for Kafka engine [#5382](https://github.com/ClickHouse/ClickHouse/pull/5382) ([Ivan](https://github.com/abyss7)) Note that Kafka is broken in this version. -* Add aggregate function combinator `-Resample` [#5590](https://github.com/ClickHouse/ClickHouse/pull/5590) ([hcz](https://github.com/hczhcz)) -* Aggregate functions `groupArrayMovingSum(win_size)(x)` and `groupArrayMovingAvg(win_size)(x)`, which calculate moving sum/avg with or without window-size limitation. [#5595](https://github.com/ClickHouse/ClickHouse/pull/5595) ([inv2004](https://github.com/inv2004)) -* Add synonim `arrayFlatten` <-> `flatten` [#5764](https://github.com/ClickHouse/ClickHouse/pull/5764) ([hcz](https://github.com/hczhcz)) -* Intergate H3 function `geoToH3` from Uber. [#4724](https://github.com/ClickHouse/ClickHouse/pull/4724) ([Remen Ivan](https://github.com/BHYCHIK)) [#5805](https://github.com/ClickHouse/ClickHouse/pull/5805) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Bug Fix -* Implement DNS cache with asynchronous update. Separate thread resolves all hosts and updates DNS cache with period (setting `dns_cache_update_period`). It should help, when ip of hosts changes frequently. [#5857](https://github.com/ClickHouse/ClickHouse/pull/5857) ([Anton Popov](https://github.com/CurtizJ)) -* Fix segfault in `Delta` codec which affects columns with values less than 32 bits size. The bug led to random memory corruption. [#5786](https://github.com/ClickHouse/ClickHouse/pull/5786) ([alesapin](https://github.com/alesapin)) -* Fix segfault in TTL merge with non-physical columns in block. [#5819](https://github.com/ClickHouse/ClickHouse/pull/5819) ([Anton Popov](https://github.com/CurtizJ)) -* Fix rare bug in checking of part with `LowCardinality` column. Previously `checkDataPart` always fails for part with `LowCardinality` column. [#5832](https://github.com/ClickHouse/ClickHouse/pull/5832) ([alesapin](https://github.com/alesapin)) -* Avoid hanging connections when server thread pool is full. It is important for connections from `remote` table function or connections to a shard without replicas when there is long connection timeout. This fixes [#5878](https://github.com/ClickHouse/ClickHouse/issues/5878) [#5881](https://github.com/ClickHouse/ClickHouse/pull/5881) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Support for constant arguments to `evalMLModel` function. This fixes [#5817](https://github.com/ClickHouse/ClickHouse/issues/5817) [#5820](https://github.com/ClickHouse/ClickHouse/pull/5820) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed the issue when ClickHouse determines default time zone as `UCT` instead of `UTC`. This fixes [#5804](https://github.com/ClickHouse/ClickHouse/issues/5804). [#5828](https://github.com/ClickHouse/ClickHouse/pull/5828) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed buffer underflow in `visitParamExtractRaw`. This fixes [#5901](https://github.com/ClickHouse/ClickHouse/issues/5901) [#5902](https://github.com/ClickHouse/ClickHouse/pull/5902) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Now distributed `DROP/ALTER/TRUNCATE/OPTIMIZE ON CLUSTER` queries will be executed directly on leader replica. [#5757](https://github.com/ClickHouse/ClickHouse/pull/5757) ([alesapin](https://github.com/alesapin)) -* Fix `coalesce` for `ColumnConst` with `ColumnNullable` + related changes. [#5755](https://github.com/ClickHouse/ClickHouse/pull/5755) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix the `ReadBufferFromKafkaConsumer` so that it keeps reading new messages after `commit()` even if it was stalled before [#5852](https://github.com/ClickHouse/ClickHouse/pull/5852) ([Ivan](https://github.com/abyss7)) -* Fix `FULL` and `RIGHT` JOIN results when joining on `Nullable` keys in right table. [#5859](https://github.com/ClickHouse/ClickHouse/pull/5859) ([Artem Zuikov](https://github.com/4ertus2)) -* Possible fix of infinite sleeping of low-priority queries. [#5842](https://github.com/ClickHouse/ClickHouse/pull/5842) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix race condition, which cause that some queries may not appear in query_log after `SYSTEM FLUSH LOGS` query. [#5456](https://github.com/ClickHouse/ClickHouse/issues/5456) [#5685](https://github.com/ClickHouse/ClickHouse/pull/5685) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed `heap-use-after-free` ASan warning in ClusterCopier caused by watch which try to use already removed copier object. [#5871](https://github.com/ClickHouse/ClickHouse/pull/5871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed wrong `StringRef` pointer returned by some implementations of `IColumn::deserializeAndInsertFromArena`. This bug affected only unit-tests. [#5973](https://github.com/ClickHouse/ClickHouse/pull/5973) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Prevent source and intermediate array join columns of masking same name columns. [#5941](https://github.com/ClickHouse/ClickHouse/pull/5941) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix insert and select query to MySQL engine with MySQL style identifier quoting. [#5704](https://github.com/ClickHouse/ClickHouse/pull/5704) ([Winter Zhang](https://github.com/zhang2014)) -* Now `CHECK TABLE` query can work with MergeTree engine family. It returns check status and message if any for each part (or file in case of simplier engines). Also, fix bug in fetch of a broken part. [#5865](https://github.com/ClickHouse/ClickHouse/pull/5865) ([alesapin](https://github.com/alesapin)) -* Fix SPLIT_SHARED_LIBRARIES runtime [#5793](https://github.com/ClickHouse/ClickHouse/pull/5793) ([Danila Kutenin](https://github.com/danlark1)) -* Fixed time zone initialization when `/etc/localtime` is a relative symlink like `../usr/share/zoneinfo/Europe/Moscow` [#5922](https://github.com/ClickHouse/ClickHouse/pull/5922) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* clickhouse-copier: Fix use-after free on shutdown [#5752](https://github.com/ClickHouse/ClickHouse/pull/5752) ([proller](https://github.com/proller)) -* Updated `simdjson`. Fixed the issue that some invalid JSONs with zero bytes successfully parse. [#5938](https://github.com/ClickHouse/ClickHouse/pull/5938) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix shutdown of SystemLogs [#5802](https://github.com/ClickHouse/ClickHouse/pull/5802) ([Anton Popov](https://github.com/CurtizJ)) -* Fix hanging when condition in invalidate_query depends on a dictionary. [#6011](https://github.com/ClickHouse/ClickHouse/pull/6011) ([Vitaly Baranov](https://github.com/vitlibar)) - -#### Improvement -* Allow unresolvable addresses in cluster configuration. They will be considered unavailable and tried to resolve at every connection attempt. This is especially useful for Kubernetes. This fixes [#5714](https://github.com/ClickHouse/ClickHouse/issues/5714) [#5924](https://github.com/ClickHouse/ClickHouse/pull/5924) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Close idle TCP connections (with one hour timeout by default). This is especially important for large clusters with multiple distributed tables on every server, because every server can possibly keep a connection pool to every other server, and after peak query concurrency, connections will stall. This fixes [#5879](https://github.com/ClickHouse/ClickHouse/issues/5879) [#5880](https://github.com/ClickHouse/ClickHouse/pull/5880) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Better quality of `topK` function. Changed the SavingSpace set behavior to remove the last element if the new element have a bigger weight. [#5833](https://github.com/ClickHouse/ClickHouse/issues/5833) [#5850](https://github.com/ClickHouse/ClickHouse/pull/5850) ([Guillaume Tassery](https://github.com/YiuRULE)) -* URL functions to work with domains now can work for incomplete URLs without scheme [#5725](https://github.com/ClickHouse/ClickHouse/pull/5725) ([alesapin](https://github.com/alesapin)) -* Checksums added to the `system.parts_columns` table. [#5874](https://github.com/ClickHouse/ClickHouse/pull/5874) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) -* Added `Enum` data type as a synonim for `Enum8` or `Enum16`. [#5886](https://github.com/ClickHouse/ClickHouse/pull/5886) ([dimarub2000](https://github.com/dimarub2000)) -* Full bit transpose variant for `T64` codec. Could lead to better compression with `zstd`. [#5742](https://github.com/ClickHouse/ClickHouse/pull/5742) ([Artem Zuikov](https://github.com/4ertus2)) -* Condition on `startsWith` function now can uses primary key. This fixes [#5310](https://github.com/ClickHouse/ClickHouse/issues/5310) and [#5882](https://github.com/ClickHouse/ClickHouse/issues/5882) [#5919](https://github.com/ClickHouse/ClickHouse/pull/5919) ([dimarub2000](https://github.com/dimarub2000)) -* Allow to use `clickhouse-copier` with cross-replication cluster topology by permitting empty database name. [#5745](https://github.com/ClickHouse/ClickHouse/pull/5745) ([nvartolomei](https://github.com/nvartolomei)) -* Use `UTC` as default timezone on a system without `tzdata` (e.g. bare Docker container). Before this patch, error message `Could not determine local time zone` was printed and server or client refused to start. [#5827](https://github.com/ClickHouse/ClickHouse/pull/5827) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Returned back support for floating point argument in function `quantileTiming` for backward compatibility. [#5911](https://github.com/ClickHouse/ClickHouse/pull/5911) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Show which table is missing column in error messages. [#5768](https://github.com/ClickHouse/ClickHouse/pull/5768) ([Ivan](https://github.com/abyss7)) -* Disallow run query with same query_id by various users [#5430](https://github.com/ClickHouse/ClickHouse/pull/5430) ([proller](https://github.com/proller)) -* More robust code for sending metrics to Graphite. It will work even during long multiple `RENAME TABLE` operation. [#5875](https://github.com/ClickHouse/ClickHouse/pull/5875) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* More informative error messages will be displayed when ThreadPool cannot schedule a task for execution. This fixes [#5305](https://github.com/ClickHouse/ClickHouse/issues/5305) [#5801](https://github.com/ClickHouse/ClickHouse/pull/5801) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Inverting ngramSearch to be more intuitive [#5807](https://github.com/ClickHouse/ClickHouse/pull/5807) ([Danila Kutenin](https://github.com/danlark1)) -* Add user parsing in HDFS engine builder [#5946](https://github.com/ClickHouse/ClickHouse/pull/5946) ([akonyaev90](https://github.com/akonyaev90)) -* Update default value of `max_ast_elements parameter` [#5933](https://github.com/ClickHouse/ClickHouse/pull/5933) ([Artem Konovalov](https://github.com/izebit)) -* Added a notion of obsolete settings. The obsolete setting `allow_experimental_low_cardinality_type` can be used with no effect. [0f15c01c6802f7ce1a1494c12c846be8c98944cd](https://github.com/ClickHouse/ClickHouse/commit/0f15c01c6802f7ce1a1494c12c846be8c98944cd) [Alexey Milovidov](https://github.com/alexey-milovidov) - -#### Performance Improvement -* Increase number of streams to SELECT from Merge table for more uniform distribution of threads. Added setting `max_streams_multiplier_for_merge_tables`. This fixes [#5797](https://github.com/ClickHouse/ClickHouse/issues/5797) [#5915](https://github.com/ClickHouse/ClickHouse/pull/5915) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Build/Testing/Packaging Improvement -* Add a backward compatibility test for client-server interaction with different versions of clickhouse. [#5868](https://github.com/ClickHouse/ClickHouse/pull/5868) ([alesapin](https://github.com/alesapin)) -* Test coverage information in every commit and pull request. [#5896](https://github.com/ClickHouse/ClickHouse/pull/5896) ([alesapin](https://github.com/alesapin)) -* Cooperate with address sanitizer to support our custom allocators (`Arena` and `ArenaWithFreeLists`) for better debugging of "use-after-free" errors. [#5728](https://github.com/ClickHouse/ClickHouse/pull/5728) ([akuzm](https://github.com/akuzm)) -* Switch to [LLVM libunwind implementation](https://github.com/llvm-mirror/libunwind) for C++ exception handling and for stack traces printing [#4828](https://github.com/ClickHouse/ClickHouse/pull/4828) ([Nikita Lapkov](https://github.com/laplab)) -* Add two more warnings from -Weverything [#5923](https://github.com/ClickHouse/ClickHouse/pull/5923) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to build ClickHouse with Memory Sanitizer. [#3949](https://github.com/ClickHouse/ClickHouse/pull/3949) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed ubsan report about `bitTest` function in fuzz test. [#5943](https://github.com/ClickHouse/ClickHouse/pull/5943) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Docker: added possibility to init a ClickHouse instance which requires authentication. [#5727](https://github.com/ClickHouse/ClickHouse/pull/5727) ([Korviakov Andrey](https://github.com/shurshun)) -* Update librdkafka to version 1.1.0 [#5872](https://github.com/ClickHouse/ClickHouse/pull/5872) ([Ivan](https://github.com/abyss7)) -* Add global timeout for integration tests and disable some of them in tests code. [#5741](https://github.com/ClickHouse/ClickHouse/pull/5741) ([alesapin](https://github.com/alesapin)) -* Fix some ThreadSanitizer failures. [#5854](https://github.com/ClickHouse/ClickHouse/pull/5854) ([akuzm](https://github.com/akuzm)) -* The `--no-undefined` option forces the linker to check all external names for existence while linking. It's very useful to track real dependencies between libraries in the split build mode. [#5855](https://github.com/ClickHouse/ClickHouse/pull/5855) ([Ivan](https://github.com/abyss7)) -* Added performance test for [#5797](https://github.com/ClickHouse/ClickHouse/issues/5797) [#5914](https://github.com/ClickHouse/ClickHouse/pull/5914) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed compatibility with gcc-7. [#5840](https://github.com/ClickHouse/ClickHouse/pull/5840) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added support for gcc-9. This fixes [#5717](https://github.com/ClickHouse/ClickHouse/issues/5717) [#5774](https://github.com/ClickHouse/ClickHouse/pull/5774) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error when libunwind can be linked incorrectly. [#5948](https://github.com/ClickHouse/ClickHouse/pull/5948) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a few warnings found by PVS-Studio. [#5921](https://github.com/ClickHouse/ClickHouse/pull/5921) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added initial support for `clang-tidy` static analyzer. [#5806](https://github.com/ClickHouse/ClickHouse/pull/5806) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Convert BSD/Linux endian macros( 'be64toh' and 'htobe64') to the Mac OS X equivalents [#5785](https://github.com/ClickHouse/ClickHouse/pull/5785) ([Fu Chen](https://github.com/fredchenbj)) -* Improved integration tests guide. [#5796](https://github.com/ClickHouse/ClickHouse/pull/5796) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Fixing build at macosx + gcc9 [#5822](https://github.com/ClickHouse/ClickHouse/pull/5822) ([filimonov](https://github.com/filimonov)) -* Fix a hard-to-spot typo: aggreAGte -> aggregate. [#5753](https://github.com/ClickHouse/ClickHouse/pull/5753) ([akuzm](https://github.com/akuzm)) -* Fix freebsd build [#5760](https://github.com/ClickHouse/ClickHouse/pull/5760) ([proller](https://github.com/proller)) -* Add link to experimental YouTube channel to website [#5845](https://github.com/ClickHouse/ClickHouse/pull/5845) ([Ivan Blinkov](https://github.com/blinkov)) -* CMake: add option for coverage flags: WITH_COVERAGE [#5776](https://github.com/ClickHouse/ClickHouse/pull/5776) ([proller](https://github.com/proller)) -* Fix initial size of some inline PODArray's. [#5787](https://github.com/ClickHouse/ClickHouse/pull/5787) ([akuzm](https://github.com/akuzm)) -* clickhouse-server.postinst: fix os detection for centos 6 [#5788](https://github.com/ClickHouse/ClickHouse/pull/5788) ([proller](https://github.com/proller)) -* Added Arch linux package generation. [#5719](https://github.com/ClickHouse/ClickHouse/pull/5719) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Split Common/config.h by libs (dbms) [#5715](https://github.com/ClickHouse/ClickHouse/pull/5715) ([proller](https://github.com/proller)) -* Fixes for "Arcadia" build platform [#5795](https://github.com/ClickHouse/ClickHouse/pull/5795) ([proller](https://github.com/proller)) -* Fixes for unconventional build (gcc9, no submodules) [#5792](https://github.com/ClickHouse/ClickHouse/pull/5792) ([proller](https://github.com/proller)) -* Require explicit type in unalignedStore because it was proven to be bug-prone [#5791](https://github.com/ClickHouse/ClickHouse/pull/5791) ([akuzm](https://github.com/akuzm)) -* Fixes MacOS build [#5830](https://github.com/ClickHouse/ClickHouse/pull/5830) ([filimonov](https://github.com/filimonov)) -* Performance test concerning the new JIT feature with bigger dataset, as requested here [#5263](https://github.com/ClickHouse/ClickHouse/issues/5263) [#5887](https://github.com/ClickHouse/ClickHouse/pull/5887) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Run stateful tests in stress test [12693e568722f11e19859742f56428455501fd2a](https://github.com/ClickHouse/ClickHouse/commit/12693e568722f11e19859742f56428455501fd2a) ([alesapin](https://github.com/alesapin)) - -#### Backward Incompatible Change -* `Kafka` is broken in this version. -* Enable `adaptive_index_granularity` = 10MB by default for new `MergeTree` tables. If you created new MergeTree tables on version 19.11+, downgrade to versions prior to 19.6 will be impossible. [#5628](https://github.com/ClickHouse/ClickHouse/pull/5628) ([alesapin](https://github.com/alesapin)) -* Removed obsolete undocumented embedded dictionaries that were used by Yandex.Metrica. The functions `OSIn`, `SEIn`, `OSToRoot`, `SEToRoot`, `OSHierarchy`, `SEHierarchy` are no longer available. If you are using these functions, write email to clickhouse-feedback@yandex-team.com. Note: at the last moment we decided to keep these functions for a while. [#5780](https://github.com/ClickHouse/ClickHouse/pull/5780) ([alexey-milovidov](https://github.com/alexey-milovidov)) - - -## ClickHouse release 19.10 -### ClickHouse release 19.10.1.5, 2019-07-12 - -#### New Feature -* Add new column codec: `T64`. Made for (U)IntX/EnumX/Data(Time)/DecimalX columns. It should be good for columns with constant or small range values. Codec itself allows enlarge or shrink data type without re-compression. [#5557](https://github.com/ClickHouse/ClickHouse/pull/5557) ([Artem Zuikov](https://github.com/4ertus2)) -* Add database engine `MySQL` that allow to view all the tables in remote MySQL server [#5599](https://github.com/ClickHouse/ClickHouse/pull/5599) ([Winter Zhang](https://github.com/zhang2014)) -* `bitmapContains` implementation. It's 2x faster than `bitmapHasAny` if the second bitmap contains one element. [#5535](https://github.com/ClickHouse/ClickHouse/pull/5535) ([Zhichang Yu](https://github.com/yuzhichang)) -* Support for `crc32` function (with behaviour exactly as in MySQL or PHP). Do not use it if you need a hash function. [#5661](https://github.com/ClickHouse/ClickHouse/pull/5661) ([Remen Ivan](https://github.com/BHYCHIK)) -* Implemented `SYSTEM START/STOP DISTRIBUTED SENDS` queries to control asynchronous inserts into `Distributed` tables. [#4935](https://github.com/ClickHouse/ClickHouse/pull/4935) ([Winter Zhang](https://github.com/zhang2014)) - -#### Bug Fix -* Ignore query execution limits and max parts size for merge limits while executing mutations. [#5659](https://github.com/ClickHouse/ClickHouse/pull/5659) ([Anton Popov](https://github.com/CurtizJ)) -* Fix bug which may lead to deduplication of normal blocks (extremely rare) and insertion of duplicate blocks (more often). [#5549](https://github.com/ClickHouse/ClickHouse/pull/5549) ([alesapin](https://github.com/alesapin)) -* Fix of function `arrayEnumerateUniqRanked` for arguments with empty arrays [#5559](https://github.com/ClickHouse/ClickHouse/pull/5559) ([proller](https://github.com/proller)) -* Don't subscribe to Kafka topics without intent to poll any messages. [#5698](https://github.com/ClickHouse/ClickHouse/pull/5698) ([Ivan](https://github.com/abyss7)) -* Make setting `join_use_nulls` get no effect for types that cannot be inside Nullable [#5700](https://github.com/ClickHouse/ClickHouse/pull/5700) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixed `Incorrect size of index granularity` errors [#5720](https://github.com/ClickHouse/ClickHouse/pull/5720) ([coraxster](https://github.com/coraxster)) -* Fix Float to Decimal convert overflow [#5607](https://github.com/ClickHouse/ClickHouse/pull/5607) ([coraxster](https://github.com/coraxster)) -* Flush buffer when `WriteBufferFromHDFS`'s destructor is called. This fixes writing into `HDFS`. [#5684](https://github.com/ClickHouse/ClickHouse/pull/5684) ([Xindong Peng](https://github.com/eejoin)) - -#### Improvement -* Treat empty cells in `CSV` as default values when the setting `input_format_defaults_for_omitted_fields` is enabled. [#5625](https://github.com/ClickHouse/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) -* Non-blocking loading of external dictionaries. [#5567](https://github.com/ClickHouse/ClickHouse/pull/5567) ([Vitaly Baranov](https://github.com/vitlibar)) -* Network timeouts can be dynamically changed for already established connections according to the settings. [#4558](https://github.com/ClickHouse/ClickHouse/pull/4558) ([Konstantin Podshumok](https://github.com/podshumok)) -* Using "public_suffix_list" for functions `firstSignificantSubdomain`, `cutToFirstSignificantSubdomain`. It's using a perfect hash table generated by `gperf` with a list generated from the file: [https://publicsuffix.org/list/public_suffix_list.dat](https://publicsuffix.org/list/public_suffix_list.dat). (for example, now we recognize the domain `ac.uk` as non-significant). [#5030](https://github.com/ClickHouse/ClickHouse/pull/5030) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Adopted `IPv6` data type in system tables; unified client info columns in `system.processes` and `system.query_log` [#5640](https://github.com/ClickHouse/ClickHouse/pull/5640) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Using sessions for connections with MySQL compatibility protocol. #5476 [#5646](https://github.com/ClickHouse/ClickHouse/pull/5646) ([Yuriy Baranov](https://github.com/yurriy)) -* Support more `ALTER` queries `ON CLUSTER`. [#5593](https://github.com/ClickHouse/ClickHouse/pull/5593) [#5613](https://github.com/ClickHouse/ClickHouse/pull/5613) ([sundyli](https://github.com/sundy-li)) -* Support `` section in `clickhouse-local` config file. [#5540](https://github.com/ClickHouse/ClickHouse/pull/5540) ([proller](https://github.com/proller)) -* Allow run query with `remote` table function in `clickhouse-local` [#5627](https://github.com/ClickHouse/ClickHouse/pull/5627) ([proller](https://github.com/proller)) - -#### Performance Improvement -* Add the possibility to write the final mark at the end of MergeTree columns. It allows to avoid useless reads for keys that are out of table data range. It is enabled only if adaptive index granularity is in use. [#5624](https://github.com/ClickHouse/ClickHouse/pull/5624) ([alesapin](https://github.com/alesapin)) -* Improved performance of MergeTree tables on very slow filesystems by reducing number of `stat` syscalls. [#5648](https://github.com/ClickHouse/ClickHouse/pull/5648) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed performance degradation in reading from MergeTree tables that was introduced in version 19.6. Fixes #5631. [#5633](https://github.com/ClickHouse/ClickHouse/pull/5633) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Build/Testing/Packaging Improvement -* Implemented `TestKeeper` as an implementation of ZooKeeper interface used for testing [#5643](https://github.com/ClickHouse/ClickHouse/pull/5643) ([alexey-milovidov](https://github.com/alexey-milovidov)) ([levushkin aleksej](https://github.com/alexey-milovidov)) -* From now on `.sql` tests can be run isolated by server, in parallel, with random database. It allows to run them faster, add new tests with custom server configurations, and be sure that different tests doesn't affect each other. [#5554](https://github.com/ClickHouse/ClickHouse/pull/5554) ([Ivan](https://github.com/abyss7)) -* Remove `` and `` from performance tests [#5672](https://github.com/ClickHouse/ClickHouse/pull/5672) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixed "select_format" performance test for `Pretty` formats [#5642](https://github.com/ClickHouse/ClickHouse/pull/5642) ([alexey-milovidov](https://github.com/alexey-milovidov)) - - -## ClickHouse release 19.9 -### ClickHouse release 19.9.3.31, 2019-07-05 - -#### Bug Fix -* Fix segfault in Delta codec which affects columns with values less than 32 bits size. The bug led to random memory corruption. [#5786](https://github.com/ClickHouse/ClickHouse/pull/5786) ([alesapin](https://github.com/alesapin)) -* Fix rare bug in checking of part with LowCardinality column. [#5832](https://github.com/ClickHouse/ClickHouse/pull/5832) ([alesapin](https://github.com/alesapin)) -* Fix segfault in TTL merge with non-physical columns in block. [#5819](https://github.com/ClickHouse/ClickHouse/pull/5819) ([Anton Popov](https://github.com/CurtizJ)) -* Fix potential infinite sleeping of low-priority queries. [#5842](https://github.com/ClickHouse/ClickHouse/pull/5842) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix how ClickHouse determines default time zone as UCT instead of UTC. [#5828](https://github.com/ClickHouse/ClickHouse/pull/5828) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug about executing distributed DROP/ALTER/TRUNCATE/OPTIMIZE ON CLUSTER queries on follower replica before leader replica. Now they will be executed directly on leader replica. [#5757](https://github.com/ClickHouse/ClickHouse/pull/5757) ([alesapin](https://github.com/alesapin)) -* Fix race condition, which cause that some queries may not appear in query_log instantly after SYSTEM FLUSH LOGS query. [#5685](https://github.com/ClickHouse/ClickHouse/pull/5685) ([Anton Popov](https://github.com/CurtizJ)) -* Added missing support for constant arguments to `evalMLModel` function. [#5820](https://github.com/ClickHouse/ClickHouse/pull/5820) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -### ClickHouse release 19.9.2.4, 2019-06-24 - -#### New Feature -* Print information about frozen parts in `system.parts` table. [#5471](https://github.com/ClickHouse/ClickHouse/pull/5471) ([proller](https://github.com/proller)) -* Ask client password on clickhouse-client start on tty if not set in arguments [#5092](https://github.com/ClickHouse/ClickHouse/pull/5092) ([proller](https://github.com/proller)) -* Implement `dictGet` and `dictGetOrDefault` functions for Decimal types. [#5394](https://github.com/ClickHouse/ClickHouse/pull/5394) ([Artem Zuikov](https://github.com/4ertus2)) - -#### Improvement -* Debian init: Add service stop timeout [#5522](https://github.com/ClickHouse/ClickHouse/pull/5522) ([proller](https://github.com/proller)) -* Add setting forbidden by default to create table with suspicious types for LowCardinality [#5448](https://github.com/ClickHouse/ClickHouse/pull/5448) ([Olga Khvostikova](https://github.com/stavrolia)) -* Regression functions return model weights when not used as State in function `evalMLMethod`. [#5411](https://github.com/ClickHouse/ClickHouse/pull/5411) ([Quid37](https://github.com/Quid37)) -* Rename and improve regression methods. [#5492](https://github.com/ClickHouse/ClickHouse/pull/5492) ([Quid37](https://github.com/Quid37)) -* Clearer interfaces of string searchers. [#5586](https://github.com/ClickHouse/ClickHouse/pull/5586) ([Danila Kutenin](https://github.com/danlark1)) - -#### Bug Fix -* Fix potential data loss in Kafka [#5445](https://github.com/ClickHouse/ClickHouse/pull/5445) ([Ivan](https://github.com/abyss7)) -* Fix potential infinite loop in `PrettySpace` format when called with zero columns [#5560](https://github.com/ClickHouse/ClickHouse/pull/5560) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixed UInt32 overflow bug in linear models. Allow eval ML model for non-const model argument. [#5516](https://github.com/ClickHouse/ClickHouse/pull/5516) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* `ALTER TABLE ... DROP INDEX IF EXISTS ...` should not raise an exception if provided index does not exist [#5524](https://github.com/ClickHouse/ClickHouse/pull/5524) ([Gleb Novikov](https://github.com/NanoBjorn)) -* Fix segfault with `bitmapHasAny` in scalar subquery [#5528](https://github.com/ClickHouse/ClickHouse/pull/5528) ([Zhichang Yu](https://github.com/yuzhichang)) -* Fixed error when replication connection pool doesn't retry to resolve host, even when DNS cache was dropped. [#5534](https://github.com/ClickHouse/ClickHouse/pull/5534) ([alesapin](https://github.com/alesapin)) -* Fixed `ALTER ... MODIFY TTL` on ReplicatedMergeTree. [#5539](https://github.com/ClickHouse/ClickHouse/pull/5539) ([Anton Popov](https://github.com/CurtizJ)) -* Fix INSERT into Distributed table with MATERIALIZED column [#5429](https://github.com/ClickHouse/ClickHouse/pull/5429) ([Azat Khuzhin](https://github.com/azat)) -* Fix bad alloc when truncate Join storage [#5437](https://github.com/ClickHouse/ClickHouse/pull/5437) ([TCeason](https://github.com/TCeason)) -* In recent versions of package tzdata some of files are symlinks now. The current mechanism for detecting default timezone gets broken and gives wrong names for some timezones. Now at least we force the timezone name to the contents of TZ if provided. [#5443](https://github.com/ClickHouse/ClickHouse/pull/5443) ([Ivan](https://github.com/abyss7)) -* Fix some extremely rare cases with MultiVolnitsky searcher when the constant needles in sum are at least 16KB long. The algorithm missed or overwrote the previous results which can lead to the incorrect result of `multiSearchAny`. [#5588](https://github.com/ClickHouse/ClickHouse/pull/5588) ([Danila Kutenin](https://github.com/danlark1)) -* Fix the issue when settings for ExternalData requests couldn't use ClickHouse settings. Also, for now, settings `date_time_input_format` and `low_cardinality_allow_in_native_format` cannot be used because of the ambiguity of names (in external data it can be interpreted as table format and in the query it can be a setting). [#5455](https://github.com/ClickHouse/ClickHouse/pull/5455) ([Danila Kutenin](https://github.com/danlark1)) -* Fix bug when parts were removed only from FS without dropping them from Zookeeper. [#5520](https://github.com/ClickHouse/ClickHouse/pull/5520) ([alesapin](https://github.com/alesapin)) -* Remove debug logging from MySQL protocol [#5478](https://github.com/ClickHouse/ClickHouse/pull/5478) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Skip ZNONODE during DDL query processing [#5489](https://github.com/ClickHouse/ClickHouse/pull/5489) ([Azat Khuzhin](https://github.com/azat)) -* Fix mix `UNION ALL` result column type. There were cases with inconsistent data and column types of resulting columns. [#5503](https://github.com/ClickHouse/ClickHouse/pull/5503) ([Artem Zuikov](https://github.com/4ertus2)) -* Throw an exception on wrong integers in `dictGetT` functions instead of crash. [#5446](https://github.com/ClickHouse/ClickHouse/pull/5446) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix wrong element_count and load_factor for hashed dictionary in `system.dictionaries` table. [#5440](https://github.com/ClickHouse/ClickHouse/pull/5440) ([Azat Khuzhin](https://github.com/azat)) - -#### Build/Testing/Packaging Improvement -* Fixed build without `Brotli` HTTP compression support (`ENABLE_BROTLI=OFF` cmake variable). [#5521](https://github.com/ClickHouse/ClickHouse/pull/5521) ([Anton Yuzhaninov](https://github.com/citrin)) -* Include roaring.h as roaring/roaring.h [#5523](https://github.com/ClickHouse/ClickHouse/pull/5523) ([Orivej Desh](https://github.com/orivej)) -* Fix gcc9 warnings in hyperscan (#line directive is evil!) [#5546](https://github.com/ClickHouse/ClickHouse/pull/5546) ([Danila Kutenin](https://github.com/danlark1)) -* Fix all warnings when compiling with gcc-9. Fix some contrib issues. Fix gcc9 ICE and submit it to bugzilla. [#5498](https://github.com/ClickHouse/ClickHouse/pull/5498) ([Danila Kutenin](https://github.com/danlark1)) -* Fixed linking with lld [#5477](https://github.com/ClickHouse/ClickHouse/pull/5477) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Remove unused specializations in dictionaries [#5452](https://github.com/ClickHouse/ClickHouse/pull/5452) ([Artem Zuikov](https://github.com/4ertus2)) -* Improvement performance tests for formatting and parsing tables for different types of files [#5497](https://github.com/ClickHouse/ClickHouse/pull/5497) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixes for parallel test run [#5506](https://github.com/ClickHouse/ClickHouse/pull/5506) ([proller](https://github.com/proller)) -* Docker: use configs from clickhouse-test [#5531](https://github.com/ClickHouse/ClickHouse/pull/5531) ([proller](https://github.com/proller)) -* Fix compile for FreeBSD [#5447](https://github.com/ClickHouse/ClickHouse/pull/5447) ([proller](https://github.com/proller)) -* Upgrade boost to 1.70 [#5570](https://github.com/ClickHouse/ClickHouse/pull/5570) ([proller](https://github.com/proller)) -* Fix build clickhouse as submodule [#5574](https://github.com/ClickHouse/ClickHouse/pull/5574) ([proller](https://github.com/proller)) -* Improve JSONExtract performance tests [#5444](https://github.com/ClickHouse/ClickHouse/pull/5444) ([Vitaly Baranov](https://github.com/vitlibar)) - -## ClickHouse release 19.8 -### ClickHouse release 19.8.3.8, 2019-06-11 - -#### New Features -* Added functions to work with JSON [#4686](https://github.com/ClickHouse/ClickHouse/pull/4686) ([hcz](https://github.com/hczhcz)) [#5124](https://github.com/ClickHouse/ClickHouse/pull/5124). ([Vitaly Baranov](https://github.com/vitlibar)) -* Add a function basename, with a similar behaviour to a basename function, which exists in a lot of languages (`os.path.basename` in python, `basename` in PHP, etc...). Work with both an UNIX-like path or a Windows path. [#5136](https://github.com/ClickHouse/ClickHouse/pull/5136) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Added `LIMIT n, m BY` or `LIMIT m OFFSET n BY` syntax to set offset of n for LIMIT BY clause. [#5138](https://github.com/ClickHouse/ClickHouse/pull/5138) ([Anton Popov](https://github.com/CurtizJ)) -* Added new data type `SimpleAggregateFunction`, which allows to have columns with light aggregation in an `AggregatingMergeTree`. This can only be used with simple functions like `any`, `anyLast`, `sum`, `min`, `max`. [#4629](https://github.com/ClickHouse/ClickHouse/pull/4629) ([Boris Granveaud](https://github.com/bgranvea)) -* Added support for non-constant arguments in function `ngramDistance` [#5198](https://github.com/ClickHouse/ClickHouse/pull/5198) ([Danila Kutenin](https://github.com/danlark1)) -* Added functions `skewPop`, `skewSamp`, `kurtPop` and `kurtSamp` to compute for sequence skewness, sample skewness, kurtosis and sample kurtosis respectively. [#5200](https://github.com/ClickHouse/ClickHouse/pull/5200) ([hcz](https://github.com/hczhcz)) -* Support rename operation for `MaterializeView` storage. [#5209](https://github.com/ClickHouse/ClickHouse/pull/5209) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Added server which allows connecting to ClickHouse using MySQL client. [#4715](https://github.com/ClickHouse/ClickHouse/pull/4715) ([Yuriy Baranov](https://github.com/yurriy)) -* Add `toDecimal*OrZero` and `toDecimal*OrNull` functions. [#5291](https://github.com/ClickHouse/ClickHouse/pull/5291) ([Artem Zuikov](https://github.com/4ertus2)) -* Support Decimal types in functions: `quantile`, `quantiles`, `median`, `quantileExactWeighted`, `quantilesExactWeighted`, medianExactWeighted. [#5304](https://github.com/ClickHouse/ClickHouse/pull/5304) ([Artem Zuikov](https://github.com/4ertus2)) -* Added `toValidUTF8` function, which replaces all invalid UTF-8 characters by replacement character � (U+FFFD). [#5322](https://github.com/ClickHouse/ClickHouse/pull/5322) ([Danila Kutenin](https://github.com/danlark1)) -* Added `format` function. Formatting constant pattern (simplified Python format pattern) with the strings listed in the arguments. [#5330](https://github.com/ClickHouse/ClickHouse/pull/5330) ([Danila Kutenin](https://github.com/danlark1)) -* Added `system.detached_parts` table containing information about detached parts of `MergeTree` tables. [#5353](https://github.com/ClickHouse/ClickHouse/pull/5353) ([akuzm](https://github.com/akuzm)) -* Added `ngramSearch` function to calculate the non-symmetric difference between needle and haystack. [#5418](https://github.com/ClickHouse/ClickHouse/pull/5418)[#5422](https://github.com/ClickHouse/ClickHouse/pull/5422) ([Danila Kutenin](https://github.com/danlark1)) -* Implementation of basic machine learning methods (stochastic linear regression and logistic regression) using aggregate functions interface. Has different strategies for updating model weights (simple gradient descent, momentum method, Nesterov method). Also supports mini-batches of custom size. [#4943](https://github.com/ClickHouse/ClickHouse/pull/4943) ([Quid37](https://github.com/Quid37)) -* Implementation of `geohashEncode` and `geohashDecode` functions. [#5003](https://github.com/ClickHouse/ClickHouse/pull/5003) ([Vasily Nemkov](https://github.com/Enmk)) -* Added aggregate function `timeSeriesGroupSum`, which can aggregate different time series that sample timestamp not alignment. It will use linear interpolation between two sample timestamp and then sum time-series together. Added aggregate function `timeSeriesGroupRateSum`, which calculates the rate of time-series and then sum rates together. [#4542](https://github.com/ClickHouse/ClickHouse/pull/4542) ([Yangkuan Liu](https://github.com/LiuYangkuan)) -* Added functions `IPv4CIDRtoIPv4Range` and `IPv6CIDRtoIPv6Range` to calculate the lower and higher bounds for an IP in the subnet using a CIDR. [#5095](https://github.com/ClickHouse/ClickHouse/pull/5095) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Add a X-ClickHouse-Summary header when we send a query using HTTP with enabled setting `send_progress_in_http_headers`. Return the usual information of X-ClickHouse-Progress, with additional information like how many rows and bytes were inserted in the query. [#5116](https://github.com/ClickHouse/ClickHouse/pull/5116) ([Guillaume Tassery](https://github.com/YiuRULE)) - -#### Improvements -* Added `max_parts_in_total` setting for MergeTree family of tables (default: 100 000) that prevents unsafe specification of partition key #5166. [#5171](https://github.com/ClickHouse/ClickHouse/pull/5171) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `clickhouse-obfuscator`: derive seed for individual columns by combining initial seed with column name, not column position. This is intended to transform datasets with multiple related tables, so that tables will remain JOINable after transformation. [#5178](https://github.com/ClickHouse/ClickHouse/pull/5178) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added functions `JSONExtractRaw`, `JSONExtractKeyAndValues`. Renamed functions `jsonExtract` to `JSONExtract`. When something goes wrong these functions return the correspondent values, not `NULL`. Modified function `JSONExtract`, now it gets the return type from its last parameter and doesn't inject nullables. Implemented fallback to RapidJSON in case AVX2 instructions are not available. Simdjson library updated to a new version. [#5235](https://github.com/ClickHouse/ClickHouse/pull/5235) ([Vitaly Baranov](https://github.com/vitlibar)) -* Now `if` and `multiIf` functions don't rely on the condition's `Nullable`, but rely on the branches for sql compatibility. [#5238](https://github.com/ClickHouse/ClickHouse/pull/5238) ([Jian Wu](https://github.com/janplus)) -* `In` predicate now generates `Null` result from `Null` input like the `Equal` function. [#5152](https://github.com/ClickHouse/ClickHouse/pull/5152) ([Jian Wu](https://github.com/janplus)) -* Check the time limit every (flush_interval / poll_timeout) number of rows from Kafka. This allows to break the reading from Kafka consumer more frequently and to check the time limits for the top-level streams [#5249](https://github.com/ClickHouse/ClickHouse/pull/5249) ([Ivan](https://github.com/abyss7)) -* Link rdkafka with bundled SASL. It should allow to use SASL SCRAM authentication [#5253](https://github.com/ClickHouse/ClickHouse/pull/5253) ([Ivan](https://github.com/abyss7)) -* Batched version of RowRefList for ALL JOINS. [#5267](https://github.com/ClickHouse/ClickHouse/pull/5267) ([Artem Zuikov](https://github.com/4ertus2)) -* clickhouse-server: more informative listen error messages. [#5268](https://github.com/ClickHouse/ClickHouse/pull/5268) ([proller](https://github.com/proller)) -* Support dictionaries in clickhouse-copier for functions in `` [#5270](https://github.com/ClickHouse/ClickHouse/pull/5270) ([proller](https://github.com/proller)) -* Add new setting `kafka_commit_every_batch` to regulate Kafka committing policy. -It allows to set commit mode: after every batch of messages is handled, or after the whole block is written to the storage. It's a trade-off between losing some messages or reading them twice in some extreme situations. [#5308](https://github.com/ClickHouse/ClickHouse/pull/5308) ([Ivan](https://github.com/abyss7)) -* Make `windowFunnel` support other Unsigned Integer Types. [#5320](https://github.com/ClickHouse/ClickHouse/pull/5320) ([sundyli](https://github.com/sundy-li)) -* Allow to shadow virtual column `_table` in Merge engine. [#5325](https://github.com/ClickHouse/ClickHouse/pull/5325) ([Ivan](https://github.com/abyss7)) -* Make `sequenceMatch` aggregate functions support other unsigned Integer types [#5339](https://github.com/ClickHouse/ClickHouse/pull/5339) ([sundyli](https://github.com/sundy-li)) -* Better error messages if checksum mismatch is most likely caused by hardware failures. [#5355](https://github.com/ClickHouse/ClickHouse/pull/5355) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Check that underlying tables support sampling for `StorageMerge` [#5366](https://github.com/ClickHouse/ClickHouse/pull/5366) ([Ivan](https://github.com/abyss7)) -* Сlose MySQL connections after their usage in external dictionaries. It is related to issue #893. [#5395](https://github.com/ClickHouse/ClickHouse/pull/5395) ([Clément Rodriguez](https://github.com/clemrodriguez)) -* Improvements of MySQL Wire Protocol. Changed name of format to MySQLWire. Using RAII for calling RSA_free. Disabling SSL if context cannot be created. [#5419](https://github.com/ClickHouse/ClickHouse/pull/5419) ([Yuriy Baranov](https://github.com/yurriy)) -* clickhouse-client: allow to run with unaccessable history file (read-only, no disk space, file is directory, ...). [#5431](https://github.com/ClickHouse/ClickHouse/pull/5431) ([proller](https://github.com/proller)) -* Respect query settings in asynchronous INSERTs into Distributed tables. [#4936](https://github.com/ClickHouse/ClickHouse/pull/4936) ([TCeason](https://github.com/TCeason)) -* Renamed functions `leastSqr` to `simpleLinearRegression`, `LinearRegression` to `linearRegression`, `LogisticRegression` to `logisticRegression`. [#5391](https://github.com/ClickHouse/ClickHouse/pull/5391) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) - -#### Performance Improvements -* Parallelize processing of parts of non-replicated MergeTree tables in ALTER MODIFY query. [#4639](https://github.com/ClickHouse/ClickHouse/pull/4639) ([Ivan Kush](https://github.com/IvanKush)) -* Optimizations in regular expressions extraction. [#5193](https://github.com/ClickHouse/ClickHouse/pull/5193) [#5191](https://github.com/ClickHouse/ClickHouse/pull/5191) ([Danila Kutenin](https://github.com/danlark1)) -* Do not add right join key column to join result if it's used only in join on section. [#5260](https://github.com/ClickHouse/ClickHouse/pull/5260) ([Artem Zuikov](https://github.com/4ertus2)) -* Freeze the Kafka buffer after first empty response. It avoids multiple invokations of `ReadBuffer::next()` for empty result in some row-parsing streams. [#5283](https://github.com/ClickHouse/ClickHouse/pull/5283) ([Ivan](https://github.com/abyss7)) -* `concat` function optimization for multiple arguments. [#5357](https://github.com/ClickHouse/ClickHouse/pull/5357) ([Danila Kutenin](https://github.com/danlark1)) -* Query optimisation. Allow push down IN statement while rewriting commа/cross join into inner one. [#5396](https://github.com/ClickHouse/ClickHouse/pull/5396) ([Artem Zuikov](https://github.com/4ertus2)) -* Upgrade our LZ4 implementation with reference one to have faster decompression. [#5070](https://github.com/ClickHouse/ClickHouse/pull/5070) ([Danila Kutenin](https://github.com/danlark1)) -* Implemented MSD radix sort (based on kxsort), and partial sorting. [#5129](https://github.com/ClickHouse/ClickHouse/pull/5129) ([Evgenii Pravda](https://github.com/kvinty)) - -#### Bug Fixes -* Fix push require columns with join [#5192](https://github.com/ClickHouse/ClickHouse/pull/5192) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed bug, when ClickHouse is run by systemd, the command `sudo service clickhouse-server forcerestart` was not working as expected. [#5204](https://github.com/ClickHouse/ClickHouse/pull/5204) ([proller](https://github.com/proller)) -* Fix http error codes in DataPartsExchange (interserver http server on 9009 port always returned code 200, even on errors). [#5216](https://github.com/ClickHouse/ClickHouse/pull/5216) ([proller](https://github.com/proller)) -* Fix SimpleAggregateFunction for String longer than MAX_SMALL_STRING_SIZE [#5311](https://github.com/ClickHouse/ClickHouse/pull/5311) ([Azat Khuzhin](https://github.com/azat)) -* Fix error for `Decimal` to `Nullable(Decimal)` conversion in IN. Support other Decimal to Decimal conversions (including different scales). [#5350](https://github.com/ClickHouse/ClickHouse/pull/5350) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed FPU clobbering in simdjson library that lead to wrong calculation of `uniqHLL` and `uniqCombined` aggregate function and math functions such as `log`. [#5354](https://github.com/ClickHouse/ClickHouse/pull/5354) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed handling mixed const/nonconst cases in JSON functions. [#5435](https://github.com/ClickHouse/ClickHouse/pull/5435) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix `retention` function. Now all conditions that satisfy in a row of data are added to the data state. [#5119](https://github.com/ClickHouse/ClickHouse/pull/5119) ([小路](https://github.com/nicelulu)) -* Fix result type for `quantileExact` with Decimals. [#5304](https://github.com/ClickHouse/ClickHouse/pull/5304) ([Artem Zuikov](https://github.com/4ertus2)) - -#### Documentation -* Translate documentation for `CollapsingMergeTree` to chinese. [#5168](https://github.com/ClickHouse/ClickHouse/pull/5168) ([张风啸](https://github.com/AlexZFX)) -* Translate some documentation about table engines to chinese. - [#5134](https://github.com/ClickHouse/ClickHouse/pull/5134) - [#5328](https://github.com/ClickHouse/ClickHouse/pull/5328) +## ClickHouse release v19.17 {#clickhouse-release-v19-17} + +### ClickHouse release v19.17.6.36, 2019-12-27 {#clickhouse-release-v19-17-6-36-2019-12-27} + +#### Bug Fix {#bug-fix} + +- Fixed potential buffer overflow in decompress. Malicious user can pass fabricated compressed data that could cause read after buffer. This issue was found by Eldar Zaitov from Yandex information security team. [\#8404](https://github.com/ClickHouse/ClickHouse/pull/8404) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed possible server crash (`std::terminate`) when the server cannot send or write data in JSON or XML format with values of String data type (that require UTF-8 validation) or when compressing result data with Brotli algorithm or in some other rare cases. [\#8384](https://github.com/ClickHouse/ClickHouse/pull/8384) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed dictionaries with source from a clickhouse `VIEW`, now reading such dictionaries doesn’t cause the error `There is no query`. [\#8351](https://github.com/ClickHouse/ClickHouse/pull/8351) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fixed checking if a client host is allowed by host\_regexp specified in users.xml. [\#8241](https://github.com/ClickHouse/ClickHouse/pull/8241), [\#8342](https://github.com/ClickHouse/ClickHouse/pull/8342) ([Vitaly Baranov](https://github.com/vitlibar)) +- `RENAME TABLE` for a distributed table now renames the folder containing inserted data before sending to shards. This fixes an issue with successive renames `tableA->tableB`, `tableC->tableA`. [\#8306](https://github.com/ClickHouse/ClickHouse/pull/8306) ([tavplubix](https://github.com/tavplubix)) +- `range_hashed` external dictionaries created by DDL queries now allow ranges of arbitrary numeric types. [\#8275](https://github.com/ClickHouse/ClickHouse/pull/8275) ([alesapin](https://github.com/alesapin)) +- Fixed `INSERT INTO table SELECT ... FROM mysql(...)` table function. [\#8234](https://github.com/ClickHouse/ClickHouse/pull/8234) ([tavplubix](https://github.com/tavplubix)) +- Fixed segfault in `INSERT INTO TABLE FUNCTION file()` while inserting into a file which doesn’t exist. Now in this case file would be created and then insert would be processed. [\#8177](https://github.com/ClickHouse/ClickHouse/pull/8177) ([Olga Khvostikova](https://github.com/stavrolia)) +- Fixed bitmapAnd error when intersecting an aggregated bitmap and a scalar bitmap. [\#8082](https://github.com/ClickHouse/ClickHouse/pull/8082) ([Yue Huang](https://github.com/moon03432)) +- Fixed segfault when `EXISTS` query was used without `TABLE` or `DICTIONARY` qualifier, just like `EXISTS t`. [\#8213](https://github.com/ClickHouse/ClickHouse/pull/8213) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed return type for functions `rand` and `randConstant` in case of nullable argument. Now functions always return `UInt32` and never `Nullable(UInt32)`. [\#8204](https://github.com/ClickHouse/ClickHouse/pull/8204) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fixed `DROP DICTIONARY IF EXISTS db.dict`, now it doesn’t throw exception if `db` doesn’t exist. [\#8185](https://github.com/ClickHouse/ClickHouse/pull/8185) ([Vitaly Baranov](https://github.com/vitlibar)) +- If a table wasn’t completely dropped because of server crash, the server will try to restore and load it [\#8176](https://github.com/ClickHouse/ClickHouse/pull/8176) ([tavplubix](https://github.com/tavplubix)) +- Fixed a trivial count query for a distributed table if there are more than two shard local table. [\#8164](https://github.com/ClickHouse/ClickHouse/pull/8164) ([小路](https://github.com/nicelulu)) +- Fixed bug that lead to a data race in DB::BlockStreamProfileInfo::calculateRowsBeforeLimit() [\#8143](https://github.com/ClickHouse/ClickHouse/pull/8143) ([Alexander Kazakov](https://github.com/Akazz)) +- Fixed `ALTER table MOVE part` executed immediately after merging the specified part, which could cause moving a part which the specified part merged into. Now it correctly moves the specified part. [\#8104](https://github.com/ClickHouse/ClickHouse/pull/8104) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Expressions for dictionaries can be specified as strings now. This is useful for calculation of attributes while extracting data from non-ClickHouse sources because it allows to use non-ClickHouse syntax for those expressions. [\#8098](https://github.com/ClickHouse/ClickHouse/pull/8098) ([alesapin](https://github.com/alesapin)) +- Fixed a very rare race in `clickhouse-copier` because of an overflow in ZXid. [\#8088](https://github.com/ClickHouse/ClickHouse/pull/8088) ([Ding Xiang Fei](https://github.com/dingxiangfei2009)) +- Fixed the bug when after the query failed (due to “Too many simultaneous queries” for example) it would not read external tables info, and the + next request would interpret this info as the beginning of the next query causing an error like `Unknown packet from client`. [\#8084](https://github.com/ClickHouse/ClickHouse/pull/8084) ([Azat Khuzhin](https://github.com/azat)) +- Avoid null dereference after “Unknown packet X from server” [\#8071](https://github.com/ClickHouse/ClickHouse/pull/8071) ([Azat Khuzhin](https://github.com/azat)) +- Restore support of all ICU locales, add the ability to apply collations for constant expressions and add language name to system.collations table. [\#8051](https://github.com/ClickHouse/ClickHouse/pull/8051) ([alesapin](https://github.com/alesapin)) +- Number of streams for read from `StorageFile` and `StorageHDFS` is now limited, to avoid exceeding the memory limit. [\#7981](https://github.com/ClickHouse/ClickHouse/pull/7981) ([alesapin](https://github.com/alesapin)) +- Fixed `CHECK TABLE` query for `*MergeTree` tables without key. [\#7979](https://github.com/ClickHouse/ClickHouse/pull/7979) ([alesapin](https://github.com/alesapin)) +- Removed the mutation number from a part name in case there were no mutations. This removing improved the compatibility with older versions. [\#8250](https://github.com/ClickHouse/ClickHouse/pull/8250) ([alesapin](https://github.com/alesapin)) +- Fixed the bug that mutations are skipped for some attached parts due to their data\_version are larger than the table mutation version. [\#7812](https://github.com/ClickHouse/ClickHouse/pull/7812) ([Zhichang Yu](https://github.com/yuzhichang)) +- Allow starting the server with redundant copies of parts after moving them to another device. [\#7810](https://github.com/ClickHouse/ClickHouse/pull/7810) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Fixed the error “Sizes of columns doesn’t match” that might appear when using aggregate function columns. [\#7790](https://github.com/ClickHouse/ClickHouse/pull/7790) ([Boris Granveaud](https://github.com/bgranvea)) +- Now an exception will be thrown in case of using WITH TIES alongside LIMIT BY. And now it’s possible to use TOP with LIMIT BY. [\#7637](https://github.com/ClickHouse/ClickHouse/pull/7637) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +- Fix dictionary reload if it has `invalidate_query`, which stopped updates and some exception on previous update tries. [\#8029](https://github.com/ClickHouse/ClickHouse/pull/8029) ([alesapin](https://github.com/alesapin)) + +### ClickHouse release v19.17.4.11, 2019-11-22 {#clickhouse-release-v19-17-4-11-2019-11-22} + +#### Backward Incompatible Change {#backward-incompatible-change} + +- Using column instead of AST to store scalar subquery results for better performance. Setting `enable_scalar_subquery_optimization` was added in 19.17 and it was enabled by default. It leads to errors like [this](https://github.com/ClickHouse/ClickHouse/issues/7851) during upgrade to 19.17.2 or 19.17.3 from previous versions. This setting was disabled by default in 19.17.4, to make possible upgrading from 19.16 and older versions without errors. [\#7392](https://github.com/ClickHouse/ClickHouse/pull/7392) ([Amos Bird](https://github.com/amosbird)) + +#### New Feature {#new-feature} + +- Add the ability to create dictionaries with DDL queries. [\#7360](https://github.com/ClickHouse/ClickHouse/pull/7360) ([alesapin](https://github.com/alesapin)) +- Make `bloom_filter` type of index supporting `LowCardinality` and `Nullable` [\#7363](https://github.com/ClickHouse/ClickHouse/issues/7363) [\#7561](https://github.com/ClickHouse/ClickHouse/pull/7561) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Add function `isValidJSON` to check that passed string is a valid json. [\#5910](https://github.com/ClickHouse/ClickHouse/issues/5910) [\#7293](https://github.com/ClickHouse/ClickHouse/pull/7293) ([Vdimir](https://github.com/Vdimir)) +- Implement `arrayCompact` function [\#7328](https://github.com/ClickHouse/ClickHouse/pull/7328) ([Memo](https://github.com/Joeywzr)) +- Created function `hex` for Decimal numbers. It works like `hex(reinterpretAsString())`, but doesn’t delete last zero bytes. [\#7355](https://github.com/ClickHouse/ClickHouse/pull/7355) ([Mikhail Korotov](https://github.com/millb)) +- Add `arrayFill` and `arrayReverseFill` functions, which replace elements by other elements in front/back of them in the array. [\#7380](https://github.com/ClickHouse/ClickHouse/pull/7380) ([hcz](https://github.com/hczhcz)) +- Add `CRC32IEEE()`/`CRC64()` support [\#7480](https://github.com/ClickHouse/ClickHouse/pull/7480) ([Azat Khuzhin](https://github.com/azat)) +- Implement `char` function similar to one in [mysql](https://dev.mysql.com/doc/refman/8.0/en/string-functions.html#function_char) [\#7486](https://github.com/ClickHouse/ClickHouse/pull/7486) ([sundyli](https://github.com/sundy-li)) +- Add `bitmapTransform` function. It transforms an array of values in a bitmap to another array of values, the result is a new bitmap [\#7598](https://github.com/ClickHouse/ClickHouse/pull/7598) ([Zhichang Yu](https://github.com/yuzhichang)) +- Implemented `javaHashUTF16LE()` function [\#7651](https://github.com/ClickHouse/ClickHouse/pull/7651) ([achimbab](https://github.com/achimbab)) +- Add `_shard_num` virtual column for the Distributed engine [\#7624](https://github.com/ClickHouse/ClickHouse/pull/7624) ([Azat Khuzhin](https://github.com/azat)) + +#### Experimental Feature {#experimental-feature} + +- Support for processors (new query execution pipeline) in `MergeTree`. [\#7181](https://github.com/ClickHouse/ClickHouse/pull/7181) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) + +#### Bug Fix {#bug-fix-1} + +- Fix incorrect float parsing in `Values` [\#7817](https://github.com/ClickHouse/ClickHouse/issues/7817) [\#7870](https://github.com/ClickHouse/ClickHouse/pull/7870) ([tavplubix](https://github.com/tavplubix)) +- Fix rare deadlock which can happen when trace\_log is enabled. [\#7838](https://github.com/ClickHouse/ClickHouse/pull/7838) ([filimonov](https://github.com/filimonov)) +- Prevent message duplication when producing Kafka table has any MVs selecting from it [\#7265](https://github.com/ClickHouse/ClickHouse/pull/7265) ([Ivan](https://github.com/abyss7)) +- Support for `Array(LowCardinality(Nullable(String)))` in `IN`. Resolves [\#7364](https://github.com/ClickHouse/ClickHouse/issues/7364) [\#7366](https://github.com/ClickHouse/ClickHouse/pull/7366) ([achimbab](https://github.com/achimbab)) +- Add handling of `SQL_TINYINT` and `SQL_BIGINT`, and fix handling of `SQL_FLOAT` data source types in ODBC Bridge. [\#7491](https://github.com/ClickHouse/ClickHouse/pull/7491) ([Denis Glazachev](https://github.com/traceon)) +- Fix aggregation (`avg` and quantiles) over empty decimal columns [\#7431](https://github.com/ClickHouse/ClickHouse/pull/7431) ([Andrey Konyaev](https://github.com/akonyaev90)) +- Fix `INSERT` into Distributed with `MATERIALIZED` columns [\#7377](https://github.com/ClickHouse/ClickHouse/pull/7377) ([Azat Khuzhin](https://github.com/azat)) +- Make `MOVE PARTITION` work if some parts of partition are already on destination disk or volume [\#7434](https://github.com/ClickHouse/ClickHouse/pull/7434) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Fixed bug with hardlinks failing to be created during mutations in `ReplicatedMergeTree` in multi-disk configurations. [\#7558](https://github.com/ClickHouse/ClickHouse/pull/7558) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Fixed a bug with a mutation on a MergeTree when whole part remains unchanged and best space is being found on another disk [\#7602](https://github.com/ClickHouse/ClickHouse/pull/7602) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Fixed bug with `keep_free_space_ratio` not being read from disks configuration [\#7645](https://github.com/ClickHouse/ClickHouse/pull/7645) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Fix bug with table contains only `Tuple` columns or columns with complex paths. Fixes [7541](https://github.com/ClickHouse/ClickHouse/issues/7541). [\#7545](https://github.com/ClickHouse/ClickHouse/pull/7545) ([alesapin](https://github.com/alesapin)) +- Do not account memory for Buffer engine in max\_memory\_usage limit [\#7552](https://github.com/ClickHouse/ClickHouse/pull/7552) ([Azat Khuzhin](https://github.com/azat)) +- Fix final mark usage in `MergeTree` tables ordered by `tuple()`. In rare cases it could lead to `Can't adjust last granule` error while select. [\#7639](https://github.com/ClickHouse/ClickHouse/pull/7639) ([Anton Popov](https://github.com/CurtizJ)) +- Fix bug in mutations that have predicate with actions that require context (for example functions for json), which may lead to crashes or strange exceptions. [\#7664](https://github.com/ClickHouse/ClickHouse/pull/7664) ([alesapin](https://github.com/alesapin)) +- Fix mismatch of database and table names escaping in `data/` and `shadow/` directories [\#7575](https://github.com/ClickHouse/ClickHouse/pull/7575) ([Alexander Burmak](https://github.com/Alex-Burmak)) +- Support duplicated keys in RIGHT\|FULL JOINs, e.g. `ON t.x = u.x AND t.x = u.y`. Fix crash in this case. [\#7586](https://github.com/ClickHouse/ClickHouse/pull/7586) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix `Not found column in block` when joining on expression with RIGHT or FULL JOIN. [\#7641](https://github.com/ClickHouse/ClickHouse/pull/7641) ([Artem Zuikov](https://github.com/4ertus2)) +- One more attempt to fix infinite loop in `PrettySpace` format [\#7591](https://github.com/ClickHouse/ClickHouse/pull/7591) ([Olga Khvostikova](https://github.com/stavrolia)) +- Fix bug in `concat` function when all arguments were `FixedString` of the same size. [\#7635](https://github.com/ClickHouse/ClickHouse/pull/7635) ([alesapin](https://github.com/alesapin)) +- Fixed exception in case of using 1 argument while defining S3, URL and HDFS storages. [\#7618](https://github.com/ClickHouse/ClickHouse/pull/7618) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Fix scope of the InterpreterSelectQuery for views with query [\#7601](https://github.com/ClickHouse/ClickHouse/pull/7601) ([Azat Khuzhin](https://github.com/azat)) + +#### Improvement {#improvement} + +- `Nullable` columns recognized and NULL-values handled correctly by ODBC-bridge [\#7402](https://github.com/ClickHouse/ClickHouse/pull/7402) ([Vasily Nemkov](https://github.com/Enmk)) +- Write current batch for distributed send atomically [\#7600](https://github.com/ClickHouse/ClickHouse/pull/7600) ([Azat Khuzhin](https://github.com/azat)) +- Throw an exception if we cannot detect table for column name in query. [\#7358](https://github.com/ClickHouse/ClickHouse/pull/7358) ([Artem Zuikov](https://github.com/4ertus2)) +- Add `merge_max_block_size` setting to `MergeTreeSettings` [\#7412](https://github.com/ClickHouse/ClickHouse/pull/7412) ([Artem Zuikov](https://github.com/4ertus2)) +- Queries with `HAVING` and without `GROUP BY` assume group by constant. So, `SELECT 1 HAVING 1` now returns a result. [\#7496](https://github.com/ClickHouse/ClickHouse/pull/7496) ([Amos Bird](https://github.com/amosbird)) +- Support parsing `(X,)` as tuple similar to python. [\#7501](https://github.com/ClickHouse/ClickHouse/pull/7501), [\#7562](https://github.com/ClickHouse/ClickHouse/pull/7562) ([Amos Bird](https://github.com/amosbird)) +- Make `range` function behaviors almost like pythonic one. [\#7518](https://github.com/ClickHouse/ClickHouse/pull/7518) ([sundyli](https://github.com/sundy-li)) +- Add `constraints` columns to table `system.settings` [\#7553](https://github.com/ClickHouse/ClickHouse/pull/7553) ([Vitaly Baranov](https://github.com/vitlibar)) +- Better Null format for tcp handler, so that it’s possible to use `select ignore() from table format Null` for perf measure via clickhouse-client [\#7606](https://github.com/ClickHouse/ClickHouse/pull/7606) ([Amos Bird](https://github.com/amosbird)) +- Queries like `CREATE TABLE ... AS (SELECT (1, 2))` are parsed correctly [\#7542](https://github.com/ClickHouse/ClickHouse/pull/7542) ([hcz](https://github.com/hczhcz)) + +#### Performance Improvement {#performance-improvement} + +- The performance of aggregation over short string keys is improved. [\#6243](https://github.com/ClickHouse/ClickHouse/pull/6243) ([Alexander Kuzmenkov](https://github.com/akuzm), [Amos Bird](https://github.com/amosbird)) +- Run another pass of syntax/expression analysis to get potential optimizations after constant predicates are folded. [\#7497](https://github.com/ClickHouse/ClickHouse/pull/7497) ([Amos Bird](https://github.com/amosbird)) +- Use storage meta info to evaluate trivial `SELECT count() FROM table;` [\#7510](https://github.com/ClickHouse/ClickHouse/pull/7510) ([Amos Bird](https://github.com/amosbird), [alexey-milovidov](https://github.com/alexey-milovidov)) +- Vectorize processing `arrayReduce` similar to Aggregator `addBatch`. [\#7608](https://github.com/ClickHouse/ClickHouse/pull/7608) ([Amos Bird](https://github.com/amosbird)) +- Minor improvements in performance of `Kafka` consumption [\#7475](https://github.com/ClickHouse/ClickHouse/pull/7475) ([Ivan](https://github.com/abyss7)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement} + +- Add support for cross-compiling to the CPU architecture AARCH64. Refactor packager script. [\#7370](https://github.com/ClickHouse/ClickHouse/pull/7370) [\#7539](https://github.com/ClickHouse/ClickHouse/pull/7539) ([Ivan](https://github.com/abyss7)) +- Unpack darwin-x86\_64 and linux-aarch64 toolchains into mounted Docker volume when building packages [\#7534](https://github.com/ClickHouse/ClickHouse/pull/7534) ([Ivan](https://github.com/abyss7)) +- Update Docker Image for Binary Packager [\#7474](https://github.com/ClickHouse/ClickHouse/pull/7474) ([Ivan](https://github.com/abyss7)) +- Fixed compile errors on MacOS Catalina [\#7585](https://github.com/ClickHouse/ClickHouse/pull/7585) ([Ernest Poletaev](https://github.com/ernestp)) +- Some refactoring in query analysis logic: split complex class into several simple ones. [\#7454](https://github.com/ClickHouse/ClickHouse/pull/7454) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix build without submodules [\#7295](https://github.com/ClickHouse/ClickHouse/pull/7295) ([proller](https://github.com/proller)) +- Better `add_globs` in CMake files [\#7418](https://github.com/ClickHouse/ClickHouse/pull/7418) ([Amos Bird](https://github.com/amosbird)) +- Remove hardcoded paths in `unwind` target [\#7460](https://github.com/ClickHouse/ClickHouse/pull/7460) ([Konstantin Podshumok](https://github.com/podshumok)) +- Allow to use mysql format without ssl [\#7524](https://github.com/ClickHouse/ClickHouse/pull/7524) ([proller](https://github.com/proller)) + +#### Other {#other} + +- Added ANTLR4 grammar for ClickHouse SQL dialect [\#7595](https://github.com/ClickHouse/ClickHouse/issues/7595) [\#7596](https://github.com/ClickHouse/ClickHouse/pull/7596) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +## ClickHouse release v19.16 {#clickhouse-release-v19-16} + +#### Clickhouse release v19.16.14.65, 2020-03-05 {#clickhouse-release-v19-16-14-65-2020-03-05} + +- Fix distributed subqueries incompatibility with older CH versions. Fixes [\#7851](https://github.com/ClickHouse/ClickHouse/issues/7851) + [(tabplubix)](https://github.com/tavplubix) +- When executing `CREATE` query, fold constant expressions in storage engine arguments. Replace empty database name with current database. Fixes [\#6508](https://github.com/ClickHouse/ClickHouse/issues/6508), [\#3492](https://github.com/ClickHouse/ClickHouse/issues/3492). Also fix check for local address in `ClickHouseDictionarySource`. + [\#9262](https://github.com/ClickHouse/ClickHouse/pull/9262) [(tabplubix)](https://github.com/tavplubix) +- Now background merges in `*MergeTree` table engines family preserve storage policy volume order more accurately. + [\#8549](https://github.com/ClickHouse/ClickHouse/pull/8549) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Prevent losing data in `Kafka` in rare cases when exception happens after reading suffix but before commit. Fixes [\#9378](https://github.com/ClickHouse/ClickHouse/issues/9378). Related: [\#7175](https://github.com/ClickHouse/ClickHouse/issues/7175) + [\#9507](https://github.com/ClickHouse/ClickHouse/pull/9507) [(filimonov)](https://github.com/filimonov) +- Fix bug leading to server termination when trying to use / drop `Kafka` table created with wrong parameters. Fixes [\#9494](https://github.com/ClickHouse/ClickHouse/issues/9494). Incorporates [\#9507](https://github.com/ClickHouse/ClickHouse/issues/9507). + [\#9513](https://github.com/ClickHouse/ClickHouse/pull/9513) [(filimonov)](https://github.com/filimonov) +- Allow using `MaterializedView` with subqueries above `Kafka` tables. + [\#8197](https://github.com/ClickHouse/ClickHouse/pull/8197) ([filimonov](https://github.com/filimonov)) + +#### New Feature {#new-feature-1} + +- Add `deduplicate_blocks_in_dependent_materialized_views` option to control the behaviour of idempotent inserts into tables with materialized views. This new feature was added to the bugfix release by a special request from Altinity. + [\#9070](https://github.com/ClickHouse/ClickHouse/pull/9070) [(urykhy)](https://github.com/urykhy) + +### ClickHouse release v19.16.2.2, 2019-10-30 {#clickhouse-release-v19-16-2-2-2019-10-30} + +#### Backward Incompatible Change {#backward-incompatible-change-1} + +- Add missing arity validation for count/counIf. + [\#7095](https://github.com/ClickHouse/ClickHouse/issues/7095) + [\#7298](https://github.com/ClickHouse/ClickHouse/pull/7298) ([Vdimir](https://github.com/Vdimir)) +- Remove legacy `asterisk_left_columns_only` setting (it was disabled by default). + [\#7335](https://github.com/ClickHouse/ClickHouse/pull/7335) ([Artem + Zuikov](https://github.com/4ertus2)) +- Format strings for Template data format are now specified in files. + [\#7118](https://github.com/ClickHouse/ClickHouse/pull/7118) + ([tavplubix](https://github.com/tavplubix)) + +#### New Feature {#new-feature-2} + +- Introduce uniqCombined64() to calculate cardinality greater than UINT\_MAX. + [\#7213](https://github.com/ClickHouse/ClickHouse/pull/7213), + [\#7222](https://github.com/ClickHouse/ClickHouse/pull/7222) ([Azat + Khuzhin](https://github.com/azat)) +- Support Bloom filter indexes on Array columns. + [\#6984](https://github.com/ClickHouse/ClickHouse/pull/6984) + ([achimbab](https://github.com/achimbab)) +- Add a function `getMacro(name)` that returns String with the value of corresponding `` + from server configuration. [\#7240](https://github.com/ClickHouse/ClickHouse/pull/7240) + ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Set two configuration options for a dictionary based on an HTTP source: `credentials` and + `http-headers`. [\#7092](https://github.com/ClickHouse/ClickHouse/pull/7092) ([Guillaume + Tassery](https://github.com/YiuRULE)) +- Add a new ProfileEvent `Merge` that counts the number of launched background merges. + [\#7093](https://github.com/ClickHouse/ClickHouse/pull/7093) ([Mikhail + Korotov](https://github.com/millb)) +- Add fullHostName function that returns a fully qualified domain name. + [\#7263](https://github.com/ClickHouse/ClickHouse/issues/7263) + [\#7291](https://github.com/ClickHouse/ClickHouse/pull/7291) ([sundyli](https://github.com/sundy-li)) +- Add function `arraySplit` and `arrayReverseSplit` which split an array by “cut off” + conditions. They are useful in time sequence handling. + [\#7294](https://github.com/ClickHouse/ClickHouse/pull/7294) ([hcz](https://github.com/hczhcz)) +- Add new functions that return the Array of all matched indices in multiMatch family of functions. + [\#7299](https://github.com/ClickHouse/ClickHouse/pull/7299) ([Danila + Kutenin](https://github.com/danlark1)) +- Add a new database engine `Lazy` that is optimized for storing a large number of small -Log + tables. [\#7171](https://github.com/ClickHouse/ClickHouse/pull/7171) ([Nikita + Vasilev](https://github.com/nikvas0)) +- Add aggregate functions groupBitmapAnd, -Or, -Xor for bitmap columns. [\#7109](https://github.com/ClickHouse/ClickHouse/pull/7109) ([Zhichang + Yu](https://github.com/yuzhichang)) +- Add aggregate function combinators -OrNull and -OrDefault, which return null + or default values when there is nothing to aggregate. + [\#7331](https://github.com/ClickHouse/ClickHouse/pull/7331) + ([hcz](https://github.com/hczhcz)) +- Introduce CustomSeparated data format that supports custom escaping and + delimiter rules. [\#7118](https://github.com/ClickHouse/ClickHouse/pull/7118) + ([tavplubix](https://github.com/tavplubix)) +- Support Redis as source of external dictionary. [\#4361](https://github.com/ClickHouse/ClickHouse/pull/4361) [\#6962](https://github.com/ClickHouse/ClickHouse/pull/6962) ([comunodi](https://github.com/comunodi), [Anton + Popov](https://github.com/CurtizJ)) + +#### Bug Fix {#bug-fix-2} + +- Fix wrong query result if it has `WHERE IN (SELECT ...)` section and `optimize_read_in_order` is + used. [\#7371](https://github.com/ClickHouse/ClickHouse/pull/7371) ([Anton + Popov](https://github.com/CurtizJ)) +- Disabled MariaDB authentication plugin, which depends on files outside of project. + [\#7140](https://github.com/ClickHouse/ClickHouse/pull/7140) ([Yuriy + Baranov](https://github.com/yurriy)) +- Fix exception `Cannot convert column ... because it is constant but values of constants are different in source and result` which could rarely happen when functions `now()`, `today()`, + `yesterday()`, `randConstant()` are used. + [\#7156](https://github.com/ClickHouse/ClickHouse/pull/7156) ([Nikolai + Kochetov](https://github.com/KochetovNicolai)) +- Fixed issue of using HTTP keep alive timeout instead of TCP keep alive timeout. + [\#7351](https://github.com/ClickHouse/ClickHouse/pull/7351) ([Vasily + Nemkov](https://github.com/Enmk)) +- Fixed a segmentation fault in groupBitmapOr (issue [\#7109](https://github.com/ClickHouse/ClickHouse/issues/7109)). + [\#7289](https://github.com/ClickHouse/ClickHouse/pull/7289) ([Zhichang + Yu](https://github.com/yuzhichang)) +- For materialized views the commit for Kafka is called after all data were written. + [\#7175](https://github.com/ClickHouse/ClickHouse/pull/7175) ([Ivan](https://github.com/abyss7)) +- Fixed wrong `duration_ms` value in `system.part_log` table. It was ten times off. + [\#7172](https://github.com/ClickHouse/ClickHouse/pull/7172) ([Vladimir + Chebotarev](https://github.com/excitoon)) +- A quick fix to resolve crash in LIVE VIEW table and re-enabling all LIVE VIEW tests. + [\#7201](https://github.com/ClickHouse/ClickHouse/pull/7201) + ([vzakaznikov](https://github.com/vzakaznikov)) +- Serialize NULL values correctly in min/max indexes of MergeTree parts. + [\#7234](https://github.com/ClickHouse/ClickHouse/pull/7234) ([Alexander + Kuzmenkov](https://github.com/akuzm)) +- Don’t put virtual columns to .sql metadata when table is created as `CREATE TABLE AS`. + [\#7183](https://github.com/ClickHouse/ClickHouse/pull/7183) ([Ivan](https://github.com/abyss7)) +- Fix segmentation fault in `ATTACH PART` query. + [\#7185](https://github.com/ClickHouse/ClickHouse/pull/7185) + ([alesapin](https://github.com/alesapin)) +- Fix wrong result for some queries given by the optimization of empty IN subqueries and empty + INNER/RIGHT JOIN. [\#7284](https://github.com/ClickHouse/ClickHouse/pull/7284) ([Nikolai + Kochetov](https://github.com/KochetovNicolai)) +- Fixing AddressSanitizer error in the LIVE VIEW getHeader() method. + [\#7271](https://github.com/ClickHouse/ClickHouse/pull/7271) + ([vzakaznikov](https://github.com/vzakaznikov)) + +#### Improvement {#improvement-1} + +- Add a message in case of queue\_wait\_max\_ms wait takes place. + [\#7390](https://github.com/ClickHouse/ClickHouse/pull/7390) ([Azat + Khuzhin](https://github.com/azat)) +- Made setting `s3_min_upload_part_size` table-level. + [\#7059](https://github.com/ClickHouse/ClickHouse/pull/7059) ([Vladimir + Chebotarev](https://github.com/excitoon)) +- Check TTL in StorageFactory. [\#7304](https://github.com/ClickHouse/ClickHouse/pull/7304) + ([sundyli](https://github.com/sundy-li)) +- Squash left-hand blocks in partial merge join (optimization). + [\#7122](https://github.com/ClickHouse/ClickHouse/pull/7122) ([Artem + Zuikov](https://github.com/4ertus2)) +- Do not allow non-deterministic functions in mutations of Replicated table engines, because this + can introduce inconsistencies between replicas. + [\#7247](https://github.com/ClickHouse/ClickHouse/pull/7247) ([Alexander + Kazakov](https://github.com/Akazz)) +- Disable memory tracker while converting exception stack trace to string. It can prevent the loss + of error messages of type `Memory limit exceeded` on server, which caused the `Attempt to read after eof` exception on client. [\#7264](https://github.com/ClickHouse/ClickHouse/pull/7264) + ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Miscellaneous format improvements. Resolves + [\#6033](https://github.com/ClickHouse/ClickHouse/issues/6033), + [\#2633](https://github.com/ClickHouse/ClickHouse/issues/2633), + [\#6611](https://github.com/ClickHouse/ClickHouse/issues/6611), + [\#6742](https://github.com/ClickHouse/ClickHouse/issues/6742) + [\#7215](https://github.com/ClickHouse/ClickHouse/pull/7215) + ([tavplubix](https://github.com/tavplubix)) +- ClickHouse ignores values on the right side of IN operator that are not convertible to the left + side type. Make it work properly for compound types – Array and Tuple. + [\#7283](https://github.com/ClickHouse/ClickHouse/pull/7283) ([Alexander + Kuzmenkov](https://github.com/akuzm)) +- Support missing inequalities for ASOF JOIN. It’s possible to join less-or-equal variant and strict + greater and less variants for ASOF column in ON syntax. + [\#7282](https://github.com/ClickHouse/ClickHouse/pull/7282) ([Artem + Zuikov](https://github.com/4ertus2)) +- Optimize partial merge join. [\#7070](https://github.com/ClickHouse/ClickHouse/pull/7070) + ([Artem Zuikov](https://github.com/4ertus2)) +- Do not use more than 98K of memory in uniqCombined functions. + [\#7236](https://github.com/ClickHouse/ClickHouse/pull/7236), + [\#7270](https://github.com/ClickHouse/ClickHouse/pull/7270) ([Azat + Khuzhin](https://github.com/azat)) +- Flush parts of right-hand joining table on disk in PartialMergeJoin (if there is not enough + memory). Load data back when needed. [\#7186](https://github.com/ClickHouse/ClickHouse/pull/7186) + ([Artem Zuikov](https://github.com/4ertus2)) + +#### Performance Improvement {#performance-improvement-1} + +- Speed up joinGet with const arguments by avoiding data duplication. + [\#7359](https://github.com/ClickHouse/ClickHouse/pull/7359) ([Amos + Bird](https://github.com/amosbird)) +- Return early if the subquery is empty. + [\#7007](https://github.com/ClickHouse/ClickHouse/pull/7007) ([小路](https://github.com/nicelulu)) +- Optimize parsing of SQL expression in Values. + [\#6781](https://github.com/ClickHouse/ClickHouse/pull/6781) + ([tavplubix](https://github.com/tavplubix)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-1} + +- Disable some contribs for cross-compilation to Mac OS. + [\#7101](https://github.com/ClickHouse/ClickHouse/pull/7101) ([Ivan](https://github.com/abyss7)) +- Add missing linking with PocoXML for clickhouse\_common\_io. + [\#7200](https://github.com/ClickHouse/ClickHouse/pull/7200) ([Azat + Khuzhin](https://github.com/azat)) +- Accept multiple test filter arguments in clickhouse-test. + [\#7226](https://github.com/ClickHouse/ClickHouse/pull/7226) ([Alexander + Kuzmenkov](https://github.com/akuzm)) +- Enable musl and jemalloc for ARM. [\#7300](https://github.com/ClickHouse/ClickHouse/pull/7300) + ([Amos Bird](https://github.com/amosbird)) +- Added `--client-option` parameter to `clickhouse-test` to pass additional parameters to client. + [\#7277](https://github.com/ClickHouse/ClickHouse/pull/7277) ([Nikolai + Kochetov](https://github.com/KochetovNicolai)) +- Preserve existing configs on rpm package upgrade. + [\#7103](https://github.com/ClickHouse/ClickHouse/pull/7103) + ([filimonov](https://github.com/filimonov)) +- Fix errors detected by PVS. [\#7153](https://github.com/ClickHouse/ClickHouse/pull/7153) ([Artem + Zuikov](https://github.com/4ertus2)) +- Fix build for Darwin. [\#7149](https://github.com/ClickHouse/ClickHouse/pull/7149) + ([Ivan](https://github.com/abyss7)) +- glibc 2.29 compatibility. [\#7142](https://github.com/ClickHouse/ClickHouse/pull/7142) ([Amos + Bird](https://github.com/amosbird)) +- Make sure dh\_clean does not touch potential source files. + [\#7205](https://github.com/ClickHouse/ClickHouse/pull/7205) ([Amos + Bird](https://github.com/amosbird)) +- Attempt to avoid conflict when updating from altinity rpm - it has config file packaged separately + in clickhouse-server-common. [\#7073](https://github.com/ClickHouse/ClickHouse/pull/7073) + ([filimonov](https://github.com/filimonov)) +- Optimize some header files for faster rebuilds. + [\#7212](https://github.com/ClickHouse/ClickHouse/pull/7212), + [\#7231](https://github.com/ClickHouse/ClickHouse/pull/7231) ([Alexander + Kuzmenkov](https://github.com/akuzm)) +- Add performance tests for Date and DateTime. [\#7332](https://github.com/ClickHouse/ClickHouse/pull/7332) ([Vasily + Nemkov](https://github.com/Enmk)) +- Fix some tests that contained non-deterministic mutations. + [\#7132](https://github.com/ClickHouse/ClickHouse/pull/7132) ([Alexander + Kazakov](https://github.com/Akazz)) +- Add build with MemorySanitizer to CI. [\#7066](https://github.com/ClickHouse/ClickHouse/pull/7066) + ([Alexander Kuzmenkov](https://github.com/akuzm)) +- Avoid use of uninitialized values in MetricsTransmitter. + [\#7158](https://github.com/ClickHouse/ClickHouse/pull/7158) ([Azat + Khuzhin](https://github.com/azat)) +- Fix some issues in Fields found by MemorySanitizer. + [\#7135](https://github.com/ClickHouse/ClickHouse/pull/7135), + [\#7179](https://github.com/ClickHouse/ClickHouse/pull/7179) ([Alexander + Kuzmenkov](https://github.com/akuzm)), [\#7376](https://github.com/ClickHouse/ClickHouse/pull/7376) + ([Amos Bird](https://github.com/amosbird)) +- Fix undefined behavior in murmurhash32. [\#7388](https://github.com/ClickHouse/ClickHouse/pull/7388) ([Amos + Bird](https://github.com/amosbird)) +- Fix undefined behavior in StoragesInfoStream. [\#7384](https://github.com/ClickHouse/ClickHouse/pull/7384) + ([tavplubix](https://github.com/tavplubix)) +- Fixed constant expressions folding for external database engines (MySQL, ODBC, JDBC). In previous + versions it wasn’t working for multiple constant expressions and was not working at all for Date, + DateTime and UUID. This fixes [\#7245](https://github.com/ClickHouse/ClickHouse/issues/7245) + [\#7252](https://github.com/ClickHouse/ClickHouse/pull/7252) + ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixing ThreadSanitizer data race error in the LIVE VIEW when accessing no\_users\_thread variable. + [\#7353](https://github.com/ClickHouse/ClickHouse/pull/7353) + ([vzakaznikov](https://github.com/vzakaznikov)) +- Get rid of malloc symbols in libcommon + [\#7134](https://github.com/ClickHouse/ClickHouse/pull/7134), + [\#7065](https://github.com/ClickHouse/ClickHouse/pull/7065) ([Amos + Bird](https://github.com/amosbird)) +- Add global flag ENABLE\_LIBRARIES for disabling all libraries. + [\#7063](https://github.com/ClickHouse/ClickHouse/pull/7063) + ([proller](https://github.com/proller)) + +#### Code cleanup {#code-cleanup} + +- Generalize configuration repository to prepare for DDL for Dictionaries. [\#7155](https://github.com/ClickHouse/ClickHouse/pull/7155) + ([alesapin](https://github.com/alesapin)) +- Parser for dictionaries DDL without any semantic. + [\#7209](https://github.com/ClickHouse/ClickHouse/pull/7209) + ([alesapin](https://github.com/alesapin)) +- Split ParserCreateQuery into different smaller parsers. + [\#7253](https://github.com/ClickHouse/ClickHouse/pull/7253) + ([alesapin](https://github.com/alesapin)) +- Small refactoring and renaming near external dictionaries. + [\#7111](https://github.com/ClickHouse/ClickHouse/pull/7111) + ([alesapin](https://github.com/alesapin)) +- Refactor some code to prepare for role-based access control. [\#7235](https://github.com/ClickHouse/ClickHouse/pull/7235) ([Vitaly + Baranov](https://github.com/vitlibar)) +- Some improvements in DatabaseOrdinary code. + [\#7086](https://github.com/ClickHouse/ClickHouse/pull/7086) ([Nikita + Vasilev](https://github.com/nikvas0)) +- Do not use iterators in find() and emplace() methods of hash tables. + [\#7026](https://github.com/ClickHouse/ClickHouse/pull/7026) ([Alexander + Kuzmenkov](https://github.com/akuzm)) +- Fix getMultipleValuesFromConfig in case when parameter root is not empty. [\#7374](https://github.com/ClickHouse/ClickHouse/pull/7374) + ([Mikhail Korotov](https://github.com/millb)) +- Remove some copy-paste (TemporaryFile and TemporaryFileStream) + [\#7166](https://github.com/ClickHouse/ClickHouse/pull/7166) ([Artem + Zuikov](https://github.com/4ertus2)) +- Improved code readability a little bit (`MergeTreeData::getActiveContainingPart`). + [\#7361](https://github.com/ClickHouse/ClickHouse/pull/7361) ([Vladimir + Chebotarev](https://github.com/excitoon)) +- Wait for all scheduled jobs, which are using local objects, if `ThreadPool::schedule(...)` throws + an exception. Rename `ThreadPool::schedule(...)` to `ThreadPool::scheduleOrThrowOnError(...)` and + fix comments to make obvious that it may throw. + [\#7350](https://github.com/ClickHouse/ClickHouse/pull/7350) + ([tavplubix](https://github.com/tavplubix)) + +## ClickHouse release 19.15 {#clickhouse-release-19-15} + +### ClickHouse release 19.15.4.10, 2019-10-31 {#clickhouse-release-19-15-4-10-2019-10-31} + +#### Bug Fix {#bug-fix-3} + +- Added handling of SQL\_TINYINT and SQL\_BIGINT, and fix handling of SQL\_FLOAT data source types in ODBC Bridge. + [\#7491](https://github.com/ClickHouse/ClickHouse/pull/7491) ([Denis Glazachev](https://github.com/traceon)) +- Allowed to have some parts on destination disk or volume in MOVE PARTITION. + [\#7434](https://github.com/ClickHouse/ClickHouse/pull/7434) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Fixed NULL-values in nullable columns through ODBC-bridge. + [\#7402](https://github.com/ClickHouse/ClickHouse/pull/7402) ([Vasily Nemkov](https://github.com/Enmk)) +- Fixed INSERT into Distributed non local node with MATERIALIZED columns. + [\#7377](https://github.com/ClickHouse/ClickHouse/pull/7377) ([Azat Khuzhin](https://github.com/azat)) +- Fixed function getMultipleValuesFromConfig. + [\#7374](https://github.com/ClickHouse/ClickHouse/pull/7374) ([Mikhail Korotov](https://github.com/millb)) +- Fixed issue of using HTTP keep alive timeout instead of TCP keep alive timeout. + [\#7351](https://github.com/ClickHouse/ClickHouse/pull/7351) ([Vasily Nemkov](https://github.com/Enmk)) +- Wait for all jobs to finish on exception (fixes rare segfaults). + [\#7350](https://github.com/ClickHouse/ClickHouse/pull/7350) ([tavplubix](https://github.com/tavplubix)) +- Don’t push to MVs when inserting into Kafka table. + [\#7265](https://github.com/ClickHouse/ClickHouse/pull/7265) ([Ivan](https://github.com/abyss7)) +- Disable memory tracker for exception stack. + [\#7264](https://github.com/ClickHouse/ClickHouse/pull/7264) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fixed bad code in transforming query for external database. + [\#7252](https://github.com/ClickHouse/ClickHouse/pull/7252) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Avoid use of uninitialized values in MetricsTransmitter. + [\#7158](https://github.com/ClickHouse/ClickHouse/pull/7158) ([Azat Khuzhin](https://github.com/azat)) +- Added example config with macros for tests ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.15.3.6, 2019-10-09 {#clickhouse-release-19-15-3-6-2019-10-09} + +#### Bug Fix {#bug-fix-4} + +- Fixed bad\_variant in hashed dictionary. + ([alesapin](https://github.com/alesapin)) +- Fixed up bug with segmentation fault in ATTACH PART query. + ([alesapin](https://github.com/alesapin)) +- Fixed time calculation in `MergeTreeData`. + ([Vladimir Chebotarev](https://github.com/excitoon)) +- Commit to Kafka explicitly after the writing is finalized. + [\#7175](https://github.com/ClickHouse/ClickHouse/pull/7175) ([Ivan](https://github.com/abyss7)) +- Serialize NULL values correctly in min/max indexes of MergeTree parts. + [\#7234](https://github.com/ClickHouse/ClickHouse/pull/7234) ([Alexander Kuzmenkov](https://github.com/akuzm)) + +### ClickHouse release 19.15.2.2, 2019-10-01 {#clickhouse-release-19-15-2-2-2019-10-01} + +#### New Feature {#new-feature-3} + +- Tiered storage: support to use multiple storage volumes for tables with MergeTree engine. It’s possible to store fresh data on SSD and automatically move old data to HDD. ([example](https://clickhouse.github.io/clickhouse-presentations/meetup30/new_features/#12)). [\#4918](https://github.com/ClickHouse/ClickHouse/pull/4918) ([Igr](https://github.com/ObjatieGroba)) [\#6489](https://github.com/ClickHouse/ClickHouse/pull/6489) ([alesapin](https://github.com/alesapin)) +- Add table function `input` for reading incoming data in `INSERT SELECT` query. [\#5450](https://github.com/ClickHouse/ClickHouse/pull/5450) ([palasonic1](https://github.com/palasonic1)) [\#6832](https://github.com/ClickHouse/ClickHouse/pull/6832) ([Anton Popov](https://github.com/CurtizJ)) +- Add a `sparse_hashed` dictionary layout, that is functionally equivalent to the `hashed` layout, but is more memory efficient. It uses about twice as less memory at the cost of slower value retrieval. [\#6894](https://github.com/ClickHouse/ClickHouse/pull/6894) ([Azat Khuzhin](https://github.com/azat)) +- Implement ability to define list of users for access to dictionaries. Only current connected database using. [\#6907](https://github.com/ClickHouse/ClickHouse/pull/6907) ([Guillaume Tassery](https://github.com/YiuRULE)) +- Add `LIMIT` option to `SHOW` query. [\#6944](https://github.com/ClickHouse/ClickHouse/pull/6944) ([Philipp Malkovsky](https://github.com/malkfilipp)) +- Add `bitmapSubsetLimit(bitmap, range_start, limit)` function, that returns subset of the smallest `limit` values in set that is no smaller than `range_start`. [\#6957](https://github.com/ClickHouse/ClickHouse/pull/6957) ([Zhichang Yu](https://github.com/yuzhichang)) +- Add `bitmapMin` and `bitmapMax` functions. [\#6970](https://github.com/ClickHouse/ClickHouse/pull/6970) ([Zhichang Yu](https://github.com/yuzhichang)) +- Add function `repeat` related to [issue-6648](https://github.com/ClickHouse/ClickHouse/issues/6648) [\#6999](https://github.com/ClickHouse/ClickHouse/pull/6999) ([flynn](https://github.com/ucasFL)) + +#### Experimental Feature {#experimental-feature-1} + +- Implement (in memory) Merge Join variant that does not change current pipeline. Result is partially sorted by merge key. Set `partial_merge_join = 1` to use this feature. The Merge Join is still in development. [\#6940](https://github.com/ClickHouse/ClickHouse/pull/6940) ([Artem Zuikov](https://github.com/4ertus2)) +- Add `S3` engine and table function. It is still in development (no authentication support yet). [\#5596](https://github.com/ClickHouse/ClickHouse/pull/5596) ([Vladimir Chebotarev](https://github.com/excitoon)) + +#### Improvement {#improvement-2} + +- Every message read from Kafka is inserted atomically. This resolves almost all known issues with Kafka engine. [\#6950](https://github.com/ClickHouse/ClickHouse/pull/6950) ([Ivan](https://github.com/abyss7)) +- Improvements for failover of Distributed queries. Shorten recovery time, also it is now configurable and can be seen in `system.clusters`. [\#6399](https://github.com/ClickHouse/ClickHouse/pull/6399) ([Vasily Nemkov](https://github.com/Enmk)) +- Support numeric values for Enums directly in `IN` section. \#6766 [\#6941](https://github.com/ClickHouse/ClickHouse/pull/6941) ([dimarub2000](https://github.com/dimarub2000)) +- Support (optional, disabled by default) redirects on URL storage. [\#6914](https://github.com/ClickHouse/ClickHouse/pull/6914) ([maqroll](https://github.com/maqroll)) +- Add information message when client with an older version connects to a server. [\#6893](https://github.com/ClickHouse/ClickHouse/pull/6893) ([Philipp Malkovsky](https://github.com/malkfilipp)) +- Remove maximum backoff sleep time limit for sending data in Distributed tables [\#6895](https://github.com/ClickHouse/ClickHouse/pull/6895) ([Azat Khuzhin](https://github.com/azat)) +- Add ability to send profile events (counters) with cumulative values to graphite. It can be enabled under `` in server `config.xml`. [\#6969](https://github.com/ClickHouse/ClickHouse/pull/6969) ([Azat Khuzhin](https://github.com/azat)) +- Add automatically cast type `T` to `LowCardinality(T)` while inserting data in column of type `LowCardinality(T)` in Native format via HTTP. [\#6891](https://github.com/ClickHouse/ClickHouse/pull/6891) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Add ability to use function `hex` without using `reinterpretAsString` for `Float32`, `Float64`. [\#7024](https://github.com/ClickHouse/ClickHouse/pull/7024) ([Mikhail Korotov](https://github.com/millb)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-2} + +- Add gdb-index to clickhouse binary with debug info. It will speed up startup time of `gdb`. [\#6947](https://github.com/ClickHouse/ClickHouse/pull/6947) ([alesapin](https://github.com/alesapin)) +- Speed up deb packaging with patched dpkg-deb which uses `pigz`. [\#6960](https://github.com/ClickHouse/ClickHouse/pull/6960) ([alesapin](https://github.com/alesapin)) +- Set `enable_fuzzing = 1` to enable libfuzzer instrumentation of all the project code. [\#7042](https://github.com/ClickHouse/ClickHouse/pull/7042) ([kyprizel](https://github.com/kyprizel)) +- Add split build smoke test in CI. [\#7061](https://github.com/ClickHouse/ClickHouse/pull/7061) ([alesapin](https://github.com/alesapin)) +- Add build with MemorySanitizer to CI. [\#7066](https://github.com/ClickHouse/ClickHouse/pull/7066) ([Alexander Kuzmenkov](https://github.com/akuzm)) +- Replace `libsparsehash` with `sparsehash-c11` [\#6965](https://github.com/ClickHouse/ClickHouse/pull/6965) ([Azat Khuzhin](https://github.com/azat)) + +#### Bug Fix {#bug-fix-5} + +- Fixed performance degradation of index analysis on complex keys on large tables. This fixes \#6924. [\#7075](https://github.com/ClickHouse/ClickHouse/pull/7075) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix logical error causing segfaults when selecting from Kafka empty topic. [\#6909](https://github.com/ClickHouse/ClickHouse/pull/6909) ([Ivan](https://github.com/abyss7)) +- Fix too early MySQL connection close in `MySQLBlockInputStream.cpp`. [\#6882](https://github.com/ClickHouse/ClickHouse/pull/6882) ([Clément Rodriguez](https://github.com/clemrodriguez)) +- Returned support for very old Linux kernels (fix [\#6841](https://github.com/ClickHouse/ClickHouse/issues/6841)) [\#6853](https://github.com/ClickHouse/ClickHouse/pull/6853) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix possible data loss in `insert select` query in case of empty block in input stream. \#6834 \#6862 [\#6911](https://github.com/ClickHouse/ClickHouse/pull/6911) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params [\#6928](https://github.com/ClickHouse/ClickHouse/pull/6928) ([proller](https://github.com/proller)) +- Fix complex queries with array joins and global subqueries. [\#6934](https://github.com/ClickHouse/ClickHouse/pull/6934) ([Ivan](https://github.com/abyss7)) +- Fix `Unknown identifier` error in ORDER BY and GROUP BY with multiple JOINs [\#7022](https://github.com/ClickHouse/ClickHouse/pull/7022) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed `MSan` warning while executing function with `LowCardinality` argument. [\#7062](https://github.com/ClickHouse/ClickHouse/pull/7062) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) + +#### Backward Incompatible Change {#backward-incompatible-change-2} + +- Changed serialization format of bitmap\* aggregate function states to improve performance. Serialized states of bitmap\* from previous versions cannot be read. [\#6908](https://github.com/ClickHouse/ClickHouse/pull/6908) ([Zhichang Yu](https://github.com/yuzhichang)) + +## ClickHouse release 19.14 {#clickhouse-release-19-14} + +### ClickHouse release 19.14.7.15, 2019-10-02 {#clickhouse-release-19-14-7-15-2019-10-02} + +#### Bug Fix {#bug-fix-6} + +- This release also contains all bug fixes from 19.11.12.69. +- Fixed compatibility for distributed queries between 19.14 and earlier versions. This fixes [\#7068](https://github.com/ClickHouse/ClickHouse/issues/7068). [\#7069](https://github.com/ClickHouse/ClickHouse/pull/7069) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.14.6.12, 2019-09-19 {#clickhouse-release-19-14-6-12-2019-09-19} + +#### Bug Fix {#bug-fix-7} + +- Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params. [\#6928](https://github.com/ClickHouse/ClickHouse/pull/6928) ([proller](https://github.com/proller)) +- Fixed subquery name in queries with `ARRAY JOIN` and `GLOBAL IN subquery` with alias. Use subquery alias for external table name if it is specified. [\#6934](https://github.com/ClickHouse/ClickHouse/pull/6934) ([Ivan](https://github.com/abyss7)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-3} + +- Fix [flapping](https://clickhouse-test-reports.s3.yandex.net/6944/aab95fd5175a513413c7395a73a82044bdafb906/functional_stateless_tests_(debug).html) test `00715_fetch_merged_or_mutated_part_zookeeper` by rewriting it to a shell scripts because it needs to wait for mutations to apply. [\#6977](https://github.com/ClickHouse/ClickHouse/pull/6977) ([Alexander Kazakov](https://github.com/Akazz)) +- Fixed UBSan and MemSan failure in function `groupUniqArray` with emtpy array argument. It was caused by placing of empty `PaddedPODArray` into hash table zero cell because constructor for zero cell value was not called. [\#6937](https://github.com/ClickHouse/ClickHouse/pull/6937) ([Amos Bird](https://github.com/amosbird)) + +### ClickHouse release 19.14.3.3, 2019-09-10 {#clickhouse-release-19-14-3-3-2019-09-10} + +#### New Feature {#new-feature-4} + +- `WITH FILL` modifier for `ORDER BY`. (continuation of [\#5069](https://github.com/ClickHouse/ClickHouse/issues/5069)) [\#6610](https://github.com/ClickHouse/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) +- `WITH TIES` modifier for `LIMIT`. (continuation of [\#5069](https://github.com/ClickHouse/ClickHouse/issues/5069)) [\#6610](https://github.com/ClickHouse/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) +- Parse unquoted `NULL` literal as NULL (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [\#5990](https://github.com/ClickHouse/ClickHouse/issues/5990) [\#6055](https://github.com/ClickHouse/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) +- Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [\#6092](https://github.com/ClickHouse/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) +- New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [\#6363](https://github.com/ClickHouse/ClickHouse/issues/6363) [\#6467](https://github.com/ClickHouse/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [\#6530](https://github.com/ClickHouse/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Allow to write ClickHouse text logs to `system.text_log` table. [\#6037](https://github.com/ClickHouse/ClickHouse/issues/6037) [\#6103](https://github.com/ClickHouse/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [\#6164](https://github.com/ClickHouse/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Show private symbols in stack traces (this is done via parsing symbol tables of ELF files). Added information about file and line number in stack traces if debug info is present. Speedup symbol name lookup with indexing symbols present in program. Added new SQL functions for introspection: `demangle` and `addressToLine`. Renamed function `symbolizeAddress` to `addressToSymbol` for consistency. Function `addressToSymbol` will return mangled name for performance reasons and you have to apply `demangle`. Added setting `allow_introspection_functions` which is turned off by default. [\#6201](https://github.com/ClickHouse/ClickHouse/pull/6201) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Table function `values` (the name is case-insensitive). It allows to read from `VALUES` list proposed in [\#5984](https://github.com/ClickHouse/ClickHouse/issues/5984). Example: `SELECT * FROM VALUES('a UInt64, s String', (1, 'one'), (2, 'two'), (3, 'three'))`. [\#6217](https://github.com/ClickHouse/ClickHouse/issues/6217). [\#6209](https://github.com/ClickHouse/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) +- Added an ability to alter storage settings. Syntax: `ALTER TABLE
MODIFY SETTING = `. [\#6366](https://github.com/ClickHouse/ClickHouse/pull/6366) [\#6669](https://github.com/ClickHouse/ClickHouse/pull/6669) [\#6685](https://github.com/ClickHouse/ClickHouse/pull/6685) ([alesapin](https://github.com/alesapin)) +- Support for removing of detached parts. Syntax: `ALTER TABLE DROP DETACHED PART ''`. [\#6158](https://github.com/ClickHouse/ClickHouse/pull/6158) ([tavplubix](https://github.com/tavplubix)) +- Table constraints. Allows to add constraint to table definition which will be checked at insert. [\#5273](https://github.com/ClickHouse/ClickHouse/pull/5273) ([Gleb Novikov](https://github.com/NanoBjorn)) [\#6652](https://github.com/ClickHouse/ClickHouse/pull/6652) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Suppport for cascaded materialized views. [\#6324](https://github.com/ClickHouse/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) +- Turn on query profiler by default to sample every query execution thread once a second. [\#6283](https://github.com/ClickHouse/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Input format `ORC`. [\#6454](https://github.com/ClickHouse/ClickHouse/pull/6454) [\#6703](https://github.com/ClickHouse/ClickHouse/pull/6703) ([akonyaev90](https://github.com/akonyaev90)) +- Added two new functions: `sigmoid` and `tanh` (that are useful for machine learning applications). [\#6254](https://github.com/ClickHouse/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Function `hasToken(haystack, token)`, `hasTokenCaseInsensitive(haystack, token)` to check if given token is in haystack. Token is a maximal length substring between two non alphanumeric ASCII characters (or boundaries of haystack). Token must be a constant string. Supported by tokenbf\_v1 index specialization. [\#6596](https://github.com/ClickHouse/ClickHouse/pull/6596), [\#6662](https://github.com/ClickHouse/ClickHouse/pull/6662) ([Vasily Nemkov](https://github.com/Enmk)) +- New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column in a block of data. [\#5925](https://github.com/ClickHouse/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/ClickHouse/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/ClickHouse/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) +- Created a function `currentUser()`, returning login of authorized user. Added alias `user()` for compatibility with MySQL. [\#6470](https://github.com/ClickHouse/ClickHouse/pull/6470) ([Alex Krash](https://github.com/alex-krash)) +- New aggregate functions `quantilesExactInclusive` and `quantilesExactExclusive` which were proposed in [\#5885](https://github.com/ClickHouse/ClickHouse/issues/5885). [\#6477](https://github.com/ClickHouse/ClickHouse/pull/6477) ([dimarub2000](https://github.com/dimarub2000)) +- Function `bitmapRange(bitmap, range_begin, range_end)` which returns new set with specified range (not include the `range_end`). [\#6314](https://github.com/ClickHouse/ClickHouse/pull/6314) ([Zhichang Yu](https://github.com/yuzhichang)) +- Function `geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)` which creates array of precision-long strings of geohash-boxes covering provided area. [\#6127](https://github.com/ClickHouse/ClickHouse/pull/6127) ([Vasily Nemkov](https://github.com/Enmk)) +- Implement support for INSERT query with `Kafka` tables. [\#6012](https://github.com/ClickHouse/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) +- Added support for `_partition` and `_timestamp` virtual columns to Kafka engine. [\#6400](https://github.com/ClickHouse/ClickHouse/pull/6400) ([Ivan](https://github.com/abyss7)) +- Possibility to remove sensitive data from `query_log`, server logs, process list with regexp-based rules. [\#5710](https://github.com/ClickHouse/ClickHouse/pull/5710) ([filimonov](https://github.com/filimonov)) + +#### Experimental Feature {#experimental-feature-2} + +- Input and output data format `Template`. It allows to specify custom format string for input and output. [\#4354](https://github.com/ClickHouse/ClickHouse/issues/4354) [\#6727](https://github.com/ClickHouse/ClickHouse/pull/6727) ([tavplubix](https://github.com/tavplubix)) +- Implementation of `LIVE VIEW` tables that were originally proposed in [\#2898](https://github.com/ClickHouse/ClickHouse/pull/2898), prepared in [\#3925](https://github.com/ClickHouse/ClickHouse/issues/3925), and then updated in [\#5541](https://github.com/ClickHouse/ClickHouse/issues/5541). See [\#5541](https://github.com/ClickHouse/ClickHouse/issues/5541) for detailed description. [\#5541](https://github.com/ClickHouse/ClickHouse/issues/5541) ([vzakaznikov](https://github.com/vzakaznikov)) [\#6425](https://github.com/ClickHouse/ClickHouse/pull/6425) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) [\#6656](https://github.com/ClickHouse/ClickHouse/pull/6656) ([vzakaznikov](https://github.com/vzakaznikov)) Note that `LIVE VIEW` feature may be removed in next versions. + +#### Bug Fix {#bug-fix-8} + +- This release also contains all bug fixes from 19.13 and 19.11. +- Fix segmentation fault when the table has skip indices and vertical merge happens. [\#6723](https://github.com/ClickHouse/ClickHouse/pull/6723) ([alesapin](https://github.com/alesapin)) +- Fix per-column TTL with non-trivial column defaults. Previously in case of force TTL merge with `OPTIMIZE ... FINAL` query, expired values was replaced by type defaults instead of user-specified column defaults. [\#6796](https://github.com/ClickHouse/ClickHouse/pull/6796) ([Anton Popov](https://github.com/CurtizJ)) +- Fix Kafka messages duplication problem on normal server restart. [\#6597](https://github.com/ClickHouse/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) +- Fixed infinite loop when reading Kafka messages. Do not pause/resume consumer on subscription at all - otherwise it may get paused indefinitely in some scenarios. [\#6354](https://github.com/ClickHouse/ClickHouse/pull/6354) ([Ivan](https://github.com/abyss7)) +- Fix `Key expression contains comparison between inconvertible types` exception in `bitmapContains` function. [\#6136](https://github.com/ClickHouse/ClickHouse/issues/6136) [\#6146](https://github.com/ClickHouse/ClickHouse/issues/6146) [\#6156](https://github.com/ClickHouse/ClickHouse/pull/6156) ([dimarub2000](https://github.com/dimarub2000)) +- Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [\#6384](https://github.com/ClickHouse/ClickHouse/pull/6384) ([Anton Popov](https://github.com/CurtizJ)) +- Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [\#6514](https://github.com/ClickHouse/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Removed extra verbose logging in MySQL interface [\#6389](https://github.com/ClickHouse/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Return the ability to parse boolean settings from ‘true’ and ‘false’ in the configuration file. [\#6278](https://github.com/ClickHouse/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) +- Fix crash in `quantile` and `median` function over `Nullable(Decimal128)`. [\#6378](https://github.com/ClickHouse/ClickHouse/pull/6378) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed possible incomplete result returned by `SELECT` query with `WHERE` condition on primary key contained conversion to Float type. It was caused by incorrect checking of monotonicity in `toFloat` function. [\#6248](https://github.com/ClickHouse/ClickHouse/issues/6248) [\#6374](https://github.com/ClickHouse/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) +- Check `max_expanded_ast_elements` setting for mutations. Clear mutations after `TRUNCATE TABLE`. [\#6205](https://github.com/ClickHouse/ClickHouse/pull/6205) ([Winter Zhang](https://github.com/zhang2014)) +- Fix JOIN results for key columns when used with `join_use_nulls`. Attach Nulls instead of columns defaults. [\#6249](https://github.com/ClickHouse/ClickHouse/pull/6249) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix for skip indices with vertical merge and alter. Fix for `Bad size of marks file` exception. [\#6594](https://github.com/ClickHouse/ClickHouse/issues/6594) [\#6713](https://github.com/ClickHouse/ClickHouse/pull/6713) ([alesapin](https://github.com/alesapin)) +- Fix rare crash in `ALTER MODIFY COLUMN` and vertical merge when one of merged/altered parts is empty (0 rows) [\#6746](https://github.com/ClickHouse/ClickHouse/issues/6746) [\#6780](https://github.com/ClickHouse/ClickHouse/pull/6780) ([alesapin](https://github.com/alesapin)) +- Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [\#6257](https://github.com/ClickHouse/ClickHouse/issues/6257). [\#6281](https://github.com/ClickHouse/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [\#6404](https://github.com/ClickHouse/ClickHouse/pull/6404) ([Anton Popov](https://github.com/CurtizJ)) +- Fixed unsafe code around `getIdentifier` function. [\#6401](https://github.com/ClickHouse/ClickHouse/issues/6401) [\#6409](https://github.com/ClickHouse/ClickHouse/pull/6409) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed bug in MySQL wire protocol (is used while connecting to ClickHouse form MySQL client). Caused by heap buffer overflow in `PacketPayloadWriteBuffer`. [\#6212](https://github.com/ClickHouse/ClickHouse/pull/6212) ([Yuriy Baranov](https://github.com/yurriy)) +- Fixed memory leak in `bitmapSubsetInRange` function. [\#6819](https://github.com/ClickHouse/ClickHouse/pull/6819) ([Zhichang Yu](https://github.com/yuzhichang)) +- Fix rare bug when mutation executed after granularity change. [\#6816](https://github.com/ClickHouse/ClickHouse/pull/6816) ([alesapin](https://github.com/alesapin)) +- Allow protobuf message with all fields by default. [\#6132](https://github.com/ClickHouse/ClickHouse/pull/6132) ([Vitaly Baranov](https://github.com/vitlibar)) +- Resolve a bug with `nullIf` function when we send a `NULL` argument on the second argument. [\#6446](https://github.com/ClickHouse/ClickHouse/pull/6446) ([Guillaume Tassery](https://github.com/YiuRULE)) +- Fix rare bug with wrong memory allocation/deallocation in complex key cache dictionaries with string fields which leads to infinite memory consumption (looks like memory leak). Bug reproduces when string size was a power of two starting from eight (8, 16, 32, etc). [\#6447](https://github.com/ClickHouse/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) +- Fixed Gorilla encoding on small sequences which caused exception `Cannot write after end of buffer`. [\#6398](https://github.com/ClickHouse/ClickHouse/issues/6398) [\#6444](https://github.com/ClickHouse/ClickHouse/pull/6444) ([Vasily Nemkov](https://github.com/Enmk)) +- Allow to use not nullable types in JOINs with `join_use_nulls` enabled. [\#6705](https://github.com/ClickHouse/ClickHouse/pull/6705) ([Artem Zuikov](https://github.com/4ertus2)) +- Disable `Poco::AbstractConfiguration` substitutions in query in `clickhouse-client`. [\#6706](https://github.com/ClickHouse/ClickHouse/pull/6706) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Avoid deadlock in `REPLACE PARTITION`. [\#6677](https://github.com/ClickHouse/ClickHouse/pull/6677) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Using `arrayReduce` for constant arguments may lead to segfault. [\#6242](https://github.com/ClickHouse/ClickHouse/issues/6242) [\#6326](https://github.com/ClickHouse/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix inconsistent parts which can appear if replica was restored after `DROP PARTITION`. [\#6522](https://github.com/ClickHouse/ClickHouse/issues/6522) [\#6523](https://github.com/ClickHouse/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) +- Fixed hang in `JSONExtractRaw` function. [\#6195](https://github.com/ClickHouse/ClickHouse/issues/6195) [\#6198](https://github.com/ClickHouse/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix bug with incorrect skip indices serialization and aggregation with adaptive granularity. [\#6594](https://github.com/ClickHouse/ClickHouse/issues/6594). [\#6748](https://github.com/ClickHouse/ClickHouse/pull/6748) ([alesapin](https://github.com/alesapin)) +- Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [\#6225](https://github.com/ClickHouse/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) +- Fix bug with writing secondary indices marks with adaptive granularity. [\#6126](https://github.com/ClickHouse/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin)) +- Fix initialization order while server startup. Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [\#6080](https://github.com/ClickHouse/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7)) +- Clearing the data buffer from the previous read operation that was completed with an error. [\#6026](https://github.com/ClickHouse/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) +- Fix bug with enabling adaptive granularity when creating a new replica for Replicated\*MergeTree table. [\#6394](https://github.com/ClickHouse/ClickHouse/issues/6394) [\#6452](https://github.com/ClickHouse/ClickHouse/pull/6452) ([alesapin](https://github.com/alesapin)) +- Fixed possible crash during server startup in case of exception happened in `libunwind` during exception at access to uninitialized `ThreadStatus` structure. [\#6456](https://github.com/ClickHouse/ClickHouse/pull/6456) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +- Fix crash in `yandexConsistentHash` function. Found by fuzz test. [\#6304](https://github.com/ClickHouse/ClickHouse/issues/6304) [\#6305](https://github.com/ClickHouse/ClickHouse/pull/6305) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed the possibility of hanging queries when server is overloaded and global thread pool becomes near full. This have higher chance to happen on clusters with large number of shards (hundreds), because distributed queries allocate a thread per connection to each shard. For example, this issue may reproduce if a cluster of 330 shards is processing 30 concurrent distributed queries. This issue affects all versions starting from 19.2. [\#6301](https://github.com/ClickHouse/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed logic of `arrayEnumerateUniqRanked` function. [\#6423](https://github.com/ClickHouse/ClickHouse/pull/6423) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix segfault when decoding symbol table. [\#6603](https://github.com/ClickHouse/ClickHouse/pull/6603) ([Amos Bird](https://github.com/amosbird)) +- Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn’t contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [\#6094](https://github.com/ClickHouse/ClickHouse/issues/6094) [\#6119](https://github.com/ClickHouse/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Removed extra quoting of description in `system.settings` table. [\#6696](https://github.com/ClickHouse/ClickHouse/issues/6696) [\#6699](https://github.com/ClickHouse/ClickHouse/pull/6699) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Avoid possible deadlock in `TRUNCATE` of Replicated table. [\#6695](https://github.com/ClickHouse/ClickHouse/pull/6695) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix reading in order of sorting key. [\#6189](https://github.com/ClickHouse/ClickHouse/pull/6189) ([Anton Popov](https://github.com/CurtizJ)) +- Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [\#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) +- Fix bug opened by [\#4405](https://github.com/ClickHouse/ClickHouse/pull/4405) (since 19.4.0). Reproduces in queries to Distributed tables over MergeTree tables when we doesn’t query any columns (`SELECT 1`). [\#6236](https://github.com/ClickHouse/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) +- Fixed overflow in integer division of signed type to unsigned type. The behaviour was exactly as in C or C++ language (integer promotion rules) that may be surprising. Please note that the overflow is still possible when dividing large signed number to large unsigned number or vice-versa (but that case is less usual). The issue existed in all server versions. [\#6214](https://github.com/ClickHouse/ClickHouse/issues/6214) [\#6233](https://github.com/ClickHouse/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Limit maximum sleep time for throttling when `max_execution_speed` or `max_execution_speed_bytes` is set. Fixed false errors like `Estimated query execution time (inf seconds) is too long`. [\#5547](https://github.com/ClickHouse/ClickHouse/issues/5547) [\#6232](https://github.com/ClickHouse/ClickHouse/pull/6232) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed issues about using `MATERIALIZED` columns and aliases in `MaterializedView`. [\#448](https://github.com/ClickHouse/ClickHouse/issues/448) [\#3484](https://github.com/ClickHouse/ClickHouse/issues/3484) [\#3450](https://github.com/ClickHouse/ClickHouse/issues/3450) [\#2878](https://github.com/ClickHouse/ClickHouse/issues/2878) [\#2285](https://github.com/ClickHouse/ClickHouse/issues/2285) [\#3796](https://github.com/ClickHouse/ClickHouse/pull/3796) ([Amos Bird](https://github.com/amosbird)) [\#6316](https://github.com/ClickHouse/ClickHouse/pull/6316) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix `FormatFactory` behaviour for input streams which are not implemented as processor. [\#6495](https://github.com/ClickHouse/ClickHouse/pull/6495) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fixed typo. [\#6631](https://github.com/ClickHouse/ClickHouse/pull/6631) ([Alex Ryndin](https://github.com/alexryndin)) +- Typo in the error message ( is -\> are ). [\#6839](https://github.com/ClickHouse/ClickHouse/pull/6839) ([Denis Zhuravlev](https://github.com/den-crane)) +- Fixed error while parsing of columns list from string if type contained a comma (this issue was relevant for `File`, `URL`, `HDFS` storages) [\#6217](https://github.com/ClickHouse/ClickHouse/issues/6217). [\#6209](https://github.com/ClickHouse/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) + +#### Security Fix {#security-fix} + +- This release also contains all bug security fixes from 19.13 and 19.11. +- Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser. Fixed the possibility of stack overflow in Merge and Distributed tables, materialized views and conditions for row-level security that involve subqueries. [\#6433](https://github.com/ClickHouse/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Improvement {#improvement-3} + +- Correct implementation of ternary logic for `AND/OR`. [\#6048](https://github.com/ClickHouse/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) +- Now values and rows with expired TTL will be removed after `OPTIMIZE ... FINAL` query from old parts without TTL infos or with outdated TTL infos, e.g. after `ALTER ... MODIFY TTL` query. Added queries `SYSTEM STOP/START TTL MERGES` to disallow/allow assign merges with TTL and filter expired values in all merges. [\#6274](https://github.com/ClickHouse/ClickHouse/pull/6274) ([Anton Popov](https://github.com/CurtizJ)) +- Possibility to change the location of ClickHouse history file for client using `CLICKHOUSE_HISTORY_FILE` env. [\#6840](https://github.com/ClickHouse/ClickHouse/pull/6840) ([filimonov](https://github.com/filimonov)) +- Remove `dry_run` flag from `InterpreterSelectQuery`. … [\#6375](https://github.com/ClickHouse/ClickHouse/pull/6375) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Support `ASOF JOIN` with `ON` section. [\#6211](https://github.com/ClickHouse/ClickHouse/pull/6211) ([Artem Zuikov](https://github.com/4ertus2)) +- Better support of skip indexes for mutations and replication. Support for `MATERIALIZE/CLEAR INDEX ... IN PARTITION` query. `UPDATE x = x` recalculates all indices that use column `x`. [\#5053](https://github.com/ClickHouse/ClickHouse/pull/5053) ([Nikita Vasilev](https://github.com/nikvas0)) +- Allow to `ATTACH` live views (for example, at the server startup) regardless to `allow_experimental_live_view` setting. [\#6754](https://github.com/ClickHouse/ClickHouse/pull/6754) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- For stack traces gathered by query profiler, do not include stack frames generated by the query profiler itself. [\#6250](https://github.com/ClickHouse/ClickHouse/pull/6250) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Now table functions `values`, `file`, `url`, `hdfs` have support for ALIAS columns. [\#6255](https://github.com/ClickHouse/ClickHouse/pull/6255) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Throw an exception if `config.d` file doesn’t have the corresponding root element as the config file. [\#6123](https://github.com/ClickHouse/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) +- Print extra info in exception message for `no space left on device`. [\#6182](https://github.com/ClickHouse/ClickHouse/issues/6182), [\#6252](https://github.com/ClickHouse/ClickHouse/issues/6252) [\#6352](https://github.com/ClickHouse/ClickHouse/pull/6352) ([tavplubix](https://github.com/tavplubix)) +- When determining shards of a `Distributed` table to be covered by a read query (for `optimize_skip_unused_shards` = 1) ClickHouse now checks conditions from both `prewhere` and `where` clauses of select statement. [\#6521](https://github.com/ClickHouse/ClickHouse/pull/6521) ([Alexander Kazakov](https://github.com/Akazz)) +- Enabled `SIMDJSON` for machines without AVX2 but with SSE 4.2 and PCLMUL instruction set. [\#6285](https://github.com/ClickHouse/ClickHouse/issues/6285) [\#6320](https://github.com/ClickHouse/ClickHouse/pull/6320) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- ClickHouse can work on filesystems without `O_DIRECT` support (such as ZFS and BtrFS) without additional tuning. [\#4449](https://github.com/ClickHouse/ClickHouse/issues/4449) [\#6730](https://github.com/ClickHouse/ClickHouse/pull/6730) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Support push down predicate for final subquery. [\#6120](https://github.com/ClickHouse/ClickHouse/pull/6120) ([TCeason](https://github.com/TCeason)) [\#6162](https://github.com/ClickHouse/ClickHouse/pull/6162) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Better `JOIN ON` keys extraction [\#6131](https://github.com/ClickHouse/ClickHouse/pull/6131) ([Artem Zuikov](https://github.com/4ertus2)) +- Upated `SIMDJSON`. [\#6285](https://github.com/ClickHouse/ClickHouse/issues/6285). [\#6306](https://github.com/ClickHouse/ClickHouse/pull/6306) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Optimize selecting of smallest column for `SELECT count()` query. [\#6344](https://github.com/ClickHouse/ClickHouse/pull/6344) ([Amos Bird](https://github.com/amosbird)) +- Added `strict` parameter in `windowFunnel()`. When the `strict` is set, the `windowFunnel()` applies conditions only for the unique values. [\#6548](https://github.com/ClickHouse/ClickHouse/pull/6548) ([achimbab](https://github.com/achimbab)) +- Safer interface of `mysqlxx::Pool`. [\#6150](https://github.com/ClickHouse/ClickHouse/pull/6150) ([avasiliev](https://github.com/avasiliev)) +- Options line size when executing with `--help` option now corresponds with terminal size. [\#6590](https://github.com/ClickHouse/ClickHouse/pull/6590) ([dimarub2000](https://github.com/dimarub2000)) +- Disable “read in order” optimization for aggregation without keys. [\#6599](https://github.com/ClickHouse/ClickHouse/pull/6599) ([Anton Popov](https://github.com/CurtizJ)) +- HTTP status code for `INCORRECT_DATA` and `TYPE_MISMATCH` error codes was changed from default `500 Internal Server Error` to `400 Bad Request`. [\#6271](https://github.com/ClickHouse/ClickHouse/pull/6271) ([Alexander Rodin](https://github.com/a-rodin)) +- Move Join object from `ExpressionAction` into `AnalyzedJoin`. `ExpressionAnalyzer` and `ExpressionAction` do not know about `Join` class anymore. Its logic is hidden by `AnalyzedJoin` iface. [\#6801](https://github.com/ClickHouse/ClickHouse/pull/6801) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed possible deadlock of distributed queries when one of shards is localhost but the query is sent via network connection. [\#6759](https://github.com/ClickHouse/ClickHouse/pull/6759) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Changed semantic of multiple tables `RENAME` to avoid possible deadlocks. [\#6757](https://github.com/ClickHouse/ClickHouse/issues/6757). [\#6756](https://github.com/ClickHouse/ClickHouse/pull/6756) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Rewritten MySQL compatibility server to prevent loading full packet payload in memory. Decreased memory consumption for each connection to approximately `2 * DBMS_DEFAULT_BUFFER_SIZE` (read/write buffers). [\#5811](https://github.com/ClickHouse/ClickHouse/pull/5811) ([Yuriy Baranov](https://github.com/yurriy)) +- Move AST alias interpreting logic out of parser that doesn’t have to know anything about query semantics. [\#6108](https://github.com/ClickHouse/ClickHouse/pull/6108) ([Artem Zuikov](https://github.com/4ertus2)) +- Slightly more safe parsing of `NamesAndTypesList`. [\#6408](https://github.com/ClickHouse/ClickHouse/issues/6408). [\#6410](https://github.com/ClickHouse/ClickHouse/pull/6410) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- `clickhouse-copier`: Allow use `where_condition` from config with `partition_key` alias in query for checking partition existence (Earlier it was used only in reading data queries). [\#6577](https://github.com/ClickHouse/ClickHouse/pull/6577) ([proller](https://github.com/proller)) +- Added optional message argument in `throwIf`. ([\#5772](https://github.com/ClickHouse/ClickHouse/issues/5772)) [\#6329](https://github.com/ClickHouse/ClickHouse/pull/6329) ([Vdimir](https://github.com/Vdimir)) +- Server exception got while sending insertion data is now being processed in client as well. [\#5891](https://github.com/ClickHouse/ClickHouse/issues/5891) [\#6711](https://github.com/ClickHouse/ClickHouse/pull/6711) ([dimarub2000](https://github.com/dimarub2000)) +- Added a metric `DistributedFilesToInsert` that shows the total number of files in filesystem that are selected to send to remote servers by Distributed tables. The number is summed across all shards. [\#6600](https://github.com/ClickHouse/ClickHouse/pull/6600) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Move most of JOINs prepare logic from `ExpressionAction/ExpressionAnalyzer` to `AnalyzedJoin`. [\#6785](https://github.com/ClickHouse/ClickHouse/pull/6785) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix TSan [warning](https://clickhouse-test-reports.s3.yandex.net/6399/c1c1d1daa98e199e620766f1bd06a5921050a00d/functional_stateful_tests_(thread).html) ‘lock-order-inversion’. [\#6740](https://github.com/ClickHouse/ClickHouse/pull/6740) ([Vasily Nemkov](https://github.com/Enmk)) +- Better information messages about lack of Linux capabilities. Logging fatal errors with “fatal” level, that will make it easier to find in `system.text_log`. [\#6441](https://github.com/ClickHouse/ClickHouse/pull/6441) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- When enable dumping temporary data to the disk to restrict memory usage during `GROUP BY`, `ORDER BY`, it didn’t check the free disk space. The fix add a new setting `min_free_disk_space`, when the free disk space it smaller then the threshold, the query will stop and throw `ErrorCodes::NOT_ENOUGH_SPACE`. [\#6678](https://github.com/ClickHouse/ClickHouse/pull/6678) ([Weiqing Xu](https://github.com/weiqxu)) [\#6691](https://github.com/ClickHouse/ClickHouse/pull/6691) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Removed recursive rwlock by thread. It makes no sense, because threads are reused between queries. `SELECT` query may acquire a lock in one thread, hold a lock from another thread and exit from first thread. In the same time, first thread can be reused by `DROP` query. This will lead to false “Attempt to acquire exclusive lock recursively” messages. [\#6771](https://github.com/ClickHouse/ClickHouse/pull/6771) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Split `ExpressionAnalyzer.appendJoin()`. Prepare a place in `ExpressionAnalyzer` for `MergeJoin`. [\#6524](https://github.com/ClickHouse/ClickHouse/pull/6524) ([Artem Zuikov](https://github.com/4ertus2)) +- Added `mysql_native_password` authentication plugin to MySQL compatibility server. [\#6194](https://github.com/ClickHouse/ClickHouse/pull/6194) ([Yuriy Baranov](https://github.com/yurriy)) +- Less number of `clock_gettime` calls; fixed ABI compatibility between debug/release in `Allocator` (insignificant issue). [\#6197](https://github.com/ClickHouse/ClickHouse/pull/6197) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Move `collectUsedColumns` from `ExpressionAnalyzer` to `SyntaxAnalyzer`. `SyntaxAnalyzer` makes `required_source_columns` itself now. [\#6416](https://github.com/ClickHouse/ClickHouse/pull/6416) ([Artem Zuikov](https://github.com/4ertus2)) +- Add setting `joined_subquery_requires_alias` to require aliases for subselects and table functions in `FROM` that more than one table is present (i.e. queries with JOINs). [\#6733](https://github.com/ClickHouse/ClickHouse/pull/6733) ([Artem Zuikov](https://github.com/4ertus2)) +- Extract `GetAggregatesVisitor` class from `ExpressionAnalyzer`. [\#6458](https://github.com/ClickHouse/ClickHouse/pull/6458) ([Artem Zuikov](https://github.com/4ertus2)) +- `system.query_log`: change data type of `type` column to `Enum`. [\#6265](https://github.com/ClickHouse/ClickHouse/pull/6265) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +- Static linking of `sha256_password` authentication plugin. [\#6512](https://github.com/ClickHouse/ClickHouse/pull/6512) ([Yuriy Baranov](https://github.com/yurriy)) +- Avoid extra dependency for the setting `compile` to work. In previous versions, the user may get error like `cannot open crti.o`, `unable to find library -lc` etc. [\#6309](https://github.com/ClickHouse/ClickHouse/pull/6309) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- More validation of the input that may come from malicious replica. [\#6303](https://github.com/ClickHouse/ClickHouse/pull/6303) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Now `clickhouse-obfuscator` file is available in `clickhouse-client` package. In previous versions it was available as `clickhouse obfuscator` (with whitespace). [\#5816](https://github.com/ClickHouse/ClickHouse/issues/5816) [\#6609](https://github.com/ClickHouse/ClickHouse/pull/6609) ([dimarub2000](https://github.com/dimarub2000)) +- Fixed deadlock when we have at least two queries that read at least two tables in different order and another query that performs DDL operation on one of tables. Fixed another very rare deadlock. [\#6764](https://github.com/ClickHouse/ClickHouse/pull/6764) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added `os_thread_ids` column to `system.processes` and `system.query_log` for better debugging possibilities. [\#6763](https://github.com/ClickHouse/ClickHouse/pull/6763) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- A workaround for PHP mysqlnd extension bugs which occur when `sha256_password` is used as a default authentication plugin (described in [\#6031](https://github.com/ClickHouse/ClickHouse/issues/6031)). [\#6113](https://github.com/ClickHouse/ClickHouse/pull/6113) ([Yuriy Baranov](https://github.com/yurriy)) +- Remove unneeded place with changed nullability columns. [\#6693](https://github.com/ClickHouse/ClickHouse/pull/6693) ([Artem Zuikov](https://github.com/4ertus2)) +- Set default value of `queue_max_wait_ms` to zero, because current value (five seconds) makes no sense. There are rare circumstances when this settings has any use. Added settings `replace_running_query_max_wait_ms`, `kafka_max_wait_ms` and `connection_pool_max_wait_ms` for disambiguation. [\#6692](https://github.com/ClickHouse/ClickHouse/pull/6692) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Extract `SelectQueryExpressionAnalyzer` from `ExpressionAnalyzer`. Keep the last one for non-select queries. [\#6499](https://github.com/ClickHouse/ClickHouse/pull/6499) ([Artem Zuikov](https://github.com/4ertus2)) +- Removed duplicating input and output formats. [\#6239](https://github.com/ClickHouse/ClickHouse/pull/6239) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [\#6230](https://github.com/ClickHouse/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- `MergeTree` now has an additional option `ttl_only_drop_parts` (disabled by default) to avoid partial pruning of parts, so that they dropped completely when all the rows in a part are expired. [\#6191](https://github.com/ClickHouse/ClickHouse/pull/6191) ([Sergi Vladykin](https://github.com/svladykin)) +- Type checks for set index functions. Throw exception if function got a wrong type. This fixes fuzz test with UBSan. [\#6511](https://github.com/ClickHouse/ClickHouse/pull/6511) ([Nikita Vasilev](https://github.com/nikvas0)) + +#### Performance Improvement {#performance-improvement-2} + +- Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with sorting key in `MergeTree` tables. This optimization is controlled by `optimize_read_in_order` setting. [\#6054](https://github.com/ClickHouse/ClickHouse/pull/6054) [\#6629](https://github.com/ClickHouse/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) +- Allow to use multiple threads during parts loading and removal. [\#6372](https://github.com/ClickHouse/ClickHouse/issues/6372) [\#6074](https://github.com/ClickHouse/ClickHouse/issues/6074) [\#6438](https://github.com/ClickHouse/ClickHouse/pull/6438) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [\#6435](https://github.com/ClickHouse/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [\#6254](https://github.com/ClickHouse/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) Using Danila Kutenin variant to make fastops working [\#6317](https://github.com/ClickHouse/ClickHouse/pull/6317) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Disable consecutive key optimization for `UInt8/16`. [\#6298](https://github.com/ClickHouse/ClickHouse/pull/6298) [\#6701](https://github.com/ClickHouse/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) +- Improved performance of `simdjson` library by getting rid of dynamic allocation in `ParsedJson::Iterator`. [\#6479](https://github.com/ClickHouse/ClickHouse/pull/6479) ([Vitaly Baranov](https://github.com/vitlibar)) +- Pre-fault pages when allocating memory with `mmap()`. [\#6667](https://github.com/ClickHouse/ClickHouse/pull/6667) ([akuzm](https://github.com/akuzm)) +- Fix performance bug in `Decimal` comparison. [\#6380](https://github.com/ClickHouse/ClickHouse/pull/6380) ([Artem Zuikov](https://github.com/4ertus2)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-4} + +- Remove Compiler (runtime template instantiation) because we’ve win over it’s performance. [\#6646](https://github.com/ClickHouse/ClickHouse/pull/6646) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added performance test to show degradation of performance in gcc-9 in more isolated way. [\#6302](https://github.com/ClickHouse/ClickHouse/pull/6302) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added table function `numbers_mt`, which is multithreaded version of `numbers`. Updated performance tests with hash functions. [\#6554](https://github.com/ClickHouse/ClickHouse/pull/6554) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Comparison mode in `clickhouse-benchmark` [\#6220](https://github.com/ClickHouse/ClickHouse/issues/6220) [\#6343](https://github.com/ClickHouse/ClickHouse/pull/6343) ([dimarub2000](https://github.com/dimarub2000)) +- Best effort for printing stack traces. Also added `SIGPROF` as a debugging signal to print stack trace of a running thread. [\#6529](https://github.com/ClickHouse/ClickHouse/pull/6529) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Every function in its own file, part 10. [\#6321](https://github.com/ClickHouse/ClickHouse/pull/6321) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Remove doubled const `TABLE_IS_READ_ONLY`. [\#6566](https://github.com/ClickHouse/ClickHouse/pull/6566) ([filimonov](https://github.com/filimonov)) +- Formatting changes for `StringHashMap` PR [\#5417](https://github.com/ClickHouse/ClickHouse/issues/5417). [\#6700](https://github.com/ClickHouse/ClickHouse/pull/6700) ([akuzm](https://github.com/akuzm)) +- Better subquery for join creation in `ExpressionAnalyzer`. [\#6824](https://github.com/ClickHouse/ClickHouse/pull/6824) ([Artem Zuikov](https://github.com/4ertus2)) +- Remove a redundant condition (found by PVS Studio). [\#6775](https://github.com/ClickHouse/ClickHouse/pull/6775) ([akuzm](https://github.com/akuzm)) +- Separate the hash table interface for `ReverseIndex`. [\#6672](https://github.com/ClickHouse/ClickHouse/pull/6672) ([akuzm](https://github.com/akuzm)) +- Refactoring of settings. [\#6689](https://github.com/ClickHouse/ClickHouse/pull/6689) ([alesapin](https://github.com/alesapin)) +- Add comments for `set` index functions. [\#6319](https://github.com/ClickHouse/ClickHouse/pull/6319) ([Nikita Vasilev](https://github.com/nikvas0)) +- Increase OOM score in debug version on Linux. [\#6152](https://github.com/ClickHouse/ClickHouse/pull/6152) ([akuzm](https://github.com/akuzm)) +- HDFS HA now work in debug build. [\#6650](https://github.com/ClickHouse/ClickHouse/pull/6650) ([Weiqing Xu](https://github.com/weiqxu)) +- Added a test to `transform_query_for_external_database`. [\#6388](https://github.com/ClickHouse/ClickHouse/pull/6388) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add test for multiple materialized views for Kafka table. [\#6509](https://github.com/ClickHouse/ClickHouse/pull/6509) ([Ivan](https://github.com/abyss7)) +- Make a better build scheme. [\#6500](https://github.com/ClickHouse/ClickHouse/pull/6500) ([Ivan](https://github.com/abyss7)) +- Fixed `test_external_dictionaries` integration in case it was executed under non root user. [\#6507](https://github.com/ClickHouse/ClickHouse/pull/6507) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- The bug reproduces when total size of written packets exceeds `DBMS_DEFAULT_BUFFER_SIZE`. [\#6204](https://github.com/ClickHouse/ClickHouse/pull/6204) ([Yuriy Baranov](https://github.com/yurriy)) +- Added a test for `RENAME` table race condition [\#6752](https://github.com/ClickHouse/ClickHouse/pull/6752) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Avoid data race on Settings in `KILL QUERY`. [\#6753](https://github.com/ClickHouse/ClickHouse/pull/6753) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add integration test for handling errors by a cache dictionary. [\#6755](https://github.com/ClickHouse/ClickHouse/pull/6755) ([Vitaly Baranov](https://github.com/vitlibar)) +- Disable parsing of ELF object files on Mac OS, because it makes no sense. [\#6578](https://github.com/ClickHouse/ClickHouse/pull/6578) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Attempt to make changelog generator better. [\#6327](https://github.com/ClickHouse/ClickHouse/pull/6327) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Adding `-Wshadow` switch to the GCC. [\#6325](https://github.com/ClickHouse/ClickHouse/pull/6325) ([kreuzerkrieg](https://github.com/kreuzerkrieg)) +- Removed obsolete code for `mimalloc` support. [\#6715](https://github.com/ClickHouse/ClickHouse/pull/6715) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- `zlib-ng` determines x86 capabilities and saves this info to global variables. This is done in defalteInit call, which may be made by different threads simultaneously. To avoid multithreaded writes, do it on library startup. [\#6141](https://github.com/ClickHouse/ClickHouse/pull/6141) ([akuzm](https://github.com/akuzm)) +- Regression test for a bug which in join which was fixed in [\#5192](https://github.com/ClickHouse/ClickHouse/issues/5192). [\#6147](https://github.com/ClickHouse/ClickHouse/pull/6147) ([Bakhtiyor Ruziev](https://github.com/theruziev)) +- Fixed MSan report. [\#6144](https://github.com/ClickHouse/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix flapping TTL test. [\#6782](https://github.com/ClickHouse/ClickHouse/pull/6782) ([Anton Popov](https://github.com/CurtizJ)) +- Fixed false data race in `MergeTreeDataPart::is_frozen` field. [\#6583](https://github.com/ClickHouse/ClickHouse/pull/6583) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed timeouts in fuzz test. In previous version, it managed to find false hangup in query `SELECT * FROM numbers_mt(gccMurmurHash(''))`. [\#6582](https://github.com/ClickHouse/ClickHouse/pull/6582) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added debug checks to `static_cast` of columns. [\#6581](https://github.com/ClickHouse/ClickHouse/pull/6581) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Support for Oracle Linux in official RPM packages. [\#6356](https://github.com/ClickHouse/ClickHouse/issues/6356) [\#6585](https://github.com/ClickHouse/ClickHouse/pull/6585) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Changed json perftests from `once` to `loop` type. [\#6536](https://github.com/ClickHouse/ClickHouse/pull/6536) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- `odbc-bridge.cpp` defines `main()` so it should not be included in `clickhouse-lib`. [\#6538](https://github.com/ClickHouse/ClickHouse/pull/6538) ([Orivej Desh](https://github.com/orivej)) +- Test for crash in `FULL|RIGHT JOIN` with nulls in right table’s keys. [\#6362](https://github.com/ClickHouse/ClickHouse/pull/6362) ([Artem Zuikov](https://github.com/4ertus2)) +- Added a test for the limit on expansion of aliases just in case. [\#6442](https://github.com/ClickHouse/ClickHouse/pull/6442) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Switched from `boost::filesystem` to `std::filesystem` where appropriate. [\#6253](https://github.com/ClickHouse/ClickHouse/pull/6253) [\#6385](https://github.com/ClickHouse/ClickHouse/pull/6385) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added RPM packages to website. [\#6251](https://github.com/ClickHouse/ClickHouse/pull/6251) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add a test for fixed `Unknown identifier` exception in `IN` section. [\#6708](https://github.com/ClickHouse/ClickHouse/pull/6708) ([Artem Zuikov](https://github.com/4ertus2)) +- Simplify `shared_ptr_helper` because people facing difficulties understanding it. [\#6675](https://github.com/ClickHouse/ClickHouse/pull/6675) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added performance tests for fixed Gorilla and DoubleDelta codec. [\#6179](https://github.com/ClickHouse/ClickHouse/pull/6179) ([Vasily Nemkov](https://github.com/Enmk)) +- Split the integration test `test_dictionaries` into 4 separate tests. [\#6776](https://github.com/ClickHouse/ClickHouse/pull/6776) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fix PVS-Studio warning in `PipelineExecutor`. [\#6777](https://github.com/ClickHouse/ClickHouse/pull/6777) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Allow to use `library` dictionary source with ASan. [\#6482](https://github.com/ClickHouse/ClickHouse/pull/6482) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added option to generate changelog from a list of PRs. [\#6350](https://github.com/ClickHouse/ClickHouse/pull/6350) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Lock the `TinyLog` storage when reading. [\#6226](https://github.com/ClickHouse/ClickHouse/pull/6226) ([akuzm](https://github.com/akuzm)) +- Check for broken symlinks in CI. [\#6634](https://github.com/ClickHouse/ClickHouse/pull/6634) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Increase timeout for “stack overflow” test because it may take a long time in debug build. [\#6637](https://github.com/ClickHouse/ClickHouse/pull/6637) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added a check for double whitespaces. [\#6643](https://github.com/ClickHouse/ClickHouse/pull/6643) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix `new/delete` memory tracking when build with sanitizers. Tracking is not clear. It only prevents memory limit exceptions in tests. [\#6450](https://github.com/ClickHouse/ClickHouse/pull/6450) ([Artem Zuikov](https://github.com/4ertus2)) +- Enable back the check of undefined symbols while linking. [\#6453](https://github.com/ClickHouse/ClickHouse/pull/6453) ([Ivan](https://github.com/abyss7)) +- Avoid rebuilding `hyperscan` every day. [\#6307](https://github.com/ClickHouse/ClickHouse/pull/6307) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed UBSan report in `ProtobufWriter`. [\#6163](https://github.com/ClickHouse/ClickHouse/pull/6163) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Don’t allow to use query profiler with sanitizers because it is not compatible. [\#6769](https://github.com/ClickHouse/ClickHouse/pull/6769) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add test for reloading a dictionary after fail by timer. [\#6114](https://github.com/ClickHouse/ClickHouse/pull/6114) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fix inconsistency in `PipelineExecutor::prepareProcessor` argument type. [\#6494](https://github.com/ClickHouse/ClickHouse/pull/6494) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Added a test for bad URIs. [\#6493](https://github.com/ClickHouse/ClickHouse/pull/6493) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added more checks to `CAST` function. This should get more information about segmentation fault in fuzzy test. [\#6346](https://github.com/ClickHouse/ClickHouse/pull/6346) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Added `gcc-9` support to `docker/builder` container that builds image locally. [\#6333](https://github.com/ClickHouse/ClickHouse/pull/6333) ([Gleb Novikov](https://github.com/NanoBjorn)) +- Test for primary key with `LowCardinality(String)`. [\#5044](https://github.com/ClickHouse/ClickHouse/issues/5044) [\#6219](https://github.com/ClickHouse/ClickHouse/pull/6219) ([dimarub2000](https://github.com/dimarub2000)) +- Fixed tests affected by slow stack traces printing. [\#6315](https://github.com/ClickHouse/ClickHouse/pull/6315) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add a test case for crash in `groupUniqArray` fixed in [\#6029](https://github.com/ClickHouse/ClickHouse/pull/6029). [\#4402](https://github.com/ClickHouse/ClickHouse/issues/4402) [\#6129](https://github.com/ClickHouse/ClickHouse/pull/6129) ([akuzm](https://github.com/akuzm)) +- Fixed indices mutations tests. [\#6645](https://github.com/ClickHouse/ClickHouse/pull/6645) ([Nikita Vasilev](https://github.com/nikvas0)) +- In performance test, do not read query log for queries we didn’t run. [\#6427](https://github.com/ClickHouse/ClickHouse/pull/6427) ([akuzm](https://github.com/akuzm)) +- Materialized view now could be created with any low cardinality types regardless to the setting about suspicious low cardinality types. [\#6428](https://github.com/ClickHouse/ClickHouse/pull/6428) ([Olga Khvostikova](https://github.com/stavrolia)) +- Updated tests for `send_logs_level` setting. [\#6207](https://github.com/ClickHouse/ClickHouse/pull/6207) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix build under gcc-8.2. [\#6196](https://github.com/ClickHouse/ClickHouse/pull/6196) ([Max Akhmedov](https://github.com/zlobober)) +- Fix build with internal libc++. [\#6724](https://github.com/ClickHouse/ClickHouse/pull/6724) ([Ivan](https://github.com/abyss7)) +- Fix shared build with `rdkafka` library [\#6101](https://github.com/ClickHouse/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) +- Fixes for Mac OS build (incomplete). [\#6390](https://github.com/ClickHouse/ClickHouse/pull/6390) ([alexey-milovidov](https://github.com/alexey-milovidov)) [\#6429](https://github.com/ClickHouse/ClickHouse/pull/6429) ([alex-zaitsev](https://github.com/alex-zaitsev)) +- Fix “splitted” build. [\#6618](https://github.com/ClickHouse/ClickHouse/pull/6618) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Other build fixes: [\#6186](https://github.com/ClickHouse/ClickHouse/pull/6186) ([Amos Bird](https://github.com/amosbird)) [\#6486](https://github.com/ClickHouse/ClickHouse/pull/6486) [\#6348](https://github.com/ClickHouse/ClickHouse/pull/6348) ([vxider](https://github.com/Vxider)) [\#6744](https://github.com/ClickHouse/ClickHouse/pull/6744) ([Ivan](https://github.com/abyss7)) [\#6016](https://github.com/ClickHouse/ClickHouse/pull/6016) [\#6421](https://github.com/ClickHouse/ClickHouse/pull/6421) [\#6491](https://github.com/ClickHouse/ClickHouse/pull/6491) ([proller](https://github.com/proller)) + +#### Backward Incompatible Change {#backward-incompatible-change-3} + +- Removed rarely used table function `catBoostPool` and storage `CatBoostPool`. If you have used this table function, please write email to `clickhouse-feedback@yandex-team.com`. Note that CatBoost integration remains and will be supported. [\#6279](https://github.com/ClickHouse/ClickHouse/pull/6279) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Disable `ANY RIGHT JOIN` and `ANY FULL JOIN` by default. Set `any_join_distinct_right_table_keys` setting to enable them. [\#5126](https://github.com/ClickHouse/ClickHouse/issues/5126) [\#6351](https://github.com/ClickHouse/ClickHouse/pull/6351) ([Artem Zuikov](https://github.com/4ertus2)) + +## ClickHouse release 19.13 {#clickhouse-release-19-13} + +### ClickHouse release 19.13.6.51, 2019-10-02 {#clickhouse-release-19-13-6-51-2019-10-02} + +#### Bug Fix {#bug-fix-9} + +- This release also contains all bug fixes from 19.11.12.69. + +### ClickHouse release 19.13.5.44, 2019-09-20 {#clickhouse-release-19-13-5-44-2019-09-20} + +#### Bug Fix {#bug-fix-10} + +- This release also contains all bug fixes from 19.14.6.12. +- Fixed possible inconsistent state of table while executing `DROP` query for replicated table while zookeeper is not accessible. [\#6045](https://github.com/ClickHouse/ClickHouse/issues/6045) [\#6413](https://github.com/ClickHouse/ClickHouse/pull/6413) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +- Fix for data race in StorageMerge [\#6717](https://github.com/ClickHouse/ClickHouse/pull/6717) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix bug introduced in query profiler which leads to endless recv from socket. [\#6386](https://github.com/ClickHouse/ClickHouse/pull/6386) ([alesapin](https://github.com/alesapin)) +- Fix excessive CPU usage while executing `JSONExtractRaw` function over a boolean value. [\#6208](https://github.com/ClickHouse/ClickHouse/pull/6208) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fixes the regression while pushing to materialized view. [\#6415](https://github.com/ClickHouse/ClickHouse/pull/6415) ([Ivan](https://github.com/abyss7)) +- Table function `url` had the vulnerability allowed the attacker to inject arbitrary HTTP headers in the request. This issue was found by [Nikita Tikhomirov](https://github.com/NSTikhomirov). [\#6466](https://github.com/ClickHouse/ClickHouse/pull/6466) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix useless `AST` check in Set index. [\#6510](https://github.com/ClickHouse/ClickHouse/issues/6510) [\#6651](https://github.com/ClickHouse/ClickHouse/pull/6651) ([Nikita Vasilev](https://github.com/nikvas0)) +- Fixed parsing of `AggregateFunction` values embedded in query. [\#6575](https://github.com/ClickHouse/ClickHouse/issues/6575) [\#6773](https://github.com/ClickHouse/ClickHouse/pull/6773) ([Zhichang Yu](https://github.com/yuzhichang)) +- Fixed wrong behaviour of `trim` functions family. [\#6647](https://github.com/ClickHouse/ClickHouse/pull/6647) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.13.4.32, 2019-09-10 {#clickhouse-release-19-13-4-32-2019-09-10} + +#### Bug Fix {#bug-fix-11} + +- This release also contains all bug security fixes from 19.11.9.52 and 19.11.10.54. +- Fixed data race in `system.parts` table and `ALTER` query. [\#6245](https://github.com/ClickHouse/ClickHouse/issues/6245) [\#6513](https://github.com/ClickHouse/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed mismatched header in streams happened in case of reading from empty distributed table with sample and prewhere. [\#6167](https://github.com/ClickHouse/ClickHouse/issues/6167) ([Lixiang Qian](https://github.com/fancyqlx)) [\#6823](https://github.com/ClickHouse/ClickHouse/pull/6823) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fixed crash when using `IN` clause with a subquery with a tuple. [\#6125](https://github.com/ClickHouse/ClickHouse/issues/6125) [\#6550](https://github.com/ClickHouse/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) +- Fix case with same column names in `GLOBAL JOIN ON` section. [\#6181](https://github.com/ClickHouse/ClickHouse/pull/6181) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix crash when casting types to `Decimal` that do not support it. Throw exception instead. [\#6297](https://github.com/ClickHouse/ClickHouse/pull/6297) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed crash in `extractAll()` function. [\#6644](https://github.com/ClickHouse/ClickHouse/pull/6644) ([Artem Zuikov](https://github.com/4ertus2)) +- Query transformation for `MySQL`, `ODBC`, `JDBC` table functions now works properly for `SELECT WHERE` queries with multiple `AND` expressions. [\#6381](https://github.com/ClickHouse/ClickHouse/issues/6381) [\#6676](https://github.com/ClickHouse/ClickHouse/pull/6676) ([dimarub2000](https://github.com/dimarub2000)) +- Added previous declaration checks for MySQL 8 integration. [\#6569](https://github.com/ClickHouse/ClickHouse/pull/6569) ([Rafael David Tinoco](https://github.com/rafaeldtinoco)) + +#### Security Fix {#security-fix-1} + +- Fix two vulnerabilities in codecs in decompression phase (malicious user can fabricate compressed data that will lead to buffer overflow in decompression). [\#6670](https://github.com/ClickHouse/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) + +### ClickHouse release 19.13.3.26, 2019-08-22 {#clickhouse-release-19-13-3-26-2019-08-22} + +#### Bug Fix {#bug-fix-12} + +- Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [\#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) +- Fix NPE when using IN clause with a subquery with a tuple. [\#6125](https://github.com/ClickHouse/ClickHouse/issues/6125) [\#6550](https://github.com/ClickHouse/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) +- Fixed an issue that if a stale replica becomes alive, it may still have data parts that were removed by DROP PARTITION. [\#6522](https://github.com/ClickHouse/ClickHouse/issues/6522) [\#6523](https://github.com/ClickHouse/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) +- Fixed issue with parsing CSV [\#6426](https://github.com/ClickHouse/ClickHouse/issues/6426) [\#6559](https://github.com/ClickHouse/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) +- Fixed data race in system.parts table and ALTER query. This fixes [\#6245](https://github.com/ClickHouse/ClickHouse/issues/6245). [\#6513](https://github.com/ClickHouse/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [\#6514](https://github.com/ClickHouse/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [\#6224](https://github.com/ClickHouse/ClickHouse/issues/6224) [\#6282](https://github.com/ClickHouse/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) + +#### Security Fix {#security-fix-2} + +- If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [\#6247](https://github.com/ClickHouse/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.13.2.19, 2019-08-14 {#clickhouse-release-19-13-2-19-2019-08-14} + +#### New Feature {#new-feature-5} + +- Sampling profiler on query level. [Example](https://gist.github.com/alexey-milovidov/92758583dd41c24c360fdb8d6a4da194). [\#4247](https://github.com/ClickHouse/ClickHouse/issues/4247) ([laplab](https://github.com/laplab)) [\#6124](https://github.com/ClickHouse/ClickHouse/pull/6124) ([alexey-milovidov](https://github.com/alexey-milovidov)) [\#6250](https://github.com/ClickHouse/ClickHouse/pull/6250) [\#6283](https://github.com/ClickHouse/ClickHouse/pull/6283) [\#6386](https://github.com/ClickHouse/ClickHouse/pull/6386) +- Allow to specify a list of columns with `COLUMNS('regexp')` expression that works like a more sophisticated variant of `*` asterisk. [\#5951](https://github.com/ClickHouse/ClickHouse/pull/5951) ([mfridental](https://github.com/mfridental)), ([alexey-milovidov](https://github.com/alexey-milovidov)) +- `CREATE TABLE AS table_function()` is now possible [\#6057](https://github.com/ClickHouse/ClickHouse/pull/6057) ([dimarub2000](https://github.com/dimarub2000)) +- Adam optimizer for stochastic gradient descent is used by default in `stochasticLinearRegression()` and `stochasticLogisticRegression()` aggregate functions, because it shows good quality without almost any tuning. [\#6000](https://github.com/ClickHouse/ClickHouse/pull/6000) ([Quid37](https://github.com/Quid37)) +- Added functions for working with the сustom week number [\#5212](https://github.com/ClickHouse/ClickHouse/pull/5212) ([Andy Yang](https://github.com/andyyzh)) +- `RENAME` queries now work with all storages. [\#5953](https://github.com/ClickHouse/ClickHouse/pull/5953) ([Ivan](https://github.com/abyss7)) +- Now client receive logs from server with any desired level by setting `send_logs_level` regardless to the log level specified in server settings. [\#5964](https://github.com/ClickHouse/ClickHouse/pull/5964) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) + +#### Backward Incompatible Change {#backward-incompatible-change-4} + +- The setting `input_format_defaults_for_omitted_fields` is enabled by default. Inserts in Distributed tables need this setting to be the same on cluster (you need to set it before rolling update). It enables calculation of complex default expressions for omitted fields in `JSONEachRow` and `CSV*` formats. It should be the expected behavior but may lead to negligible performance difference. [\#6043](https://github.com/ClickHouse/ClickHouse/pull/6043) ([Artem Zuikov](https://github.com/4ertus2)), [\#5625](https://github.com/ClickHouse/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) + +#### Experimental features {#experimental-features} + +- New query processing pipeline. Use `experimental_use_processors=1` option to enable it. Use for your own trouble. [\#4914](https://github.com/ClickHouse/ClickHouse/pull/4914) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) + +#### Bug Fix {#bug-fix-13} + +- Kafka integration has been fixed in this version. +- Fixed `DoubleDelta` encoding of `Int64` for large `DoubleDelta` values, improved `DoubleDelta` encoding for random data for `Int32`. [\#5998](https://github.com/ClickHouse/ClickHouse/pull/5998) ([Vasily Nemkov](https://github.com/Enmk)) +- Fixed overestimation of `max_rows_to_read` if the setting `merge_tree_uniform_read_distribution` is set to 0. [\#6019](https://github.com/ClickHouse/ClickHouse/pull/6019) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Improvement {#improvement-4} + +- Throws an exception if `config.d` file doesn’t have the corresponding root element as the config file [\#6123](https://github.com/ClickHouse/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) + +#### Performance Improvement {#performance-improvement-3} + +- Optimize `count()`. Now it uses the smallest column (if possible). [\#6028](https://github.com/ClickHouse/ClickHouse/pull/6028) ([Amos Bird](https://github.com/amosbird)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-5} + +- Report memory usage in performance tests. [\#5899](https://github.com/ClickHouse/ClickHouse/pull/5899) ([akuzm](https://github.com/akuzm)) +- Fix build with external `libcxx` [\#6010](https://github.com/ClickHouse/ClickHouse/pull/6010) ([Ivan](https://github.com/abyss7)) +- Fix shared build with `rdkafka` library [\#6101](https://github.com/ClickHouse/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) + +## ClickHouse release 19.11 {#clickhouse-release-19-11} + +### ClickHouse release 19.11.13.74, 2019-11-01 {#clickhouse-release-19-11-13-74-2019-11-01} + +#### Bug Fix {#bug-fix-14} + +- Fixed rare crash in `ALTER MODIFY COLUMN` and vertical merge when one of merged/altered parts is empty (0 rows). [\#6780](https://github.com/ClickHouse/ClickHouse/pull/6780) ([alesapin](https://github.com/alesapin)) +- Manual update of `SIMDJSON`. This fixes possible flooding of stderr files with bogus json diagnostic messages. [\#7548](https://github.com/ClickHouse/ClickHouse/pull/7548) ([Alexander Kazakov](https://github.com/Akazz)) +- Fixed bug with `mrk` file extension for mutations ([alesapin](https://github.com/alesapin)) + +### ClickHouse release 19.11.12.69, 2019-10-02 {#clickhouse-release-19-11-12-69-2019-10-02} + +#### Bug Fix {#bug-fix-15} + +- Fixed performance degradation of index analysis on complex keys on large tables. This fixes [\#6924](https://github.com/ClickHouse/ClickHouse/issues/6924). [\#7075](https://github.com/ClickHouse/ClickHouse/pull/7075) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Avoid rare SIGSEGV while sending data in tables with Distributed engine (`Failed to send batch: file with index XXXXX is absent`). [\#7032](https://github.com/ClickHouse/ClickHouse/pull/7032) ([Azat Khuzhin](https://github.com/azat)) +- Fix `Unknown identifier` with multiple joins. This fixes [\#5254](https://github.com/ClickHouse/ClickHouse/issues/5254). [\#7022](https://github.com/ClickHouse/ClickHouse/pull/7022) ([Artem Zuikov](https://github.com/4ertus2)) + +### ClickHouse release 19.11.11.57, 2019-09-13 {#clickhouse-release-19-11-11-57-2019-09-13} + +- Fix logical error causing segfaults when selecting from Kafka empty topic. [\#6902](https://github.com/ClickHouse/ClickHouse/issues/6902) [\#6909](https://github.com/ClickHouse/ClickHouse/pull/6909) ([Ivan](https://github.com/abyss7)) +- Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params. [\#6928](https://github.com/ClickHouse/ClickHouse/pull/6928) ([proller](https://github.com/proller)) + +### ClickHouse release 19.11.10.54, 2019-09-10 {#clickhouse-release-19-11-10-54-2019-09-10} + +#### Bug Fix {#bug-fix-16} + +- Do store offsets for Kafka messages manually to be able to commit them all at once for all partitions. Fixes potential duplication in “one consumer - many partitions” scenario. [\#6872](https://github.com/ClickHouse/ClickHouse/pull/6872) ([Ivan](https://github.com/abyss7)) + +### ClickHouse release 19.11.9.52, 2019-09-6 {#clickhouse-release-19-11-9-52-2019-09-6} + +- Improve error handling in cache dictionaries. [\#6737](https://github.com/ClickHouse/ClickHouse/pull/6737) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fixed bug in function `arrayEnumerateUniqRanked`. [\#6779](https://github.com/ClickHouse/ClickHouse/pull/6779) ([proller](https://github.com/proller)) +- Fix `JSONExtract` function while extracting a `Tuple` from JSON. [\#6718](https://github.com/ClickHouse/ClickHouse/pull/6718) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [\#6224](https://github.com/ClickHouse/ClickHouse/issues/6224) [\#6282](https://github.com/ClickHouse/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) +- Fixed performance test. [\#6392](https://github.com/ClickHouse/ClickHouse/pull/6392) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Parquet: Fix reading boolean columns. [\#6579](https://github.com/ClickHouse/ClickHouse/pull/6579) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed wrong behaviour of `nullIf` function for constant arguments. [\#6518](https://github.com/ClickHouse/ClickHouse/pull/6518) ([Guillaume Tassery](https://github.com/YiuRULE)) [\#6580](https://github.com/ClickHouse/ClickHouse/pull/6580) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix Kafka messages duplication problem on normal server restart. [\#6597](https://github.com/ClickHouse/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) +- Fixed an issue when long `ALTER UPDATE` or `ALTER DELETE` may prevent regular merges to run. Prevent mutations from executing if there is no enough free threads available. [\#6502](https://github.com/ClickHouse/ClickHouse/issues/6502) [\#6617](https://github.com/ClickHouse/ClickHouse/pull/6617) ([tavplubix](https://github.com/tavplubix)) +- Fixed error with processing “timezone” in server configuration file. [\#6709](https://github.com/ClickHouse/ClickHouse/pull/6709) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix kafka tests. [\#6805](https://github.com/ClickHouse/ClickHouse/pull/6805) ([Ivan](https://github.com/abyss7)) + +#### Security Fix {#security-fix-3} + +- If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse runs, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [\#6247](https://github.com/ClickHouse/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.11.8.46, 2019-08-22 {#clickhouse-release-19-11-8-46-2019-08-22} + +#### Bug Fix {#bug-fix-17} + +- Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [\#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) +- Fix NPE when using IN clause with a subquery with a tuple. [\#6125](https://github.com/ClickHouse/ClickHouse/issues/6125) [\#6550](https://github.com/ClickHouse/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) +- Fixed an issue that if a stale replica becomes alive, it may still have data parts that were removed by DROP PARTITION. [\#6522](https://github.com/ClickHouse/ClickHouse/issues/6522) [\#6523](https://github.com/ClickHouse/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) +- Fixed issue with parsing CSV [\#6426](https://github.com/ClickHouse/ClickHouse/issues/6426) [\#6559](https://github.com/ClickHouse/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) +- Fixed data race in system.parts table and ALTER query. This fixes [\#6245](https://github.com/ClickHouse/ClickHouse/issues/6245). [\#6513](https://github.com/ClickHouse/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [\#6514](https://github.com/ClickHouse/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.11.7.40, 2019-08-14 {#clickhouse-release-19-11-7-40-2019-08-14} + +#### Bug fix {#bug-fix-18} + +- Kafka integration has been fixed in this version. +- Fix segfault when using `arrayReduce` for constant arguments. [\#6326](https://github.com/ClickHouse/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed `toFloat()` monotonicity. [\#6374](https://github.com/ClickHouse/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) +- Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [\#6384](https://github.com/ClickHouse/ClickHouse/pull/6384) ([CurtizJ](https://github.com/CurtizJ)) +- Fixed logic of `arrayEnumerateUniqRanked` function. [\#6423](https://github.com/ClickHouse/ClickHouse/pull/6423) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Removed extra verbose logging from MySQL handler. [\#6389](https://github.com/ClickHouse/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [\#6404](https://github.com/ClickHouse/ClickHouse/pull/6404) ([CurtizJ](https://github.com/CurtizJ)) +- Do not expose virtual columns in `system.columns` table. This is required for backward compatibility. [\#6406](https://github.com/ClickHouse/ClickHouse/pull/6406) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix bug with memory allocation for string fields in complex key cache dictionary. [\#6447](https://github.com/ClickHouse/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) +- Fix bug with enabling adaptive granularity when creating new replica for `Replicated*MergeTree` table. [\#6452](https://github.com/ClickHouse/ClickHouse/pull/6452) ([alesapin](https://github.com/alesapin)) +- Fix infinite loop when reading Kafka messages. [\#6354](https://github.com/ClickHouse/ClickHouse/pull/6354) ([abyss7](https://github.com/abyss7)) +- Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser and possibility of stack overflow in `Merge` and `Distributed` tables [\#6433](https://github.com/ClickHouse/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed Gorilla encoding error on small sequences. [\#6444](https://github.com/ClickHouse/ClickHouse/pull/6444) ([Enmk](https://github.com/Enmk)) + +#### Improvement {#improvement-5} + +- Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [\#6230](https://github.com/ClickHouse/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.11.5.28, 2019-08-05 {#clickhouse-release-19-11-5-28-2019-08-05} + +#### Bug fix {#bug-fix-19} + +- Fixed the possibility of hanging queries when server is overloaded. [\#6301](https://github.com/ClickHouse/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix FPE in yandexConsistentHash function. This fixes [\#6304](https://github.com/ClickHouse/ClickHouse/issues/6304). [\#6126](https://github.com/ClickHouse/ClickHouse/pull/6126) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [\#6257](https://github.com/ClickHouse/ClickHouse/issues/6257). [\#6281](https://github.com/ClickHouse/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix parsing of `bool` settings from `true` and `false` strings in configuration files. [\#6278](https://github.com/ClickHouse/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) +- Fix rare bug with incompatible stream headers in queries to `Distributed` table over `MergeTree` table when part of `WHERE` moves to `PREWHERE`. [\#6236](https://github.com/ClickHouse/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) +- Fixed overflow in integer division of signed type to unsigned type. This fixes [\#6214](https://github.com/ClickHouse/ClickHouse/issues/6214). [\#6233](https://github.com/ClickHouse/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Backward Incompatible Change {#backward-incompatible-change-5} + +- `Kafka` still broken. + +### ClickHouse release 19.11.4.24, 2019-08-01 {#clickhouse-release-19-11-4-24-2019-08-01} + +#### Bug Fix {#bug-fix-20} + +- Fix bug with writing secondary indices marks with adaptive granularity. [\#6126](https://github.com/ClickHouse/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin)) +- Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [\#6225](https://github.com/ClickHouse/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) +- Fixed hang in `JSONExtractRaw` function. Fixed [\#6195](https://github.com/ClickHouse/ClickHouse/issues/6195) [\#6198](https://github.com/ClickHouse/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix segfault in ExternalLoader::reloadOutdated(). [\#6082](https://github.com/ClickHouse/ClickHouse/pull/6082) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fixed the case when server may close listening sockets but not shutdown and continue serving remaining queries. You may end up with two running clickhouse-server processes. Sometimes, the server may return an error `bad_function_call` for remaining queries. [\#6231](https://github.com/ClickHouse/ClickHouse/pull/6231) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed useless and incorrect condition on update field for initial loading of external dictionaries via ODBC, MySQL, ClickHouse and HTTP. This fixes [\#6069](https://github.com/ClickHouse/ClickHouse/issues/6069) [\#6083](https://github.com/ClickHouse/ClickHouse/pull/6083) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn’t contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [\#6094](https://github.com/ClickHouse/ClickHouse/issues/6094) [\#6119](https://github.com/ClickHouse/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix non-deterministic result of “uniq” aggregate function in extreme rare cases. The bug was present in all ClickHouse versions. [\#6058](https://github.com/ClickHouse/ClickHouse/pull/6058) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Segfault when we set a little bit too high CIDR on the function `IPv6CIDRToRange`. [\#6068](https://github.com/ClickHouse/ClickHouse/pull/6068) ([Guillaume Tassery](https://github.com/YiuRULE)) +- Fixed small memory leak when server throw many exceptions from many different contexts. [\#6144](https://github.com/ClickHouse/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix the situation when consumer got paused before subscription and not resumed afterwards. [\#6075](https://github.com/ClickHouse/ClickHouse/pull/6075) ([Ivan](https://github.com/abyss7)) Note that Kafka is broken in this version. +- Clearing the Kafka data buffer from the previous read operation that was completed with an error [\#6026](https://github.com/ClickHouse/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) Note that Kafka is broken in this version. +- Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [\#6080](https://github.com/ClickHouse/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-6} + +- Added official `rpm` packages. [\#5740](https://github.com/ClickHouse/ClickHouse/pull/5740) ([proller](https://github.com/proller)) ([alesapin](https://github.com/alesapin)) +- Add an ability to build `.rpm` and `.tgz` packages with `packager` script. [\#5769](https://github.com/ClickHouse/ClickHouse/pull/5769) ([alesapin](https://github.com/alesapin)) +- Fixes for “Arcadia” build system. [\#6223](https://github.com/ClickHouse/ClickHouse/pull/6223) ([proller](https://github.com/proller)) + +#### Backward Incompatible Change {#backward-incompatible-change-6} + +- `Kafka` is broken in this version. + +### ClickHouse release 19.11.3.11, 2019-07-18 {#clickhouse-release-19-11-3-11-2019-07-18} + +#### New Feature {#new-feature-6} + +- Added support for prepared statements. [\#5331](https://github.com/ClickHouse/ClickHouse/pull/5331/) ([Alexander](https://github.com/sanych73)) [\#5630](https://github.com/ClickHouse/ClickHouse/pull/5630) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- `DoubleDelta` and `Gorilla` column codecs [\#5600](https://github.com/ClickHouse/ClickHouse/pull/5600) ([Vasily Nemkov](https://github.com/Enmk)) +- Added `os_thread_priority` setting that allows to control the “nice” value of query processing threads that is used by OS to adjust dynamic scheduling priority. It requires `CAP_SYS_NICE` capabilities to work. This implements [\#5858](https://github.com/ClickHouse/ClickHouse/issues/5858) [\#5909](https://github.com/ClickHouse/ClickHouse/pull/5909) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Implement `_topic`, `_offset`, `_key` columns for Kafka engine [\#5382](https://github.com/ClickHouse/ClickHouse/pull/5382) ([Ivan](https://github.com/abyss7)) Note that Kafka is broken in this version. +- Add aggregate function combinator `-Resample` [\#5590](https://github.com/ClickHouse/ClickHouse/pull/5590) ([hcz](https://github.com/hczhcz)) +- Aggregate functions `groupArrayMovingSum(win_size)(x)` and `groupArrayMovingAvg(win_size)(x)`, which calculate moving sum/avg with or without window-size limitation. [\#5595](https://github.com/ClickHouse/ClickHouse/pull/5595) ([inv2004](https://github.com/inv2004)) +- Add synonim `arrayFlatten` \<-\> `flatten` [\#5764](https://github.com/ClickHouse/ClickHouse/pull/5764) ([hcz](https://github.com/hczhcz)) +- Intergate H3 function `geoToH3` from Uber. [\#4724](https://github.com/ClickHouse/ClickHouse/pull/4724) ([Remen Ivan](https://github.com/BHYCHIK)) [\#5805](https://github.com/ClickHouse/ClickHouse/pull/5805) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Bug Fix {#bug-fix-21} + +- Implement DNS cache with asynchronous update. Separate thread resolves all hosts and updates DNS cache with period (setting `dns_cache_update_period`). It should help, when ip of hosts changes frequently. [\#5857](https://github.com/ClickHouse/ClickHouse/pull/5857) ([Anton Popov](https://github.com/CurtizJ)) +- Fix segfault in `Delta` codec which affects columns with values less than 32 bits size. The bug led to random memory corruption. [\#5786](https://github.com/ClickHouse/ClickHouse/pull/5786) ([alesapin](https://github.com/alesapin)) +- Fix segfault in TTL merge with non-physical columns in block. [\#5819](https://github.com/ClickHouse/ClickHouse/pull/5819) ([Anton Popov](https://github.com/CurtizJ)) +- Fix rare bug in checking of part with `LowCardinality` column. Previously `checkDataPart` always fails for part with `LowCardinality` column. [\#5832](https://github.com/ClickHouse/ClickHouse/pull/5832) ([alesapin](https://github.com/alesapin)) +- Avoid hanging connections when server thread pool is full. It is important for connections from `remote` table function or connections to a shard without replicas when there is long connection timeout. This fixes [\#5878](https://github.com/ClickHouse/ClickHouse/issues/5878) [\#5881](https://github.com/ClickHouse/ClickHouse/pull/5881) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Support for constant arguments to `evalMLModel` function. This fixes [\#5817](https://github.com/ClickHouse/ClickHouse/issues/5817) [\#5820](https://github.com/ClickHouse/ClickHouse/pull/5820) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed the issue when ClickHouse determines default time zone as `UCT` instead of `UTC`. This fixes [\#5804](https://github.com/ClickHouse/ClickHouse/issues/5804). [\#5828](https://github.com/ClickHouse/ClickHouse/pull/5828) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed buffer underflow in `visitParamExtractRaw`. This fixes [\#5901](https://github.com/ClickHouse/ClickHouse/issues/5901) [\#5902](https://github.com/ClickHouse/ClickHouse/pull/5902) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Now distributed `DROP/ALTER/TRUNCATE/OPTIMIZE ON CLUSTER` queries will be executed directly on leader replica. [\#5757](https://github.com/ClickHouse/ClickHouse/pull/5757) ([alesapin](https://github.com/alesapin)) +- Fix `coalesce` for `ColumnConst` with `ColumnNullable` + related changes. [\#5755](https://github.com/ClickHouse/ClickHouse/pull/5755) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix the `ReadBufferFromKafkaConsumer` so that it keeps reading new messages after `commit()` even if it was stalled before [\#5852](https://github.com/ClickHouse/ClickHouse/pull/5852) ([Ivan](https://github.com/abyss7)) +- Fix `FULL` and `RIGHT` JOIN results when joining on `Nullable` keys in right table. [\#5859](https://github.com/ClickHouse/ClickHouse/pull/5859) ([Artem Zuikov](https://github.com/4ertus2)) +- Possible fix of infinite sleeping of low-priority queries. [\#5842](https://github.com/ClickHouse/ClickHouse/pull/5842) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix race condition, which cause that some queries may not appear in query\_log after `SYSTEM FLUSH LOGS` query. [\#5456](https://github.com/ClickHouse/ClickHouse/issues/5456) [\#5685](https://github.com/ClickHouse/ClickHouse/pull/5685) ([Anton Popov](https://github.com/CurtizJ)) +- Fixed `heap-use-after-free` ASan warning in ClusterCopier caused by watch which try to use already removed copier object. [\#5871](https://github.com/ClickHouse/ClickHouse/pull/5871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fixed wrong `StringRef` pointer returned by some implementations of `IColumn::deserializeAndInsertFromArena`. This bug affected only unit-tests. [\#5973](https://github.com/ClickHouse/ClickHouse/pull/5973) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Prevent source and intermediate array join columns of masking same name columns. [\#5941](https://github.com/ClickHouse/ClickHouse/pull/5941) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix insert and select query to MySQL engine with MySQL style identifier quoting. [\#5704](https://github.com/ClickHouse/ClickHouse/pull/5704) ([Winter Zhang](https://github.com/zhang2014)) +- Now `CHECK TABLE` query can work with MergeTree engine family. It returns check status and message if any for each part (or file in case of simplier engines). Also, fix bug in fetch of a broken part. [\#5865](https://github.com/ClickHouse/ClickHouse/pull/5865) ([alesapin](https://github.com/alesapin)) +- Fix SPLIT\_SHARED\_LIBRARIES runtime [\#5793](https://github.com/ClickHouse/ClickHouse/pull/5793) ([Danila Kutenin](https://github.com/danlark1)) +- Fixed time zone initialization when `/etc/localtime` is a relative symlink like `../usr/share/zoneinfo/Europe/Moscow` [\#5922](https://github.com/ClickHouse/ClickHouse/pull/5922) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- clickhouse-copier: Fix use-after free on shutdown [\#5752](https://github.com/ClickHouse/ClickHouse/pull/5752) ([proller](https://github.com/proller)) +- Updated `simdjson`. Fixed the issue that some invalid JSONs with zero bytes successfully parse. [\#5938](https://github.com/ClickHouse/ClickHouse/pull/5938) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix shutdown of SystemLogs [\#5802](https://github.com/ClickHouse/ClickHouse/pull/5802) ([Anton Popov](https://github.com/CurtizJ)) +- Fix hanging when condition in invalidate\_query depends on a dictionary. [\#6011](https://github.com/ClickHouse/ClickHouse/pull/6011) ([Vitaly Baranov](https://github.com/vitlibar)) + +#### Improvement {#improvement-6} + +- Allow unresolvable addresses in cluster configuration. They will be considered unavailable and tried to resolve at every connection attempt. This is especially useful for Kubernetes. This fixes [\#5714](https://github.com/ClickHouse/ClickHouse/issues/5714) [\#5924](https://github.com/ClickHouse/ClickHouse/pull/5924) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Close idle TCP connections (with one hour timeout by default). This is especially important for large clusters with multiple distributed tables on every server, because every server can possibly keep a connection pool to every other server, and after peak query concurrency, connections will stall. This fixes [\#5879](https://github.com/ClickHouse/ClickHouse/issues/5879) [\#5880](https://github.com/ClickHouse/ClickHouse/pull/5880) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Better quality of `topK` function. Changed the SavingSpace set behavior to remove the last element if the new element have a bigger weight. [\#5833](https://github.com/ClickHouse/ClickHouse/issues/5833) [\#5850](https://github.com/ClickHouse/ClickHouse/pull/5850) ([Guillaume Tassery](https://github.com/YiuRULE)) +- URL functions to work with domains now can work for incomplete URLs without scheme [\#5725](https://github.com/ClickHouse/ClickHouse/pull/5725) ([alesapin](https://github.com/alesapin)) +- Checksums added to the `system.parts_columns` table. [\#5874](https://github.com/ClickHouse/ClickHouse/pull/5874) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +- Added `Enum` data type as a synonim for `Enum8` or `Enum16`. [\#5886](https://github.com/ClickHouse/ClickHouse/pull/5886) ([dimarub2000](https://github.com/dimarub2000)) +- Full bit transpose variant for `T64` codec. Could lead to better compression with `zstd`. [\#5742](https://github.com/ClickHouse/ClickHouse/pull/5742) ([Artem Zuikov](https://github.com/4ertus2)) +- Condition on `startsWith` function now can uses primary key. This fixes [\#5310](https://github.com/ClickHouse/ClickHouse/issues/5310) and [\#5882](https://github.com/ClickHouse/ClickHouse/issues/5882) [\#5919](https://github.com/ClickHouse/ClickHouse/pull/5919) ([dimarub2000](https://github.com/dimarub2000)) +- Allow to use `clickhouse-copier` with cross-replication cluster topology by permitting empty database name. [\#5745](https://github.com/ClickHouse/ClickHouse/pull/5745) ([nvartolomei](https://github.com/nvartolomei)) +- Use `UTC` as default timezone on a system without `tzdata` (e.g. bare Docker container). Before this patch, error message `Could not determine local time zone` was printed and server or client refused to start. [\#5827](https://github.com/ClickHouse/ClickHouse/pull/5827) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Returned back support for floating point argument in function `quantileTiming` for backward compatibility. [\#5911](https://github.com/ClickHouse/ClickHouse/pull/5911) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Show which table is missing column in error messages. [\#5768](https://github.com/ClickHouse/ClickHouse/pull/5768) ([Ivan](https://github.com/abyss7)) +- Disallow run query with same query\_id by various users [\#5430](https://github.com/ClickHouse/ClickHouse/pull/5430) ([proller](https://github.com/proller)) +- More robust code for sending metrics to Graphite. It will work even during long multiple `RENAME TABLE` operation. [\#5875](https://github.com/ClickHouse/ClickHouse/pull/5875) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- More informative error messages will be displayed when ThreadPool cannot schedule a task for execution. This fixes [\#5305](https://github.com/ClickHouse/ClickHouse/issues/5305) [\#5801](https://github.com/ClickHouse/ClickHouse/pull/5801) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Inverting ngramSearch to be more intuitive [\#5807](https://github.com/ClickHouse/ClickHouse/pull/5807) ([Danila Kutenin](https://github.com/danlark1)) +- Add user parsing in HDFS engine builder [\#5946](https://github.com/ClickHouse/ClickHouse/pull/5946) ([akonyaev90](https://github.com/akonyaev90)) +- Update default value of `max_ast_elements parameter` [\#5933](https://github.com/ClickHouse/ClickHouse/pull/5933) ([Artem Konovalov](https://github.com/izebit)) +- Added a notion of obsolete settings. The obsolete setting `allow_experimental_low_cardinality_type` can be used with no effect. [0f15c01c6802f7ce1a1494c12c846be8c98944cd](https://github.com/ClickHouse/ClickHouse/commit/0f15c01c6802f7ce1a1494c12c846be8c98944cd) [Alexey Milovidov](https://github.com/alexey-milovidov) + +#### Performance Improvement {#performance-improvement-4} + +- Increase number of streams to SELECT from Merge table for more uniform distribution of threads. Added setting `max_streams_multiplier_for_merge_tables`. This fixes [\#5797](https://github.com/ClickHouse/ClickHouse/issues/5797) [\#5915](https://github.com/ClickHouse/ClickHouse/pull/5915) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-7} + +- Add a backward compatibility test for client-server interaction with different versions of clickhouse. [\#5868](https://github.com/ClickHouse/ClickHouse/pull/5868) ([alesapin](https://github.com/alesapin)) +- Test coverage information in every commit and pull request. [\#5896](https://github.com/ClickHouse/ClickHouse/pull/5896) ([alesapin](https://github.com/alesapin)) +- Cooperate with address sanitizer to support our custom allocators (`Arena` and `ArenaWithFreeLists`) for better debugging of “use-after-free” errors. [\#5728](https://github.com/ClickHouse/ClickHouse/pull/5728) ([akuzm](https://github.com/akuzm)) +- Switch to [LLVM libunwind implementation](https://github.com/llvm-mirror/libunwind) for C++ exception handling and for stack traces printing [\#4828](https://github.com/ClickHouse/ClickHouse/pull/4828) ([Nikita Lapkov](https://github.com/laplab)) +- Add two more warnings from -Weverything [\#5923](https://github.com/ClickHouse/ClickHouse/pull/5923) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Allow to build ClickHouse with Memory Sanitizer. [\#3949](https://github.com/ClickHouse/ClickHouse/pull/3949) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed ubsan report about `bitTest` function in fuzz test. [\#5943](https://github.com/ClickHouse/ClickHouse/pull/5943) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Docker: added possibility to init a ClickHouse instance which requires authentication. [\#5727](https://github.com/ClickHouse/ClickHouse/pull/5727) ([Korviakov Andrey](https://github.com/shurshun)) +- Update librdkafka to version 1.1.0 [\#5872](https://github.com/ClickHouse/ClickHouse/pull/5872) ([Ivan](https://github.com/abyss7)) +- Add global timeout for integration tests and disable some of them in tests code. [\#5741](https://github.com/ClickHouse/ClickHouse/pull/5741) ([alesapin](https://github.com/alesapin)) +- Fix some ThreadSanitizer failures. [\#5854](https://github.com/ClickHouse/ClickHouse/pull/5854) ([akuzm](https://github.com/akuzm)) +- The `--no-undefined` option forces the linker to check all external names for existence while linking. It’s very useful to track real dependencies between libraries in the split build mode. [\#5855](https://github.com/ClickHouse/ClickHouse/pull/5855) ([Ivan](https://github.com/abyss7)) +- Added performance test for [\#5797](https://github.com/ClickHouse/ClickHouse/issues/5797) [\#5914](https://github.com/ClickHouse/ClickHouse/pull/5914) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed compatibility with gcc-7. [\#5840](https://github.com/ClickHouse/ClickHouse/pull/5840) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added support for gcc-9. This fixes [\#5717](https://github.com/ClickHouse/ClickHouse/issues/5717) [\#5774](https://github.com/ClickHouse/ClickHouse/pull/5774) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed error when libunwind can be linked incorrectly. [\#5948](https://github.com/ClickHouse/ClickHouse/pull/5948) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed a few warnings found by PVS-Studio. [\#5921](https://github.com/ClickHouse/ClickHouse/pull/5921) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added initial support for `clang-tidy` static analyzer. [\#5806](https://github.com/ClickHouse/ClickHouse/pull/5806) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Convert BSD/Linux endian macros( ‘be64toh’ and ‘htobe64’) to the Mac OS X equivalents [\#5785](https://github.com/ClickHouse/ClickHouse/pull/5785) ([Fu Chen](https://github.com/fredchenbj)) +- Improved integration tests guide. [\#5796](https://github.com/ClickHouse/ClickHouse/pull/5796) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Fixing build at macosx + gcc9 [\#5822](https://github.com/ClickHouse/ClickHouse/pull/5822) ([filimonov](https://github.com/filimonov)) +- Fix a hard-to-spot typo: aggreAGte -\> aggregate. [\#5753](https://github.com/ClickHouse/ClickHouse/pull/5753) ([akuzm](https://github.com/akuzm)) +- Fix freebsd build [\#5760](https://github.com/ClickHouse/ClickHouse/pull/5760) ([proller](https://github.com/proller)) +- Add link to experimental YouTube channel to website [\#5845](https://github.com/ClickHouse/ClickHouse/pull/5845) ([Ivan Blinkov](https://github.com/blinkov)) +- CMake: add option for coverage flags: WITH\_COVERAGE [\#5776](https://github.com/ClickHouse/ClickHouse/pull/5776) ([proller](https://github.com/proller)) +- Fix initial size of some inline PODArray’s. [\#5787](https://github.com/ClickHouse/ClickHouse/pull/5787) ([akuzm](https://github.com/akuzm)) +- clickhouse-server.postinst: fix os detection for centos 6 [\#5788](https://github.com/ClickHouse/ClickHouse/pull/5788) ([proller](https://github.com/proller)) +- Added Arch linux package generation. [\#5719](https://github.com/ClickHouse/ClickHouse/pull/5719) ([Vladimir Chebotarev](https://github.com/excitoon)) +- Split Common/config.h by libs (dbms) [\#5715](https://github.com/ClickHouse/ClickHouse/pull/5715) ([proller](https://github.com/proller)) +- Fixes for “Arcadia” build platform [\#5795](https://github.com/ClickHouse/ClickHouse/pull/5795) ([proller](https://github.com/proller)) +- Fixes for unconventional build (gcc9, no submodules) [\#5792](https://github.com/ClickHouse/ClickHouse/pull/5792) ([proller](https://github.com/proller)) +- Require explicit type in unalignedStore because it was proven to be bug-prone [\#5791](https://github.com/ClickHouse/ClickHouse/pull/5791) ([akuzm](https://github.com/akuzm)) +- Fixes MacOS build [\#5830](https://github.com/ClickHouse/ClickHouse/pull/5830) ([filimonov](https://github.com/filimonov)) +- Performance test concerning the new JIT feature with bigger dataset, as requested here [\#5263](https://github.com/ClickHouse/ClickHouse/issues/5263) [\#5887](https://github.com/ClickHouse/ClickHouse/pull/5887) ([Guillaume Tassery](https://github.com/YiuRULE)) +- Run stateful tests in stress test [12693e568722f11e19859742f56428455501fd2a](https://github.com/ClickHouse/ClickHouse/commit/12693e568722f11e19859742f56428455501fd2a) ([alesapin](https://github.com/alesapin)) + +#### Backward Incompatible Change {#backward-incompatible-change-7} + +- `Kafka` is broken in this version. +- Enable `adaptive_index_granularity` = 10MB by default for new `MergeTree` tables. If you created new MergeTree tables on version 19.11+, downgrade to versions prior to 19.6 will be impossible. [\#5628](https://github.com/ClickHouse/ClickHouse/pull/5628) ([alesapin](https://github.com/alesapin)) +- Removed obsolete undocumented embedded dictionaries that were used by Yandex.Metrica. The functions `OSIn`, `SEIn`, `OSToRoot`, `SEToRoot`, `OSHierarchy`, `SEHierarchy` are no longer available. If you are using these functions, write email to clickhouse-feedback@yandex-team.com. Note: at the last moment we decided to keep these functions for a while. [\#5780](https://github.com/ClickHouse/ClickHouse/pull/5780) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +## ClickHouse release 19.10 {#clickhouse-release-19-10} + +### ClickHouse release 19.10.1.5, 2019-07-12 {#clickhouse-release-19-10-1-5-2019-07-12} + +#### New Feature {#new-feature-7} + +- Add new column codec: `T64`. Made for (U)IntX/EnumX/Data(Time)/DecimalX columns. It should be good for columns with constant or small range values. Codec itself allows enlarge or shrink data type without re-compression. [\#5557](https://github.com/ClickHouse/ClickHouse/pull/5557) ([Artem Zuikov](https://github.com/4ertus2)) +- Add database engine `MySQL` that allow to view all the tables in remote MySQL server [\#5599](https://github.com/ClickHouse/ClickHouse/pull/5599) ([Winter Zhang](https://github.com/zhang2014)) +- `bitmapContains` implementation. It’s 2x faster than `bitmapHasAny` if the second bitmap contains one element. [\#5535](https://github.com/ClickHouse/ClickHouse/pull/5535) ([Zhichang Yu](https://github.com/yuzhichang)) +- Support for `crc32` function (with behaviour exactly as in MySQL or PHP). Do not use it if you need a hash function. [\#5661](https://github.com/ClickHouse/ClickHouse/pull/5661) ([Remen Ivan](https://github.com/BHYCHIK)) +- Implemented `SYSTEM START/STOP DISTRIBUTED SENDS` queries to control asynchronous inserts into `Distributed` tables. [\#4935](https://github.com/ClickHouse/ClickHouse/pull/4935) ([Winter Zhang](https://github.com/zhang2014)) + +#### Bug Fix {#bug-fix-22} + +- Ignore query execution limits and max parts size for merge limits while executing mutations. [\#5659](https://github.com/ClickHouse/ClickHouse/pull/5659) ([Anton Popov](https://github.com/CurtizJ)) +- Fix bug which may lead to deduplication of normal blocks (extremely rare) and insertion of duplicate blocks (more often). [\#5549](https://github.com/ClickHouse/ClickHouse/pull/5549) ([alesapin](https://github.com/alesapin)) +- Fix of function `arrayEnumerateUniqRanked` for arguments with empty arrays [\#5559](https://github.com/ClickHouse/ClickHouse/pull/5559) ([proller](https://github.com/proller)) +- Don’t subscribe to Kafka topics without intent to poll any messages. [\#5698](https://github.com/ClickHouse/ClickHouse/pull/5698) ([Ivan](https://github.com/abyss7)) +- Make setting `join_use_nulls` get no effect for types that cannot be inside Nullable [\#5700](https://github.com/ClickHouse/ClickHouse/pull/5700) ([Olga Khvostikova](https://github.com/stavrolia)) +- Fixed `Incorrect size of index granularity` errors [\#5720](https://github.com/ClickHouse/ClickHouse/pull/5720) ([coraxster](https://github.com/coraxster)) +- Fix Float to Decimal convert overflow [\#5607](https://github.com/ClickHouse/ClickHouse/pull/5607) ([coraxster](https://github.com/coraxster)) +- Flush buffer when `WriteBufferFromHDFS`’s destructor is called. This fixes writing into `HDFS`. [\#5684](https://github.com/ClickHouse/ClickHouse/pull/5684) ([Xindong Peng](https://github.com/eejoin)) + +#### Improvement {#improvement-7} + +- Treat empty cells in `CSV` as default values when the setting `input_format_defaults_for_omitted_fields` is enabled. [\#5625](https://github.com/ClickHouse/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) +- Non-blocking loading of external dictionaries. [\#5567](https://github.com/ClickHouse/ClickHouse/pull/5567) ([Vitaly Baranov](https://github.com/vitlibar)) +- Network timeouts can be dynamically changed for already established connections according to the settings. [\#4558](https://github.com/ClickHouse/ClickHouse/pull/4558) ([Konstantin Podshumok](https://github.com/podshumok)) +- Using “public\_suffix\_list” for functions `firstSignificantSubdomain`, `cutToFirstSignificantSubdomain`. It’s using a perfect hash table generated by `gperf` with a list generated from the file: https://publicsuffix.org/list/public\_suffix\_list.dat. (for example, now we recognize the domain `ac.uk` as non-significant). [\#5030](https://github.com/ClickHouse/ClickHouse/pull/5030) ([Guillaume Tassery](https://github.com/YiuRULE)) +- Adopted `IPv6` data type in system tables; unified client info columns in `system.processes` and `system.query_log` [\#5640](https://github.com/ClickHouse/ClickHouse/pull/5640) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Using sessions for connections with MySQL compatibility protocol. \#5476 [\#5646](https://github.com/ClickHouse/ClickHouse/pull/5646) ([Yuriy Baranov](https://github.com/yurriy)) +- Support more `ALTER` queries `ON CLUSTER`. [\#5593](https://github.com/ClickHouse/ClickHouse/pull/5593) [\#5613](https://github.com/ClickHouse/ClickHouse/pull/5613) ([sundyli](https://github.com/sundy-li)) +- Support `` section in `clickhouse-local` config file. [\#5540](https://github.com/ClickHouse/ClickHouse/pull/5540) ([proller](https://github.com/proller)) +- Allow run query with `remote` table function in `clickhouse-local` [\#5627](https://github.com/ClickHouse/ClickHouse/pull/5627) ([proller](https://github.com/proller)) + +#### Performance Improvement {#performance-improvement-5} + +- Add the possibility to write the final mark at the end of MergeTree columns. It allows to avoid useless reads for keys that are out of table data range. It is enabled only if adaptive index granularity is in use. [\#5624](https://github.com/ClickHouse/ClickHouse/pull/5624) ([alesapin](https://github.com/alesapin)) +- Improved performance of MergeTree tables on very slow filesystems by reducing number of `stat` syscalls. [\#5648](https://github.com/ClickHouse/ClickHouse/pull/5648) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed performance degradation in reading from MergeTree tables that was introduced in version 19.6. Fixes \#5631. [\#5633](https://github.com/ClickHouse/ClickHouse/pull/5633) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-8} + +- Implemented `TestKeeper` as an implementation of ZooKeeper interface used for testing [\#5643](https://github.com/ClickHouse/ClickHouse/pull/5643) ([alexey-milovidov](https://github.com/alexey-milovidov)) ([levushkin aleksej](https://github.com/alexey-milovidov)) +- From now on `.sql` tests can be run isolated by server, in parallel, with random database. It allows to run them faster, add new tests with custom server configurations, and be sure that different tests doesn’t affect each other. [\#5554](https://github.com/ClickHouse/ClickHouse/pull/5554) ([Ivan](https://github.com/abyss7)) +- Remove `` and `` from performance tests [\#5672](https://github.com/ClickHouse/ClickHouse/pull/5672) ([Olga Khvostikova](https://github.com/stavrolia)) +- Fixed “select\_format” performance test for `Pretty` formats [\#5642](https://github.com/ClickHouse/ClickHouse/pull/5642) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +## ClickHouse release 19.9 {#clickhouse-release-19-9} + +### ClickHouse release 19.9.3.31, 2019-07-05 {#clickhouse-release-19-9-3-31-2019-07-05} + +#### Bug Fix {#bug-fix-23} + +- Fix segfault in Delta codec which affects columns with values less than 32 bits size. The bug led to random memory corruption. [\#5786](https://github.com/ClickHouse/ClickHouse/pull/5786) ([alesapin](https://github.com/alesapin)) +- Fix rare bug in checking of part with LowCardinality column. [\#5832](https://github.com/ClickHouse/ClickHouse/pull/5832) ([alesapin](https://github.com/alesapin)) +- Fix segfault in TTL merge with non-physical columns in block. [\#5819](https://github.com/ClickHouse/ClickHouse/pull/5819) ([Anton Popov](https://github.com/CurtizJ)) +- Fix potential infinite sleeping of low-priority queries. [\#5842](https://github.com/ClickHouse/ClickHouse/pull/5842) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix how ClickHouse determines default time zone as UCT instead of UTC. [\#5828](https://github.com/ClickHouse/ClickHouse/pull/5828) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix bug about executing distributed DROP/ALTER/TRUNCATE/OPTIMIZE ON CLUSTER queries on follower replica before leader replica. Now they will be executed directly on leader replica. [\#5757](https://github.com/ClickHouse/ClickHouse/pull/5757) ([alesapin](https://github.com/alesapin)) +- Fix race condition, which cause that some queries may not appear in query\_log instantly after SYSTEM FLUSH LOGS query. [\#5685](https://github.com/ClickHouse/ClickHouse/pull/5685) ([Anton Popov](https://github.com/CurtizJ)) +- Added missing support for constant arguments to `evalMLModel` function. [\#5820](https://github.com/ClickHouse/ClickHouse/pull/5820) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.9.2.4, 2019-06-24 {#clickhouse-release-19-9-2-4-2019-06-24} + +#### New Feature {#new-feature-8} + +- Print information about frozen parts in `system.parts` table. [\#5471](https://github.com/ClickHouse/ClickHouse/pull/5471) ([proller](https://github.com/proller)) +- Ask client password on clickhouse-client start on tty if not set in arguments [\#5092](https://github.com/ClickHouse/ClickHouse/pull/5092) ([proller](https://github.com/proller)) +- Implement `dictGet` and `dictGetOrDefault` functions for Decimal types. [\#5394](https://github.com/ClickHouse/ClickHouse/pull/5394) ([Artem Zuikov](https://github.com/4ertus2)) + +#### Improvement {#improvement-8} + +- Debian init: Add service stop timeout [\#5522](https://github.com/ClickHouse/ClickHouse/pull/5522) ([proller](https://github.com/proller)) +- Add setting forbidden by default to create table with suspicious types for LowCardinality [\#5448](https://github.com/ClickHouse/ClickHouse/pull/5448) ([Olga Khvostikova](https://github.com/stavrolia)) +- Regression functions return model weights when not used as State in function `evalMLMethod`. [\#5411](https://github.com/ClickHouse/ClickHouse/pull/5411) ([Quid37](https://github.com/Quid37)) +- Rename and improve regression methods. [\#5492](https://github.com/ClickHouse/ClickHouse/pull/5492) ([Quid37](https://github.com/Quid37)) +- Clearer interfaces of string searchers. [\#5586](https://github.com/ClickHouse/ClickHouse/pull/5586) ([Danila Kutenin](https://github.com/danlark1)) + +#### Bug Fix {#bug-fix-24} + +- Fix potential data loss in Kafka [\#5445](https://github.com/ClickHouse/ClickHouse/pull/5445) ([Ivan](https://github.com/abyss7)) +- Fix potential infinite loop in `PrettySpace` format when called with zero columns [\#5560](https://github.com/ClickHouse/ClickHouse/pull/5560) ([Olga Khvostikova](https://github.com/stavrolia)) +- Fixed UInt32 overflow bug in linear models. Allow eval ML model for non-const model argument. [\#5516](https://github.com/ClickHouse/ClickHouse/pull/5516) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- `ALTER TABLE ... DROP INDEX IF EXISTS ...` should not raise an exception if provided index does not exist [\#5524](https://github.com/ClickHouse/ClickHouse/pull/5524) ([Gleb Novikov](https://github.com/NanoBjorn)) +- Fix segfault with `bitmapHasAny` in scalar subquery [\#5528](https://github.com/ClickHouse/ClickHouse/pull/5528) ([Zhichang Yu](https://github.com/yuzhichang)) +- Fixed error when replication connection pool doesn’t retry to resolve host, even when DNS cache was dropped. [\#5534](https://github.com/ClickHouse/ClickHouse/pull/5534) ([alesapin](https://github.com/alesapin)) +- Fixed `ALTER ... MODIFY TTL` on ReplicatedMergeTree. [\#5539](https://github.com/ClickHouse/ClickHouse/pull/5539) ([Anton Popov](https://github.com/CurtizJ)) +- Fix INSERT into Distributed table with MATERIALIZED column [\#5429](https://github.com/ClickHouse/ClickHouse/pull/5429) ([Azat Khuzhin](https://github.com/azat)) +- Fix bad alloc when truncate Join storage [\#5437](https://github.com/ClickHouse/ClickHouse/pull/5437) ([TCeason](https://github.com/TCeason)) +- In recent versions of package tzdata some of files are symlinks now. The current mechanism for detecting default timezone gets broken and gives wrong names for some timezones. Now at least we force the timezone name to the contents of TZ if provided. [\#5443](https://github.com/ClickHouse/ClickHouse/pull/5443) ([Ivan](https://github.com/abyss7)) +- Fix some extremely rare cases with MultiVolnitsky searcher when the constant needles in sum are at least 16KB long. The algorithm missed or overwrote the previous results which can lead to the incorrect result of `multiSearchAny`. [\#5588](https://github.com/ClickHouse/ClickHouse/pull/5588) ([Danila Kutenin](https://github.com/danlark1)) +- Fix the issue when settings for ExternalData requests couldn’t use ClickHouse settings. Also, for now, settings `date_time_input_format` and `low_cardinality_allow_in_native_format` cannot be used because of the ambiguity of names (in external data it can be interpreted as table format and in the query it can be a setting). [\#5455](https://github.com/ClickHouse/ClickHouse/pull/5455) ([Danila Kutenin](https://github.com/danlark1)) +- Fix bug when parts were removed only from FS without dropping them from Zookeeper. [\#5520](https://github.com/ClickHouse/ClickHouse/pull/5520) ([alesapin](https://github.com/alesapin)) +- Remove debug logging from MySQL protocol [\#5478](https://github.com/ClickHouse/ClickHouse/pull/5478) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Skip ZNONODE during DDL query processing [\#5489](https://github.com/ClickHouse/ClickHouse/pull/5489) ([Azat Khuzhin](https://github.com/azat)) +- Fix mix `UNION ALL` result column type. There were cases with inconsistent data and column types of resulting columns. [\#5503](https://github.com/ClickHouse/ClickHouse/pull/5503) ([Artem Zuikov](https://github.com/4ertus2)) +- Throw an exception on wrong integers in `dictGetT` functions instead of crash. [\#5446](https://github.com/ClickHouse/ClickHouse/pull/5446) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix wrong element\_count and load\_factor for hashed dictionary in `system.dictionaries` table. [\#5440](https://github.com/ClickHouse/ClickHouse/pull/5440) ([Azat Khuzhin](https://github.com/azat)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-9} + +- Fixed build without `Brotli` HTTP compression support (`ENABLE_BROTLI=OFF` cmake variable). [\#5521](https://github.com/ClickHouse/ClickHouse/pull/5521) ([Anton Yuzhaninov](https://github.com/citrin)) +- Include roaring.h as roaring/roaring.h [\#5523](https://github.com/ClickHouse/ClickHouse/pull/5523) ([Orivej Desh](https://github.com/orivej)) +- Fix gcc9 warnings in hyperscan (\#line directive is evil!) [\#5546](https://github.com/ClickHouse/ClickHouse/pull/5546) ([Danila Kutenin](https://github.com/danlark1)) +- Fix all warnings when compiling with gcc-9. Fix some contrib issues. Fix gcc9 ICE and submit it to bugzilla. [\#5498](https://github.com/ClickHouse/ClickHouse/pull/5498) ([Danila Kutenin](https://github.com/danlark1)) +- Fixed linking with lld [\#5477](https://github.com/ClickHouse/ClickHouse/pull/5477) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Remove unused specializations in dictionaries [\#5452](https://github.com/ClickHouse/ClickHouse/pull/5452) ([Artem Zuikov](https://github.com/4ertus2)) +- Improvement performance tests for formatting and parsing tables for different types of files [\#5497](https://github.com/ClickHouse/ClickHouse/pull/5497) ([Olga Khvostikova](https://github.com/stavrolia)) +- Fixes for parallel test run [\#5506](https://github.com/ClickHouse/ClickHouse/pull/5506) ([proller](https://github.com/proller)) +- Docker: use configs from clickhouse-test [\#5531](https://github.com/ClickHouse/ClickHouse/pull/5531) ([proller](https://github.com/proller)) +- Fix compile for FreeBSD [\#5447](https://github.com/ClickHouse/ClickHouse/pull/5447) ([proller](https://github.com/proller)) +- Upgrade boost to 1.70 [\#5570](https://github.com/ClickHouse/ClickHouse/pull/5570) ([proller](https://github.com/proller)) +- Fix build clickhouse as submodule [\#5574](https://github.com/ClickHouse/ClickHouse/pull/5574) ([proller](https://github.com/proller)) +- Improve JSONExtract performance tests [\#5444](https://github.com/ClickHouse/ClickHouse/pull/5444) ([Vitaly Baranov](https://github.com/vitlibar)) + +## ClickHouse release 19.8 {#clickhouse-release-19-8} + +### ClickHouse release 19.8.3.8, 2019-06-11 {#clickhouse-release-19-8-3-8-2019-06-11} + +#### New Features {#new-features} + +- Added functions to work with JSON [\#4686](https://github.com/ClickHouse/ClickHouse/pull/4686) ([hcz](https://github.com/hczhcz)) [\#5124](https://github.com/ClickHouse/ClickHouse/pull/5124). ([Vitaly Baranov](https://github.com/vitlibar)) +- Add a function basename, with a similar behaviour to a basename function, which exists in a lot of languages (`os.path.basename` in python, `basename` in PHP, etc…). Work with both an UNIX-like path or a Windows path. [\#5136](https://github.com/ClickHouse/ClickHouse/pull/5136) ([Guillaume Tassery](https://github.com/YiuRULE)) +- Added `LIMIT n, m BY` or `LIMIT m OFFSET n BY` syntax to set offset of n for LIMIT BY clause. [\#5138](https://github.com/ClickHouse/ClickHouse/pull/5138) ([Anton Popov](https://github.com/CurtizJ)) +- Added new data type `SimpleAggregateFunction`, which allows to have columns with light aggregation in an `AggregatingMergeTree`. This can only be used with simple functions like `any`, `anyLast`, `sum`, `min`, `max`. [\#4629](https://github.com/ClickHouse/ClickHouse/pull/4629) ([Boris Granveaud](https://github.com/bgranvea)) +- Added support for non-constant arguments in function `ngramDistance` [\#5198](https://github.com/ClickHouse/ClickHouse/pull/5198) ([Danila Kutenin](https://github.com/danlark1)) +- Added functions `skewPop`, `skewSamp`, `kurtPop` and `kurtSamp` to compute for sequence skewness, sample skewness, kurtosis and sample kurtosis respectively. [\#5200](https://github.com/ClickHouse/ClickHouse/pull/5200) ([hcz](https://github.com/hczhcz)) +- Support rename operation for `MaterializeView` storage. [\#5209](https://github.com/ClickHouse/ClickHouse/pull/5209) ([Guillaume Tassery](https://github.com/YiuRULE)) +- Added server which allows connecting to ClickHouse using MySQL client. [\#4715](https://github.com/ClickHouse/ClickHouse/pull/4715) ([Yuriy Baranov](https://github.com/yurriy)) +- Add `toDecimal*OrZero` and `toDecimal*OrNull` functions. [\#5291](https://github.com/ClickHouse/ClickHouse/pull/5291) ([Artem Zuikov](https://github.com/4ertus2)) +- Support Decimal types in functions: `quantile`, `quantiles`, `median`, `quantileExactWeighted`, `quantilesExactWeighted`, medianExactWeighted. [\#5304](https://github.com/ClickHouse/ClickHouse/pull/5304) ([Artem Zuikov](https://github.com/4ertus2)) +- Added `toValidUTF8` function, which replaces all invalid UTF-8 characters by replacement character � (U+FFFD). [\#5322](https://github.com/ClickHouse/ClickHouse/pull/5322) ([Danila Kutenin](https://github.com/danlark1)) +- Added `format` function. Formatting constant pattern (simplified Python format pattern) with the strings listed in the arguments. [\#5330](https://github.com/ClickHouse/ClickHouse/pull/5330) ([Danila Kutenin](https://github.com/danlark1)) +- Added `system.detached_parts` table containing information about detached parts of `MergeTree` tables. [\#5353](https://github.com/ClickHouse/ClickHouse/pull/5353) ([akuzm](https://github.com/akuzm)) +- Added `ngramSearch` function to calculate the non-symmetric difference between needle and haystack. [\#5418](https://github.com/ClickHouse/ClickHouse/pull/5418)[\#5422](https://github.com/ClickHouse/ClickHouse/pull/5422) ([Danila Kutenin](https://github.com/danlark1)) +- Implementation of basic machine learning methods (stochastic linear regression and logistic regression) using aggregate functions interface. Has different strategies for updating model weights (simple gradient descent, momentum method, Nesterov method). Also supports mini-batches of custom size. [\#4943](https://github.com/ClickHouse/ClickHouse/pull/4943) ([Quid37](https://github.com/Quid37)) +- Implementation of `geohashEncode` and `geohashDecode` functions. [\#5003](https://github.com/ClickHouse/ClickHouse/pull/5003) ([Vasily Nemkov](https://github.com/Enmk)) +- Added aggregate function `timeSeriesGroupSum`, which can aggregate different time series that sample timestamp not alignment. It will use linear interpolation between two sample timestamp and then sum time-series together. Added aggregate function `timeSeriesGroupRateSum`, which calculates the rate of time-series and then sum rates together. [\#4542](https://github.com/ClickHouse/ClickHouse/pull/4542) ([Yangkuan Liu](https://github.com/LiuYangkuan)) +- Added functions `IPv4CIDRtoIPv4Range` and `IPv6CIDRtoIPv6Range` to calculate the lower and higher bounds for an IP in the subnet using a CIDR. [\#5095](https://github.com/ClickHouse/ClickHouse/pull/5095) ([Guillaume Tassery](https://github.com/YiuRULE)) +- Add a X-ClickHouse-Summary header when we send a query using HTTP with enabled setting `send_progress_in_http_headers`. Return the usual information of X-ClickHouse-Progress, with additional information like how many rows and bytes were inserted in the query. [\#5116](https://github.com/ClickHouse/ClickHouse/pull/5116) ([Guillaume Tassery](https://github.com/YiuRULE)) + +#### Improvements {#improvements} + +- Added `max_parts_in_total` setting for MergeTree family of tables (default: 100 000) that prevents unsafe specification of partition key \#5166. [\#5171](https://github.com/ClickHouse/ClickHouse/pull/5171) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- `clickhouse-obfuscator`: derive seed for individual columns by combining initial seed with column name, not column position. This is intended to transform datasets with multiple related tables, so that tables will remain JOINable after transformation. [\#5178](https://github.com/ClickHouse/ClickHouse/pull/5178) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added functions `JSONExtractRaw`, `JSONExtractKeyAndValues`. Renamed functions `jsonExtract` to `JSONExtract`. When something goes wrong these functions return the correspondent values, not `NULL`. Modified function `JSONExtract`, now it gets the return type from its last parameter and doesn’t inject nullables. Implemented fallback to RapidJSON in case AVX2 instructions are not available. Simdjson library updated to a new version. [\#5235](https://github.com/ClickHouse/ClickHouse/pull/5235) ([Vitaly Baranov](https://github.com/vitlibar)) +- Now `if` and `multiIf` functions don’t rely on the condition’s `Nullable`, but rely on the branches for sql compatibility. [\#5238](https://github.com/ClickHouse/ClickHouse/pull/5238) ([Jian Wu](https://github.com/janplus)) +- `In` predicate now generates `Null` result from `Null` input like the `Equal` function. [\#5152](https://github.com/ClickHouse/ClickHouse/pull/5152) ([Jian Wu](https://github.com/janplus)) +- Check the time limit every (flush\_interval / poll\_timeout) number of rows from Kafka. This allows to break the reading from Kafka consumer more frequently and to check the time limits for the top-level streams [\#5249](https://github.com/ClickHouse/ClickHouse/pull/5249) ([Ivan](https://github.com/abyss7)) +- Link rdkafka with bundled SASL. It should allow to use SASL SCRAM authentication [\#5253](https://github.com/ClickHouse/ClickHouse/pull/5253) ([Ivan](https://github.com/abyss7)) +- Batched version of RowRefList for ALL JOINS. [\#5267](https://github.com/ClickHouse/ClickHouse/pull/5267) ([Artem Zuikov](https://github.com/4ertus2)) +- clickhouse-server: more informative listen error messages. [\#5268](https://github.com/ClickHouse/ClickHouse/pull/5268) ([proller](https://github.com/proller)) +- Support dictionaries in clickhouse-copier for functions in `` [\#5270](https://github.com/ClickHouse/ClickHouse/pull/5270) ([proller](https://github.com/proller)) +- Add new setting `kafka_commit_every_batch` to regulate Kafka committing policy. + It allows to set commit mode: after every batch of messages is handled, or after the whole block is written to the storage. It’s a trade-off between losing some messages or reading them twice in some extreme situations. [\#5308](https://github.com/ClickHouse/ClickHouse/pull/5308) ([Ivan](https://github.com/abyss7)) +- Make `windowFunnel` support other Unsigned Integer Types. [\#5320](https://github.com/ClickHouse/ClickHouse/pull/5320) ([sundyli](https://github.com/sundy-li)) +- Allow to shadow virtual column `_table` in Merge engine. [\#5325](https://github.com/ClickHouse/ClickHouse/pull/5325) ([Ivan](https://github.com/abyss7)) +- Make `sequenceMatch` aggregate functions support other unsigned Integer types [\#5339](https://github.com/ClickHouse/ClickHouse/pull/5339) ([sundyli](https://github.com/sundy-li)) +- Better error messages if checksum mismatch is most likely caused by hardware failures. [\#5355](https://github.com/ClickHouse/ClickHouse/pull/5355) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Check that underlying tables support sampling for `StorageMerge` [\#5366](https://github.com/ClickHouse/ClickHouse/pull/5366) ([Ivan](https://github.com/abyss7)) +- Сlose MySQL connections after their usage in external dictionaries. It is related to issue \#893. [\#5395](https://github.com/ClickHouse/ClickHouse/pull/5395) ([Clément Rodriguez](https://github.com/clemrodriguez)) +- Improvements of MySQL Wire Protocol. Changed name of format to MySQLWire. Using RAII for calling RSA\_free. Disabling SSL if context cannot be created. [\#5419](https://github.com/ClickHouse/ClickHouse/pull/5419) ([Yuriy Baranov](https://github.com/yurriy)) +- clickhouse-client: allow to run with unaccessable history file (read-only, no disk space, file is directory, …). [\#5431](https://github.com/ClickHouse/ClickHouse/pull/5431) ([proller](https://github.com/proller)) +- Respect query settings in asynchronous INSERTs into Distributed tables. [\#4936](https://github.com/ClickHouse/ClickHouse/pull/4936) ([TCeason](https://github.com/TCeason)) +- Renamed functions `leastSqr` to `simpleLinearRegression`, `LinearRegression` to `linearRegression`, `LogisticRegression` to `logisticRegression`. [\#5391](https://github.com/ClickHouse/ClickHouse/pull/5391) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) + +#### Performance Improvements {#performance-improvements} + +- Parallelize processing of parts of non-replicated MergeTree tables in ALTER MODIFY query. [\#4639](https://github.com/ClickHouse/ClickHouse/pull/4639) ([Ivan Kush](https://github.com/IvanKush)) +- Optimizations in regular expressions extraction. [\#5193](https://github.com/ClickHouse/ClickHouse/pull/5193) [\#5191](https://github.com/ClickHouse/ClickHouse/pull/5191) ([Danila Kutenin](https://github.com/danlark1)) +- Do not add right join key column to join result if it’s used only in join on section. [\#5260](https://github.com/ClickHouse/ClickHouse/pull/5260) ([Artem Zuikov](https://github.com/4ertus2)) +- Freeze the Kafka buffer after first empty response. It avoids multiple invokations of `ReadBuffer::next()` for empty result in some row-parsing streams. [\#5283](https://github.com/ClickHouse/ClickHouse/pull/5283) ([Ivan](https://github.com/abyss7)) +- `concat` function optimization for multiple arguments. [\#5357](https://github.com/ClickHouse/ClickHouse/pull/5357) ([Danila Kutenin](https://github.com/danlark1)) +- Query optimisation. Allow push down IN statement while rewriting commа/cross join into inner one. [\#5396](https://github.com/ClickHouse/ClickHouse/pull/5396) ([Artem Zuikov](https://github.com/4ertus2)) +- Upgrade our LZ4 implementation with reference one to have faster decompression. [\#5070](https://github.com/ClickHouse/ClickHouse/pull/5070) ([Danila Kutenin](https://github.com/danlark1)) +- Implemented MSD radix sort (based on kxsort), and partial sorting. [\#5129](https://github.com/ClickHouse/ClickHouse/pull/5129) ([Evgenii Pravda](https://github.com/kvinty)) + +#### Bug Fixes {#bug-fixes} + +- Fix push require columns with join [\#5192](https://github.com/ClickHouse/ClickHouse/pull/5192) ([Winter Zhang](https://github.com/zhang2014)) +- Fixed bug, when ClickHouse is run by systemd, the command `sudo service clickhouse-server forcerestart` was not working as expected. [\#5204](https://github.com/ClickHouse/ClickHouse/pull/5204) ([proller](https://github.com/proller)) +- Fix http error codes in DataPartsExchange (interserver http server on 9009 port always returned code 200, even on errors). [\#5216](https://github.com/ClickHouse/ClickHouse/pull/5216) ([proller](https://github.com/proller)) +- Fix SimpleAggregateFunction for String longer than MAX\_SMALL\_STRING\_SIZE [\#5311](https://github.com/ClickHouse/ClickHouse/pull/5311) ([Azat Khuzhin](https://github.com/azat)) +- Fix error for `Decimal` to `Nullable(Decimal)` conversion in IN. Support other Decimal to Decimal conversions (including different scales). [\#5350](https://github.com/ClickHouse/ClickHouse/pull/5350) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed FPU clobbering in simdjson library that lead to wrong calculation of `uniqHLL` and `uniqCombined` aggregate function and math functions such as `log`. [\#5354](https://github.com/ClickHouse/ClickHouse/pull/5354) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed handling mixed const/nonconst cases in JSON functions. [\#5435](https://github.com/ClickHouse/ClickHouse/pull/5435) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fix `retention` function. Now all conditions that satisfy in a row of data are added to the data state. [\#5119](https://github.com/ClickHouse/ClickHouse/pull/5119) ([小路](https://github.com/nicelulu)) +- Fix result type for `quantileExact` with Decimals. [\#5304](https://github.com/ClickHouse/ClickHouse/pull/5304) ([Artem Zuikov](https://github.com/4ertus2)) + +#### Documentation {#documentation} + +- Translate documentation for `CollapsingMergeTree` to chinese. [\#5168](https://github.com/ClickHouse/ClickHouse/pull/5168) ([张风啸](https://github.com/AlexZFX)) +- Translate some documentation about table engines to chinese. + [\#5134](https://github.com/ClickHouse/ClickHouse/pull/5134) + [\#5328](https://github.com/ClickHouse/ClickHouse/pull/5328) ([never lee](https://github.com/neverlee)) +#### Build/Testing/Packaging Improvements {#buildtestingpackaging-improvements} + +- Fix some sanitizer reports that show probable use-after-free.[\#5139](https://github.com/ClickHouse/ClickHouse/pull/5139) [\#5143](https://github.com/ClickHouse/ClickHouse/pull/5143) [\#5393](https://github.com/ClickHouse/ClickHouse/pull/5393) ([Ivan](https://github.com/abyss7)) +- Move performance tests out of separate directories for convenience. [\#5158](https://github.com/ClickHouse/ClickHouse/pull/5158) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix incorrect performance tests. [\#5255](https://github.com/ClickHouse/ClickHouse/pull/5255) ([alesapin](https://github.com/alesapin)) +- Added a tool to calculate checksums caused by bit flips to debug hardware issues. [\#5334](https://github.com/ClickHouse/ClickHouse/pull/5334) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Make runner script more usable. [\#5340](https://github.com/ClickHouse/ClickHouse/pull/5340)[\#5360](https://github.com/ClickHouse/ClickHouse/pull/5360) ([filimonov](https://github.com/filimonov)) +- Add small instruction how to write performance tests. [\#5408](https://github.com/ClickHouse/ClickHouse/pull/5408) ([alesapin](https://github.com/alesapin)) +- Add ability to make substitutions in create, fill and drop query in performance tests [\#5367](https://github.com/ClickHouse/ClickHouse/pull/5367) ([Olga Khvostikova](https://github.com/stavrolia)) + +## ClickHouse release 19.7 {#clickhouse-release-19-7} + +### ClickHouse release 19.7.5.29, 2019-07-05 {#clickhouse-release-19-7-5-29-2019-07-05} + +#### Bug Fix {#bug-fix-25} + +- Fix performance regression in some queries with JOIN. [\#5192](https://github.com/ClickHouse/ClickHouse/pull/5192) ([Winter Zhang](https://github.com/zhang2014)) + +### ClickHouse release 19.7.5.27, 2019-06-09 {#clickhouse-release-19-7-5-27-2019-06-09} + +#### New features {#new-features-1} + +- Added bitmap related functions `bitmapHasAny` and `bitmapHasAll` analogous to `hasAny` and `hasAll` functions for arrays. [\#5279](https://github.com/ClickHouse/ClickHouse/pull/5279) ([Sergi Vladykin](https://github.com/svladykin)) + +#### Bug Fixes {#bug-fixes-1} + +- Fix segfault on `minmax` INDEX with Null value. [\#5246](https://github.com/ClickHouse/ClickHouse/pull/5246) ([Nikita Vasilev](https://github.com/nikvas0)) +- Mark all input columns in LIMIT BY as required output. It fixes ‘Not found column’ error in some distributed queries. [\#5407](https://github.com/ClickHouse/ClickHouse/pull/5407) ([Constantin S. Pan](https://github.com/kvap)) +- Fix “Column ‘0’ already exists” error in `SELECT .. PREWHERE` on column with DEFAULT [\#5397](https://github.com/ClickHouse/ClickHouse/pull/5397) ([proller](https://github.com/proller)) +- Fix `ALTER MODIFY TTL` query on `ReplicatedMergeTree`. [\#5539](https://github.com/ClickHouse/ClickHouse/pull/5539/commits) ([Anton Popov](https://github.com/CurtizJ)) +- Don’t crash the server when Kafka consumers have failed to start. [\#5285](https://github.com/ClickHouse/ClickHouse/pull/5285) ([Ivan](https://github.com/abyss7)) +- Fixed bitmap functions produce wrong result. [\#5359](https://github.com/ClickHouse/ClickHouse/pull/5359) ([Andy Yang](https://github.com/andyyzh)) +- Fix element\_count for hashed dictionary (do not include duplicates) [\#5440](https://github.com/ClickHouse/ClickHouse/pull/5440) ([Azat Khuzhin](https://github.com/azat)) +- Use contents of environment variable TZ as the name for timezone. It helps to correctly detect default timezone in some cases.[\#5443](https://github.com/ClickHouse/ClickHouse/pull/5443) ([Ivan](https://github.com/abyss7)) +- Do not try to convert integers in `dictGetT` functions, because it doesn’t work correctly. Throw an exception instead. [\#5446](https://github.com/ClickHouse/ClickHouse/pull/5446) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix settings in ExternalData HTTP request. [\#5455](https://github.com/ClickHouse/ClickHouse/pull/5455) ([Danila + Kutenin](https://github.com/danlark1)) +- Fix bug when parts were removed only from FS without dropping them from Zookeeper. [\#5520](https://github.com/ClickHouse/ClickHouse/pull/5520) ([alesapin](https://github.com/alesapin)) +- Fix segmentation fault in `bitmapHasAny` function. [\#5528](https://github.com/ClickHouse/ClickHouse/pull/5528) ([Zhichang Yu](https://github.com/yuzhichang)) +- Fixed error when replication connection pool doesn’t retry to resolve host, even when DNS cache was dropped. [\#5534](https://github.com/ClickHouse/ClickHouse/pull/5534) ([alesapin](https://github.com/alesapin)) +- Fixed `DROP INDEX IF EXISTS` query. Now `ALTER TABLE ... DROP INDEX IF EXISTS ...` query doesn’t raise an exception if provided index does not exist. [\#5524](https://github.com/ClickHouse/ClickHouse/pull/5524) ([Gleb Novikov](https://github.com/NanoBjorn)) +- Fix union all supertype column. There were cases with inconsistent data and column types of resulting columns. [\#5503](https://github.com/ClickHouse/ClickHouse/pull/5503) ([Artem Zuikov](https://github.com/4ertus2)) +- Skip ZNONODE during DDL query processing. Before if another node removes the znode in task queue, the one that + did not process it, but already get list of children, will terminate the DDLWorker thread. [\#5489](https://github.com/ClickHouse/ClickHouse/pull/5489) ([Azat Khuzhin](https://github.com/azat)) +- Fix INSERT into Distributed() table with MATERIALIZED column. [\#5429](https://github.com/ClickHouse/ClickHouse/pull/5429) ([Azat Khuzhin](https://github.com/azat)) + +### ClickHouse release 19.7.3.9, 2019-05-30 {#clickhouse-release-19-7-3-9-2019-05-30} + +#### New Features {#new-features-2} + +- Allow to limit the range of a setting that can be specified by user. + These constraints can be set up in user settings profile. + [\#4931](https://github.com/ClickHouse/ClickHouse/pull/4931) ([Vitaly + Baranov](https://github.com/vitlibar)) +- Add a second version of the function `groupUniqArray` with an optional + `max_size` parameter that limits the size of the resulting array. This + behavior is similar to `groupArray(max_size)(x)` function. + [\#5026](https://github.com/ClickHouse/ClickHouse/pull/5026) ([Guillaume + Tassery](https://github.com/YiuRULE)) +- For TSVWithNames/CSVWithNames input file formats, column order can now be + determined from file header. This is controlled by + `input_format_with_names_use_header` parameter. + [\#5081](https://github.com/ClickHouse/ClickHouse/pull/5081) + ([Alexander](https://github.com/Akazz)) + +#### Bug Fixes {#bug-fixes-2} + +- Crash with uncompressed\_cache + JOIN during merge (\#5197) + [\#5133](https://github.com/ClickHouse/ClickHouse/pull/5133) ([Danila + Kutenin](https://github.com/danlark1)) +- Segmentation fault on a clickhouse-client query to system tables. \#5066 + [\#5127](https://github.com/ClickHouse/ClickHouse/pull/5127) + ([Ivan](https://github.com/abyss7)) +- Data loss on heavy load via KafkaEngine (\#4736) + [\#5080](https://github.com/ClickHouse/ClickHouse/pull/5080) + ([Ivan](https://github.com/abyss7)) +- Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts\_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [\#5189](https://github.com/ClickHouse/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Performance Improvements {#performance-improvements-1} + +- Use radix sort for sorting by single numeric column in `ORDER BY` without + `LIMIT`. [\#5106](https://github.com/ClickHouse/ClickHouse/pull/5106), + [\#4439](https://github.com/ClickHouse/ClickHouse/pull/4439) + ([Evgenii Pravda](https://github.com/kvinty), + [alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Documentation {#documentation-1} + +- Translate documentation for some table engines to Chinese. + [\#5107](https://github.com/ClickHouse/ClickHouse/pull/5107), + [\#5094](https://github.com/ClickHouse/ClickHouse/pull/5094), + [\#5087](https://github.com/ClickHouse/ClickHouse/pull/5087) + ([张风啸](https://github.com/AlexZFX)), + [\#5068](https://github.com/ClickHouse/ClickHouse/pull/5068) ([never + lee](https://github.com/neverlee)) -#### Build/Testing/Packaging Improvements -* Fix some sanitizer reports that show probable use-after-free.[#5139](https://github.com/ClickHouse/ClickHouse/pull/5139) [#5143](https://github.com/ClickHouse/ClickHouse/pull/5143) [#5393](https://github.com/ClickHouse/ClickHouse/pull/5393) ([Ivan](https://github.com/abyss7)) -* Move performance tests out of separate directories for convenience. [#5158](https://github.com/ClickHouse/ClickHouse/pull/5158) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix incorrect performance tests. [#5255](https://github.com/ClickHouse/ClickHouse/pull/5255) ([alesapin](https://github.com/alesapin)) -* Added a tool to calculate checksums caused by bit flips to debug hardware issues. [#5334](https://github.com/ClickHouse/ClickHouse/pull/5334) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Make runner script more usable. [#5340](https://github.com/ClickHouse/ClickHouse/pull/5340)[#5360](https://github.com/ClickHouse/ClickHouse/pull/5360) ([filimonov](https://github.com/filimonov)) -* Add small instruction how to write performance tests. [#5408](https://github.com/ClickHouse/ClickHouse/pull/5408) ([alesapin](https://github.com/alesapin)) -* Add ability to make substitutions in create, fill and drop query in performance tests [#5367](https://github.com/ClickHouse/ClickHouse/pull/5367) ([Olga Khvostikova](https://github.com/stavrolia)) - -## ClickHouse release 19.7 - -### ClickHouse release 19.7.5.29, 2019-07-05 - -#### Bug Fix -* Fix performance regression in some queries with JOIN. [#5192](https://github.com/ClickHouse/ClickHouse/pull/5192) ([Winter Zhang](https://github.com/zhang2014)) - -### ClickHouse release 19.7.5.27, 2019-06-09 - -#### New features -* Added bitmap related functions `bitmapHasAny` and `bitmapHasAll` analogous to `hasAny` and `hasAll` functions for arrays. [#5279](https://github.com/ClickHouse/ClickHouse/pull/5279) ([Sergi Vladykin](https://github.com/svladykin)) - -#### Bug Fixes -* Fix segfault on `minmax` INDEX with Null value. [#5246](https://github.com/ClickHouse/ClickHouse/pull/5246) ([Nikita Vasilev](https://github.com/nikvas0)) -* Mark all input columns in LIMIT BY as required output. It fixes 'Not found column' error in some distributed queries. [#5407](https://github.com/ClickHouse/ClickHouse/pull/5407) ([Constantin S. Pan](https://github.com/kvap)) -* Fix "Column '0' already exists" error in `SELECT .. PREWHERE` on column with DEFAULT [#5397](https://github.com/ClickHouse/ClickHouse/pull/5397) ([proller](https://github.com/proller)) -* Fix `ALTER MODIFY TTL` query on `ReplicatedMergeTree`. [#5539](https://github.com/ClickHouse/ClickHouse/pull/5539/commits) ([Anton Popov](https://github.com/CurtizJ)) -* Don't crash the server when Kafka consumers have failed to start. [#5285](https://github.com/ClickHouse/ClickHouse/pull/5285) ([Ivan](https://github.com/abyss7)) -* Fixed bitmap functions produce wrong result. [#5359](https://github.com/ClickHouse/ClickHouse/pull/5359) ([Andy Yang](https://github.com/andyyzh)) -* Fix element_count for hashed dictionary (do not include duplicates) [#5440](https://github.com/ClickHouse/ClickHouse/pull/5440) ([Azat Khuzhin](https://github.com/azat)) -* Use contents of environment variable TZ as the name for timezone. It helps to correctly detect default timezone in some cases.[#5443](https://github.com/ClickHouse/ClickHouse/pull/5443) ([Ivan](https://github.com/abyss7)) -* Do not try to convert integers in `dictGetT` functions, because it doesn't work correctly. Throw an exception instead. [#5446](https://github.com/ClickHouse/ClickHouse/pull/5446) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix settings in ExternalData HTTP request. [#5455](https://github.com/ClickHouse/ClickHouse/pull/5455) ([Danila - Kutenin](https://github.com/danlark1)) -* Fix bug when parts were removed only from FS without dropping them from Zookeeper. [#5520](https://github.com/ClickHouse/ClickHouse/pull/5520) ([alesapin](https://github.com/alesapin)) -* Fix segmentation fault in `bitmapHasAny` function. [#5528](https://github.com/ClickHouse/ClickHouse/pull/5528) ([Zhichang Yu](https://github.com/yuzhichang)) -* Fixed error when replication connection pool doesn't retry to resolve host, even when DNS cache was dropped. [#5534](https://github.com/ClickHouse/ClickHouse/pull/5534) ([alesapin](https://github.com/alesapin)) -* Fixed `DROP INDEX IF EXISTS` query. Now `ALTER TABLE ... DROP INDEX IF EXISTS ...` query doesn't raise an exception if provided index does not exist. [#5524](https://github.com/ClickHouse/ClickHouse/pull/5524) ([Gleb Novikov](https://github.com/NanoBjorn)) -* Fix union all supertype column. There were cases with inconsistent data and column types of resulting columns. [#5503](https://github.com/ClickHouse/ClickHouse/pull/5503) ([Artem Zuikov](https://github.com/4ertus2)) -* Skip ZNONODE during DDL query processing. Before if another node removes the znode in task queue, the one that -did not process it, but already get list of children, will terminate the DDLWorker thread. [#5489](https://github.com/ClickHouse/ClickHouse/pull/5489) ([Azat Khuzhin](https://github.com/azat)) -* Fix INSERT into Distributed() table with MATERIALIZED column. [#5429](https://github.com/ClickHouse/ClickHouse/pull/5429) ([Azat Khuzhin](https://github.com/azat)) - -### ClickHouse release 19.7.3.9, 2019-05-30 - -#### New Features -* Allow to limit the range of a setting that can be specified by user. - These constraints can be set up in user settings profile. -[#4931](https://github.com/ClickHouse/ClickHouse/pull/4931) ([Vitaly -Baranov](https://github.com/vitlibar)) -* Add a second version of the function `groupUniqArray` with an optional - `max_size` parameter that limits the size of the resulting array. This -behavior is similar to `groupArray(max_size)(x)` function. -[#5026](https://github.com/ClickHouse/ClickHouse/pull/5026) ([Guillaume -Tassery](https://github.com/YiuRULE)) -* For TSVWithNames/CSVWithNames input file formats, column order can now be - determined from file header. This is controlled by -`input_format_with_names_use_header` parameter. -[#5081](https://github.com/ClickHouse/ClickHouse/pull/5081) -([Alexander](https://github.com/Akazz)) - -#### Bug Fixes -* Crash with uncompressed_cache + JOIN during merge (#5197) -[#5133](https://github.com/ClickHouse/ClickHouse/pull/5133) ([Danila -Kutenin](https://github.com/danlark1)) -* Segmentation fault on a clickhouse-client query to system tables. #5066 -[#5127](https://github.com/ClickHouse/ClickHouse/pull/5127) -([Ivan](https://github.com/abyss7)) -* Data loss on heavy load via KafkaEngine (#4736) -[#5080](https://github.com/ClickHouse/ClickHouse/pull/5080) -([Ivan](https://github.com/abyss7)) -* Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [#5189](https://github.com/ClickHouse/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Performance Improvements -* Use radix sort for sorting by single numeric column in `ORDER BY` without - `LIMIT`. [#5106](https://github.com/ClickHouse/ClickHouse/pull/5106), -[#4439](https://github.com/ClickHouse/ClickHouse/pull/4439) -([Evgenii Pravda](https://github.com/kvinty), -[alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Documentation -* Translate documentation for some table engines to Chinese. - [#5107](https://github.com/ClickHouse/ClickHouse/pull/5107), -[#5094](https://github.com/ClickHouse/ClickHouse/pull/5094), -[#5087](https://github.com/ClickHouse/ClickHouse/pull/5087) -([张风啸](https://github.com/AlexZFX)), -[#5068](https://github.com/ClickHouse/ClickHouse/pull/5068) ([never -lee](https://github.com/neverlee)) - -#### Build/Testing/Packaging Improvements -* Print UTF-8 characters properly in `clickhouse-test`. - [#5084](https://github.com/ClickHouse/ClickHouse/pull/5084) -([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add command line parameter for clickhouse-client to always load suggestion - data. [#5102](https://github.com/ClickHouse/ClickHouse/pull/5102) -([alexey-milovidov](https://github.com/alexey-milovidov)) -* Resolve some of PVS-Studio warnings. - [#5082](https://github.com/ClickHouse/ClickHouse/pull/5082) -([alexey-milovidov](https://github.com/alexey-milovidov)) -* Update LZ4 [#5040](https://github.com/ClickHouse/ClickHouse/pull/5040) ([Danila - Kutenin](https://github.com/danlark1)) -* Add gperf to build requirements for upcoming pull request #5030. - [#5110](https://github.com/ClickHouse/ClickHouse/pull/5110) -([proller](https://github.com/proller)) - -## ClickHouse release 19.6 -### ClickHouse release 19.6.3.18, 2019-06-13 - -#### Bug Fixes -* Fixed IN condition pushdown for queries from table functions `mysql` and `odbc` and corresponding table engines. This fixes #3540 and #2384. [#5313](https://github.com/ClickHouse/ClickHouse/pull/5313) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix deadlock in Zookeeper. [#5297](https://github.com/ClickHouse/ClickHouse/pull/5297) ([github1youlc](https://github.com/github1youlc)) -* Allow quoted decimals in CSV. [#5284](https://github.com/ClickHouse/ClickHouse/pull/5284) ([Artem Zuikov](https://github.com/4ertus2) -* Disallow conversion from float Inf/NaN into Decimals (throw exception). [#5282](https://github.com/ClickHouse/ClickHouse/pull/5282) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix data race in rename query. [#5247](https://github.com/ClickHouse/ClickHouse/pull/5247) ([Winter Zhang](https://github.com/zhang2014)) -* Temporarily disable LFAlloc. Usage of LFAlloc might lead to a lot of MAP_FAILED in allocating UncompressedCache and in a result to crashes of queries at high loaded servers. [cfdba93](https://github.com/ClickHouse/ClickHouse/commit/cfdba938ce22f16efeec504f7f90206a515b1280)([Danila Kutenin](https://github.com/danlark1)) - -### ClickHouse release 19.6.2.11, 2019-05-13 - -#### New Features -* TTL expressions for columns and tables. [#4212](https://github.com/ClickHouse/ClickHouse/pull/4212) ([Anton Popov](https://github.com/CurtizJ)) -* Added support for `brotli` compression for HTTP responses (Accept-Encoding: br) [#4388](https://github.com/ClickHouse/ClickHouse/pull/4388) ([Mikhail](https://github.com/fandyushin)) -* Added new function `isValidUTF8` for checking whether a set of bytes is correctly utf-8 encoded. [#4934](https://github.com/ClickHouse/ClickHouse/pull/4934) ([Danila Kutenin](https://github.com/danlark1)) -* Add new load balancing policy `first_or_random` which sends queries to the first specified host and if it's inaccessible send queries to random hosts of shard. Useful for cross-replication topology setups. [#5012](https://github.com/ClickHouse/ClickHouse/pull/5012) ([nvartolomei](https://github.com/nvartolomei)) - -#### Experimental Features -* Add setting `index_granularity_bytes` (adaptive index granularity) for MergeTree* tables family. [#4826](https://github.com/ClickHouse/ClickHouse/pull/4826) ([alesapin](https://github.com/alesapin)) - -#### Improvements -* Added support for non-constant and negative size and length arguments for function `substringUTF8`. [#4989](https://github.com/ClickHouse/ClickHouse/pull/4989) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Disable push-down to right table in left join, left table in right join, and both tables in full join. This fixes wrong JOIN results in some cases. [#4846](https://github.com/ClickHouse/ClickHouse/pull/4846) ([Ivan](https://github.com/abyss7)) -* `clickhouse-copier`: auto upload task configuration from `--task-file` option [#4876](https://github.com/ClickHouse/ClickHouse/pull/4876) ([proller](https://github.com/proller)) -* Added typos handler for storage factory and table functions factory. [#4891](https://github.com/ClickHouse/ClickHouse/pull/4891) ([Danila Kutenin](https://github.com/danlark1)) -* Support asterisks and qualified asterisks for multiple joins without subqueries [#4898](https://github.com/ClickHouse/ClickHouse/pull/4898) ([Artem Zuikov](https://github.com/4ertus2)) -* Make missing column error message more user friendly. [#4915](https://github.com/ClickHouse/ClickHouse/pull/4915) ([Artem Zuikov](https://github.com/4ertus2)) - -#### Performance Improvements -* Significant speedup of ASOF JOIN [#4924](https://github.com/ClickHouse/ClickHouse/pull/4924) ([Martijn Bakker](https://github.com/Gladdy)) - -#### Backward Incompatible Changes -* HTTP header `Query-Id` was renamed to `X-ClickHouse-Query-Id` for consistency. [#4972](https://github.com/ClickHouse/ClickHouse/pull/4972) ([Mikhail](https://github.com/fandyushin)) - -#### Bug Fixes -* Fixed potential null pointer dereference in `clickhouse-copier`. [#4900](https://github.com/ClickHouse/ClickHouse/pull/4900) ([proller](https://github.com/proller)) -* Fixed error on query with JOIN + ARRAY JOIN [#4938](https://github.com/ClickHouse/ClickHouse/pull/4938) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed hanging on start of the server when a dictionary depends on another dictionary via a database with engine=Dictionary. [#4962](https://github.com/ClickHouse/ClickHouse/pull/4962) ([Vitaly Baranov](https://github.com/vitlibar)) -* Partially fix distributed_product_mode = local. It's possible to allow columns of local tables in where/having/order by/... via table aliases. Throw exception if table does not have alias. There's not possible to access to the columns without table aliases yet. [#4986](https://github.com/ClickHouse/ClickHouse/pull/4986) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix potentially wrong result for `SELECT DISTINCT` with `JOIN` [#5001](https://github.com/ClickHouse/ClickHouse/pull/5001) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [#5189](https://github.com/ClickHouse/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Build/Testing/Packaging Improvements -* Fixed test failures when running clickhouse-server on different host [#4713](https://github.com/ClickHouse/ClickHouse/pull/4713) ([Vasily Nemkov](https://github.com/Enmk)) -* clickhouse-test: Disable color control sequences in non tty environment. [#4937](https://github.com/ClickHouse/ClickHouse/pull/4937) ([alesapin](https://github.com/alesapin)) -* clickhouse-test: Allow use any test database (remove `test.` qualification where it possible) [#5008](https://github.com/ClickHouse/ClickHouse/pull/5008) ([proller](https://github.com/proller)) -* Fix ubsan errors [#5037](https://github.com/ClickHouse/ClickHouse/pull/5037) ([Vitaly Baranov](https://github.com/vitlibar)) -* Yandex LFAlloc was added to ClickHouse to allocate MarkCache and UncompressedCache data in different ways to catch segfaults more reliable [#4995](https://github.com/ClickHouse/ClickHouse/pull/4995) ([Danila Kutenin](https://github.com/danlark1)) -* Python util to help with backports and changelogs. [#4949](https://github.com/ClickHouse/ClickHouse/pull/4949) ([Ivan](https://github.com/abyss7)) - - -## ClickHouse release 19.5 -### ClickHouse release 19.5.4.22, 2019-05-13 - -#### Bug fixes -* Fixed possible crash in bitmap* functions [#5220](https://github.com/ClickHouse/ClickHouse/pull/5220) [#5228](https://github.com/ClickHouse/ClickHouse/pull/5228) ([Andy Yang](https://github.com/andyyzh)) -* Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [#5189](https://github.com/ClickHouse/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error `Set for IN is not created yet in case of using single LowCardinality column in the left part of IN`. This error happened if LowCardinality column was the part of primary key. #5031 [#5154](https://github.com/ClickHouse/ClickHouse/pull/5154) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Modification of retention function: If a row satisfies both the first and NTH condition, only the first satisfied condition is added to the data state. Now all conditions that satisfy in a row of data are added to the data state. [#5119](https://github.com/ClickHouse/ClickHouse/pull/5119) ([小路](https://github.com/nicelulu)) - - -### ClickHouse release 19.5.3.8, 2019-04-18 - -#### Bug fixes -* Fixed type of setting `max_partitions_per_insert_block` from boolean to UInt64. [#5028](https://github.com/ClickHouse/ClickHouse/pull/5028) ([Mohammad Hossein Sekhavat](https://github.com/mhsekhavat)) - - -### ClickHouse release 19.5.2.6, 2019-04-15 - -#### New Features - -* [Hyperscan](https://github.com/intel/hyperscan) multiple regular expression matching was added (functions `multiMatchAny`, `multiMatchAnyIndex`, `multiFuzzyMatchAny`, `multiFuzzyMatchAnyIndex`). [#4780](https://github.com/ClickHouse/ClickHouse/pull/4780), [#4841](https://github.com/ClickHouse/ClickHouse/pull/4841) ([Danila Kutenin](https://github.com/danlark1)) -* `multiSearchFirstPosition` function was added. [#4780](https://github.com/ClickHouse/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1)) -* Implement the predefined expression filter per row for tables. [#4792](https://github.com/ClickHouse/ClickHouse/pull/4792) ([Ivan](https://github.com/abyss7)) -* A new type of data skipping indices based on bloom filters (can be used for `equal`, `in` and `like` functions). [#4499](https://github.com/ClickHouse/ClickHouse/pull/4499) ([Nikita Vasilev](https://github.com/nikvas0)) -* Added `ASOF JOIN` which allows to run queries that join to the most recent value known. [#4774](https://github.com/ClickHouse/ClickHouse/pull/4774) [#4867](https://github.com/ClickHouse/ClickHouse/pull/4867) [#4863](https://github.com/ClickHouse/ClickHouse/pull/4863) [#4875](https://github.com/ClickHouse/ClickHouse/pull/4875) ([Martijn Bakker](https://github.com/Gladdy), [Artem Zuikov](https://github.com/4ertus2)) -* Rewrite multiple `COMMA JOIN` to `CROSS JOIN`. Then rewrite them to `INNER JOIN` if possible. [#4661](https://github.com/ClickHouse/ClickHouse/pull/4661) ([Artem Zuikov](https://github.com/4ertus2)) - -#### Improvement - -* `topK` and `topKWeighted` now supports custom `loadFactor` (fixes issue [#4252](https://github.com/ClickHouse/ClickHouse/issues/4252)). [#4634](https://github.com/ClickHouse/ClickHouse/pull/4634) ([Kirill Danshin](https://github.com/kirillDanshin)) -* Allow to use `parallel_replicas_count > 1` even for tables without sampling (the setting is simply ignored for them). In previous versions it was lead to exception. [#4637](https://github.com/ClickHouse/ClickHouse/pull/4637) ([Alexey Elymanov](https://github.com/digitalist)) -* Support for `CREATE OR REPLACE VIEW`. Allow to create a view or set a new definition in a single statement. [#4654](https://github.com/ClickHouse/ClickHouse/pull/4654) ([Boris Granveaud](https://github.com/bgranvea)) -* `Buffer` table engine now supports `PREWHERE`. [#4671](https://github.com/ClickHouse/ClickHouse/pull/4671) ([Yangkuan Liu](https://github.com/LiuYangkuan)) -* Add ability to start replicated table without metadata in zookeeper in `readonly` mode. [#4691](https://github.com/ClickHouse/ClickHouse/pull/4691) ([alesapin](https://github.com/alesapin)) -* Fixed flicker of progress bar in clickhouse-client. The issue was most noticeable when using `FORMAT Null` with streaming queries. [#4811](https://github.com/ClickHouse/ClickHouse/pull/4811) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to disable functions with `hyperscan` library on per user basis to limit potentially excessive and uncontrolled resource usage. [#4816](https://github.com/ClickHouse/ClickHouse/pull/4816) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add version number logging in all errors. [#4824](https://github.com/ClickHouse/ClickHouse/pull/4824) ([proller](https://github.com/proller)) -* Added restriction to the `multiMatch` functions which requires string size to fit into `unsigned int`. Also added the number of arguments limit to the `multiSearch` functions. [#4834](https://github.com/ClickHouse/ClickHouse/pull/4834) ([Danila Kutenin](https://github.com/danlark1)) -* Improved usage of scratch space and error handling in Hyperscan. [#4866](https://github.com/ClickHouse/ClickHouse/pull/4866) ([Danila Kutenin](https://github.com/danlark1)) -* Fill `system.graphite_detentions` from a table config of `*GraphiteMergeTree` engine tables. [#4584](https://github.com/ClickHouse/ClickHouse/pull/4584) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) -* Rename `trigramDistance` function to `ngramDistance` and add more functions with `CaseInsensitive` and `UTF`. [#4602](https://github.com/ClickHouse/ClickHouse/pull/4602) ([Danila Kutenin](https://github.com/danlark1)) -* Improved data skipping indices calculation. [#4640](https://github.com/ClickHouse/ClickHouse/pull/4640) ([Nikita Vasilev](https://github.com/nikvas0)) -* Keep ordinary, `DEFAULT`, `MATERIALIZED` and `ALIAS` columns in a single list (fixes issue [#2867](https://github.com/ClickHouse/ClickHouse/issues/2867)). [#4707](https://github.com/ClickHouse/ClickHouse/pull/4707) ([Alex Zatelepin](https://github.com/ztlpn)) - -#### Bug Fix - -* Avoid `std::terminate` in case of memory allocation failure. Now `std::bad_alloc` exception is thrown as expected. [#4665](https://github.com/ClickHouse/ClickHouse/pull/4665) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixes capnproto reading from buffer. Sometimes files wasn't loaded successfully by HTTP. [#4674](https://github.com/ClickHouse/ClickHouse/pull/4674) ([Vladislav](https://github.com/smirnov-vs)) -* Fix error `Unknown log entry type: 0` after `OPTIMIZE TABLE FINAL` query. [#4683](https://github.com/ClickHouse/ClickHouse/pull/4683) ([Amos Bird](https://github.com/amosbird)) -* Wrong arguments to `hasAny` or `hasAll` functions may lead to segfault. [#4698](https://github.com/ClickHouse/ClickHouse/pull/4698) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Deadlock may happen while executing `DROP DATABASE dictionary` query. [#4701](https://github.com/ClickHouse/ClickHouse/pull/4701) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix undefined behavior in `median` and `quantile` functions. [#4702](https://github.com/ClickHouse/ClickHouse/pull/4702) ([hcz](https://github.com/hczhcz)) -* Fix compression level detection when `network_compression_method` in lowercase. Broken in v19.1. [#4706](https://github.com/ClickHouse/ClickHouse/pull/4706) ([proller](https://github.com/proller)) -* Fixed ignorance of `UTC` setting (fixes issue [#4658](https://github.com/ClickHouse/ClickHouse/issues/4658)). [#4718](https://github.com/ClickHouse/ClickHouse/pull/4718) ([proller](https://github.com/proller)) -* Fix `histogram` function behaviour with `Distributed` tables. [#4741](https://github.com/ClickHouse/ClickHouse/pull/4741) ([olegkv](https://github.com/olegkv)) -* Fixed tsan report `destroy of a locked mutex`. [#4742](https://github.com/ClickHouse/ClickHouse/pull/4742) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed TSan report on shutdown due to race condition in system logs usage. Fixed potential use-after-free on shutdown when part_log is enabled. [#4758](https://github.com/ClickHouse/ClickHouse/pull/4758) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix recheck parts in `ReplicatedMergeTreeAlterThread` in case of error. [#4772](https://github.com/ClickHouse/ClickHouse/pull/4772) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Arithmetic operations on intermediate aggregate function states were not working for constant arguments (such as subquery results). [#4776](https://github.com/ClickHouse/ClickHouse/pull/4776) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Always backquote column names in metadata. Otherwise it's impossible to create a table with column named `index` (server won't restart due to malformed `ATTACH` query in metadata). [#4782](https://github.com/ClickHouse/ClickHouse/pull/4782) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix crash in `ALTER ... MODIFY ORDER BY` on `Distributed` table. [#4790](https://github.com/ClickHouse/ClickHouse/pull/4790) ([TCeason](https://github.com/TCeason)) -* Fix segfault in `JOIN ON` with enabled `enable_optimize_predicate_expression`. [#4794](https://github.com/ClickHouse/ClickHouse/pull/4794) ([Winter Zhang](https://github.com/zhang2014)) -* Fix bug with adding an extraneous row after consuming a protobuf message from Kafka. [#4808](https://github.com/ClickHouse/ClickHouse/pull/4808) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix crash of `JOIN` on not-nullable vs nullable column. Fix `NULLs` in right keys in `ANY JOIN` + `join_use_nulls`. [#4815](https://github.com/ClickHouse/ClickHouse/pull/4815) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) -* Fixed race condition in `SELECT` from `system.tables` if the table is renamed or altered concurrently. [#4836](https://github.com/ClickHouse/ClickHouse/pull/4836) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed data race when fetching data part that is already obsolete. [#4839](https://github.com/ClickHouse/ClickHouse/pull/4839) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed rare data race that can happen during `RENAME` table of MergeTree family. [#4844](https://github.com/ClickHouse/ClickHouse/pull/4844) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed segmentation fault in function `arrayIntersect`. Segmentation fault could happen if function was called with mixed constant and ordinary arguments. [#4847](https://github.com/ClickHouse/ClickHouse/pull/4847) ([Lixiang Qian](https://github.com/fancyqlx)) -* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/ClickHouse/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix `No message received` exception while fetching parts between replicas. [#4856](https://github.com/ClickHouse/ClickHouse/pull/4856) ([alesapin](https://github.com/alesapin)) -* Fixed `arrayIntersect` function wrong result in case of several repeated values in single array. [#4871](https://github.com/ClickHouse/ClickHouse/pull/4871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix a race condition during concurrent `ALTER COLUMN` queries that could lead to a server crash (fixes issue [#3421](https://github.com/ClickHouse/ClickHouse/issues/3421)). [#4592](https://github.com/ClickHouse/ClickHouse/pull/4592) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix incorrect result in `FULL/RIGHT JOIN` with const column. [#4723](https://github.com/ClickHouse/ClickHouse/pull/4723) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix duplicates in `GLOBAL JOIN` with asterisk. [#4705](https://github.com/ClickHouse/ClickHouse/pull/4705) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix parameter deduction in `ALTER MODIFY` of column `CODEC` when column type is not specified. [#4883](https://github.com/ClickHouse/ClickHouse/pull/4883) ([alesapin](https://github.com/alesapin)) -* Functions `cutQueryStringAndFragment()` and `queryStringAndFragment()` now works correctly when `URL` contains a fragment and no query. [#4894](https://github.com/ClickHouse/ClickHouse/pull/4894) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix rare bug when setting `min_bytes_to_use_direct_io` is greater than zero, which occures when thread have to seek backward in column file. [#4897](https://github.com/ClickHouse/ClickHouse/pull/4897) ([alesapin](https://github.com/alesapin)) -* Fix wrong argument types for aggregate functions with `LowCardinality` arguments (fixes issue [#4919](https://github.com/ClickHouse/ClickHouse/issues/4919)). [#4922](https://github.com/ClickHouse/ClickHouse/pull/4922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix wrong name qualification in `GLOBAL JOIN`. [#4969](https://github.com/ClickHouse/ClickHouse/pull/4969) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix function `toISOWeek` result for year 1970. [#4988](https://github.com/ClickHouse/ClickHouse/pull/4988) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix `DROP`, `TRUNCATE` and `OPTIMIZE` queries duplication, when executed on `ON CLUSTER` for `ReplicatedMergeTree*` tables family. [#4991](https://github.com/ClickHouse/ClickHouse/pull/4991) ([alesapin](https://github.com/alesapin)) - -#### Backward Incompatible Change - -* Rename setting `insert_sample_with_metadata` to setting `input_format_defaults_for_omitted_fields`. [#4771](https://github.com/ClickHouse/ClickHouse/pull/4771) ([Artem Zuikov](https://github.com/4ertus2)) -* Added setting `max_partitions_per_insert_block` (with value 100 by default). If inserted block contains larger number of partitions, an exception is thrown. Set it to 0 if you want to remove the limit (not recommended). [#4845](https://github.com/ClickHouse/ClickHouse/pull/4845) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Multi-search functions were renamed (`multiPosition` to `multiSearchAllPositions`, `multiSearch` to `multiSearchAny`, `firstMatch` to `multiSearchFirstIndex`). [#4780](https://github.com/ClickHouse/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1)) - -#### Performance Improvement - -* Optimize Volnitsky searcher by inlining, giving about 5-10% search improvement for queries with many needles or many similar bigrams. [#4862](https://github.com/ClickHouse/ClickHouse/pull/4862) ([Danila Kutenin](https://github.com/danlark1)) -* Fix performance issue when setting `use_uncompressed_cache` is greater than zero, which appeared when all read data contained in cache. [#4913](https://github.com/ClickHouse/ClickHouse/pull/4913) ([alesapin](https://github.com/alesapin)) - - -#### Build/Testing/Packaging Improvement - -* Hardening debug build: more granular memory mappings and ASLR; add memory protection for mark cache and index. This allows to find more memory stomping bugs in case when ASan and MSan cannot do it. [#4632](https://github.com/ClickHouse/ClickHouse/pull/4632) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add support for cmake variables `ENABLE_PROTOBUF`, `ENABLE_PARQUET` and `ENABLE_BROTLI` which allows to enable/disable the above features (same as we can do for librdkafka, mysql, etc). [#4669](https://github.com/ClickHouse/ClickHouse/pull/4669) ([Silviu Caragea](https://github.com/silviucpp)) -* Add ability to print process list and stacktraces of all threads if some queries are hung after test run. [#4675](https://github.com/ClickHouse/ClickHouse/pull/4675) ([alesapin](https://github.com/alesapin)) -* Add retries on `Connection loss` error in `clickhouse-test`. [#4682](https://github.com/ClickHouse/ClickHouse/pull/4682) ([alesapin](https://github.com/alesapin)) -* Add freebsd build with vagrant and build with thread sanitizer to packager script. [#4712](https://github.com/ClickHouse/ClickHouse/pull/4712) [#4748](https://github.com/ClickHouse/ClickHouse/pull/4748) ([alesapin](https://github.com/alesapin)) -* Now user asked for password for user `'default'` during installation. [#4725](https://github.com/ClickHouse/ClickHouse/pull/4725) ([proller](https://github.com/proller)) -* Suppress warning in `rdkafka` library. [#4740](https://github.com/ClickHouse/ClickHouse/pull/4740) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow ability to build without ssl. [#4750](https://github.com/ClickHouse/ClickHouse/pull/4750) ([proller](https://github.com/proller)) -* Add a way to launch clickhouse-server image from a custom user. [#4753](https://github.com/ClickHouse/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) -* Upgrade contrib boost to 1.69. [#4793](https://github.com/ClickHouse/ClickHouse/pull/4793) ([proller](https://github.com/proller)) -* Disable usage of `mremap` when compiled with Thread Sanitizer. Surprisingly enough, TSan does not intercept `mremap` (though it does intercept `mmap`, `munmap`) that leads to false positives. Fixed TSan report in stateful tests. [#4859](https://github.com/ClickHouse/ClickHouse/pull/4859) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add test checking using format schema via HTTP interface. [#4864](https://github.com/ClickHouse/ClickHouse/pull/4864) ([Vitaly Baranov](https://github.com/vitlibar)) - -## ClickHouse release 19.4 -### ClickHouse release 19.4.4.33, 2019-04-17 - -#### Bug Fixes - -* Avoid `std::terminate` in case of memory allocation failure. Now `std::bad_alloc` exception is thrown as expected. [#4665](https://github.com/ClickHouse/ClickHouse/pull/4665) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixes capnproto reading from buffer. Sometimes files wasn't loaded successfully by HTTP. [#4674](https://github.com/ClickHouse/ClickHouse/pull/4674) ([Vladislav](https://github.com/smirnov-vs)) -* Fix error `Unknown log entry type: 0` after `OPTIMIZE TABLE FINAL` query. [#4683](https://github.com/ClickHouse/ClickHouse/pull/4683) ([Amos Bird](https://github.com/amosbird)) -* Wrong arguments to `hasAny` or `hasAll` functions may lead to segfault. [#4698](https://github.com/ClickHouse/ClickHouse/pull/4698) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Deadlock may happen while executing `DROP DATABASE dictionary` query. [#4701](https://github.com/ClickHouse/ClickHouse/pull/4701) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix undefined behavior in `median` and `quantile` functions. [#4702](https://github.com/ClickHouse/ClickHouse/pull/4702) ([hcz](https://github.com/hczhcz)) -* Fix compression level detection when `network_compression_method` in lowercase. Broken in v19.1. [#4706](https://github.com/ClickHouse/ClickHouse/pull/4706) ([proller](https://github.com/proller)) -* Fixed ignorance of `UTC` setting (fixes issue [#4658](https://github.com/ClickHouse/ClickHouse/issues/4658)). [#4718](https://github.com/ClickHouse/ClickHouse/pull/4718) ([proller](https://github.com/proller)) -* Fix `histogram` function behaviour with `Distributed` tables. [#4741](https://github.com/ClickHouse/ClickHouse/pull/4741) ([olegkv](https://github.com/olegkv)) -* Fixed tsan report `destroy of a locked mutex`. [#4742](https://github.com/ClickHouse/ClickHouse/pull/4742) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed TSan report on shutdown due to race condition in system logs usage. Fixed potential use-after-free on shutdown when part_log is enabled. [#4758](https://github.com/ClickHouse/ClickHouse/pull/4758) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix recheck parts in `ReplicatedMergeTreeAlterThread` in case of error. [#4772](https://github.com/ClickHouse/ClickHouse/pull/4772) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Arithmetic operations on intermediate aggregate function states were not working for constant arguments (such as subquery results). [#4776](https://github.com/ClickHouse/ClickHouse/pull/4776) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Always backquote column names in metadata. Otherwise it's impossible to create a table with column named `index` (server won't restart due to malformed `ATTACH` query in metadata). [#4782](https://github.com/ClickHouse/ClickHouse/pull/4782) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix crash in `ALTER ... MODIFY ORDER BY` on `Distributed` table. [#4790](https://github.com/ClickHouse/ClickHouse/pull/4790) ([TCeason](https://github.com/TCeason)) -* Fix segfault in `JOIN ON` with enabled `enable_optimize_predicate_expression`. [#4794](https://github.com/ClickHouse/ClickHouse/pull/4794) ([Winter Zhang](https://github.com/zhang2014)) -* Fix bug with adding an extraneous row after consuming a protobuf message from Kafka. [#4808](https://github.com/ClickHouse/ClickHouse/pull/4808) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) -* Fixed race condition in `SELECT` from `system.tables` if the table is renamed or altered concurrently. [#4836](https://github.com/ClickHouse/ClickHouse/pull/4836) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed data race when fetching data part that is already obsolete. [#4839](https://github.com/ClickHouse/ClickHouse/pull/4839) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed rare data race that can happen during `RENAME` table of MergeTree family. [#4844](https://github.com/ClickHouse/ClickHouse/pull/4844) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed segmentation fault in function `arrayIntersect`. Segmentation fault could happen if function was called with mixed constant and ordinary arguments. [#4847](https://github.com/ClickHouse/ClickHouse/pull/4847) ([Lixiang Qian](https://github.com/fancyqlx)) -* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix `No message received` exception while fetching parts between replicas. [#4856](https://github.com/ClickHouse/ClickHouse/pull/4856) ([alesapin](https://github.com/alesapin)) -* Fixed `arrayIntersect` function wrong result in case of several repeated values in single array. [#4871](https://github.com/ClickHouse/ClickHouse/pull/4871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix a race condition during concurrent `ALTER COLUMN` queries that could lead to a server crash (fixes issue [#3421](https://github.com/ClickHouse/ClickHouse/issues/3421)). [#4592](https://github.com/ClickHouse/ClickHouse/pull/4592) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix parameter deduction in `ALTER MODIFY` of column `CODEC` when column type is not specified. [#4883](https://github.com/ClickHouse/ClickHouse/pull/4883) ([alesapin](https://github.com/alesapin)) -* Functions `cutQueryStringAndFragment()` and `queryStringAndFragment()` now works correctly when `URL` contains a fragment and no query. [#4894](https://github.com/ClickHouse/ClickHouse/pull/4894) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix rare bug when setting `min_bytes_to_use_direct_io` is greater than zero, which occures when thread have to seek backward in column file. [#4897](https://github.com/ClickHouse/ClickHouse/pull/4897) ([alesapin](https://github.com/alesapin)) -* Fix wrong argument types for aggregate functions with `LowCardinality` arguments (fixes issue [#4919](https://github.com/ClickHouse/ClickHouse/issues/4919)). [#4922](https://github.com/ClickHouse/ClickHouse/pull/4922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix function `toISOWeek` result for year 1970. [#4988](https://github.com/ClickHouse/ClickHouse/pull/4988) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix `DROP`, `TRUNCATE` and `OPTIMIZE` queries duplication, when executed on `ON CLUSTER` for `ReplicatedMergeTree*` tables family. [#4991](https://github.com/ClickHouse/ClickHouse/pull/4991) ([alesapin](https://github.com/alesapin)) - -#### Improvements - -* Keep ordinary, `DEFAULT`, `MATERIALIZED` and `ALIAS` columns in a single list (fixes issue [#2867](https://github.com/ClickHouse/ClickHouse/issues/2867)). [#4707](https://github.com/ClickHouse/ClickHouse/pull/4707) ([Alex Zatelepin](https://github.com/ztlpn)) - -### ClickHouse release 19.4.3.11, 2019-04-02 - -#### Bug Fixes - -* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/ClickHouse/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) - -#### Build/Testing/Packaging Improvement - -* Add a way to launch clickhouse-server image from a custom user. [#4753](https://github.com/ClickHouse/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) - -### ClickHouse release 19.4.2.7, 2019-03-30 - -#### Bug Fixes -* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) - -### ClickHouse release 19.4.1.3, 2019-03-19 - -#### Bug Fixes -* Fixed remote queries which contain both `LIMIT BY` and `LIMIT`. Previously, if `LIMIT BY` and `LIMIT` were used for remote query, `LIMIT` could happen before `LIMIT BY`, which led to too filtered result. [#4708](https://github.com/ClickHouse/ClickHouse/pull/4708) ([Constantin S. Pan](https://github.com/kvap)) - -### ClickHouse release 19.4.0.49, 2019-03-09 - -#### New Features -* Added full support for `Protobuf` format (input and output, nested data structures). [#4174](https://github.com/ClickHouse/ClickHouse/pull/4174) [#4493](https://github.com/ClickHouse/ClickHouse/pull/4493) ([Vitaly Baranov](https://github.com/vitlibar)) -* Added bitmap functions with Roaring Bitmaps. [#4207](https://github.com/ClickHouse/ClickHouse/pull/4207) ([Andy Yang](https://github.com/andyyzh)) [#4568](https://github.com/ClickHouse/ClickHouse/pull/4568) ([Vitaly Baranov](https://github.com/vitlibar)) -* Parquet format support. [#4448](https://github.com/ClickHouse/ClickHouse/pull/4448) ([proller](https://github.com/proller)) -* N-gram distance was added for fuzzy string comparison. It is similar to q-gram metrics in R language. [#4466](https://github.com/ClickHouse/ClickHouse/pull/4466) ([Danila Kutenin](https://github.com/danlark1)) -* Combine rules for graphite rollup from dedicated aggregation and retention patterns. [#4426](https://github.com/ClickHouse/ClickHouse/pull/4426) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) -* Added `max_execution_speed` and `max_execution_speed_bytes` to limit resource usage. Added `min_execution_speed_bytes` setting to complement the `min_execution_speed`. [#4430](https://github.com/ClickHouse/ClickHouse/pull/4430) ([Winter Zhang](https://github.com/zhang2014)) -* Implemented function `flatten`. [#4555](https://github.com/ClickHouse/ClickHouse/pull/4555) [#4409](https://github.com/ClickHouse/ClickHouse/pull/4409) ([alexey-milovidov](https://github.com/alexey-milovidov), [kzon](https://github.com/kzon)) -* Added functions `arrayEnumerateDenseRanked` and `arrayEnumerateUniqRanked` (it's like `arrayEnumerateUniq` but allows to fine tune array depth to look inside multidimensional arrays). [#4475](https://github.com/ClickHouse/ClickHouse/pull/4475) ([proller](https://github.com/proller)) [#4601](https://github.com/ClickHouse/ClickHouse/pull/4601) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Multiple JOINS with some restrictions: no asterisks, no complex aliases in ON/WHERE/GROUP BY/... [#4462](https://github.com/ClickHouse/ClickHouse/pull/4462) ([Artem Zuikov](https://github.com/4ertus2)) - -#### Bug Fixes -* This release also contains all bug fixes from 19.3 and 19.1. -* Fixed bug in data skipping indices: order of granules after INSERT was incorrect. [#4407](https://github.com/ClickHouse/ClickHouse/pull/4407) ([Nikita Vasilev](https://github.com/nikvas0)) -* Fixed `set` index for `Nullable` and `LowCardinality` columns. Before it, `set` index with `Nullable` or `LowCardinality` column led to error `Data type must be deserialized with multiple streams` while selecting. [#4594](https://github.com/ClickHouse/ClickHouse/pull/4594) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Correctly set update_time on full `executable` dictionary update. [#4551](https://github.com/ClickHouse/ClickHouse/pull/4551) ([Tema Novikov](https://github.com/temoon)) -* Fix broken progress bar in 19.3. [#4627](https://github.com/ClickHouse/ClickHouse/pull/4627) ([filimonov](https://github.com/filimonov)) -* Fixed inconsistent values of MemoryTracker when memory region was shrinked, in certain cases. [#4619](https://github.com/ClickHouse/ClickHouse/pull/4619) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed undefined behaviour in ThreadPool. [#4612](https://github.com/ClickHouse/ClickHouse/pull/4612) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a very rare crash with the message `mutex lock failed: Invalid argument` that could happen when a MergeTree table was dropped concurrently with a SELECT. [#4608](https://github.com/ClickHouse/ClickHouse/pull/4608) ([Alex Zatelepin](https://github.com/ztlpn)) -* ODBC driver compatibility with `LowCardinality` data type. [#4381](https://github.com/ClickHouse/ClickHouse/pull/4381) ([proller](https://github.com/proller)) -* FreeBSD: Fixup for `AIOcontextPool: Found io_event with unknown id 0` error. [#4438](https://github.com/ClickHouse/ClickHouse/pull/4438) ([urgordeadbeef](https://github.com/urgordeadbeef)) -* `system.part_log` table was created regardless to configuration. [#4483](https://github.com/ClickHouse/ClickHouse/pull/4483) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix undefined behaviour in `dictIsIn` function for cache dictionaries. [#4515](https://github.com/ClickHouse/ClickHouse/pull/4515) ([alesapin](https://github.com/alesapin)) -* Fixed a deadlock when a SELECT query locks the same table multiple times (e.g. from different threads or when executing multiple subqueries) and there is a concurrent DDL query. [#4535](https://github.com/ClickHouse/ClickHouse/pull/4535) ([Alex Zatelepin](https://github.com/ztlpn)) -* Disable compile_expressions by default until we get own `llvm` contrib and can test it with `clang` and `asan`. [#4579](https://github.com/ClickHouse/ClickHouse/pull/4579) ([alesapin](https://github.com/alesapin)) -* Prevent `std::terminate` when `invalidate_query` for `clickhouse` external dictionary source has returned wrong resultset (empty or more than one row or more than one column). Fixed issue when the `invalidate_query` was performed every five seconds regardless to the `lifetime`. [#4583](https://github.com/ClickHouse/ClickHouse/pull/4583) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid deadlock when the `invalidate_query` for a dictionary with `clickhouse` source was involving `system.dictionaries` table or `Dictionaries` database (rare case). [#4599](https://github.com/ClickHouse/ClickHouse/pull/4599) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixes for CROSS JOIN with empty WHERE. [#4598](https://github.com/ClickHouse/ClickHouse/pull/4598) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed segfault in function "replicate" when constant argument is passed. [#4603](https://github.com/ClickHouse/ClickHouse/pull/4603) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix lambda function with predicate optimizer. [#4408](https://github.com/ClickHouse/ClickHouse/pull/4408) ([Winter Zhang](https://github.com/zhang2014)) -* Multiple JOINs multiple fixes. [#4595](https://github.com/ClickHouse/ClickHouse/pull/4595) ([Artem Zuikov](https://github.com/4ertus2)) - -#### Improvements -* Support aliases in JOIN ON section for right table columns. [#4412](https://github.com/ClickHouse/ClickHouse/pull/4412) ([Artem Zuikov](https://github.com/4ertus2)) -* Result of multiple JOINs need correct result names to be used in subselects. Replace flat aliases with source names in result. [#4474](https://github.com/ClickHouse/ClickHouse/pull/4474) ([Artem Zuikov](https://github.com/4ertus2)) -* Improve push-down logic for joined statements. [#4387](https://github.com/ClickHouse/ClickHouse/pull/4387) ([Ivan](https://github.com/abyss7)) - -#### Performance Improvements -* Improved heuristics of "move to PREWHERE" optimization. [#4405](https://github.com/ClickHouse/ClickHouse/pull/4405) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Use proper lookup tables that uses HashTable's API for 8-bit and 16-bit keys. [#4536](https://github.com/ClickHouse/ClickHouse/pull/4536) ([Amos Bird](https://github.com/amosbird)) -* Improved performance of string comparison. [#4564](https://github.com/ClickHouse/ClickHouse/pull/4564) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Cleanup distributed DDL queue in a separate thread so that it doesn't slow down the main loop that processes distributed DDL tasks. [#4502](https://github.com/ClickHouse/ClickHouse/pull/4502) ([Alex Zatelepin](https://github.com/ztlpn)) -* When `min_bytes_to_use_direct_io` is set to 1, not every file was opened with O_DIRECT mode because the data size to read was sometimes underestimated by the size of one compressed block. [#4526](https://github.com/ClickHouse/ClickHouse/pull/4526) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Build/Testing/Packaging Improvement -* Added support for clang-9 [#4604](https://github.com/ClickHouse/ClickHouse/pull/4604) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix wrong `__asm__` instructions (again) [#4621](https://github.com/ClickHouse/ClickHouse/pull/4621) ([Konstantin Podshumok](https://github.com/podshumok)) -* Add ability to specify settings for `clickhouse-performance-test` from command line. [#4437](https://github.com/ClickHouse/ClickHouse/pull/4437) ([alesapin](https://github.com/alesapin)) -* Add dictionaries tests to integration tests. [#4477](https://github.com/ClickHouse/ClickHouse/pull/4477) ([alesapin](https://github.com/alesapin)) -* Added queries from the benchmark on the website to automated performance tests. [#4496](https://github.com/ClickHouse/ClickHouse/pull/4496) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `xxhash.h` does not exist in external lz4 because it is an implementation detail and its symbols are namespaced with `XXH_NAMESPACE` macro. When lz4 is external, xxHash has to be external too, and the dependents have to link to it. [#4495](https://github.com/ClickHouse/ClickHouse/pull/4495) ([Orivej Desh](https://github.com/orivej)) -* Fixed a case when `quantileTiming` aggregate function can be called with negative or floating point argument (this fixes fuzz test with undefined behaviour sanitizer). [#4506](https://github.com/ClickHouse/ClickHouse/pull/4506) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Spelling error correction. [#4531](https://github.com/ClickHouse/ClickHouse/pull/4531) ([sdk2](https://github.com/sdk2)) -* Fix compilation on Mac. [#4371](https://github.com/ClickHouse/ClickHouse/pull/4371) ([Vitaly Baranov](https://github.com/vitlibar)) -* Build fixes for FreeBSD and various unusual build configurations. [#4444](https://github.com/ClickHouse/ClickHouse/pull/4444) ([proller](https://github.com/proller)) - -## ClickHouse release 19.3 -### ClickHouse release 19.3.9.1, 2019-04-02 - -#### Bug Fixes - -* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/ClickHouse/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) -* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) - -#### Build/Testing/Packaging Improvement - -* Add a way to launch clickhouse-server image from a custom user [#4753](https://github.com/ClickHouse/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) - - -### ClickHouse release 19.3.7, 2019-03-12 - -#### Bug fixes - -* Fixed error in #3920. This error manifests itself as random cache corruption (messages `Unknown codec family code`, `Cannot seek through file`) and segfaults. This bug first appeared in version 19.1 and is present in versions up to 19.1.10 and 19.3.6. [#4623](https://github.com/ClickHouse/ClickHouse/pull/4623) ([alexey-milovidov](https://github.com/alexey-milovidov)) - - -### ClickHouse release 19.3.6, 2019-03-02 - -#### Bug fixes - -* When there are more than 1000 threads in a thread pool, `std::terminate` may happen on thread exit. [Azat Khuzhin](https://github.com/azat) [#4485](https://github.com/ClickHouse/ClickHouse/pull/4485) [#4505](https://github.com/ClickHouse/ClickHouse/pull/4505) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Now it's possible to create `ReplicatedMergeTree*` tables with comments on columns without defaults and tables with columns codecs without comments and defaults. Also fix comparison of codecs. [#4523](https://github.com/ClickHouse/ClickHouse/pull/4523) ([alesapin](https://github.com/alesapin)) -* Fixed crash on JOIN with array or tuple. [#4552](https://github.com/ClickHouse/ClickHouse/pull/4552) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed crash in clickhouse-copier with the message `ThreadStatus not created`. [#4540](https://github.com/ClickHouse/ClickHouse/pull/4540) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed hangup on server shutdown if distributed DDLs were used. [#4472](https://github.com/ClickHouse/ClickHouse/pull/4472) ([Alex Zatelepin](https://github.com/ztlpn)) -* Incorrect column numbers were printed in error message about text format parsing for columns with number greater than 10. [#4484](https://github.com/ClickHouse/ClickHouse/pull/4484) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Build/Testing/Packaging Improvements - -* Fixed build with AVX enabled. [#4527](https://github.com/ClickHouse/ClickHouse/pull/4527) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Enable extended accounting and IO accounting based on good known version instead of kernel under which it is compiled. [#4541](https://github.com/ClickHouse/ClickHouse/pull/4541) ([nvartolomei](https://github.com/nvartolomei)) -* Allow to skip setting of core_dump.size_limit, warning instead of throw if limit set fail. [#4473](https://github.com/ClickHouse/ClickHouse/pull/4473) ([proller](https://github.com/proller)) -* Removed the `inline` tags of `void readBinary(...)` in `Field.cpp`. Also merged redundant `namespace DB` blocks. [#4530](https://github.com/ClickHouse/ClickHouse/pull/4530) ([hcz](https://github.com/hczhcz)) - - -### ClickHouse release 19.3.5, 2019-02-21 - -#### Bug fixes -* Fixed bug with large http insert queries processing. [#4454](https://github.com/ClickHouse/ClickHouse/pull/4454) ([alesapin](https://github.com/alesapin)) -* Fixed backward incompatibility with old versions due to wrong implementation of `send_logs_level` setting. [#4445](https://github.com/ClickHouse/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed backward incompatibility of table function `remote` introduced with column comments. [#4446](https://github.com/ClickHouse/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -### ClickHouse release 19.3.4, 2019-02-16 - -#### Improvements -* Table index size is not accounted for memory limits when doing `ATTACH TABLE` query. Avoided the possibility that a table cannot be attached after being detached. [#4396](https://github.com/ClickHouse/ClickHouse/pull/4396) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Slightly raised up the limit on max string and array size received from ZooKeeper. It allows to continue to work with increased size of `CLIENT_JVMFLAGS=-Djute.maxbuffer=...` on ZooKeeper. [#4398](https://github.com/ClickHouse/ClickHouse/pull/4398) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to repair abandoned replica even if it already has huge number of nodes in its queue. [#4399](https://github.com/ClickHouse/ClickHouse/pull/4399) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add one required argument to `SET` index (max stored rows number). [#4386](https://github.com/ClickHouse/ClickHouse/pull/4386) ([Nikita Vasilev](https://github.com/nikvas0)) - -#### Bug Fixes -* Fixed `WITH ROLLUP` result for group by single `LowCardinality` key. [#4384](https://github.com/ClickHouse/ClickHouse/pull/4384) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed bug in the set index (dropping a granule if it contains more than `max_rows` rows). [#4386](https://github.com/ClickHouse/ClickHouse/pull/4386) ([Nikita Vasilev](https://github.com/nikvas0)) -* A lot of FreeBSD build fixes. [#4397](https://github.com/ClickHouse/ClickHouse/pull/4397) ([proller](https://github.com/proller)) -* Fixed aliases substitution in queries with subquery containing same alias (issue [#4110](https://github.com/ClickHouse/ClickHouse/issues/4110)). [#4351](https://github.com/ClickHouse/ClickHouse/pull/4351) ([Artem Zuikov](https://github.com/4ertus2)) - -#### Build/Testing/Packaging Improvements -* Add ability to run `clickhouse-server` for stateless tests in docker image. [#4347](https://github.com/ClickHouse/ClickHouse/pull/4347) ([Vasily Nemkov](https://github.com/Enmk)) - -### ClickHouse release 19.3.3, 2019-02-13 - -#### New Features -* Added the `KILL MUTATION` statement that allows removing mutations that are for some reasons stuck. Added `latest_failed_part`, `latest_fail_time`, `latest_fail_reason` fields to the `system.mutations` table for easier troubleshooting. [#4287](https://github.com/ClickHouse/ClickHouse/pull/4287) ([Alex Zatelepin](https://github.com/ztlpn)) -* Added aggregate function `entropy` which computes Shannon entropy. [#4238](https://github.com/ClickHouse/ClickHouse/pull/4238) ([Quid37](https://github.com/Quid37)) -* Added ability to send queries `INSERT INTO tbl VALUES (....` to server without splitting on `query` and `data` parts. [#4301](https://github.com/ClickHouse/ClickHouse/pull/4301) ([alesapin](https://github.com/alesapin)) -* Generic implementation of `arrayWithConstant` function was added. [#4322](https://github.com/ClickHouse/ClickHouse/pull/4322) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Implemented `NOT BETWEEN` comparison operator. [#4228](https://github.com/ClickHouse/ClickHouse/pull/4228) ([Dmitry Naumov](https://github.com/nezed)) -* Implement `sumMapFiltered` in order to be able to limit the number of keys for which values will be summed by `sumMap`. [#4129](https://github.com/ClickHouse/ClickHouse/pull/4129) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) -* Added support of `Nullable` types in `mysql` table function. [#4198](https://github.com/ClickHouse/ClickHouse/pull/4198) ([Emmanuel Donin de Rosière](https://github.com/edonin)) -* Support for arbitrary constant expressions in `LIMIT` clause. [#4246](https://github.com/ClickHouse/ClickHouse/pull/4246) ([k3box](https://github.com/k3box)) -* Added `topKWeighted` aggregate function that takes additional argument with (unsigned integer) weight. [#4245](https://github.com/ClickHouse/ClickHouse/pull/4245) ([Andrew Golman](https://github.com/andrewgolman)) -* `StorageJoin` now supports `join_any_take_last_row` setting that allows overwriting existing values of the same key. [#3973](https://github.com/ClickHouse/ClickHouse/pull/3973) ([Amos Bird](https://github.com/amosbird) -* Added function `toStartOfInterval`. [#4304](https://github.com/ClickHouse/ClickHouse/pull/4304) ([Vitaly Baranov](https://github.com/vitlibar)) -* Added `RowBinaryWithNamesAndTypes` format. [#4200](https://github.com/ClickHouse/ClickHouse/pull/4200) ([Oleg V. Kozlyuk](https://github.com/DarkWanderer)) -* Added `IPv4` and `IPv6` data types. More effective implementations of `IPv*` functions. [#3669](https://github.com/ClickHouse/ClickHouse/pull/3669) ([Vasily Nemkov](https://github.com/Enmk)) -* Added function `toStartOfTenMinutes()`. [#4298](https://github.com/ClickHouse/ClickHouse/pull/4298) ([Vitaly Baranov](https://github.com/vitlibar)) -* Added `Protobuf` output format. [#4005](https://github.com/ClickHouse/ClickHouse/pull/4005) [#4158](https://github.com/ClickHouse/ClickHouse/pull/4158) ([Vitaly Baranov](https://github.com/vitlibar)) -* Added brotli support for HTTP interface for data import (INSERTs). [#4235](https://github.com/ClickHouse/ClickHouse/pull/4235) ([Mikhail ](https://github.com/fandyushin)) -* Added hints while user make typo in function name or type in command line client. [#4239](https://github.com/ClickHouse/ClickHouse/pull/4239) ([Danila Kutenin](https://github.com/danlark1)) -* Added `Query-Id` to Server's HTTP Response header. [#4231](https://github.com/ClickHouse/ClickHouse/pull/4231) ([Mikhail ](https://github.com/fandyushin)) - -#### Experimental features -* Added `minmax` and `set` data skipping indices for MergeTree table engines family. [#4143](https://github.com/ClickHouse/ClickHouse/pull/4143) ([Nikita Vasilev](https://github.com/nikvas0)) -* Added conversion of `CROSS JOIN` to `INNER JOIN` if possible. [#4221](https://github.com/ClickHouse/ClickHouse/pull/4221) [#4266](https://github.com/ClickHouse/ClickHouse/pull/4266) ([Artem Zuikov](https://github.com/4ertus2)) - -#### Bug Fixes -* Fixed `Not found column` for duplicate columns in `JOIN ON` section. [#4279](https://github.com/ClickHouse/ClickHouse/pull/4279) ([Artem Zuikov](https://github.com/4ertus2)) -* Make `START REPLICATED SENDS` command start replicated sends. [#4229](https://github.com/ClickHouse/ClickHouse/pull/4229) ([nvartolomei](https://github.com/nvartolomei)) -* Fixed aggregate functions execution with `Array(LowCardinality)` arguments. [#4055](https://github.com/ClickHouse/ClickHouse/pull/4055) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fixed wrong behaviour when doing `INSERT ... SELECT ... FROM file(...)` query and file has `CSVWithNames` or `TSVWIthNames` format and the first data row is missing. [#4297](https://github.com/ClickHouse/ClickHouse/pull/4297) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed crash on dictionary reload if dictionary not available. This bug was appeared in 19.1.6. [#4188](https://github.com/ClickHouse/ClickHouse/pull/4188) ([proller](https://github.com/proller)) -* Fixed `ALL JOIN` with duplicates in right table. [#4184](https://github.com/ClickHouse/ClickHouse/pull/4184) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed segmentation fault with `use_uncompressed_cache=1` and exception with wrong uncompressed size. This bug was appeared in 19.1.6. [#4186](https://github.com/ClickHouse/ClickHouse/pull/4186) ([alesapin](https://github.com/alesapin)) -* Fixed `compile_expressions` bug with comparison of big (more than int16) dates. [#4341](https://github.com/ClickHouse/ClickHouse/pull/4341) ([alesapin](https://github.com/alesapin)) -* Fixed infinite loop when selecting from table function `numbers(0)`. [#4280](https://github.com/ClickHouse/ClickHouse/pull/4280) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Temporarily disable predicate optimization for `ORDER BY`. [#3890](https://github.com/ClickHouse/ClickHouse/pull/3890) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed `Illegal instruction` error when using base64 functions on old CPUs. This error has been reproduced only when ClickHouse was compiled with gcc-8. [#4275](https://github.com/ClickHouse/ClickHouse/pull/4275) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed `No message received` error when interacting with PostgreSQL ODBC Driver through TLS connection. Also fixes segfault when using MySQL ODBC Driver. [#4170](https://github.com/ClickHouse/ClickHouse/pull/4170) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed incorrect result when `Date` and `DateTime` arguments are used in branches of conditional operator (function `if`). Added generic case for function `if`. [#4243](https://github.com/ClickHouse/ClickHouse/pull/4243) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* ClickHouse dictionaries now load within `clickhouse` process. [#4166](https://github.com/ClickHouse/ClickHouse/pull/4166) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed deadlock when `SELECT` from a table with `File` engine was retried after `No such file or directory` error. [#4161](https://github.com/ClickHouse/ClickHouse/pull/4161) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed race condition when selecting from `system.tables` may give `table doesn't exist` error. [#4313](https://github.com/ClickHouse/ClickHouse/pull/4313) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `clickhouse-client` can segfault on exit while loading data for command line suggestions if it was run in interactive mode. [#4317](https://github.com/ClickHouse/ClickHouse/pull/4317) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a bug when the execution of mutations containing `IN` operators was producing incorrect results. [#4099](https://github.com/ClickHouse/ClickHouse/pull/4099) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fixed error: if there is a database with `Dictionary` engine, all dictionaries forced to load at server startup, and if there is a dictionary with ClickHouse source from localhost, the dictionary cannot load. [#4255](https://github.com/ClickHouse/ClickHouse/pull/4255) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error when system logs are tried to create again at server shutdown. [#4254](https://github.com/ClickHouse/ClickHouse/pull/4254) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Correctly return the right type and properly handle locks in `joinGet` function. [#4153](https://github.com/ClickHouse/ClickHouse/pull/4153) ([Amos Bird](https://github.com/amosbird)) -* Added `sumMapWithOverflow` function. [#4151](https://github.com/ClickHouse/ClickHouse/pull/4151) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) -* Fixed segfault with `allow_experimental_multiple_joins_emulation`. [52de2c](https://github.com/ClickHouse/ClickHouse/commit/52de2cd927f7b5257dd67e175f0a5560a48840d0) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed bug with incorrect `Date` and `DateTime` comparison. [#4237](https://github.com/ClickHouse/ClickHouse/pull/4237) ([valexey](https://github.com/valexey)) -* Fixed fuzz test under undefined behavior sanitizer: added parameter type check for `quantile*Weighted` family of functions. [#4145](https://github.com/ClickHouse/ClickHouse/pull/4145) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed rare race condition when removing of old data parts can fail with `File not found` error. [#4378](https://github.com/ClickHouse/ClickHouse/pull/4378) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix install package with missing /etc/clickhouse-server/config.xml. [#4343](https://github.com/ClickHouse/ClickHouse/pull/4343) ([proller](https://github.com/proller)) - - -#### Build/Testing/Packaging Improvements -* Debian package: correct /etc/clickhouse-server/preprocessed link according to config. [#4205](https://github.com/ClickHouse/ClickHouse/pull/4205) ([proller](https://github.com/proller)) -* Various build fixes for FreeBSD. [#4225](https://github.com/ClickHouse/ClickHouse/pull/4225) ([proller](https://github.com/proller)) -* Added ability to create, fill and drop tables in perftest. [#4220](https://github.com/ClickHouse/ClickHouse/pull/4220) ([alesapin](https://github.com/alesapin)) -* Added a script to check for duplicate includes. [#4326](https://github.com/ClickHouse/ClickHouse/pull/4326) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added ability to run queries by index in performance test. [#4264](https://github.com/ClickHouse/ClickHouse/pull/4264) ([alesapin](https://github.com/alesapin)) -* Package with debug symbols is suggested to be installed. [#4274](https://github.com/ClickHouse/ClickHouse/pull/4274) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Refactoring of performance-test. Better logging and signals handling. [#4171](https://github.com/ClickHouse/ClickHouse/pull/4171) ([alesapin](https://github.com/alesapin)) -* Added docs to anonymized Yandex.Metrika datasets. [#4164](https://github.com/ClickHouse/ClickHouse/pull/4164) ([alesapin](https://github.com/alesapin)) -* Аdded tool for converting an old month-partitioned part to the custom-partitioned format. [#4195](https://github.com/ClickHouse/ClickHouse/pull/4195) ([Alex Zatelepin](https://github.com/ztlpn)) -* Added docs about two datasets in s3. [#4144](https://github.com/ClickHouse/ClickHouse/pull/4144) ([alesapin](https://github.com/alesapin)) -* Added script which creates changelog from pull requests description. [#4169](https://github.com/ClickHouse/ClickHouse/pull/4169) [#4173](https://github.com/ClickHouse/ClickHouse/pull/4173) ([KochetovNicolai](https://github.com/KochetovNicolai)) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Added puppet module for Clickhouse. [#4182](https://github.com/ClickHouse/ClickHouse/pull/4182) ([Maxim Fedotov](https://github.com/MaxFedotov)) -* Added docs for a group of undocumented functions. [#4168](https://github.com/ClickHouse/ClickHouse/pull/4168) ([Winter Zhang](https://github.com/zhang2014)) -* ARM build fixes. [#4210](https://github.com/ClickHouse/ClickHouse/pull/4210)[#4306](https://github.com/ClickHouse/ClickHouse/pull/4306) [#4291](https://github.com/ClickHouse/ClickHouse/pull/4291) ([proller](https://github.com/proller)) ([proller](https://github.com/proller)) -* Dictionary tests now able to run from `ctest`. [#4189](https://github.com/ClickHouse/ClickHouse/pull/4189) ([proller](https://github.com/proller)) -* Now `/etc/ssl` is used as default directory with SSL certificates. [#4167](https://github.com/ClickHouse/ClickHouse/pull/4167) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added checking SSE and AVX instruction at start. [#4234](https://github.com/ClickHouse/ClickHouse/pull/4234) ([Igr](https://github.com/igron99)) -* Init script will wait server until start. [#4281](https://github.com/ClickHouse/ClickHouse/pull/4281) ([proller](https://github.com/proller)) - -#### Backward Incompatible Changes -* Removed `allow_experimental_low_cardinality_type` setting. `LowCardinality` data types are production ready. [#4323](https://github.com/ClickHouse/ClickHouse/pull/4323) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Reduce mark cache size and uncompressed cache size accordingly to available memory amount. [#4240](https://github.com/ClickHouse/ClickHouse/pull/4240) ([Lopatin Konstantin](https://github.com/k-lopatin) -* Added keyword `INDEX` in `CREATE TABLE` query. A column with name `index` must be quoted with backticks or double quotes: `` `index` ``. [#4143](https://github.com/ClickHouse/ClickHouse/pull/4143) ([Nikita Vasilev](https://github.com/nikvas0)) -* `sumMap` now promote result type instead of overflow. The old `sumMap` behavior can be obtained by using `sumMapWithOverflow` function. [#4151](https://github.com/ClickHouse/ClickHouse/pull/4151) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) - -#### Performance Improvements -* `std::sort` replaced by `pdqsort` for queries without `LIMIT`. [#4236](https://github.com/ClickHouse/ClickHouse/pull/4236) ([Evgenii Pravda](https://github.com/kvinty)) -* Now server reuse threads from global thread pool. This affects performance in some corner cases. [#4150](https://github.com/ClickHouse/ClickHouse/pull/4150) ([alexey-milovidov](https://github.com/alexey-milovidov)) - -#### Improvements -* Implemented AIO support for FreeBSD. [#4305](https://github.com/ClickHouse/ClickHouse/pull/4305) ([urgordeadbeef](https://github.com/urgordeadbeef)) -* `SELECT * FROM a JOIN b USING a, b` now return `a` and `b` columns only from the left table. [#4141](https://github.com/ClickHouse/ClickHouse/pull/4141) ([Artem Zuikov](https://github.com/4ertus2)) -* Allow `-C` option of client to work as `-c` option. [#4232](https://github.com/ClickHouse/ClickHouse/pull/4232) ([syominsergey](https://github.com/syominsergey)) -* Now option `--password` used without value requires password from stdin. [#4230](https://github.com/ClickHouse/ClickHouse/pull/4230) ([BSD_Conqueror](https://github.com/bsd-conqueror)) -* Added highlighting of unescaped metacharacters in string literals that contain `LIKE` expressions or regexps. [#4327](https://github.com/ClickHouse/ClickHouse/pull/4327) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added cancelling of HTTP read only queries if client socket goes away. [#4213](https://github.com/ClickHouse/ClickHouse/pull/4213) ([nvartolomei](https://github.com/nvartolomei)) -* Now server reports progress to keep client connections alive. [#4215](https://github.com/ClickHouse/ClickHouse/pull/4215) ([Ivan](https://github.com/abyss7)) -* Slightly better message with reason for OPTIMIZE query with `optimize_throw_if_noop` setting enabled. [#4294](https://github.com/ClickHouse/ClickHouse/pull/4294) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added support of `--version` option for clickhouse server. [#4251](https://github.com/ClickHouse/ClickHouse/pull/4251) ([Lopatin Konstantin](https://github.com/k-lopatin)) -* Added `--help/-h` option to `clickhouse-server`. [#4233](https://github.com/ClickHouse/ClickHouse/pull/4233) ([Yuriy Baranov](https://github.com/yurriy)) -* Added support for scalar subqueries with aggregate function state result. [#4348](https://github.com/ClickHouse/ClickHouse/pull/4348) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Improved server shutdown time and ALTERs waiting time. [#4372](https://github.com/ClickHouse/ClickHouse/pull/4372) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added info about the replicated_can_become_leader setting to system.replicas and add logging if the replica won't try to become leader. [#4379](https://github.com/ClickHouse/ClickHouse/pull/4379) ([Alex Zatelepin](https://github.com/ztlpn)) - - -## ClickHouse release 19.1 -### ClickHouse release 19.1.14, 2019-03-14 - -* Fixed error `Column ... queried more than once` that may happen if the setting `asterisk_left_columns_only` is set to 1 in case of using `GLOBAL JOIN` with `SELECT *` (rare case). The issue does not exist in 19.3 and newer. [6bac7d8d](https://github.com/ClickHouse/ClickHouse/pull/4692/commits/6bac7d8d11a9b0d6de0b32b53c47eb2f6f8e7062) ([Artem Zuikov](https://github.com/4ertus2)) - -### ClickHouse release 19.1.13, 2019-03-12 +#### Build/Testing/Packaging Improvements {#buildtestingpackaging-improvements-1} + +- Print UTF-8 characters properly in `clickhouse-test`. + [\#5084](https://github.com/ClickHouse/ClickHouse/pull/5084) + ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add command line parameter for clickhouse-client to always load suggestion + data. [\#5102](https://github.com/ClickHouse/ClickHouse/pull/5102) + ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Resolve some of PVS-Studio warnings. + [\#5082](https://github.com/ClickHouse/ClickHouse/pull/5082) + ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Update LZ4 [\#5040](https://github.com/ClickHouse/ClickHouse/pull/5040) ([Danila + Kutenin](https://github.com/danlark1)) +- Add gperf to build requirements for upcoming pull request \#5030. + [\#5110](https://github.com/ClickHouse/ClickHouse/pull/5110) + ([proller](https://github.com/proller)) + +## ClickHouse release 19.6 {#clickhouse-release-19-6} + +### ClickHouse release 19.6.3.18, 2019-06-13 {#clickhouse-release-19-6-3-18-2019-06-13} + +#### Bug Fixes {#bug-fixes-3} + +- Fixed IN condition pushdown for queries from table functions `mysql` and `odbc` and corresponding table engines. This fixes \#3540 and \#2384. [\#5313](https://github.com/ClickHouse/ClickHouse/pull/5313) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix deadlock in Zookeeper. [\#5297](https://github.com/ClickHouse/ClickHouse/pull/5297) ([github1youlc](https://github.com/github1youlc)) +- Allow quoted decimals in CSV. [\#5284](https://github.com/ClickHouse/ClickHouse/pull/5284) ([Artem Zuikov](https://github.com/4ertus2) +- Disallow conversion from float Inf/NaN into Decimals (throw exception). [\#5282](https://github.com/ClickHouse/ClickHouse/pull/5282) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix data race in rename query. [\#5247](https://github.com/ClickHouse/ClickHouse/pull/5247) ([Winter Zhang](https://github.com/zhang2014)) +- Temporarily disable LFAlloc. Usage of LFAlloc might lead to a lot of MAP\_FAILED in allocating UncompressedCache and in a result to crashes of queries at high loaded servers. [cfdba93](https://github.com/ClickHouse/ClickHouse/commit/cfdba938ce22f16efeec504f7f90206a515b1280)([Danila Kutenin](https://github.com/danlark1)) + +### ClickHouse release 19.6.2.11, 2019-05-13 {#clickhouse-release-19-6-2-11-2019-05-13} + +#### New Features {#new-features-3} + +- TTL expressions for columns and tables. [\#4212](https://github.com/ClickHouse/ClickHouse/pull/4212) ([Anton Popov](https://github.com/CurtizJ)) +- Added support for `brotli` compression for HTTP responses (Accept-Encoding: br) [\#4388](https://github.com/ClickHouse/ClickHouse/pull/4388) ([Mikhail](https://github.com/fandyushin)) +- Added new function `isValidUTF8` for checking whether a set of bytes is correctly utf-8 encoded. [\#4934](https://github.com/ClickHouse/ClickHouse/pull/4934) ([Danila Kutenin](https://github.com/danlark1)) +- Add new load balancing policy `first_or_random` which sends queries to the first specified host and if it’s inaccessible send queries to random hosts of shard. Useful for cross-replication topology setups. [\#5012](https://github.com/ClickHouse/ClickHouse/pull/5012) ([nvartolomei](https://github.com/nvartolomei)) + +#### Experimental Features {#experimental-features-1} + +- Add setting `index_granularity_bytes` (adaptive index granularity) for MergeTree\* tables family. [\#4826](https://github.com/ClickHouse/ClickHouse/pull/4826) ([alesapin](https://github.com/alesapin)) + +#### Improvements {#improvements-1} + +- Added support for non-constant and negative size and length arguments for function `substringUTF8`. [\#4989](https://github.com/ClickHouse/ClickHouse/pull/4989) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Disable push-down to right table in left join, left table in right join, and both tables in full join. This fixes wrong JOIN results in some cases. [\#4846](https://github.com/ClickHouse/ClickHouse/pull/4846) ([Ivan](https://github.com/abyss7)) +- `clickhouse-copier`: auto upload task configuration from `--task-file` option [\#4876](https://github.com/ClickHouse/ClickHouse/pull/4876) ([proller](https://github.com/proller)) +- Added typos handler for storage factory and table functions factory. [\#4891](https://github.com/ClickHouse/ClickHouse/pull/4891) ([Danila Kutenin](https://github.com/danlark1)) +- Support asterisks and qualified asterisks for multiple joins without subqueries [\#4898](https://github.com/ClickHouse/ClickHouse/pull/4898) ([Artem Zuikov](https://github.com/4ertus2)) +- Make missing column error message more user friendly. [\#4915](https://github.com/ClickHouse/ClickHouse/pull/4915) ([Artem Zuikov](https://github.com/4ertus2)) + +#### Performance Improvements {#performance-improvements-2} + +- Significant speedup of ASOF JOIN [\#4924](https://github.com/ClickHouse/ClickHouse/pull/4924) ([Martijn Bakker](https://github.com/Gladdy)) + +#### Backward Incompatible Changes {#backward-incompatible-changes} + +- HTTP header `Query-Id` was renamed to `X-ClickHouse-Query-Id` for consistency. [\#4972](https://github.com/ClickHouse/ClickHouse/pull/4972) ([Mikhail](https://github.com/fandyushin)) + +#### Bug Fixes {#bug-fixes-4} + +- Fixed potential null pointer dereference in `clickhouse-copier`. [\#4900](https://github.com/ClickHouse/ClickHouse/pull/4900) ([proller](https://github.com/proller)) +- Fixed error on query with JOIN + ARRAY JOIN [\#4938](https://github.com/ClickHouse/ClickHouse/pull/4938) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed hanging on start of the server when a dictionary depends on another dictionary via a database with engine=Dictionary. [\#4962](https://github.com/ClickHouse/ClickHouse/pull/4962) ([Vitaly Baranov](https://github.com/vitlibar)) +- Partially fix distributed\_product\_mode = local. It’s possible to allow columns of local tables in where/having/order by/… via table aliases. Throw exception if table does not have alias. There’s not possible to access to the columns without table aliases yet. [\#4986](https://github.com/ClickHouse/ClickHouse/pull/4986) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix potentially wrong result for `SELECT DISTINCT` with `JOIN` [\#5001](https://github.com/ClickHouse/ClickHouse/pull/5001) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts\_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [\#5189](https://github.com/ClickHouse/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Build/Testing/Packaging Improvements {#buildtestingpackaging-improvements-2} + +- Fixed test failures when running clickhouse-server on different host [\#4713](https://github.com/ClickHouse/ClickHouse/pull/4713) ([Vasily Nemkov](https://github.com/Enmk)) +- clickhouse-test: Disable color control sequences in non tty environment. [\#4937](https://github.com/ClickHouse/ClickHouse/pull/4937) ([alesapin](https://github.com/alesapin)) +- clickhouse-test: Allow use any test database (remove `test.` qualification where it possible) [\#5008](https://github.com/ClickHouse/ClickHouse/pull/5008) ([proller](https://github.com/proller)) +- Fix ubsan errors [\#5037](https://github.com/ClickHouse/ClickHouse/pull/5037) ([Vitaly Baranov](https://github.com/vitlibar)) +- Yandex LFAlloc was added to ClickHouse to allocate MarkCache and UncompressedCache data in different ways to catch segfaults more reliable [\#4995](https://github.com/ClickHouse/ClickHouse/pull/4995) ([Danila Kutenin](https://github.com/danlark1)) +- Python util to help with backports and changelogs. [\#4949](https://github.com/ClickHouse/ClickHouse/pull/4949) ([Ivan](https://github.com/abyss7)) + +## ClickHouse release 19.5 {#clickhouse-release-19-5} + +### ClickHouse release 19.5.4.22, 2019-05-13 {#clickhouse-release-19-5-4-22-2019-05-13} + +#### Bug fixes {#bug-fixes-5} + +- Fixed possible crash in bitmap\* functions [\#5220](https://github.com/ClickHouse/ClickHouse/pull/5220) [\#5228](https://github.com/ClickHouse/ClickHouse/pull/5228) ([Andy Yang](https://github.com/andyyzh)) +- Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts\_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [\#5189](https://github.com/ClickHouse/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed error `Set for IN is not created yet in case of using single LowCardinality column in the left part of IN`. This error happened if LowCardinality column was the part of primary key. \#5031 [\#5154](https://github.com/ClickHouse/ClickHouse/pull/5154) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Modification of retention function: If a row satisfies both the first and NTH condition, only the first satisfied condition is added to the data state. Now all conditions that satisfy in a row of data are added to the data state. [\#5119](https://github.com/ClickHouse/ClickHouse/pull/5119) ([小路](https://github.com/nicelulu)) + +### ClickHouse release 19.5.3.8, 2019-04-18 {#clickhouse-release-19-5-3-8-2019-04-18} + +#### Bug fixes {#bug-fixes-6} + +- Fixed type of setting `max_partitions_per_insert_block` from boolean to UInt64. [\#5028](https://github.com/ClickHouse/ClickHouse/pull/5028) ([Mohammad Hossein Sekhavat](https://github.com/mhsekhavat)) + +### ClickHouse release 19.5.2.6, 2019-04-15 {#clickhouse-release-19-5-2-6-2019-04-15} + +#### New Features {#new-features-4} + +- [Hyperscan](https://github.com/intel/hyperscan) multiple regular expression matching was added (functions `multiMatchAny`, `multiMatchAnyIndex`, `multiFuzzyMatchAny`, `multiFuzzyMatchAnyIndex`). [\#4780](https://github.com/ClickHouse/ClickHouse/pull/4780), [\#4841](https://github.com/ClickHouse/ClickHouse/pull/4841) ([Danila Kutenin](https://github.com/danlark1)) +- `multiSearchFirstPosition` function was added. [\#4780](https://github.com/ClickHouse/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1)) +- Implement the predefined expression filter per row for tables. [\#4792](https://github.com/ClickHouse/ClickHouse/pull/4792) ([Ivan](https://github.com/abyss7)) +- A new type of data skipping indices based on bloom filters (can be used for `equal`, `in` and `like` functions). [\#4499](https://github.com/ClickHouse/ClickHouse/pull/4499) ([Nikita Vasilev](https://github.com/nikvas0)) +- Added `ASOF JOIN` which allows to run queries that join to the most recent value known. [\#4774](https://github.com/ClickHouse/ClickHouse/pull/4774) [\#4867](https://github.com/ClickHouse/ClickHouse/pull/4867) [\#4863](https://github.com/ClickHouse/ClickHouse/pull/4863) [\#4875](https://github.com/ClickHouse/ClickHouse/pull/4875) ([Martijn Bakker](https://github.com/Gladdy), [Artem Zuikov](https://github.com/4ertus2)) +- Rewrite multiple `COMMA JOIN` to `CROSS JOIN`. Then rewrite them to `INNER JOIN` if possible. [\#4661](https://github.com/ClickHouse/ClickHouse/pull/4661) ([Artem Zuikov](https://github.com/4ertus2)) + +#### Improvement {#improvement-9} + +- `topK` and `topKWeighted` now supports custom `loadFactor` (fixes issue [\#4252](https://github.com/ClickHouse/ClickHouse/issues/4252)). [\#4634](https://github.com/ClickHouse/ClickHouse/pull/4634) ([Kirill Danshin](https://github.com/kirillDanshin)) +- Allow to use `parallel_replicas_count > 1` even for tables without sampling (the setting is simply ignored for them). In previous versions it was lead to exception. [\#4637](https://github.com/ClickHouse/ClickHouse/pull/4637) ([Alexey Elymanov](https://github.com/digitalist)) +- Support for `CREATE OR REPLACE VIEW`. Allow to create a view or set a new definition in a single statement. [\#4654](https://github.com/ClickHouse/ClickHouse/pull/4654) ([Boris Granveaud](https://github.com/bgranvea)) +- `Buffer` table engine now supports `PREWHERE`. [\#4671](https://github.com/ClickHouse/ClickHouse/pull/4671) ([Yangkuan Liu](https://github.com/LiuYangkuan)) +- Add ability to start replicated table without metadata in zookeeper in `readonly` mode. [\#4691](https://github.com/ClickHouse/ClickHouse/pull/4691) ([alesapin](https://github.com/alesapin)) +- Fixed flicker of progress bar in clickhouse-client. The issue was most noticeable when using `FORMAT Null` with streaming queries. [\#4811](https://github.com/ClickHouse/ClickHouse/pull/4811) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Allow to disable functions with `hyperscan` library on per user basis to limit potentially excessive and uncontrolled resource usage. [\#4816](https://github.com/ClickHouse/ClickHouse/pull/4816) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add version number logging in all errors. [\#4824](https://github.com/ClickHouse/ClickHouse/pull/4824) ([proller](https://github.com/proller)) +- Added restriction to the `multiMatch` functions which requires string size to fit into `unsigned int`. Also added the number of arguments limit to the `multiSearch` functions. [\#4834](https://github.com/ClickHouse/ClickHouse/pull/4834) ([Danila Kutenin](https://github.com/danlark1)) +- Improved usage of scratch space and error handling in Hyperscan. [\#4866](https://github.com/ClickHouse/ClickHouse/pull/4866) ([Danila Kutenin](https://github.com/danlark1)) +- Fill `system.graphite_detentions` from a table config of `*GraphiteMergeTree` engine tables. [\#4584](https://github.com/ClickHouse/ClickHouse/pull/4584) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) +- Rename `trigramDistance` function to `ngramDistance` and add more functions with `CaseInsensitive` and `UTF`. [\#4602](https://github.com/ClickHouse/ClickHouse/pull/4602) ([Danila Kutenin](https://github.com/danlark1)) +- Improved data skipping indices calculation. [\#4640](https://github.com/ClickHouse/ClickHouse/pull/4640) ([Nikita Vasilev](https://github.com/nikvas0)) +- Keep ordinary, `DEFAULT`, `MATERIALIZED` and `ALIAS` columns in a single list (fixes issue [\#2867](https://github.com/ClickHouse/ClickHouse/issues/2867)). [\#4707](https://github.com/ClickHouse/ClickHouse/pull/4707) ([Alex Zatelepin](https://github.com/ztlpn)) + +#### Bug Fix {#bug-fix-26} + +- Avoid `std::terminate` in case of memory allocation failure. Now `std::bad_alloc` exception is thrown as expected. [\#4665](https://github.com/ClickHouse/ClickHouse/pull/4665) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixes capnproto reading from buffer. Sometimes files wasn’t loaded successfully by HTTP. [\#4674](https://github.com/ClickHouse/ClickHouse/pull/4674) ([Vladislav](https://github.com/smirnov-vs)) +- Fix error `Unknown log entry type: 0` after `OPTIMIZE TABLE FINAL` query. [\#4683](https://github.com/ClickHouse/ClickHouse/pull/4683) ([Amos Bird](https://github.com/amosbird)) +- Wrong arguments to `hasAny` or `hasAll` functions may lead to segfault. [\#4698](https://github.com/ClickHouse/ClickHouse/pull/4698) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Deadlock may happen while executing `DROP DATABASE dictionary` query. [\#4701](https://github.com/ClickHouse/ClickHouse/pull/4701) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix undefined behavior in `median` and `quantile` functions. [\#4702](https://github.com/ClickHouse/ClickHouse/pull/4702) ([hcz](https://github.com/hczhcz)) +- Fix compression level detection when `network_compression_method` in lowercase. Broken in v19.1. [\#4706](https://github.com/ClickHouse/ClickHouse/pull/4706) ([proller](https://github.com/proller)) +- Fixed ignorance of `UTC` setting (fixes issue [\#4658](https://github.com/ClickHouse/ClickHouse/issues/4658)). [\#4718](https://github.com/ClickHouse/ClickHouse/pull/4718) ([proller](https://github.com/proller)) +- Fix `histogram` function behaviour with `Distributed` tables. [\#4741](https://github.com/ClickHouse/ClickHouse/pull/4741) ([olegkv](https://github.com/olegkv)) +- Fixed tsan report `destroy of a locked mutex`. [\#4742](https://github.com/ClickHouse/ClickHouse/pull/4742) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed TSan report on shutdown due to race condition in system logs usage. Fixed potential use-after-free on shutdown when part\_log is enabled. [\#4758](https://github.com/ClickHouse/ClickHouse/pull/4758) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix recheck parts in `ReplicatedMergeTreeAlterThread` in case of error. [\#4772](https://github.com/ClickHouse/ClickHouse/pull/4772) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Arithmetic operations on intermediate aggregate function states were not working for constant arguments (such as subquery results). [\#4776](https://github.com/ClickHouse/ClickHouse/pull/4776) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Always backquote column names in metadata. Otherwise it’s impossible to create a table with column named `index` (server won’t restart due to malformed `ATTACH` query in metadata). [\#4782](https://github.com/ClickHouse/ClickHouse/pull/4782) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix crash in `ALTER ... MODIFY ORDER BY` on `Distributed` table. [\#4790](https://github.com/ClickHouse/ClickHouse/pull/4790) ([TCeason](https://github.com/TCeason)) +- Fix segfault in `JOIN ON` with enabled `enable_optimize_predicate_expression`. [\#4794](https://github.com/ClickHouse/ClickHouse/pull/4794) ([Winter Zhang](https://github.com/zhang2014)) +- Fix bug with adding an extraneous row after consuming a protobuf message from Kafka. [\#4808](https://github.com/ClickHouse/ClickHouse/pull/4808) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fix crash of `JOIN` on not-nullable vs nullable column. Fix `NULLs` in right keys in `ANY JOIN` + `join_use_nulls`. [\#4815](https://github.com/ClickHouse/ClickHouse/pull/4815) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix segmentation fault in `clickhouse-copier`. [\#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) +- Fixed race condition in `SELECT` from `system.tables` if the table is renamed or altered concurrently. [\#4836](https://github.com/ClickHouse/ClickHouse/pull/4836) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed data race when fetching data part that is already obsolete. [\#4839](https://github.com/ClickHouse/ClickHouse/pull/4839) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed rare data race that can happen during `RENAME` table of MergeTree family. [\#4844](https://github.com/ClickHouse/ClickHouse/pull/4844) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed segmentation fault in function `arrayIntersect`. Segmentation fault could happen if function was called with mixed constant and ordinary arguments. [\#4847](https://github.com/ClickHouse/ClickHouse/pull/4847) ([Lixiang Qian](https://github.com/fancyqlx)) +- Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [\#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [\#4855](https://github.com/ClickHouse/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix `No message received` exception while fetching parts between replicas. [\#4856](https://github.com/ClickHouse/ClickHouse/pull/4856) ([alesapin](https://github.com/alesapin)) +- Fixed `arrayIntersect` function wrong result in case of several repeated values in single array. [\#4871](https://github.com/ClickHouse/ClickHouse/pull/4871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix a race condition during concurrent `ALTER COLUMN` queries that could lead to a server crash (fixes issue [\#3421](https://github.com/ClickHouse/ClickHouse/issues/3421)). [\#4592](https://github.com/ClickHouse/ClickHouse/pull/4592) ([Alex Zatelepin](https://github.com/ztlpn)) +- Fix incorrect result in `FULL/RIGHT JOIN` with const column. [\#4723](https://github.com/ClickHouse/ClickHouse/pull/4723) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix duplicates in `GLOBAL JOIN` with asterisk. [\#4705](https://github.com/ClickHouse/ClickHouse/pull/4705) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix parameter deduction in `ALTER MODIFY` of column `CODEC` when column type is not specified. [\#4883](https://github.com/ClickHouse/ClickHouse/pull/4883) ([alesapin](https://github.com/alesapin)) +- Functions `cutQueryStringAndFragment()` and `queryStringAndFragment()` now works correctly when `URL` contains a fragment and no query. [\#4894](https://github.com/ClickHouse/ClickHouse/pull/4894) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fix rare bug when setting `min_bytes_to_use_direct_io` is greater than zero, which occures when thread have to seek backward in column file. [\#4897](https://github.com/ClickHouse/ClickHouse/pull/4897) ([alesapin](https://github.com/alesapin)) +- Fix wrong argument types for aggregate functions with `LowCardinality` arguments (fixes issue [\#4919](https://github.com/ClickHouse/ClickHouse/issues/4919)). [\#4922](https://github.com/ClickHouse/ClickHouse/pull/4922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix wrong name qualification in `GLOBAL JOIN`. [\#4969](https://github.com/ClickHouse/ClickHouse/pull/4969) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix function `toISOWeek` result for year 1970. [\#4988](https://github.com/ClickHouse/ClickHouse/pull/4988) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix `DROP`, `TRUNCATE` and `OPTIMIZE` queries duplication, when executed on `ON CLUSTER` for `ReplicatedMergeTree*` tables family. [\#4991](https://github.com/ClickHouse/ClickHouse/pull/4991) ([alesapin](https://github.com/alesapin)) + +#### Backward Incompatible Change {#backward-incompatible-change-8} + +- Rename setting `insert_sample_with_metadata` to setting `input_format_defaults_for_omitted_fields`. [\#4771](https://github.com/ClickHouse/ClickHouse/pull/4771) ([Artem Zuikov](https://github.com/4ertus2)) +- Added setting `max_partitions_per_insert_block` (with value 100 by default). If inserted block contains larger number of partitions, an exception is thrown. Set it to 0 if you want to remove the limit (not recommended). [\#4845](https://github.com/ClickHouse/ClickHouse/pull/4845) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Multi-search functions were renamed (`multiPosition` to `multiSearchAllPositions`, `multiSearch` to `multiSearchAny`, `firstMatch` to `multiSearchFirstIndex`). [\#4780](https://github.com/ClickHouse/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1)) + +#### Performance Improvement {#performance-improvement-6} + +- Optimize Volnitsky searcher by inlining, giving about 5-10% search improvement for queries with many needles or many similar bigrams. [\#4862](https://github.com/ClickHouse/ClickHouse/pull/4862) ([Danila Kutenin](https://github.com/danlark1)) +- Fix performance issue when setting `use_uncompressed_cache` is greater than zero, which appeared when all read data contained in cache. [\#4913](https://github.com/ClickHouse/ClickHouse/pull/4913) ([alesapin](https://github.com/alesapin)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-10} + +- Hardening debug build: more granular memory mappings and ASLR; add memory protection for mark cache and index. This allows to find more memory stomping bugs in case when ASan and MSan cannot do it. [\#4632](https://github.com/ClickHouse/ClickHouse/pull/4632) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add support for cmake variables `ENABLE_PROTOBUF`, `ENABLE_PARQUET` and `ENABLE_BROTLI` which allows to enable/disable the above features (same as we can do for librdkafka, mysql, etc). [\#4669](https://github.com/ClickHouse/ClickHouse/pull/4669) ([Silviu Caragea](https://github.com/silviucpp)) +- Add ability to print process list and stacktraces of all threads if some queries are hung after test run. [\#4675](https://github.com/ClickHouse/ClickHouse/pull/4675) ([alesapin](https://github.com/alesapin)) +- Add retries on `Connection loss` error in `clickhouse-test`. [\#4682](https://github.com/ClickHouse/ClickHouse/pull/4682) ([alesapin](https://github.com/alesapin)) +- Add freebsd build with vagrant and build with thread sanitizer to packager script. [\#4712](https://github.com/ClickHouse/ClickHouse/pull/4712) [\#4748](https://github.com/ClickHouse/ClickHouse/pull/4748) ([alesapin](https://github.com/alesapin)) +- Now user asked for password for user `'default'` during installation. [\#4725](https://github.com/ClickHouse/ClickHouse/pull/4725) ([proller](https://github.com/proller)) +- Suppress warning in `rdkafka` library. [\#4740](https://github.com/ClickHouse/ClickHouse/pull/4740) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Allow ability to build without ssl. [\#4750](https://github.com/ClickHouse/ClickHouse/pull/4750) ([proller](https://github.com/proller)) +- Add a way to launch clickhouse-server image from a custom user. [\#4753](https://github.com/ClickHouse/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) +- Upgrade contrib boost to 1.69. [\#4793](https://github.com/ClickHouse/ClickHouse/pull/4793) ([proller](https://github.com/proller)) +- Disable usage of `mremap` when compiled with Thread Sanitizer. Surprisingly enough, TSan does not intercept `mremap` (though it does intercept `mmap`, `munmap`) that leads to false positives. Fixed TSan report in stateful tests. [\#4859](https://github.com/ClickHouse/ClickHouse/pull/4859) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add test checking using format schema via HTTP interface. [\#4864](https://github.com/ClickHouse/ClickHouse/pull/4864) ([Vitaly Baranov](https://github.com/vitlibar)) + +## ClickHouse release 19.4 {#clickhouse-release-19-4} + +### ClickHouse release 19.4.4.33, 2019-04-17 {#clickhouse-release-19-4-4-33-2019-04-17} + +#### Bug Fixes {#bug-fixes-7} + +- Avoid `std::terminate` in case of memory allocation failure. Now `std::bad_alloc` exception is thrown as expected. [\#4665](https://github.com/ClickHouse/ClickHouse/pull/4665) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixes capnproto reading from buffer. Sometimes files wasn’t loaded successfully by HTTP. [\#4674](https://github.com/ClickHouse/ClickHouse/pull/4674) ([Vladislav](https://github.com/smirnov-vs)) +- Fix error `Unknown log entry type: 0` after `OPTIMIZE TABLE FINAL` query. [\#4683](https://github.com/ClickHouse/ClickHouse/pull/4683) ([Amos Bird](https://github.com/amosbird)) +- Wrong arguments to `hasAny` or `hasAll` functions may lead to segfault. [\#4698](https://github.com/ClickHouse/ClickHouse/pull/4698) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Deadlock may happen while executing `DROP DATABASE dictionary` query. [\#4701](https://github.com/ClickHouse/ClickHouse/pull/4701) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix undefined behavior in `median` and `quantile` functions. [\#4702](https://github.com/ClickHouse/ClickHouse/pull/4702) ([hcz](https://github.com/hczhcz)) +- Fix compression level detection when `network_compression_method` in lowercase. Broken in v19.1. [\#4706](https://github.com/ClickHouse/ClickHouse/pull/4706) ([proller](https://github.com/proller)) +- Fixed ignorance of `UTC` setting (fixes issue [\#4658](https://github.com/ClickHouse/ClickHouse/issues/4658)). [\#4718](https://github.com/ClickHouse/ClickHouse/pull/4718) ([proller](https://github.com/proller)) +- Fix `histogram` function behaviour with `Distributed` tables. [\#4741](https://github.com/ClickHouse/ClickHouse/pull/4741) ([olegkv](https://github.com/olegkv)) +- Fixed tsan report `destroy of a locked mutex`. [\#4742](https://github.com/ClickHouse/ClickHouse/pull/4742) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed TSan report on shutdown due to race condition in system logs usage. Fixed potential use-after-free on shutdown when part\_log is enabled. [\#4758](https://github.com/ClickHouse/ClickHouse/pull/4758) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix recheck parts in `ReplicatedMergeTreeAlterThread` in case of error. [\#4772](https://github.com/ClickHouse/ClickHouse/pull/4772) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Arithmetic operations on intermediate aggregate function states were not working for constant arguments (such as subquery results). [\#4776](https://github.com/ClickHouse/ClickHouse/pull/4776) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Always backquote column names in metadata. Otherwise it’s impossible to create a table with column named `index` (server won’t restart due to malformed `ATTACH` query in metadata). [\#4782](https://github.com/ClickHouse/ClickHouse/pull/4782) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix crash in `ALTER ... MODIFY ORDER BY` on `Distributed` table. [\#4790](https://github.com/ClickHouse/ClickHouse/pull/4790) ([TCeason](https://github.com/TCeason)) +- Fix segfault in `JOIN ON` with enabled `enable_optimize_predicate_expression`. [\#4794](https://github.com/ClickHouse/ClickHouse/pull/4794) ([Winter Zhang](https://github.com/zhang2014)) +- Fix bug with adding an extraneous row after consuming a protobuf message from Kafka. [\#4808](https://github.com/ClickHouse/ClickHouse/pull/4808) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fix segmentation fault in `clickhouse-copier`. [\#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) +- Fixed race condition in `SELECT` from `system.tables` if the table is renamed or altered concurrently. [\#4836](https://github.com/ClickHouse/ClickHouse/pull/4836) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed data race when fetching data part that is already obsolete. [\#4839](https://github.com/ClickHouse/ClickHouse/pull/4839) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed rare data race that can happen during `RENAME` table of MergeTree family. [\#4844](https://github.com/ClickHouse/ClickHouse/pull/4844) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed segmentation fault in function `arrayIntersect`. Segmentation fault could happen if function was called with mixed constant and ordinary arguments. [\#4847](https://github.com/ClickHouse/ClickHouse/pull/4847) ([Lixiang Qian](https://github.com/fancyqlx)) +- Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [\#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix `No message received` exception while fetching parts between replicas. [\#4856](https://github.com/ClickHouse/ClickHouse/pull/4856) ([alesapin](https://github.com/alesapin)) +- Fixed `arrayIntersect` function wrong result in case of several repeated values in single array. [\#4871](https://github.com/ClickHouse/ClickHouse/pull/4871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix a race condition during concurrent `ALTER COLUMN` queries that could lead to a server crash (fixes issue [\#3421](https://github.com/ClickHouse/ClickHouse/issues/3421)). [\#4592](https://github.com/ClickHouse/ClickHouse/pull/4592) ([Alex Zatelepin](https://github.com/ztlpn)) +- Fix parameter deduction in `ALTER MODIFY` of column `CODEC` when column type is not specified. [\#4883](https://github.com/ClickHouse/ClickHouse/pull/4883) ([alesapin](https://github.com/alesapin)) +- Functions `cutQueryStringAndFragment()` and `queryStringAndFragment()` now works correctly when `URL` contains a fragment and no query. [\#4894](https://github.com/ClickHouse/ClickHouse/pull/4894) ([Vitaly Baranov](https://github.com/vitlibar)) +- Fix rare bug when setting `min_bytes_to_use_direct_io` is greater than zero, which occures when thread have to seek backward in column file. [\#4897](https://github.com/ClickHouse/ClickHouse/pull/4897) ([alesapin](https://github.com/alesapin)) +- Fix wrong argument types for aggregate functions with `LowCardinality` arguments (fixes issue [\#4919](https://github.com/ClickHouse/ClickHouse/issues/4919)). [\#4922](https://github.com/ClickHouse/ClickHouse/pull/4922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fix function `toISOWeek` result for year 1970. [\#4988](https://github.com/ClickHouse/ClickHouse/pull/4988) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix `DROP`, `TRUNCATE` and `OPTIMIZE` queries duplication, when executed on `ON CLUSTER` for `ReplicatedMergeTree*` tables family. [\#4991](https://github.com/ClickHouse/ClickHouse/pull/4991) ([alesapin](https://github.com/alesapin)) + +#### Improvements {#improvements-2} + +- Keep ordinary, `DEFAULT`, `MATERIALIZED` and `ALIAS` columns in a single list (fixes issue [\#2867](https://github.com/ClickHouse/ClickHouse/issues/2867)). [\#4707](https://github.com/ClickHouse/ClickHouse/pull/4707) ([Alex Zatelepin](https://github.com/ztlpn)) + +### ClickHouse release 19.4.3.11, 2019-04-02 {#clickhouse-release-19-4-3-11-2019-04-02} + +#### Bug Fixes {#bug-fixes-8} + +- Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [\#4855](https://github.com/ClickHouse/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix segmentation fault in `clickhouse-copier`. [\#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-11} + +- Add a way to launch clickhouse-server image from a custom user. [\#4753](https://github.com/ClickHouse/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) + +### ClickHouse release 19.4.2.7, 2019-03-30 {#clickhouse-release-19-4-2-7-2019-03-30} + +#### Bug Fixes {#bug-fixes-9} + +- Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [\#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) + +### ClickHouse release 19.4.1.3, 2019-03-19 {#clickhouse-release-19-4-1-3-2019-03-19} + +#### Bug Fixes {#bug-fixes-10} + +- Fixed remote queries which contain both `LIMIT BY` and `LIMIT`. Previously, if `LIMIT BY` and `LIMIT` were used for remote query, `LIMIT` could happen before `LIMIT BY`, which led to too filtered result. [\#4708](https://github.com/ClickHouse/ClickHouse/pull/4708) ([Constantin S. Pan](https://github.com/kvap)) + +### ClickHouse release 19.4.0.49, 2019-03-09 {#clickhouse-release-19-4-0-49-2019-03-09} + +#### New Features {#new-features-5} + +- Added full support for `Protobuf` format (input and output, nested data structures). [\#4174](https://github.com/ClickHouse/ClickHouse/pull/4174) [\#4493](https://github.com/ClickHouse/ClickHouse/pull/4493) ([Vitaly Baranov](https://github.com/vitlibar)) +- Added bitmap functions with Roaring Bitmaps. [\#4207](https://github.com/ClickHouse/ClickHouse/pull/4207) ([Andy Yang](https://github.com/andyyzh)) [\#4568](https://github.com/ClickHouse/ClickHouse/pull/4568) ([Vitaly Baranov](https://github.com/vitlibar)) +- Parquet format support. [\#4448](https://github.com/ClickHouse/ClickHouse/pull/4448) ([proller](https://github.com/proller)) +- N-gram distance was added for fuzzy string comparison. It is similar to q-gram metrics in R language. [\#4466](https://github.com/ClickHouse/ClickHouse/pull/4466) ([Danila Kutenin](https://github.com/danlark1)) +- Combine rules for graphite rollup from dedicated aggregation and retention patterns. [\#4426](https://github.com/ClickHouse/ClickHouse/pull/4426) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) +- Added `max_execution_speed` and `max_execution_speed_bytes` to limit resource usage. Added `min_execution_speed_bytes` setting to complement the `min_execution_speed`. [\#4430](https://github.com/ClickHouse/ClickHouse/pull/4430) ([Winter Zhang](https://github.com/zhang2014)) +- Implemented function `flatten`. [\#4555](https://github.com/ClickHouse/ClickHouse/pull/4555) [\#4409](https://github.com/ClickHouse/ClickHouse/pull/4409) ([alexey-milovidov](https://github.com/alexey-milovidov), [kzon](https://github.com/kzon)) +- Added functions `arrayEnumerateDenseRanked` and `arrayEnumerateUniqRanked` (it’s like `arrayEnumerateUniq` but allows to fine tune array depth to look inside multidimensional arrays). [\#4475](https://github.com/ClickHouse/ClickHouse/pull/4475) ([proller](https://github.com/proller)) [\#4601](https://github.com/ClickHouse/ClickHouse/pull/4601) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Multiple JOINS with some restrictions: no asterisks, no complex aliases in ON/WHERE/GROUP BY/… [\#4462](https://github.com/ClickHouse/ClickHouse/pull/4462) ([Artem Zuikov](https://github.com/4ertus2)) + +#### Bug Fixes {#bug-fixes-11} + +- This release also contains all bug fixes from 19.3 and 19.1. +- Fixed bug in data skipping indices: order of granules after INSERT was incorrect. [\#4407](https://github.com/ClickHouse/ClickHouse/pull/4407) ([Nikita Vasilev](https://github.com/nikvas0)) +- Fixed `set` index for `Nullable` and `LowCardinality` columns. Before it, `set` index with `Nullable` or `LowCardinality` column led to error `Data type must be deserialized with multiple streams` while selecting. [\#4594](https://github.com/ClickHouse/ClickHouse/pull/4594) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Correctly set update\_time on full `executable` dictionary update. [\#4551](https://github.com/ClickHouse/ClickHouse/pull/4551) ([Tema Novikov](https://github.com/temoon)) +- Fix broken progress bar in 19.3. [\#4627](https://github.com/ClickHouse/ClickHouse/pull/4627) ([filimonov](https://github.com/filimonov)) +- Fixed inconsistent values of MemoryTracker when memory region was shrinked, in certain cases. [\#4619](https://github.com/ClickHouse/ClickHouse/pull/4619) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed undefined behaviour in ThreadPool. [\#4612](https://github.com/ClickHouse/ClickHouse/pull/4612) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed a very rare crash with the message `mutex lock failed: Invalid argument` that could happen when a MergeTree table was dropped concurrently with a SELECT. [\#4608](https://github.com/ClickHouse/ClickHouse/pull/4608) ([Alex Zatelepin](https://github.com/ztlpn)) +- ODBC driver compatibility with `LowCardinality` data type. [\#4381](https://github.com/ClickHouse/ClickHouse/pull/4381) ([proller](https://github.com/proller)) +- FreeBSD: Fixup for `AIOcontextPool: Found io_event with unknown id 0` error. [\#4438](https://github.com/ClickHouse/ClickHouse/pull/4438) ([urgordeadbeef](https://github.com/urgordeadbeef)) +- `system.part_log` table was created regardless to configuration. [\#4483](https://github.com/ClickHouse/ClickHouse/pull/4483) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix undefined behaviour in `dictIsIn` function for cache dictionaries. [\#4515](https://github.com/ClickHouse/ClickHouse/pull/4515) ([alesapin](https://github.com/alesapin)) +- Fixed a deadlock when a SELECT query locks the same table multiple times (e.g. from different threads or when executing multiple subqueries) and there is a concurrent DDL query. [\#4535](https://github.com/ClickHouse/ClickHouse/pull/4535) ([Alex Zatelepin](https://github.com/ztlpn)) +- Disable compile\_expressions by default until we get own `llvm` contrib and can test it with `clang` and `asan`. [\#4579](https://github.com/ClickHouse/ClickHouse/pull/4579) ([alesapin](https://github.com/alesapin)) +- Prevent `std::terminate` when `invalidate_query` for `clickhouse` external dictionary source has returned wrong resultset (empty or more than one row or more than one column). Fixed issue when the `invalidate_query` was performed every five seconds regardless to the `lifetime`. [\#4583](https://github.com/ClickHouse/ClickHouse/pull/4583) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Avoid deadlock when the `invalidate_query` for a dictionary with `clickhouse` source was involving `system.dictionaries` table or `Dictionaries` database (rare case). [\#4599](https://github.com/ClickHouse/ClickHouse/pull/4599) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixes for CROSS JOIN with empty WHERE. [\#4598](https://github.com/ClickHouse/ClickHouse/pull/4598) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed segfault in function “replicate” when constant argument is passed. [\#4603](https://github.com/ClickHouse/ClickHouse/pull/4603) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix lambda function with predicate optimizer. [\#4408](https://github.com/ClickHouse/ClickHouse/pull/4408) ([Winter Zhang](https://github.com/zhang2014)) +- Multiple JOINs multiple fixes. [\#4595](https://github.com/ClickHouse/ClickHouse/pull/4595) ([Artem Zuikov](https://github.com/4ertus2)) + +#### Improvements {#improvements-3} + +- Support aliases in JOIN ON section for right table columns. [\#4412](https://github.com/ClickHouse/ClickHouse/pull/4412) ([Artem Zuikov](https://github.com/4ertus2)) +- Result of multiple JOINs need correct result names to be used in subselects. Replace flat aliases with source names in result. [\#4474](https://github.com/ClickHouse/ClickHouse/pull/4474) ([Artem Zuikov](https://github.com/4ertus2)) +- Improve push-down logic for joined statements. [\#4387](https://github.com/ClickHouse/ClickHouse/pull/4387) ([Ivan](https://github.com/abyss7)) + +#### Performance Improvements {#performance-improvements-3} + +- Improved heuristics of “move to PREWHERE” optimization. [\#4405](https://github.com/ClickHouse/ClickHouse/pull/4405) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Use proper lookup tables that uses HashTable’s API for 8-bit and 16-bit keys. [\#4536](https://github.com/ClickHouse/ClickHouse/pull/4536) ([Amos Bird](https://github.com/amosbird)) +- Improved performance of string comparison. [\#4564](https://github.com/ClickHouse/ClickHouse/pull/4564) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Cleanup distributed DDL queue in a separate thread so that it doesn’t slow down the main loop that processes distributed DDL tasks. [\#4502](https://github.com/ClickHouse/ClickHouse/pull/4502) ([Alex Zatelepin](https://github.com/ztlpn)) +- When `min_bytes_to_use_direct_io` is set to 1, not every file was opened with O\_DIRECT mode because the data size to read was sometimes underestimated by the size of one compressed block. [\#4526](https://github.com/ClickHouse/ClickHouse/pull/4526) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-12} + +- Added support for clang-9 [\#4604](https://github.com/ClickHouse/ClickHouse/pull/4604) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix wrong `__asm__` instructions (again) [\#4621](https://github.com/ClickHouse/ClickHouse/pull/4621) ([Konstantin Podshumok](https://github.com/podshumok)) +- Add ability to specify settings for `clickhouse-performance-test` from command line. [\#4437](https://github.com/ClickHouse/ClickHouse/pull/4437) ([alesapin](https://github.com/alesapin)) +- Add dictionaries tests to integration tests. [\#4477](https://github.com/ClickHouse/ClickHouse/pull/4477) ([alesapin](https://github.com/alesapin)) +- Added queries from the benchmark on the website to automated performance tests. [\#4496](https://github.com/ClickHouse/ClickHouse/pull/4496) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- `xxhash.h` does not exist in external lz4 because it is an implementation detail and its symbols are namespaced with `XXH_NAMESPACE` macro. When lz4 is external, xxHash has to be external too, and the dependents have to link to it. [\#4495](https://github.com/ClickHouse/ClickHouse/pull/4495) ([Orivej Desh](https://github.com/orivej)) +- Fixed a case when `quantileTiming` aggregate function can be called with negative or floating point argument (this fixes fuzz test with undefined behaviour sanitizer). [\#4506](https://github.com/ClickHouse/ClickHouse/pull/4506) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Spelling error correction. [\#4531](https://github.com/ClickHouse/ClickHouse/pull/4531) ([sdk2](https://github.com/sdk2)) +- Fix compilation on Mac. [\#4371](https://github.com/ClickHouse/ClickHouse/pull/4371) ([Vitaly Baranov](https://github.com/vitlibar)) +- Build fixes for FreeBSD and various unusual build configurations. [\#4444](https://github.com/ClickHouse/ClickHouse/pull/4444) ([proller](https://github.com/proller)) + +## ClickHouse release 19.3 {#clickhouse-release-19-3} + +### ClickHouse release 19.3.9.1, 2019-04-02 {#clickhouse-release-19-3-9-1-2019-04-02} + +#### Bug Fixes {#bug-fixes-12} + +- Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [\#4855](https://github.com/ClickHouse/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) +- Fix segmentation fault in `clickhouse-copier`. [\#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) +- Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [\#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) + +#### Build/Testing/Packaging Improvement {#buildtestingpackaging-improvement-13} + +- Add a way to launch clickhouse-server image from a custom user [\#4753](https://github.com/ClickHouse/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) + +### ClickHouse release 19.3.7, 2019-03-12 {#clickhouse-release-19-3-7-2019-03-12} + +#### Bug fixes {#bug-fixes-13} + +- Fixed error in \#3920. This error manifests itself as random cache corruption (messages `Unknown codec family code`, `Cannot seek through file`) and segfaults. This bug first appeared in version 19.1 and is present in versions up to 19.1.10 and 19.3.6. [\#4623](https://github.com/ClickHouse/ClickHouse/pull/4623) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.3.6, 2019-03-02 {#clickhouse-release-19-3-6-2019-03-02} + +#### Bug fixes {#bug-fixes-14} + +- When there are more than 1000 threads in a thread pool, `std::terminate` may happen on thread exit. [Azat Khuzhin](https://github.com/azat) [\#4485](https://github.com/ClickHouse/ClickHouse/pull/4485) [\#4505](https://github.com/ClickHouse/ClickHouse/pull/4505) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Now it’s possible to create `ReplicatedMergeTree*` tables with comments on columns without defaults and tables with columns codecs without comments and defaults. Also fix comparison of codecs. [\#4523](https://github.com/ClickHouse/ClickHouse/pull/4523) ([alesapin](https://github.com/alesapin)) +- Fixed crash on JOIN with array or tuple. [\#4552](https://github.com/ClickHouse/ClickHouse/pull/4552) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed crash in clickhouse-copier with the message `ThreadStatus not created`. [\#4540](https://github.com/ClickHouse/ClickHouse/pull/4540) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed hangup on server shutdown if distributed DDLs were used. [\#4472](https://github.com/ClickHouse/ClickHouse/pull/4472) ([Alex Zatelepin](https://github.com/ztlpn)) +- Incorrect column numbers were printed in error message about text format parsing for columns with number greater than 10. [\#4484](https://github.com/ClickHouse/ClickHouse/pull/4484) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Build/Testing/Packaging Improvements {#buildtestingpackaging-improvements-3} + +- Fixed build with AVX enabled. [\#4527](https://github.com/ClickHouse/ClickHouse/pull/4527) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Enable extended accounting and IO accounting based on good known version instead of kernel under which it is compiled. [\#4541](https://github.com/ClickHouse/ClickHouse/pull/4541) ([nvartolomei](https://github.com/nvartolomei)) +- Allow to skip setting of core\_dump.size\_limit, warning instead of throw if limit set fail. [\#4473](https://github.com/ClickHouse/ClickHouse/pull/4473) ([proller](https://github.com/proller)) +- Removed the `inline` tags of `void readBinary(...)` in `Field.cpp`. Also merged redundant `namespace DB` blocks. [\#4530](https://github.com/ClickHouse/ClickHouse/pull/4530) ([hcz](https://github.com/hczhcz)) + +### ClickHouse release 19.3.5, 2019-02-21 {#clickhouse-release-19-3-5-2019-02-21} + +#### Bug fixes {#bug-fixes-15} + +- Fixed bug with large http insert queries processing. [\#4454](https://github.com/ClickHouse/ClickHouse/pull/4454) ([alesapin](https://github.com/alesapin)) +- Fixed backward incompatibility with old versions due to wrong implementation of `send_logs_level` setting. [\#4445](https://github.com/ClickHouse/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed backward incompatibility of table function `remote` introduced with column comments. [\#4446](https://github.com/ClickHouse/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +### ClickHouse release 19.3.4, 2019-02-16 {#clickhouse-release-19-3-4-2019-02-16} + +#### Improvements {#improvements-4} + +- Table index size is not accounted for memory limits when doing `ATTACH TABLE` query. Avoided the possibility that a table cannot be attached after being detached. [\#4396](https://github.com/ClickHouse/ClickHouse/pull/4396) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Slightly raised up the limit on max string and array size received from ZooKeeper. It allows to continue to work with increased size of `CLIENT_JVMFLAGS=-Djute.maxbuffer=...` on ZooKeeper. [\#4398](https://github.com/ClickHouse/ClickHouse/pull/4398) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Allow to repair abandoned replica even if it already has huge number of nodes in its queue. [\#4399](https://github.com/ClickHouse/ClickHouse/pull/4399) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Add one required argument to `SET` index (max stored rows number). [\#4386](https://github.com/ClickHouse/ClickHouse/pull/4386) ([Nikita Vasilev](https://github.com/nikvas0)) + +#### Bug Fixes {#bug-fixes-16} + +- Fixed `WITH ROLLUP` result for group by single `LowCardinality` key. [\#4384](https://github.com/ClickHouse/ClickHouse/pull/4384) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Fixed bug in the set index (dropping a granule if it contains more than `max_rows` rows). [\#4386](https://github.com/ClickHouse/ClickHouse/pull/4386) ([Nikita Vasilev](https://github.com/nikvas0)) +- A lot of FreeBSD build fixes. [\#4397](https://github.com/ClickHouse/ClickHouse/pull/4397) ([proller](https://github.com/proller)) +- Fixed aliases substitution in queries with subquery containing same alias (issue [\#4110](https://github.com/ClickHouse/ClickHouse/issues/4110)). [\#4351](https://github.com/ClickHouse/ClickHouse/pull/4351) ([Artem Zuikov](https://github.com/4ertus2)) + +#### Build/Testing/Packaging Improvements {#buildtestingpackaging-improvements-4} + +- Add ability to run `clickhouse-server` for stateless tests in docker image. [\#4347](https://github.com/ClickHouse/ClickHouse/pull/4347) ([Vasily Nemkov](https://github.com/Enmk)) + +### ClickHouse release 19.3.3, 2019-02-13 {#clickhouse-release-19-3-3-2019-02-13} + +#### New Features {#new-features-6} + +- Added the `KILL MUTATION` statement that allows removing mutations that are for some reasons stuck. Added `latest_failed_part`, `latest_fail_time`, `latest_fail_reason` fields to the `system.mutations` table for easier troubleshooting. [\#4287](https://github.com/ClickHouse/ClickHouse/pull/4287) ([Alex Zatelepin](https://github.com/ztlpn)) +- Added aggregate function `entropy` which computes Shannon entropy. [\#4238](https://github.com/ClickHouse/ClickHouse/pull/4238) ([Quid37](https://github.com/Quid37)) +- Added ability to send queries `INSERT INTO tbl VALUES (....` to server without splitting on `query` and `data` parts. [\#4301](https://github.com/ClickHouse/ClickHouse/pull/4301) ([alesapin](https://github.com/alesapin)) +- Generic implementation of `arrayWithConstant` function was added. [\#4322](https://github.com/ClickHouse/ClickHouse/pull/4322) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Implemented `NOT BETWEEN` comparison operator. [\#4228](https://github.com/ClickHouse/ClickHouse/pull/4228) ([Dmitry Naumov](https://github.com/nezed)) +- Implement `sumMapFiltered` in order to be able to limit the number of keys for which values will be summed by `sumMap`. [\#4129](https://github.com/ClickHouse/ClickHouse/pull/4129) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) +- Added support of `Nullable` types in `mysql` table function. [\#4198](https://github.com/ClickHouse/ClickHouse/pull/4198) ([Emmanuel Donin de Rosière](https://github.com/edonin)) +- Support for arbitrary constant expressions in `LIMIT` clause. [\#4246](https://github.com/ClickHouse/ClickHouse/pull/4246) ([k3box](https://github.com/k3box)) +- Added `topKWeighted` aggregate function that takes additional argument with (unsigned integer) weight. [\#4245](https://github.com/ClickHouse/ClickHouse/pull/4245) ([Andrew Golman](https://github.com/andrewgolman)) +- `StorageJoin` now supports `join_any_take_last_row` setting that allows overwriting existing values of the same key. [\#3973](https://github.com/ClickHouse/ClickHouse/pull/3973) ([Amos Bird](https://github.com/amosbird) +- Added function `toStartOfInterval`. [\#4304](https://github.com/ClickHouse/ClickHouse/pull/4304) ([Vitaly Baranov](https://github.com/vitlibar)) +- Added `RowBinaryWithNamesAndTypes` format. [\#4200](https://github.com/ClickHouse/ClickHouse/pull/4200) ([Oleg V. Kozlyuk](https://github.com/DarkWanderer)) +- Added `IPv4` and `IPv6` data types. More effective implementations of `IPv*` functions. [\#3669](https://github.com/ClickHouse/ClickHouse/pull/3669) ([Vasily Nemkov](https://github.com/Enmk)) +- Added function `toStartOfTenMinutes()`. [\#4298](https://github.com/ClickHouse/ClickHouse/pull/4298) ([Vitaly Baranov](https://github.com/vitlibar)) +- Added `Protobuf` output format. [\#4005](https://github.com/ClickHouse/ClickHouse/pull/4005) [\#4158](https://github.com/ClickHouse/ClickHouse/pull/4158) ([Vitaly Baranov](https://github.com/vitlibar)) +- Added brotli support for HTTP interface for data import (INSERTs). [\#4235](https://github.com/ClickHouse/ClickHouse/pull/4235) ([Mikhail](https://github.com/fandyushin)) +- Added hints while user make typo in function name or type in command line client. [\#4239](https://github.com/ClickHouse/ClickHouse/pull/4239) ([Danila Kutenin](https://github.com/danlark1)) +- Added `Query-Id` to Server’s HTTP Response header. [\#4231](https://github.com/ClickHouse/ClickHouse/pull/4231) ([Mikhail](https://github.com/fandyushin)) + +#### Experimental features {#experimental-features-2} + +- Added `minmax` and `set` data skipping indices for MergeTree table engines family. [\#4143](https://github.com/ClickHouse/ClickHouse/pull/4143) ([Nikita Vasilev](https://github.com/nikvas0)) +- Added conversion of `CROSS JOIN` to `INNER JOIN` if possible. [\#4221](https://github.com/ClickHouse/ClickHouse/pull/4221) [\#4266](https://github.com/ClickHouse/ClickHouse/pull/4266) ([Artem Zuikov](https://github.com/4ertus2)) + +#### Bug Fixes {#bug-fixes-17} + +- Fixed `Not found column` for duplicate columns in `JOIN ON` section. [\#4279](https://github.com/ClickHouse/ClickHouse/pull/4279) ([Artem Zuikov](https://github.com/4ertus2)) +- Make `START REPLICATED SENDS` command start replicated sends. [\#4229](https://github.com/ClickHouse/ClickHouse/pull/4229) ([nvartolomei](https://github.com/nvartolomei)) +- Fixed aggregate functions execution with `Array(LowCardinality)` arguments. [\#4055](https://github.com/ClickHouse/ClickHouse/pull/4055) ([KochetovNicolai](https://github.com/KochetovNicolai)) +- Fixed wrong behaviour when doing `INSERT ... SELECT ... FROM file(...)` query and file has `CSVWithNames` or `TSVWIthNames` format and the first data row is missing. [\#4297](https://github.com/ClickHouse/ClickHouse/pull/4297) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed crash on dictionary reload if dictionary not available. This bug was appeared in 19.1.6. [\#4188](https://github.com/ClickHouse/ClickHouse/pull/4188) ([proller](https://github.com/proller)) +- Fixed `ALL JOIN` with duplicates in right table. [\#4184](https://github.com/ClickHouse/ClickHouse/pull/4184) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed segmentation fault with `use_uncompressed_cache=1` and exception with wrong uncompressed size. This bug was appeared in 19.1.6. [\#4186](https://github.com/ClickHouse/ClickHouse/pull/4186) ([alesapin](https://github.com/alesapin)) +- Fixed `compile_expressions` bug with comparison of big (more than int16) dates. [\#4341](https://github.com/ClickHouse/ClickHouse/pull/4341) ([alesapin](https://github.com/alesapin)) +- Fixed infinite loop when selecting from table function `numbers(0)`. [\#4280](https://github.com/ClickHouse/ClickHouse/pull/4280) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Temporarily disable predicate optimization for `ORDER BY`. [\#3890](https://github.com/ClickHouse/ClickHouse/pull/3890) ([Winter Zhang](https://github.com/zhang2014)) +- Fixed `Illegal instruction` error when using base64 functions on old CPUs. This error has been reproduced only when ClickHouse was compiled with gcc-8. [\#4275](https://github.com/ClickHouse/ClickHouse/pull/4275) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed `No message received` error when interacting with PostgreSQL ODBC Driver through TLS connection. Also fixes segfault when using MySQL ODBC Driver. [\#4170](https://github.com/ClickHouse/ClickHouse/pull/4170) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed incorrect result when `Date` and `DateTime` arguments are used in branches of conditional operator (function `if`). Added generic case for function `if`. [\#4243](https://github.com/ClickHouse/ClickHouse/pull/4243) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- ClickHouse dictionaries now load within `clickhouse` process. [\#4166](https://github.com/ClickHouse/ClickHouse/pull/4166) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed deadlock when `SELECT` from a table with `File` engine was retried after `No such file or directory` error. [\#4161](https://github.com/ClickHouse/ClickHouse/pull/4161) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed race condition when selecting from `system.tables` may give `table doesn't exist` error. [\#4313](https://github.com/ClickHouse/ClickHouse/pull/4313) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- `clickhouse-client` can segfault on exit while loading data for command line suggestions if it was run in interactive mode. [\#4317](https://github.com/ClickHouse/ClickHouse/pull/4317) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed a bug when the execution of mutations containing `IN` operators was producing incorrect results. [\#4099](https://github.com/ClickHouse/ClickHouse/pull/4099) ([Alex Zatelepin](https://github.com/ztlpn)) +- Fixed error: if there is a database with `Dictionary` engine, all dictionaries forced to load at server startup, and if there is a dictionary with ClickHouse source from localhost, the dictionary cannot load. [\#4255](https://github.com/ClickHouse/ClickHouse/pull/4255) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed error when system logs are tried to create again at server shutdown. [\#4254](https://github.com/ClickHouse/ClickHouse/pull/4254) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Correctly return the right type and properly handle locks in `joinGet` function. [\#4153](https://github.com/ClickHouse/ClickHouse/pull/4153) ([Amos Bird](https://github.com/amosbird)) +- Added `sumMapWithOverflow` function. [\#4151](https://github.com/ClickHouse/ClickHouse/pull/4151) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) +- Fixed segfault with `allow_experimental_multiple_joins_emulation`. [52de2c](https://github.com/ClickHouse/ClickHouse/commit/52de2cd927f7b5257dd67e175f0a5560a48840d0) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed bug with incorrect `Date` and `DateTime` comparison. [\#4237](https://github.com/ClickHouse/ClickHouse/pull/4237) ([valexey](https://github.com/valexey)) +- Fixed fuzz test under undefined behavior sanitizer: added parameter type check for `quantile*Weighted` family of functions. [\#4145](https://github.com/ClickHouse/ClickHouse/pull/4145) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed rare race condition when removing of old data parts can fail with `File not found` error. [\#4378](https://github.com/ClickHouse/ClickHouse/pull/4378) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix install package with missing /etc/clickhouse-server/config.xml. [\#4343](https://github.com/ClickHouse/ClickHouse/pull/4343) ([proller](https://github.com/proller)) + +#### Build/Testing/Packaging Improvements {#buildtestingpackaging-improvements-5} + +- Debian package: correct /etc/clickhouse-server/preprocessed link according to config. [\#4205](https://github.com/ClickHouse/ClickHouse/pull/4205) ([proller](https://github.com/proller)) +- Various build fixes for FreeBSD. [\#4225](https://github.com/ClickHouse/ClickHouse/pull/4225) ([proller](https://github.com/proller)) +- Added ability to create, fill and drop tables in perftest. [\#4220](https://github.com/ClickHouse/ClickHouse/pull/4220) ([alesapin](https://github.com/alesapin)) +- Added a script to check for duplicate includes. [\#4326](https://github.com/ClickHouse/ClickHouse/pull/4326) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added ability to run queries by index in performance test. [\#4264](https://github.com/ClickHouse/ClickHouse/pull/4264) ([alesapin](https://github.com/alesapin)) +- Package with debug symbols is suggested to be installed. [\#4274](https://github.com/ClickHouse/ClickHouse/pull/4274) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Refactoring of performance-test. Better logging and signals handling. [\#4171](https://github.com/ClickHouse/ClickHouse/pull/4171) ([alesapin](https://github.com/alesapin)) +- Added docs to anonymized Yandex.Metrika datasets. [\#4164](https://github.com/ClickHouse/ClickHouse/pull/4164) ([alesapin](https://github.com/alesapin)) +- Аdded tool for converting an old month-partitioned part to the custom-partitioned format. [\#4195](https://github.com/ClickHouse/ClickHouse/pull/4195) ([Alex Zatelepin](https://github.com/ztlpn)) +- Added docs about two datasets in s3. [\#4144](https://github.com/ClickHouse/ClickHouse/pull/4144) ([alesapin](https://github.com/alesapin)) +- Added script which creates changelog from pull requests description. [\#4169](https://github.com/ClickHouse/ClickHouse/pull/4169) [\#4173](https://github.com/ClickHouse/ClickHouse/pull/4173) ([KochetovNicolai](https://github.com/KochetovNicolai)) ([KochetovNicolai](https://github.com/KochetovNicolai)) +- Added puppet module for Clickhouse. [\#4182](https://github.com/ClickHouse/ClickHouse/pull/4182) ([Maxim Fedotov](https://github.com/MaxFedotov)) +- Added docs for a group of undocumented functions. [\#4168](https://github.com/ClickHouse/ClickHouse/pull/4168) ([Winter Zhang](https://github.com/zhang2014)) +- ARM build fixes. [\#4210](https://github.com/ClickHouse/ClickHouse/pull/4210)[\#4306](https://github.com/ClickHouse/ClickHouse/pull/4306) [\#4291](https://github.com/ClickHouse/ClickHouse/pull/4291) ([proller](https://github.com/proller)) ([proller](https://github.com/proller)) +- Dictionary tests now able to run from `ctest`. [\#4189](https://github.com/ClickHouse/ClickHouse/pull/4189) ([proller](https://github.com/proller)) +- Now `/etc/ssl` is used as default directory with SSL certificates. [\#4167](https://github.com/ClickHouse/ClickHouse/pull/4167) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added checking SSE and AVX instruction at start. [\#4234](https://github.com/ClickHouse/ClickHouse/pull/4234) ([Igr](https://github.com/igron99)) +- Init script will wait server until start. [\#4281](https://github.com/ClickHouse/ClickHouse/pull/4281) ([proller](https://github.com/proller)) + +#### Backward Incompatible Changes {#backward-incompatible-changes-1} + +- Removed `allow_experimental_low_cardinality_type` setting. `LowCardinality` data types are production ready. [\#4323](https://github.com/ClickHouse/ClickHouse/pull/4323) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Reduce mark cache size and uncompressed cache size accordingly to available memory amount. [\#4240](https://github.com/ClickHouse/ClickHouse/pull/4240) ([Lopatin Konstantin](https://github.com/k-lopatin) +- Added keyword `INDEX` in `CREATE TABLE` query. A column with name `index` must be quoted with backticks or double quotes: `` `index` ``. [\#4143](https://github.com/ClickHouse/ClickHouse/pull/4143) ([Nikita Vasilev](https://github.com/nikvas0)) +- `sumMap` now promote result type instead of overflow. The old `sumMap` behavior can be obtained by using `sumMapWithOverflow` function. [\#4151](https://github.com/ClickHouse/ClickHouse/pull/4151) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) + +#### Performance Improvements {#performance-improvements-4} + +- `std::sort` replaced by `pdqsort` for queries without `LIMIT`. [\#4236](https://github.com/ClickHouse/ClickHouse/pull/4236) ([Evgenii Pravda](https://github.com/kvinty)) +- Now server reuse threads from global thread pool. This affects performance in some corner cases. [\#4150](https://github.com/ClickHouse/ClickHouse/pull/4150) ([alexey-milovidov](https://github.com/alexey-milovidov)) + +#### Improvements {#improvements-5} + +- Implemented AIO support for FreeBSD. [\#4305](https://github.com/ClickHouse/ClickHouse/pull/4305) ([urgordeadbeef](https://github.com/urgordeadbeef)) +- `SELECT * FROM a JOIN b USING a, b` now return `a` and `b` columns only from the left table. [\#4141](https://github.com/ClickHouse/ClickHouse/pull/4141) ([Artem Zuikov](https://github.com/4ertus2)) +- Allow `-C` option of client to work as `-c` option. [\#4232](https://github.com/ClickHouse/ClickHouse/pull/4232) ([syominsergey](https://github.com/syominsergey)) +- Now option `--password` used without value requires password from stdin. [\#4230](https://github.com/ClickHouse/ClickHouse/pull/4230) ([BSD\_Conqueror](https://github.com/bsd-conqueror)) +- Added highlighting of unescaped metacharacters in string literals that contain `LIKE` expressions or regexps. [\#4327](https://github.com/ClickHouse/ClickHouse/pull/4327) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added cancelling of HTTP read only queries if client socket goes away. [\#4213](https://github.com/ClickHouse/ClickHouse/pull/4213) ([nvartolomei](https://github.com/nvartolomei)) +- Now server reports progress to keep client connections alive. [\#4215](https://github.com/ClickHouse/ClickHouse/pull/4215) ([Ivan](https://github.com/abyss7)) +- Slightly better message with reason for OPTIMIZE query with `optimize_throw_if_noop` setting enabled. [\#4294](https://github.com/ClickHouse/ClickHouse/pull/4294) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added support of `--version` option for clickhouse server. [\#4251](https://github.com/ClickHouse/ClickHouse/pull/4251) ([Lopatin Konstantin](https://github.com/k-lopatin)) +- Added `--help/-h` option to `clickhouse-server`. [\#4233](https://github.com/ClickHouse/ClickHouse/pull/4233) ([Yuriy Baranov](https://github.com/yurriy)) +- Added support for scalar subqueries with aggregate function state result. [\#4348](https://github.com/ClickHouse/ClickHouse/pull/4348) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +- Improved server shutdown time and ALTERs waiting time. [\#4372](https://github.com/ClickHouse/ClickHouse/pull/4372) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added info about the replicated\_can\_become\_leader setting to system.replicas and add logging if the replica won’t try to become leader. [\#4379](https://github.com/ClickHouse/ClickHouse/pull/4379) ([Alex Zatelepin](https://github.com/ztlpn)) + +## ClickHouse release 19.1 {#clickhouse-release-19-1} + +### ClickHouse release 19.1.14, 2019-03-14 {#clickhouse-release-19-1-14-2019-03-14} + +- Fixed error `Column ... queried more than once` that may happen if the setting `asterisk_left_columns_only` is set to 1 in case of using `GLOBAL JOIN` with `SELECT *` (rare case). The issue does not exist in 19.3 and newer. [6bac7d8d](https://github.com/ClickHouse/ClickHouse/pull/4692/commits/6bac7d8d11a9b0d6de0b32b53c47eb2f6f8e7062) ([Artem Zuikov](https://github.com/4ertus2)) + +### ClickHouse release 19.1.13, 2019-03-12 {#clickhouse-release-19-1-13-2019-03-12} This release contains exactly the same set of patches as 19.3.7. -### ClickHouse release 19.1.10, 2019-03-03 +### ClickHouse release 19.1.10, 2019-03-03 {#clickhouse-release-19-1-10-2019-03-03} This release contains exactly the same set of patches as 19.3.6. +## ClickHouse release 19.1 {#clickhouse-release-19-1-1} -## ClickHouse release 19.1 -### ClickHouse release 19.1.9, 2019-02-21 +### ClickHouse release 19.1.9, 2019-02-21 {#clickhouse-release-19-1-9-2019-02-21} -#### Bug fixes -* Fixed backward incompatibility with old versions due to wrong implementation of `send_logs_level` setting. [#4445](https://github.com/ClickHouse/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed backward incompatibility of table function `remote` introduced with column comments. [#4446](https://github.com/ClickHouse/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov)) +#### Bug fixes {#bug-fixes-18} -### ClickHouse release 19.1.8, 2019-02-16 +- Fixed backward incompatibility with old versions due to wrong implementation of `send_logs_level` setting. [\#4445](https://github.com/ClickHouse/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed backward incompatibility of table function `remote` introduced with column comments. [\#4446](https://github.com/ClickHouse/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov)) -#### Bug Fixes -* Fix install package with missing /etc/clickhouse-server/config.xml. [#4343](https://github.com/ClickHouse/ClickHouse/pull/4343) ([proller](https://github.com/proller)) +### ClickHouse release 19.1.8, 2019-02-16 {#clickhouse-release-19-1-8-2019-02-16} +#### Bug Fixes {#bug-fixes-19} -## ClickHouse release 19.1 -### ClickHouse release 19.1.7, 2019-02-15 +- Fix install package with missing /etc/clickhouse-server/config.xml. [\#4343](https://github.com/ClickHouse/ClickHouse/pull/4343) ([proller](https://github.com/proller)) -#### Bug Fixes -* Correctly return the right type and properly handle locks in `joinGet` function. [#4153](https://github.com/ClickHouse/ClickHouse/pull/4153) ([Amos Bird](https://github.com/amosbird)) -* Fixed error when system logs are tried to create again at server shutdown. [#4254](https://github.com/ClickHouse/ClickHouse/pull/4254) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error: if there is a database with `Dictionary` engine, all dictionaries forced to load at server startup, and if there is a dictionary with ClickHouse source from localhost, the dictionary cannot load. [#4255](https://github.com/ClickHouse/ClickHouse/pull/4255) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a bug when the execution of mutations containing `IN` operators was producing incorrect results. [#4099](https://github.com/ClickHouse/ClickHouse/pull/4099) ([Alex Zatelepin](https://github.com/ztlpn)) -* `clickhouse-client` can segfault on exit while loading data for command line suggestions if it was run in interactive mode. [#4317](https://github.com/ClickHouse/ClickHouse/pull/4317) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed race condition when selecting from `system.tables` may give `table doesn't exist` error. [#4313](https://github.com/ClickHouse/ClickHouse/pull/4313) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed deadlock when `SELECT` from a table with `File` engine was retried after `No such file or directory` error. [#4161](https://github.com/ClickHouse/ClickHouse/pull/4161) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed an issue: local ClickHouse dictionaries are loaded via TCP, but should load within process. [#4166](https://github.com/ClickHouse/ClickHouse/pull/4166) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed `No message received` error when interacting with PostgreSQL ODBC Driver through TLS connection. Also fixes segfault when using MySQL ODBC Driver. [#4170](https://github.com/ClickHouse/ClickHouse/pull/4170) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Temporarily disable predicate optimization for `ORDER BY`. [#3890](https://github.com/ClickHouse/ClickHouse/pull/3890) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed infinite loop when selecting from table function `numbers(0)`. [#4280](https://github.com/ClickHouse/ClickHouse/pull/4280) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed `compile_expressions` bug with comparison of big (more than int16) dates. [#4341](https://github.com/ClickHouse/ClickHouse/pull/4341) ([alesapin](https://github.com/alesapin)) -* Fixed segmentation fault with `uncompressed_cache=1` and exception with wrong uncompressed size. [#4186](https://github.com/ClickHouse/ClickHouse/pull/4186) ([alesapin](https://github.com/alesapin)) -* Fixed `ALL JOIN` with duplicates in right table. [#4184](https://github.com/ClickHouse/ClickHouse/pull/4184) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed wrong behaviour when doing `INSERT ... SELECT ... FROM file(...)` query and file has `CSVWithNames` or `TSVWIthNames` format and the first data row is missing. [#4297](https://github.com/ClickHouse/ClickHouse/pull/4297) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed aggregate functions execution with `Array(LowCardinality)` arguments. [#4055](https://github.com/ClickHouse/ClickHouse/pull/4055) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Debian package: correct /etc/clickhouse-server/preprocessed link according to config. [#4205](https://github.com/ClickHouse/ClickHouse/pull/4205) ([proller](https://github.com/proller)) -* Fixed fuzz test under undefined behavior sanitizer: added parameter type check for `quantile*Weighted` family of functions. [#4145](https://github.com/ClickHouse/ClickHouse/pull/4145) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Make `START REPLICATED SENDS` command start replicated sends. [#4229](https://github.com/ClickHouse/ClickHouse/pull/4229) ([nvartolomei](https://github.com/nvartolomei)) -* Fixed `Not found column` for duplicate columns in JOIN ON section. [#4279](https://github.com/ClickHouse/ClickHouse/pull/4279) ([Artem Zuikov](https://github.com/4ertus2)) -* Now `/etc/ssl` is used as default directory with SSL certificates. [#4167](https://github.com/ClickHouse/ClickHouse/pull/4167) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed crash on dictionary reload if dictionary not available. [#4188](https://github.com/ClickHouse/ClickHouse/pull/4188) ([proller](https://github.com/proller)) -* Fixed bug with incorrect `Date` and `DateTime` comparison. [#4237](https://github.com/ClickHouse/ClickHouse/pull/4237) ([valexey](https://github.com/valexey)) -* Fixed incorrect result when `Date` and `DateTime` arguments are used in branches of conditional operator (function `if`). Added generic case for function `if`. [#4243](https://github.com/ClickHouse/ClickHouse/pull/4243) ([alexey-milovidov](https://github.com/alexey-milovidov)) +## ClickHouse release 19.1 {#clickhouse-release-19-1-2} -### ClickHouse release 19.1.6, 2019-01-24 +### ClickHouse release 19.1.7, 2019-02-15 {#clickhouse-release-19-1-7-2019-02-15} -#### New Features +#### Bug Fixes {#bug-fixes-20} -* Custom per column compression codecs for tables. [#3899](https://github.com/ClickHouse/ClickHouse/pull/3899) [#4111](https://github.com/ClickHouse/ClickHouse/pull/4111) ([alesapin](https://github.com/alesapin), [Winter Zhang](https://github.com/zhang2014), [Anatoly](https://github.com/Sindbag)) -* Added compression codec `Delta`. [#4052](https://github.com/ClickHouse/ClickHouse/pull/4052) ([alesapin](https://github.com/alesapin)) -* Allow to `ALTER` compression codecs. [#4054](https://github.com/ClickHouse/ClickHouse/pull/4054) ([alesapin](https://github.com/alesapin)) -* Added functions `left`, `right`, `trim`, `ltrim`, `rtrim`, `timestampadd`, `timestampsub` for SQL standard compatibility. [#3826](https://github.com/ClickHouse/ClickHouse/pull/3826) ([Ivan Blinkov](https://github.com/blinkov)) -* Support for write in `HDFS` tables and `hdfs` table function. [#4084](https://github.com/ClickHouse/ClickHouse/pull/4084) ([alesapin](https://github.com/alesapin)) -* Added functions to search for multiple constant strings from big haystack: `multiPosition`, `multiSearch` ,`firstMatch` also with `-UTF8`, `-CaseInsensitive`, and `-CaseInsensitiveUTF8` variants. [#4053](https://github.com/ClickHouse/ClickHouse/pull/4053) ([Danila Kutenin](https://github.com/danlark1)) -* Pruning of unused shards if `SELECT` query filters by sharding key (setting `optimize_skip_unused_shards`). [#3851](https://github.com/ClickHouse/ClickHouse/pull/3851) ([Gleb Kanterov](https://github.com/kanterov), [Ivan](https://github.com/abyss7)) -* Allow `Kafka` engine to ignore some number of parsing errors per block. [#4094](https://github.com/ClickHouse/ClickHouse/pull/4094) ([Ivan](https://github.com/abyss7)) -* Added support for `CatBoost` multiclass models evaluation. Function `modelEvaluate` returns tuple with per-class raw predictions for multiclass models. `libcatboostmodel.so` should be built with [#607](https://github.com/catboost/catboost/pull/607). [#3959](https://github.com/ClickHouse/ClickHouse/pull/3959) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Added functions `filesystemAvailable`, `filesystemFree`, `filesystemCapacity`. [#4097](https://github.com/ClickHouse/ClickHouse/pull/4097) ([Boris Granveaud](https://github.com/bgranvea)) -* Added hashing functions `xxHash64` and `xxHash32`. [#3905](https://github.com/ClickHouse/ClickHouse/pull/3905) ([filimonov](https://github.com/filimonov)) -* Added `gccMurmurHash` hashing function (GCC flavoured Murmur hash) which uses the same hash seed as [gcc](https://github.com/gcc-mirror/gcc/blob/41d6b10e96a1de98e90a7c0378437c3255814b16/libstdc%2B%2B-v3/include/bits/functional_hash.h#L191) [#4000](https://github.com/ClickHouse/ClickHouse/pull/4000) ([sundyli](https://github.com/sundy-li)) -* Added hashing functions `javaHash`, `hiveHash`. [#3811](https://github.com/ClickHouse/ClickHouse/pull/3811) ([shangshujie365](https://github.com/shangshujie365)) -* Added table function `remoteSecure`. Function works as `remote`, but uses secure connection. [#4088](https://github.com/ClickHouse/ClickHouse/pull/4088) ([proller](https://github.com/proller)) +- Correctly return the right type and properly handle locks in `joinGet` function. [\#4153](https://github.com/ClickHouse/ClickHouse/pull/4153) ([Amos Bird](https://github.com/amosbird)) +- Fixed error when system logs are tried to create again at server shutdown. [\#4254](https://github.com/ClickHouse/ClickHouse/pull/4254) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed error: if there is a database with `Dictionary` engine, all dictionaries forced to load at server startup, and if there is a dictionary with ClickHouse source from localhost, the dictionary cannot load. [\#4255](https://github.com/ClickHouse/ClickHouse/pull/4255) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed a bug when the execution of mutations containing `IN` operators was producing incorrect results. [\#4099](https://github.com/ClickHouse/ClickHouse/pull/4099) ([Alex Zatelepin](https://github.com/ztlpn)) +- `clickhouse-client` can segfault on exit while loading data for command line suggestions if it was run in interactive mode. [\#4317](https://github.com/ClickHouse/ClickHouse/pull/4317) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed race condition when selecting from `system.tables` may give `table doesn't exist` error. [\#4313](https://github.com/ClickHouse/ClickHouse/pull/4313) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed deadlock when `SELECT` from a table with `File` engine was retried after `No such file or directory` error. [\#4161](https://github.com/ClickHouse/ClickHouse/pull/4161) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed an issue: local ClickHouse dictionaries are loaded via TCP, but should load within process. [\#4166](https://github.com/ClickHouse/ClickHouse/pull/4166) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed `No message received` error when interacting with PostgreSQL ODBC Driver through TLS connection. Also fixes segfault when using MySQL ODBC Driver. [\#4170](https://github.com/ClickHouse/ClickHouse/pull/4170) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Temporarily disable predicate optimization for `ORDER BY`. [\#3890](https://github.com/ClickHouse/ClickHouse/pull/3890) ([Winter Zhang](https://github.com/zhang2014)) +- Fixed infinite loop when selecting from table function `numbers(0)`. [\#4280](https://github.com/ClickHouse/ClickHouse/pull/4280) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed `compile_expressions` bug with comparison of big (more than int16) dates. [\#4341](https://github.com/ClickHouse/ClickHouse/pull/4341) ([alesapin](https://github.com/alesapin)) +- Fixed segmentation fault with `uncompressed_cache=1` and exception with wrong uncompressed size. [\#4186](https://github.com/ClickHouse/ClickHouse/pull/4186) ([alesapin](https://github.com/alesapin)) +- Fixed `ALL JOIN` with duplicates in right table. [\#4184](https://github.com/ClickHouse/ClickHouse/pull/4184) ([Artem Zuikov](https://github.com/4ertus2)) +- Fixed wrong behaviour when doing `INSERT ... SELECT ... FROM file(...)` query and file has `CSVWithNames` or `TSVWIthNames` format and the first data row is missing. [\#4297](https://github.com/ClickHouse/ClickHouse/pull/4297) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed aggregate functions execution with `Array(LowCardinality)` arguments. [\#4055](https://github.com/ClickHouse/ClickHouse/pull/4055) ([KochetovNicolai](https://github.com/KochetovNicolai)) +- Debian package: correct /etc/clickhouse-server/preprocessed link according to config. [\#4205](https://github.com/ClickHouse/ClickHouse/pull/4205) ([proller](https://github.com/proller)) +- Fixed fuzz test under undefined behavior sanitizer: added parameter type check for `quantile*Weighted` family of functions. [\#4145](https://github.com/ClickHouse/ClickHouse/pull/4145) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Make `START REPLICATED SENDS` command start replicated sends. [\#4229](https://github.com/ClickHouse/ClickHouse/pull/4229) ([nvartolomei](https://github.com/nvartolomei)) +- Fixed `Not found column` for duplicate columns in JOIN ON section. [\#4279](https://github.com/ClickHouse/ClickHouse/pull/4279) ([Artem Zuikov](https://github.com/4ertus2)) +- Now `/etc/ssl` is used as default directory with SSL certificates. [\#4167](https://github.com/ClickHouse/ClickHouse/pull/4167) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed crash on dictionary reload if dictionary not available. [\#4188](https://github.com/ClickHouse/ClickHouse/pull/4188) ([proller](https://github.com/proller)) +- Fixed bug with incorrect `Date` and `DateTime` comparison. [\#4237](https://github.com/ClickHouse/ClickHouse/pull/4237) ([valexey](https://github.com/valexey)) +- Fixed incorrect result when `Date` and `DateTime` arguments are used in branches of conditional operator (function `if`). Added generic case for function `if`. [\#4243](https://github.com/ClickHouse/ClickHouse/pull/4243) ([alexey-milovidov](https://github.com/alexey-milovidov)) +### ClickHouse release 19.1.6, 2019-01-24 {#clickhouse-release-19-1-6-2019-01-24} -#### Experimental features +#### New Features {#new-features-7} -* Added multiple JOINs emulation (`allow_experimental_multiple_joins_emulation` setting). [#3946](https://github.com/ClickHouse/ClickHouse/pull/3946) ([Artem Zuikov](https://github.com/4ertus2)) +- Custom per column compression codecs for tables. [\#3899](https://github.com/ClickHouse/ClickHouse/pull/3899) [\#4111](https://github.com/ClickHouse/ClickHouse/pull/4111) ([alesapin](https://github.com/alesapin), [Winter Zhang](https://github.com/zhang2014), [Anatoly](https://github.com/Sindbag)) +- Added compression codec `Delta`. [\#4052](https://github.com/ClickHouse/ClickHouse/pull/4052) ([alesapin](https://github.com/alesapin)) +- Allow to `ALTER` compression codecs. [\#4054](https://github.com/ClickHouse/ClickHouse/pull/4054) ([alesapin](https://github.com/alesapin)) +- Added functions `left`, `right`, `trim`, `ltrim`, `rtrim`, `timestampadd`, `timestampsub` for SQL standard compatibility. [\#3826](https://github.com/ClickHouse/ClickHouse/pull/3826) ([Ivan Blinkov](https://github.com/blinkov)) +- Support for write in `HDFS` tables and `hdfs` table function. [\#4084](https://github.com/ClickHouse/ClickHouse/pull/4084) ([alesapin](https://github.com/alesapin)) +- Added functions to search for multiple constant strings from big haystack: `multiPosition`, `multiSearch` ,`firstMatch` also with `-UTF8`, `-CaseInsensitive`, and `-CaseInsensitiveUTF8` variants. [\#4053](https://github.com/ClickHouse/ClickHouse/pull/4053) ([Danila Kutenin](https://github.com/danlark1)) +- Pruning of unused shards if `SELECT` query filters by sharding key (setting `optimize_skip_unused_shards`). [\#3851](https://github.com/ClickHouse/ClickHouse/pull/3851) ([Gleb Kanterov](https://github.com/kanterov), [Ivan](https://github.com/abyss7)) +- Allow `Kafka` engine to ignore some number of parsing errors per block. [\#4094](https://github.com/ClickHouse/ClickHouse/pull/4094) ([Ivan](https://github.com/abyss7)) +- Added support for `CatBoost` multiclass models evaluation. Function `modelEvaluate` returns tuple with per-class raw predictions for multiclass models. `libcatboostmodel.so` should be built with [\#607](https://github.com/catboost/catboost/pull/607). [\#3959](https://github.com/ClickHouse/ClickHouse/pull/3959) ([KochetovNicolai](https://github.com/KochetovNicolai)) +- Added functions `filesystemAvailable`, `filesystemFree`, `filesystemCapacity`. [\#4097](https://github.com/ClickHouse/ClickHouse/pull/4097) ([Boris Granveaud](https://github.com/bgranvea)) +- Added hashing functions `xxHash64` and `xxHash32`. [\#3905](https://github.com/ClickHouse/ClickHouse/pull/3905) ([filimonov](https://github.com/filimonov)) +- Added `gccMurmurHash` hashing function (GCC flavoured Murmur hash) which uses the same hash seed as [gcc](https://github.com/gcc-mirror/gcc/blob/41d6b10e96a1de98e90a7c0378437c3255814b16/libstdc%2B%2B-v3/include/bits/functional_hash.h#L191) [\#4000](https://github.com/ClickHouse/ClickHouse/pull/4000) ([sundyli](https://github.com/sundy-li)) +- Added hashing functions `javaHash`, `hiveHash`. [\#3811](https://github.com/ClickHouse/ClickHouse/pull/3811) ([shangshujie365](https://github.com/shangshujie365)) +- Added table function `remoteSecure`. Function works as `remote`, but uses secure connection. [\#4088](https://github.com/ClickHouse/ClickHouse/pull/4088) ([proller](https://github.com/proller)) +#### Experimental features {#experimental-features-3} -#### Bug Fixes +- Added multiple JOINs emulation (`allow_experimental_multiple_joins_emulation` setting). [\#3946](https://github.com/ClickHouse/ClickHouse/pull/3946) ([Artem Zuikov](https://github.com/4ertus2)) -* Make `compiled_expression_cache_size` setting limited by default to lower memory consumption. [#4041](https://github.com/ClickHouse/ClickHouse/pull/4041) ([alesapin](https://github.com/alesapin)) -* Fix a bug that led to hangups in threads that perform ALTERs of Replicated tables and in the thread that updates configuration from ZooKeeper. [#2947](https://github.com/ClickHouse/ClickHouse/issues/2947) [#3891](https://github.com/ClickHouse/ClickHouse/issues/3891) [#3934](https://github.com/ClickHouse/ClickHouse/pull/3934) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fixed a race condition when executing a distributed ALTER task. The race condition led to more than one replica trying to execute the task and all replicas except one failing with a ZooKeeper error. [#3904](https://github.com/ClickHouse/ClickHouse/pull/3904) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix a bug when `from_zk` config elements weren't refreshed after a request to ZooKeeper timed out. [#2947](https://github.com/ClickHouse/ClickHouse/issues/2947) [#3947](https://github.com/ClickHouse/ClickHouse/pull/3947) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix bug with wrong prefix for IPv4 subnet masks. [#3945](https://github.com/ClickHouse/ClickHouse/pull/3945) ([alesapin](https://github.com/alesapin)) -* Fixed crash (`std::terminate`) in rare cases when a new thread cannot be created due to exhausted resources. [#3956](https://github.com/ClickHouse/ClickHouse/pull/3956) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug when in `remote` table function execution when wrong restrictions were used for in `getStructureOfRemoteTable`. [#4009](https://github.com/ClickHouse/ClickHouse/pull/4009) ([alesapin](https://github.com/alesapin)) -* Fix a leak of netlink sockets. They were placed in a pool where they were never deleted and new sockets were created at the start of a new thread when all current sockets were in use. [#4017](https://github.com/ClickHouse/ClickHouse/pull/4017) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix bug with closing `/proc/self/fd` directory earlier than all fds were read from `/proc` after forking `odbc-bridge` subprocess. [#4120](https://github.com/ClickHouse/ClickHouse/pull/4120) ([alesapin](https://github.com/alesapin)) -* Fixed String to UInt monotonic conversion in case of usage String in primary key. [#3870](https://github.com/ClickHouse/ClickHouse/pull/3870) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed error in calculation of integer conversion function monotonicity. [#3921](https://github.com/ClickHouse/ClickHouse/pull/3921) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed segfault in `arrayEnumerateUniq`, `arrayEnumerateDense` functions in case of some invalid arguments. [#3909](https://github.com/ClickHouse/ClickHouse/pull/3909) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix UB in StorageMerge. [#3910](https://github.com/ClickHouse/ClickHouse/pull/3910) ([Amos Bird](https://github.com/amosbird)) -* Fixed segfault in functions `addDays`, `subtractDays`. [#3913](https://github.com/ClickHouse/ClickHouse/pull/3913) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error: functions `round`, `floor`, `trunc`, `ceil` may return bogus result when executed on integer argument and large negative scale. [#3914](https://github.com/ClickHouse/ClickHouse/pull/3914) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a bug induced by 'kill query sync' which leads to a core dump. [#3916](https://github.com/ClickHouse/ClickHouse/pull/3916) ([muVulDeePecker](https://github.com/fancyqlx)) -* Fix bug with long delay after empty replication queue. [#3928](https://github.com/ClickHouse/ClickHouse/pull/3928) [#3932](https://github.com/ClickHouse/ClickHouse/pull/3932) ([alesapin](https://github.com/alesapin)) -* Fixed excessive memory usage in case of inserting into table with `LowCardinality` primary key. [#3955](https://github.com/ClickHouse/ClickHouse/pull/3955) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fixed `LowCardinality` serialization for `Native` format in case of empty arrays. [#3907](https://github.com/ClickHouse/ClickHouse/issues/3907) [#4011](https://github.com/ClickHouse/ClickHouse/pull/4011) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fixed incorrect result while using distinct by single LowCardinality numeric column. [#3895](https://github.com/ClickHouse/ClickHouse/issues/3895) [#4012](https://github.com/ClickHouse/ClickHouse/pull/4012) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fixed specialized aggregation with LowCardinality key (in case when `compile` setting is enabled). [#3886](https://github.com/ClickHouse/ClickHouse/pull/3886) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fix user and password forwarding for replicated tables queries. [#3957](https://github.com/ClickHouse/ClickHouse/pull/3957) ([alesapin](https://github.com/alesapin)) ([小路](https://github.com/nicelulu)) -* Fixed very rare race condition that can happen when listing tables in Dictionary database while reloading dictionaries. [#3970](https://github.com/ClickHouse/ClickHouse/pull/3970) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed incorrect result when HAVING was used with ROLLUP or CUBE. [#3756](https://github.com/ClickHouse/ClickHouse/issues/3756) [#3837](https://github.com/ClickHouse/ClickHouse/pull/3837) ([Sam Chou](https://github.com/reflection)) -* Fixed column aliases for query with `JOIN ON` syntax and distributed tables. [#3980](https://github.com/ClickHouse/ClickHouse/pull/3980) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed error in internal implementation of `quantileTDigest` (found by Artem Vakhrushev). This error never happens in ClickHouse and was relevant only for those who use ClickHouse codebase as a library directly. [#3935](https://github.com/ClickHouse/ClickHouse/pull/3935) ([alexey-milovidov](https://github.com/alexey-milovidov)) +#### Bug Fixes {#bug-fixes-21} -#### Improvements +- Make `compiled_expression_cache_size` setting limited by default to lower memory consumption. [\#4041](https://github.com/ClickHouse/ClickHouse/pull/4041) ([alesapin](https://github.com/alesapin)) +- Fix a bug that led to hangups in threads that perform ALTERs of Replicated tables and in the thread that updates configuration from ZooKeeper. [\#2947](https://github.com/ClickHouse/ClickHouse/issues/2947) [\#3891](https://github.com/ClickHouse/ClickHouse/issues/3891) [\#3934](https://github.com/ClickHouse/ClickHouse/pull/3934) ([Alex Zatelepin](https://github.com/ztlpn)) +- Fixed a race condition when executing a distributed ALTER task. The race condition led to more than one replica trying to execute the task and all replicas except one failing with a ZooKeeper error. [\#3904](https://github.com/ClickHouse/ClickHouse/pull/3904) ([Alex Zatelepin](https://github.com/ztlpn)) +- Fix a bug when `from_zk` config elements weren’t refreshed after a request to ZooKeeper timed out. [\#2947](https://github.com/ClickHouse/ClickHouse/issues/2947) [\#3947](https://github.com/ClickHouse/ClickHouse/pull/3947) ([Alex Zatelepin](https://github.com/ztlpn)) +- Fix bug with wrong prefix for IPv4 subnet masks. [\#3945](https://github.com/ClickHouse/ClickHouse/pull/3945) ([alesapin](https://github.com/alesapin)) +- Fixed crash (`std::terminate`) in rare cases when a new thread cannot be created due to exhausted resources. [\#3956](https://github.com/ClickHouse/ClickHouse/pull/3956) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix bug when in `remote` table function execution when wrong restrictions were used for in `getStructureOfRemoteTable`. [\#4009](https://github.com/ClickHouse/ClickHouse/pull/4009) ([alesapin](https://github.com/alesapin)) +- Fix a leak of netlink sockets. They were placed in a pool where they were never deleted and new sockets were created at the start of a new thread when all current sockets were in use. [\#4017](https://github.com/ClickHouse/ClickHouse/pull/4017) ([Alex Zatelepin](https://github.com/ztlpn)) +- Fix bug with closing `/proc/self/fd` directory earlier than all fds were read from `/proc` after forking `odbc-bridge` subprocess. [\#4120](https://github.com/ClickHouse/ClickHouse/pull/4120) ([alesapin](https://github.com/alesapin)) +- Fixed String to UInt monotonic conversion in case of usage String in primary key. [\#3870](https://github.com/ClickHouse/ClickHouse/pull/3870) ([Winter Zhang](https://github.com/zhang2014)) +- Fixed error in calculation of integer conversion function monotonicity. [\#3921](https://github.com/ClickHouse/ClickHouse/pull/3921) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed segfault in `arrayEnumerateUniq`, `arrayEnumerateDense` functions in case of some invalid arguments. [\#3909](https://github.com/ClickHouse/ClickHouse/pull/3909) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fix UB in StorageMerge. [\#3910](https://github.com/ClickHouse/ClickHouse/pull/3910) ([Amos Bird](https://github.com/amosbird)) +- Fixed segfault in functions `addDays`, `subtractDays`. [\#3913](https://github.com/ClickHouse/ClickHouse/pull/3913) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed error: functions `round`, `floor`, `trunc`, `ceil` may return bogus result when executed on integer argument and large negative scale. [\#3914](https://github.com/ClickHouse/ClickHouse/pull/3914) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed a bug induced by ‘kill query sync’ which leads to a core dump. [\#3916](https://github.com/ClickHouse/ClickHouse/pull/3916) ([muVulDeePecker](https://github.com/fancyqlx)) +- Fix bug with long delay after empty replication queue. [\#3928](https://github.com/ClickHouse/ClickHouse/pull/3928) [\#3932](https://github.com/ClickHouse/ClickHouse/pull/3932) ([alesapin](https://github.com/alesapin)) +- Fixed excessive memory usage in case of inserting into table with `LowCardinality` primary key. [\#3955](https://github.com/ClickHouse/ClickHouse/pull/3955) ([KochetovNicolai](https://github.com/KochetovNicolai)) +- Fixed `LowCardinality` serialization for `Native` format in case of empty arrays. [\#3907](https://github.com/ClickHouse/ClickHouse/issues/3907) [\#4011](https://github.com/ClickHouse/ClickHouse/pull/4011) ([KochetovNicolai](https://github.com/KochetovNicolai)) +- Fixed incorrect result while using distinct by single LowCardinality numeric column. [\#3895](https://github.com/ClickHouse/ClickHouse/issues/3895) [\#4012](https://github.com/ClickHouse/ClickHouse/pull/4012) ([KochetovNicolai](https://github.com/KochetovNicolai)) +- Fixed specialized aggregation with LowCardinality key (in case when `compile` setting is enabled). [\#3886](https://github.com/ClickHouse/ClickHouse/pull/3886) ([KochetovNicolai](https://github.com/KochetovNicolai)) +- Fix user and password forwarding for replicated tables queries. [\#3957](https://github.com/ClickHouse/ClickHouse/pull/3957) ([alesapin](https://github.com/alesapin)) ([小路](https://github.com/nicelulu)) +- Fixed very rare race condition that can happen when listing tables in Dictionary database while reloading dictionaries. [\#3970](https://github.com/ClickHouse/ClickHouse/pull/3970) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed incorrect result when HAVING was used with ROLLUP or CUBE. [\#3756](https://github.com/ClickHouse/ClickHouse/issues/3756) [\#3837](https://github.com/ClickHouse/ClickHouse/pull/3837) ([Sam Chou](https://github.com/reflection)) +- Fixed column aliases for query with `JOIN ON` syntax and distributed tables. [\#3980](https://github.com/ClickHouse/ClickHouse/pull/3980) ([Winter Zhang](https://github.com/zhang2014)) +- Fixed error in internal implementation of `quantileTDigest` (found by Artem Vakhrushev). This error never happens in ClickHouse and was relevant only for those who use ClickHouse codebase as a library directly. [\#3935](https://github.com/ClickHouse/ClickHouse/pull/3935) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Support for `IF NOT EXISTS` in `ALTER TABLE ADD COLUMN` statements along with `IF EXISTS` in `DROP/MODIFY/CLEAR/COMMENT COLUMN`. [#3900](https://github.com/ClickHouse/ClickHouse/pull/3900) ([Boris Granveaud](https://github.com/bgranvea)) -* Function `parseDateTimeBestEffort`: support for formats `DD.MM.YYYY`, `DD.MM.YY`, `DD-MM-YYYY`, `DD-Mon-YYYY`, `DD/Month/YYYY` and similar. [#3922](https://github.com/ClickHouse/ClickHouse/pull/3922) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `CapnProtoInputStream` now support jagged structures. [#4063](https://github.com/ClickHouse/ClickHouse/pull/4063) ([Odin Hultgren Van Der Horst](https://github.com/Miniwoffer)) -* Usability improvement: added a check that server process is started from the data directory's owner. Do not allow to start server from root if the data belongs to non-root user. [#3785](https://github.com/ClickHouse/ClickHouse/pull/3785) ([sergey-v-galtsev](https://github.com/sergey-v-galtsev)) -* Better logic of checking required columns during analysis of queries with JOINs. [#3930](https://github.com/ClickHouse/ClickHouse/pull/3930) ([Artem Zuikov](https://github.com/4ertus2)) -* Decreased the number of connections in case of large number of Distributed tables in a single server. [#3726](https://github.com/ClickHouse/ClickHouse/pull/3726) ([Winter Zhang](https://github.com/zhang2014)) -* Supported totals row for `WITH TOTALS` query for ODBC driver. [#3836](https://github.com/ClickHouse/ClickHouse/pull/3836) ([Maksim Koritckiy](https://github.com/nightweb)) -* Allowed to use `Enum`s as integers inside if function. [#3875](https://github.com/ClickHouse/ClickHouse/pull/3875) ([Ivan](https://github.com/abyss7)) -* Added `low_cardinality_allow_in_native_format` setting. If disabled, do not use `LowCadrinality` type in `Native` format. [#3879](https://github.com/ClickHouse/ClickHouse/pull/3879) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Removed some redundant objects from compiled expressions cache to lower memory usage. [#4042](https://github.com/ClickHouse/ClickHouse/pull/4042) ([alesapin](https://github.com/alesapin)) -* Add check that `SET send_logs_level = 'value'` query accept appropriate value. [#3873](https://github.com/ClickHouse/ClickHouse/pull/3873) ([Sabyanin Maxim](https://github.com/s-mx)) -* Fixed data type check in type conversion functions. [#3896](https://github.com/ClickHouse/ClickHouse/pull/3896) ([Winter Zhang](https://github.com/zhang2014)) +#### Improvements {#improvements-6} -#### Performance Improvements +- Support for `IF NOT EXISTS` in `ALTER TABLE ADD COLUMN` statements along with `IF EXISTS` in `DROP/MODIFY/CLEAR/COMMENT COLUMN`. [\#3900](https://github.com/ClickHouse/ClickHouse/pull/3900) ([Boris Granveaud](https://github.com/bgranvea)) +- Function `parseDateTimeBestEffort`: support for formats `DD.MM.YYYY`, `DD.MM.YY`, `DD-MM-YYYY`, `DD-Mon-YYYY`, `DD/Month/YYYY` and similar. [\#3922](https://github.com/ClickHouse/ClickHouse/pull/3922) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- `CapnProtoInputStream` now support jagged structures. [\#4063](https://github.com/ClickHouse/ClickHouse/pull/4063) ([Odin Hultgren Van Der Horst](https://github.com/Miniwoffer)) +- Usability improvement: added a check that server process is started from the data directory’s owner. Do not allow to start server from root if the data belongs to non-root user. [\#3785](https://github.com/ClickHouse/ClickHouse/pull/3785) ([sergey-v-galtsev](https://github.com/sergey-v-galtsev)) +- Better logic of checking required columns during analysis of queries with JOINs. [\#3930](https://github.com/ClickHouse/ClickHouse/pull/3930) ([Artem Zuikov](https://github.com/4ertus2)) +- Decreased the number of connections in case of large number of Distributed tables in a single server. [\#3726](https://github.com/ClickHouse/ClickHouse/pull/3726) ([Winter Zhang](https://github.com/zhang2014)) +- Supported totals row for `WITH TOTALS` query for ODBC driver. [\#3836](https://github.com/ClickHouse/ClickHouse/pull/3836) ([Maksim Koritckiy](https://github.com/nightweb)) +- Allowed to use `Enum`s as integers inside if function. [\#3875](https://github.com/ClickHouse/ClickHouse/pull/3875) ([Ivan](https://github.com/abyss7)) +- Added `low_cardinality_allow_in_native_format` setting. If disabled, do not use `LowCadrinality` type in `Native` format. [\#3879](https://github.com/ClickHouse/ClickHouse/pull/3879) ([KochetovNicolai](https://github.com/KochetovNicolai)) +- Removed some redundant objects from compiled expressions cache to lower memory usage. [\#4042](https://github.com/ClickHouse/ClickHouse/pull/4042) ([alesapin](https://github.com/alesapin)) +- Add check that `SET send_logs_level = 'value'` query accept appropriate value. [\#3873](https://github.com/ClickHouse/ClickHouse/pull/3873) ([Sabyanin Maxim](https://github.com/s-mx)) +- Fixed data type check in type conversion functions. [\#3896](https://github.com/ClickHouse/ClickHouse/pull/3896) ([Winter Zhang](https://github.com/zhang2014)) -* Add a MergeTree setting `use_minimalistic_part_header_in_zookeeper`. If enabled, Replicated tables will store compact part metadata in a single part znode. This can dramatically reduce ZooKeeper snapshot size (especially if the tables have a lot of columns). Note that after enabling this setting you will not be able to downgrade to a version that doesn't support it. [#3960](https://github.com/ClickHouse/ClickHouse/pull/3960) ([Alex Zatelepin](https://github.com/ztlpn)) -* Add an DFA-based implementation for functions `sequenceMatch` and `sequenceCount` in case pattern doesn't contain time. [#4004](https://github.com/ClickHouse/ClickHouse/pull/4004) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) -* Performance improvement for integer numbers serialization. [#3968](https://github.com/ClickHouse/ClickHouse/pull/3968) ([Amos Bird](https://github.com/amosbird)) -* Zero left padding PODArray so that -1 element is always valid and zeroed. It's used for branchless calculation of offsets. [#3920](https://github.com/ClickHouse/ClickHouse/pull/3920) ([Amos Bird](https://github.com/amosbird)) -* Reverted `jemalloc` version which lead to performance degradation. [#4018](https://github.com/ClickHouse/ClickHouse/pull/4018) ([alexey-milovidov](https://github.com/alexey-milovidov)) +#### Performance Improvements {#performance-improvements-5} -#### Backward Incompatible Changes +- Add a MergeTree setting `use_minimalistic_part_header_in_zookeeper`. If enabled, Replicated tables will store compact part metadata in a single part znode. This can dramatically reduce ZooKeeper snapshot size (especially if the tables have a lot of columns). Note that after enabling this setting you will not be able to downgrade to a version that doesn’t support it. [\#3960](https://github.com/ClickHouse/ClickHouse/pull/3960) ([Alex Zatelepin](https://github.com/ztlpn)) +- Add an DFA-based implementation for functions `sequenceMatch` and `sequenceCount` in case pattern doesn’t contain time. [\#4004](https://github.com/ClickHouse/ClickHouse/pull/4004) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) +- Performance improvement for integer numbers serialization. [\#3968](https://github.com/ClickHouse/ClickHouse/pull/3968) ([Amos Bird](https://github.com/amosbird)) +- Zero left padding PODArray so that -1 element is always valid and zeroed. It’s used for branchless calculation of offsets. [\#3920](https://github.com/ClickHouse/ClickHouse/pull/3920) ([Amos Bird](https://github.com/amosbird)) +- Reverted `jemalloc` version which lead to performance degradation. [\#4018](https://github.com/ClickHouse/ClickHouse/pull/4018) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Removed undocumented feature `ALTER MODIFY PRIMARY KEY` because it was superseded by the `ALTER MODIFY ORDER BY` command. [#3887](https://github.com/ClickHouse/ClickHouse/pull/3887) ([Alex Zatelepin](https://github.com/ztlpn)) -* Removed function `shardByHash`. [#3833](https://github.com/ClickHouse/ClickHouse/pull/3833) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Forbid using scalar subqueries with result of type `AggregateFunction`. [#3865](https://github.com/ClickHouse/ClickHouse/pull/3865) ([Ivan](https://github.com/abyss7)) +#### Backward Incompatible Changes {#backward-incompatible-changes-2} -#### Build/Testing/Packaging Improvements +- Removed undocumented feature `ALTER MODIFY PRIMARY KEY` because it was superseded by the `ALTER MODIFY ORDER BY` command. [\#3887](https://github.com/ClickHouse/ClickHouse/pull/3887) ([Alex Zatelepin](https://github.com/ztlpn)) +- Removed function `shardByHash`. [\#3833](https://github.com/ClickHouse/ClickHouse/pull/3833) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Forbid using scalar subqueries with result of type `AggregateFunction`. [\#3865](https://github.com/ClickHouse/ClickHouse/pull/3865) ([Ivan](https://github.com/abyss7)) -* Added support for PowerPC (`ppc64le`) build. [#4132](https://github.com/ClickHouse/ClickHouse/pull/4132) ([Danila Kutenin](https://github.com/danlark1)) -* Stateful functional tests are run on public available dataset. [#3969](https://github.com/ClickHouse/ClickHouse/pull/3969) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error when the server cannot start with the `bash: /usr/bin/clickhouse-extract-from-config: Operation not permitted` message within Docker or systemd-nspawn. [#4136](https://github.com/ClickHouse/ClickHouse/pull/4136) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Updated `rdkafka` library to v1.0.0-RC5. Used cppkafka instead of raw C interface. [#4025](https://github.com/ClickHouse/ClickHouse/pull/4025) ([Ivan](https://github.com/abyss7)) -* Updated `mariadb-client` library. Fixed one of issues found by UBSan. [#3924](https://github.com/ClickHouse/ClickHouse/pull/3924) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Some fixes for UBSan builds. [#3926](https://github.com/ClickHouse/ClickHouse/pull/3926) [#3021](https://github.com/ClickHouse/ClickHouse/pull/3021) [#3948](https://github.com/ClickHouse/ClickHouse/pull/3948) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added per-commit runs of tests with UBSan build. -* Added per-commit runs of PVS-Studio static analyzer. -* Fixed bugs found by PVS-Studio. [#4013](https://github.com/ClickHouse/ClickHouse/pull/4013) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed glibc compatibility issues. [#4100](https://github.com/ClickHouse/ClickHouse/pull/4100) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Move Docker images to 18.10 and add compatibility file for glibc >= 2.28 [#3965](https://github.com/ClickHouse/ClickHouse/pull/3965) ([alesapin](https://github.com/alesapin)) -* Add env variable if user don't want to chown directories in server Docker image. [#3967](https://github.com/ClickHouse/ClickHouse/pull/3967) ([alesapin](https://github.com/alesapin)) -* Enabled most of the warnings from `-Weverything` in clang. Enabled `-Wpedantic`. [#3986](https://github.com/ClickHouse/ClickHouse/pull/3986) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added a few more warnings that are available only in clang 8. [#3993](https://github.com/ClickHouse/ClickHouse/pull/3993) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Link to `libLLVM` rather than to individual LLVM libs when using shared linking. [#3989](https://github.com/ClickHouse/ClickHouse/pull/3989) ([Orivej Desh](https://github.com/orivej)) -* Added sanitizer variables for test images. [#4072](https://github.com/ClickHouse/ClickHouse/pull/4072) ([alesapin](https://github.com/alesapin)) -* `clickhouse-server` debian package will recommend `libcap2-bin` package to use `setcap` tool for setting capabilities. This is optional. [#4093](https://github.com/ClickHouse/ClickHouse/pull/4093) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Improved compilation time, fixed includes. [#3898](https://github.com/ClickHouse/ClickHouse/pull/3898) ([proller](https://github.com/proller)) -* Added performance tests for hash functions. [#3918](https://github.com/ClickHouse/ClickHouse/pull/3918) ([filimonov](https://github.com/filimonov)) -* Fixed cyclic library dependences. [#3958](https://github.com/ClickHouse/ClickHouse/pull/3958) ([proller](https://github.com/proller)) -* Improved compilation with low available memory. [#4030](https://github.com/ClickHouse/ClickHouse/pull/4030) ([proller](https://github.com/proller)) -* Added test script to reproduce performance degradation in `jemalloc`. [#4036](https://github.com/ClickHouse/ClickHouse/pull/4036) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed misspells in comments and string literals under `dbms`. [#4122](https://github.com/ClickHouse/ClickHouse/pull/4122) ([maiha](https://github.com/maiha)) -* Fixed typos in comments. [#4089](https://github.com/ClickHouse/ClickHouse/pull/4089) ([Evgenii Pravda](https://github.com/kvinty)) +#### Build/Testing/Packaging Improvements {#buildtestingpackaging-improvements-6} + +- Added support for PowerPC (`ppc64le`) build. [\#4132](https://github.com/ClickHouse/ClickHouse/pull/4132) ([Danila Kutenin](https://github.com/danlark1)) +- Stateful functional tests are run on public available dataset. [\#3969](https://github.com/ClickHouse/ClickHouse/pull/3969) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed error when the server cannot start with the `bash: /usr/bin/clickhouse-extract-from-config: Operation not permitted` message within Docker or systemd-nspawn. [\#4136](https://github.com/ClickHouse/ClickHouse/pull/4136) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Updated `rdkafka` library to v1.0.0-RC5. Used cppkafka instead of raw C interface. [\#4025](https://github.com/ClickHouse/ClickHouse/pull/4025) ([Ivan](https://github.com/abyss7)) +- Updated `mariadb-client` library. Fixed one of issues found by UBSan. [\#3924](https://github.com/ClickHouse/ClickHouse/pull/3924) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Some fixes for UBSan builds. [\#3926](https://github.com/ClickHouse/ClickHouse/pull/3926) [\#3021](https://github.com/ClickHouse/ClickHouse/pull/3021) [\#3948](https://github.com/ClickHouse/ClickHouse/pull/3948) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added per-commit runs of tests with UBSan build. +- Added per-commit runs of PVS-Studio static analyzer. +- Fixed bugs found by PVS-Studio. [\#4013](https://github.com/ClickHouse/ClickHouse/pull/4013) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed glibc compatibility issues. [\#4100](https://github.com/ClickHouse/ClickHouse/pull/4100) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Move Docker images to 18.10 and add compatibility file for glibc \>= 2.28 [\#3965](https://github.com/ClickHouse/ClickHouse/pull/3965) ([alesapin](https://github.com/alesapin)) +- Add env variable if user don’t want to chown directories in server Docker image. [\#3967](https://github.com/ClickHouse/ClickHouse/pull/3967) ([alesapin](https://github.com/alesapin)) +- Enabled most of the warnings from `-Weverything` in clang. Enabled `-Wpedantic`. [\#3986](https://github.com/ClickHouse/ClickHouse/pull/3986) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Added a few more warnings that are available only in clang 8. [\#3993](https://github.com/ClickHouse/ClickHouse/pull/3993) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Link to `libLLVM` rather than to individual LLVM libs when using shared linking. [\#3989](https://github.com/ClickHouse/ClickHouse/pull/3989) ([Orivej Desh](https://github.com/orivej)) +- Added sanitizer variables for test images. [\#4072](https://github.com/ClickHouse/ClickHouse/pull/4072) ([alesapin](https://github.com/alesapin)) +- `clickhouse-server` debian package will recommend `libcap2-bin` package to use `setcap` tool for setting capabilities. This is optional. [\#4093](https://github.com/ClickHouse/ClickHouse/pull/4093) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Improved compilation time, fixed includes. [\#3898](https://github.com/ClickHouse/ClickHouse/pull/3898) ([proller](https://github.com/proller)) +- Added performance tests for hash functions. [\#3918](https://github.com/ClickHouse/ClickHouse/pull/3918) ([filimonov](https://github.com/filimonov)) +- Fixed cyclic library dependences. [\#3958](https://github.com/ClickHouse/ClickHouse/pull/3958) ([proller](https://github.com/proller)) +- Improved compilation with low available memory. [\#4030](https://github.com/ClickHouse/ClickHouse/pull/4030) ([proller](https://github.com/proller)) +- Added test script to reproduce performance degradation in `jemalloc`. [\#4036](https://github.com/ClickHouse/ClickHouse/pull/4036) ([alexey-milovidov](https://github.com/alexey-milovidov)) +- Fixed misspells in comments and string literals under `dbms`. [\#4122](https://github.com/ClickHouse/ClickHouse/pull/4122) ([maiha](https://github.com/maiha)) +- Fixed typos in comments. [\#4089](https://github.com/ClickHouse/ClickHouse/pull/4089) ([Evgenii Pravda](https://github.com/kvinty)) diff --git a/docs/en/commercial/cloud.md b/docs/en/commercial/cloud.md index 3ee7ea96a59..d8ddf2167d8 100644 --- a/docs/en/commercial/cloud.md +++ b/docs/en/commercial/cloud.md @@ -1,14 +1,16 @@ -# ClickHouse Cloud Service Providers +# ClickHouse Cloud Service Providers {#clickhouse-cloud-service-providers} !!! info "Info" If you have launched a public cloud with managed ClickHouse service, feel free to [open a pull-request](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/commercial/cloud.md) adding it to the following list. - -## Yandex Cloud + +## Yandex Cloud {#yandex-cloud} [Yandex Managed Service for ClickHouse](https://cloud.yandex.com/services/managed-clickhouse?utm_source=referrals&utm_medium=clickhouseofficialsite&utm_campaign=link3) provides the following key features: -* Fully managed ZooKeeper service for [ClickHouse replication](../operations/table_engines/replication.md) -* Multiple storage type choices -* Replicas in different availability zones -* Encryption and isolation -* Automated maintenance +- Fully managed ZooKeeper service for [ClickHouse replication](../operations/table_engines/replication.md) +- Multiple storage type choices +- Replicas in different availability zones +- Encryption and isolation +- Automated maintenance + +{## [Original article](https://clickhouse.tech/docs/en/commercial/cloud/) ##} diff --git a/docs/en/data_types/array.md b/docs/en/data_types/array.md index d23fe60c327..66f2e8b1b2a 100644 --- a/docs/en/data_types/array.md +++ b/docs/en/data_types/array.md @@ -1,55 +1,58 @@ -# Array(T) {#data_type-array} +# Array(T) {#data-type-array} Array of `T`-type items. `T` can be anything, including an array. -## Creating an array +## Creating an array {#creating-an-array} You can use a function to create an array: -```sql +``` sql array(T) ``` You can also use square brackets. -```sql +``` sql [] ``` Example of creating an array: -```sql +``` sql SELECT array(1, 2) AS x, toTypeName(x) ``` -```text + +``` text ┌─x─────┬─toTypeName(array(1, 2))─┐ │ [1,2] │ Array(UInt8) │ └───────┴─────────────────────────┘ ``` -```sql + +``` sql SELECT [1, 2] AS x, toTypeName(x) ``` -```text + +``` text ┌─x─────┬─toTypeName([1, 2])─┐ │ [1,2] │ Array(UInt8) │ └───────┴────────────────────┘ - ``` -## Working with data types +## Working with data types {#working-with-data-types} When creating an array on the fly, ClickHouse automatically defines the argument type as the narrowest data type that can store all the listed arguments. If there are any [NULL](../query_language/syntax.md#null-literal) or [Nullable](nullable.md#data_type-nullable) type arguments, the type of array elements is [Nullable](nullable.md). -If ClickHouse couldn't determine the data type, it will generate an exception. For instance, this will happen when trying to create an array with strings and numbers simultaneously (`SELECT array(1, 'a')`). +If ClickHouse couldn’t determine the data type, it will generate an exception. For instance, this will happen when trying to create an array with strings and numbers simultaneously (`SELECT array(1, 'a')`). Examples of automatic data type detection: -```sql +``` sql SELECT array(1, 2, NULL) AS x, toTypeName(x) -``` -```text +``` + +``` text ┌─x──────────┬─toTypeName(array(1, 2, NULL))─┐ │ [1,2,NULL] │ Array(Nullable(UInt8)) │ └────────────┴───────────────────────────────┘ @@ -57,13 +60,13 @@ SELECT array(1, 2, NULL) AS x, toTypeName(x) If you try to create an array of incompatible data types, ClickHouse throws an exception: -```sql +``` sql SELECT array(1, 'a') ``` -```text + +``` text Received exception from server (version 1.1.54388): Code: 386. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: There is no supertype for types UInt8, String because some of them are String/FixedString and some of them are not. ``` - [Original article](https://clickhouse.tech/docs/en/data_types/array/) diff --git a/docs/en/data_types/boolean.md b/docs/en/data_types/boolean.md index d5fc88e45cc..1d3d0d1a6a4 100644 --- a/docs/en/data_types/boolean.md +++ b/docs/en/data_types/boolean.md @@ -1,6 +1,5 @@ -# Boolean Values - -There isn't a separate type for boolean values. They use the UInt8 type, restricted to the values 0 or 1. +# Boolean Values {#boolean-values} +There is no separate type for boolean values. Use UInt8 type, restricted to the values 0 or 1. [Original article](https://clickhouse.tech/docs/en/data_types/boolean/) diff --git a/docs/en/data_types/date.md b/docs/en/data_types/date.md index cb5f84d6545..359ef17f724 100644 --- a/docs/en/data_types/date.md +++ b/docs/en/data_types/date.md @@ -1,9 +1,8 @@ -# Date +# Date {#date} A date. Stored in two bytes as the number of days since 1970-01-01 (unsigned). Allows storing values from just after the beginning of the Unix Epoch to the upper threshold defined by a constant at the compilation stage (currently, this is until the year 2106, but the final fully-supported year is 2105). The minimum value is output as 0000-00-00. -The date is stored without the time zone. - +The date value is stored without the time zone. [Original article](https://clickhouse.tech/docs/en/data_types/date/) diff --git a/docs/en/data_types/datetime.md b/docs/en/data_types/datetime.md index fded3caa4e5..516709e6ef5 100644 --- a/docs/en/data_types/datetime.md +++ b/docs/en/data_types/datetime.md @@ -1,74 +1,81 @@ -# DateTime {#data_type-datetime} +# DateTime {#data-type-datetime} Allows to store an instant in time, that can be expressed as a calendar date and a time of a day. Syntax: -```sql +``` sql DateTime([timezone]) ``` -Supported range of values: [1970-01-01 00:00:00, 2105-12-31 23:59:59]. +Supported range of values: \[1970-01-01 00:00:00, 2105-12-31 23:59:59\]. Resolution: 1 second. -## Usage Remarks +## Usage Remarks {#usage-remarks} -The point in time is saved as a [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time), regardless of the time zone or daylight saving time. Additionally, the `DateTime` type can store time zone that is the same for the entire column, that affects how the values of the `DateTime` type values are displayed in text format and how the values specified as strings are parsed ('2020-01-01 05:00:01'). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. +The point in time is saved as a [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time), regardless of the time zone or daylight saving time. Additionally, the `DateTime` type can store time zone that is the same for the entire column, that affects how the values of the `DateTime` type values are displayed in text format and how the values specified as strings are parsed (‘2020-01-01 05:00:01’). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. A list of supported time zones can be found in the [IANA Time Zone Database](https://www.iana.org/time-zones). The `tzdata` package, containing [IANA Time Zone Database](https://www.iana.org/time-zones), should be installed in the system. Use the `timedatectl list-timezones` command to list timezones known by a local system. -You can explicitly set a time zone for `DateTime`-type columns when creating a table. If the time zone isn't set, ClickHouse uses the value of the [timezone](../operations/server_settings/settings.md#server_settings-timezone) parameter in the server settings or the operating system settings at the moment of the ClickHouse server start. +You can explicitly set a time zone for `DateTime`-type columns when creating a table. If the time zone isn’t set, ClickHouse uses the value of the [timezone](../operations/server_settings/settings.md#server_settings-timezone) parameter in the server settings or the operating system settings at the moment of the ClickHouse server start. -The [clickhouse-client](../interfaces/cli.md) applies the server time zone by default if a time zone isn't explicitly set when initializing the data type. To use the client time zone, run `clickhouse-client` with the `--use_client_time_zone` parameter. +The [clickhouse-client](../interfaces/cli.md) applies the server time zone by default if a time zone isn’t explicitly set when initializing the data type. To use the client time zone, run `clickhouse-client` with the `--use_client_time_zone` parameter. ClickHouse outputs values in `YYYY-MM-DD hh:mm:ss` text format by default. You can change the output with the [formatDateTime](../query_language/functions/date_time_functions.md#formatdatetime) function. -When inserting data into ClickHouse, you can use different formats of date and time strings, depending on the value of the [date_time_input_format](../operations/settings/settings.md#settings-date_time_input_format) setting. +When inserting data into ClickHouse, you can use different formats of date and time strings, depending on the value of the [date\_time\_input\_format](../operations/settings/settings.md#settings-date_time_input_format) setting. -## Examples +## Examples {#examples} **1.** Creating a table with a `DateTime`-type column and inserting data into it: -```sql +``` sql CREATE TABLE dt ( - `timestamp` DateTime('Europe/Moscow'), + `timestamp` DateTime('Europe/Moscow'), `event_id` UInt8 ) ENGINE = TinyLog; ``` -```sql + +``` sql INSERT INTO dt Values (1546300800, 1), ('2019-01-01 00:00:00', 2); ``` -```sql + +``` sql SELECT * FROM dt; ``` -```text + +``` text ┌───────────timestamp─┬─event_id─┐ │ 2019-01-01 03:00:00 │ 1 │ │ 2019-01-01 00:00:00 │ 2 │ └─────────────────────┴──────────┘ ``` -* When inserting datetime as an integer, it is treated as Unix Timestamp (UTC). `1546300800` represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Europe/Moscow` (UTC+3) timezone specified, when outputting as string the value will be shown as `'2019-01-01 03:00:00'` -* When inserting string value as datetime, it is treated as being in column timezone. `'2019-01-01 00:00:00'` will be treated as being in `Europe/Moscow` timezone and saved as `1546290000`. +- When inserting datetime as an integer, it is treated as Unix Timestamp (UTC). `1546300800` represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Europe/Moscow` (UTC+3) timezone specified, when outputting as string the value will be shown as `'2019-01-01 03:00:00'` +- When inserting string value as datetime, it is treated as being in column timezone. `'2019-01-01 00:00:00'` will be treated as being in `Europe/Moscow` timezone and saved as `1546290000`. **2.** Filtering on `DateTime` values -```sql +``` sql SELECT * FROM dt WHERE timestamp = toDateTime('2019-01-01 00:00:00', 'Europe/Moscow') ``` -```text + +``` text ┌───────────timestamp─┬─event_id─┐ │ 2019-01-01 00:00:00 │ 2 │ └─────────────────────┴──────────┘ ``` + `DateTime` column values can be filtered using a string value in `WHERE` predicate. It will be converted to `DateTime` automatically: -```sql + +``` sql SELECT * FROM dt WHERE timestamp = '2019-01-01 00:00:00' ``` -```text + +``` text ┌───────────timestamp─┬─event_id─┐ │ 2019-01-01 03:00:00 │ 1 │ └─────────────────────┴──────────┘ @@ -76,10 +83,11 @@ SELECT * FROM dt WHERE timestamp = '2019-01-01 00:00:00' **3.** Getting a time zone for a `DateTime`-type column: -```sql +``` sql SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x ``` -```text + +``` text ┌──────────────column─┬─x─────────────────────────┐ │ 2019-10-16 04:12:04 │ DateTime('Europe/Moscow') │ └─────────────────────┴───────────────────────────┘ @@ -87,27 +95,28 @@ SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x **4.** Timezone conversion -```sql -SELECT -toDateTime(timestamp, 'Europe/London') as lon_time, +``` sql +SELECT +toDateTime(timestamp, 'Europe/London') as lon_time, toDateTime(timestamp, 'Europe/Moscow') as mos_time FROM dt ``` -```text + +``` text ┌───────────lon_time──┬────────────mos_time─┐ │ 2019-01-01 00:00:00 │ 2019-01-01 03:00:00 │ │ 2018-12-31 21:00:00 │ 2019-01-01 00:00:00 │ └─────────────────────┴─────────────────────┘ ``` -## See Also +## See Also {#see-also} -- [Type conversion functions](../query_language/functions/type_conversion_functions.md) -- [Functions for working with dates and times](../query_language/functions/date_time_functions.md) -- [Functions for working with arrays](../query_language/functions/array_functions.md) -- [The `date_time_input_format` setting](../operations/settings/settings.md#settings-date_time_input_format) -- [The `timezone` server configuration parameter](../operations/server_settings/settings.md#server_settings-timezone) -- [Operators for working with dates and times](../query_language/operators.md#operators-datetime) -- [The `Date` data type](date.md) +- [Type conversion functions](../query_language/functions/type_conversion_functions.md) +- [Functions for working with dates and times](../query_language/functions/date_time_functions.md) +- [Functions for working with arrays](../query_language/functions/array_functions.md) +- [The `date_time_input_format` setting](../operations/settings/settings.md#settings-date_time_input_format) +- [The `timezone` server configuration parameter](../operations/server_settings/settings.md#server_settings-timezone) +- [Operators for working with dates and times](../query_language/operators.md#operators-datetime) +- [The `Date` data type](date.md) [Original article](https://clickhouse.tech/docs/en/data_types/datetime/) diff --git a/docs/en/data_types/datetime64.md b/docs/en/data_types/datetime64.md index f7feabbb996..c61bf0ecda0 100644 --- a/docs/en/data_types/datetime64.md +++ b/docs/en/data_types/datetime64.md @@ -1,89 +1,97 @@ -# DateTime64 {#data_type-datetime64} +# DateTime64 {#data-type-datetime64} Allows to store an instant in time, that can be expressed as a calendar date and a time of a day, with defined sub-second precision Tick size (precision): 10-precision seconds Syntax: -```sql + +``` sql DateTime64(precision, [timezone]) ``` -Internally, stores data as number of 'ticks' since epoch start (1970-01-01 00:00:00 UTC) as Int64. The tick resolution is determined by the precision parameter. Additionally, the `DateTime64` type can store time zone that is the same for the entire column, that affects how the values of the `DateTime64` type values are displayed in text format and how the values specified as strings are parsed ('2020-01-01 05:00:01.000'). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. See details in [DateTime](datetime.md). +Internally, stores data as a number of ‘ticks’ since epoch start (1970-01-01 00:00:00 UTC) as Int64. The tick resolution is determined by the precision parameter. Additionally, the `DateTime64` type can store time zone that is the same for the entire column, that affects how the values of the `DateTime64` type values are displayed in text format and how the values specified as strings are parsed (‘2020-01-01 05:00:01.000’). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata. See details in [DateTime](datetime.md). -## Examples +## Examples {#examples} **1.** Creating a table with `DateTime64`-type column and inserting data into it: -```sql +``` sql CREATE TABLE dt ( - `timestamp` DateTime64(3, 'Europe/Moscow'), + `timestamp` DateTime64(3, 'Europe/Moscow'), `event_id` UInt8 ) ENGINE = TinyLog ``` -```sql + +``` sql INSERT INTO dt Values (1546300800000, 1), ('2019-01-01 00:00:00', 2) ``` -```sql + +``` sql SELECT * FROM dt ``` -```text + +``` text ┌───────────────timestamp─┬─event_id─┐ │ 2019-01-01 03:00:00.000 │ 1 │ │ 2019-01-01 00:00:00.000 │ 2 │ └─────────────────────────┴──────────┘ ``` -* When inserting datetime as an integer, it is treated as an appropriately scaled Unix Timestamp (UTC). `1546300800000` (with precision 3) represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Europe/Moscow` (UTC+3) timezone specified, when outputting as string the value will be shown as `'2019-01-01 03:00:00'` -* When inserting string value as datetime, it is treated as being in column timezone. `'2019-01-01 00:00:00'` will be treated as being in `Europe/Moscow` timezone and stored as `1546290000000`. +- When inserting datetime as an integer, it is treated as an appropriately scaled Unix Timestamp (UTC). `1546300800000` (with precision 3) represents `'2019-01-01 00:00:00'` UTC. However, as `timestamp` column has `Europe/Moscow` (UTC+3) timezone specified, when outputting as a string the value will be shown as `'2019-01-01 03:00:00'` +- When inserting string value as datetime, it is treated as being in column timezone. `'2019-01-01 00:00:00'` will be treated as being in `Europe/Moscow` timezone and stored as `1546290000000`. **2.** Filtering on `DateTime64` values -```sql +``` sql SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow') ``` -```text + +``` text ┌───────────────timestamp─┬─event_id─┐ │ 2019-01-01 00:00:00.000 │ 2 │ └─────────────────────────┴──────────┘ ``` + Unlike `DateTime`, `DateTime64` values are not converted from `String` automatically **3.** Getting a time zone for a `DateTime64`-type value: -```sql +``` sql SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x ``` -```text + +``` text ┌──────────────────column─┬─x──────────────────────────────┐ │ 2019-10-16 04:12:04.000 │ DateTime64(3, 'Europe/Moscow') │ └─────────────────────────┴────────────────────────────────┘ ``` -**4.** Timezone conversion +**4.** Timezone conversion -```sql -SELECT -toDateTime64(timestamp, 3, 'Europe/London') as lon_time, +``` sql +SELECT +toDateTime64(timestamp, 3, 'Europe/London') as lon_time, toDateTime64(timestamp, 3, 'Europe/Moscow') as mos_time FROM dt ``` -```text + +``` text ┌───────────────lon_time──┬────────────────mos_time─┐ │ 2019-01-01 00:00:00.000 │ 2019-01-01 03:00:00.000 │ │ 2018-12-31 21:00:00.000 │ 2019-01-01 00:00:00.000 │ └─────────────────────────┴─────────────────────────┘ ``` -## See Also +## See Also {#see-also} -- [Type conversion functions](../query_language/functions/type_conversion_functions.md) -- [Functions for working with dates and times](../query_language/functions/date_time_functions.md) -- [Functions for working with arrays](../query_language/functions/array_functions.md) -- [The `date_time_input_format` setting](../operations/settings/settings.md#settings-date_time_input_format) -- [The `timezone` server configuration parameter](../operations/server_settings/settings.md#server_settings-timezone) -- [Operators for working with dates and times](../query_language/operators.md#operators-datetime) -- [`Date` data type](date.md) -- [`DateTime` data type](datetime.md) +- [Type conversion functions](../query_language/functions/type_conversion_functions.md) +- [Functions for working with dates and times](../query_language/functions/date_time_functions.md) +- [Functions for working with arrays](../query_language/functions/array_functions.md) +- [The `date_time_input_format` setting](../operations/settings/settings.md#settings-date_time_input_format) +- [The `timezone` server configuration parameter](../operations/server_settings/settings.md#server_settings-timezone) +- [Operators for working with dates and times](../query_language/operators.md#operators-datetime) +- [`Date` data type](date.md) +- [`DateTime` data type](datetime.md) diff --git a/docs/en/data_types/decimal.md b/docs/en/data_types/decimal.md index 8d3112c3dab..fb52fc433d9 100644 --- a/docs/en/data_types/decimal.md +++ b/docs/en/data_types/decimal.md @@ -1,97 +1,101 @@ +# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S) {#decimalp-s-decimal32s-decimal64s-decimal128s} -# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S) +Signed fixed-point numbers that keep precision during add, subtract and multiply operations. For division least significant digits are discarded (not rounded). -Signed fixed point numbers that keep precision during add, subtract and multiply operations. For division least significant digits are discarded (not rounded). +## Parameters {#parameters} -## Parameters - -- P - precision. Valid range: [ 1 : 38 ]. Determines how many decimal digits number can have (including fraction). -- S - scale. Valid range: [ 0 : P ]. Determines how many decimal digits fraction can have. +- P - precision. Valid range: \[ 1 : 38 \]. Determines how many decimal digits number can have (including fraction). +- S - scale. Valid range: \[ 0 : P \]. Determines how many decimal digits fraction can have. Depending on P parameter value Decimal(P, S) is a synonym for: -- P from [ 1 : 9 ] - for Decimal32(S) -- P from [ 10 : 18 ] - for Decimal64(S) -- P from [ 19 : 38 ] - for Decimal128(S) +- P from \[ 1 : 9 \] - for Decimal32(S) +- P from \[ 10 : 18 \] - for Decimal64(S) +- P from \[ 19 : 38 \] - for Decimal128(S) -## Decimal value ranges +## Decimal value ranges {#decimal-value-ranges} -- Decimal32(S) - ( -1 * 10^(9 - S), 1 * 10^(9 - S) ) -- Decimal64(S) - ( -1 * 10^(18 - S), 1 * 10^(18 - S) ) -- Decimal128(S) - ( -1 * 10^(38 - S), 1 * 10^(38 - S) ) +- Decimal32(S) - ( -1 \* 10^(9 - S), 1 \* 10^(9 - S) ) +- Decimal64(S) - ( -1 \* 10^(18 - S), 1 \* 10^(18 - S) ) +- Decimal128(S) - ( -1 \* 10^(38 - S), 1 \* 10^(38 - S) ) For example, Decimal32(4) can contain numbers from -99999.9999 to 99999.9999 with 0.0001 step. -## Internal representation +## Internal representation {#internal-representation} -Internally data is represented as normal signed integers with respective bit width. Real value ranges that can be stored in memory are a bit larger than specified above, which are checked only on convertion from string. +Internally data is represented as normal signed integers with respective bit width. Real value ranges that can be stored in memory are a bit larger than specified above, which are checked only on conversion from a string. -Because modern CPU's do not support 128 bit integers natively, operations on Decimal128 are emulated. Because of this Decimal128 works signigicantly slower than Decimal32/Decimal64. +Because modern CPU’s do not support 128-bit integers natively, operations on Decimal128 are emulated. Because of this Decimal128 works significantly slower than Decimal32/Decimal64. -## Operations and result type +## Operations and result type {#operations-and-result-type} Binary operations on Decimal result in wider result type (with any order of arguments). -- Decimal64(S1) Decimal32(S2) -> Decimal64(S) -- Decimal128(S1) Decimal32(S2) -> Decimal128(S) -- Decimal128(S1) Decimal64(S2) -> Decimal128(S) +- Decimal64(S1) Decimal32(S2) -\> Decimal64(S) +- Decimal128(S1) Decimal32(S2) -\> Decimal128(S) +- Decimal128(S1) Decimal64(S2) -\> Decimal128(S) Rules for scale: -- add, subtract: S = max(S1, S2). -- multuply: S = S1 + S2. -- divide: S = S1. +- add, subtract: S = max(S1, S2). +- multuply: S = S1 + S2. +- divide: S = S1. -For similar operations between Decimal and integers, the result is Decimal of the same size as argument. +For similar operations between Decimal and integers, the result is Decimal of the same size as an argument. -Operations between Decimal and Float32/Float64 are not defined. If you really need them, you can explicitly cast one of argument using toDecimal32, toDecimal64, toDecimal128 or toFloat32, toFloat64 builtins. Keep in mind that the result will lose precision and type conversion is computationally expensive operation. +Operations between Decimal and Float32/Float64 are not defined. If you need them, you can explicitly cast one of argument using toDecimal32, toDecimal64, toDecimal128 or toFloat32, toFloat64 builtins. Keep in mind that the result will lose precision and type conversion is a computationally expensive operation. -Some functions on Decimal return result as Float64 (for example, var or stddev). Intermediate calculations might still be performed in Decimal, which might lead to different results between Float64 and Decimal inputs with same values. +Some functions on Decimal return result as Float64 (for example, var or stddev). Intermediate calculations might still be performed in Decimal, which might lead to different results between Float64 and Decimal inputs with the same values. -## Overflow checks +## Overflow checks {#overflow-checks} -During calculations on Decimal, integer overflows might happen. Excessive digits in fraction are discarded (not rounded). Excessive digits in integer part will lead to exception. +During calculations on Decimal, integer overflows might happen. Excessive digits in a fraction are discarded (not rounded). Excessive digits in integer part will lead to an exception. -```sql +``` sql SELECT toDecimal32(2, 4) AS x, x / 3 ``` -```text + +``` text ┌──────x─┬─divide(toDecimal32(2, 4), 3)─┐ │ 2.0000 │ 0.6666 │ └────────┴──────────────────────────────┘ ``` -```sql +``` sql SELECT toDecimal32(4.2, 8) AS x, x * x ``` -```text + +``` text DB::Exception: Scale is out of bounds. ``` -```sql +``` sql SELECT toDecimal32(4.2, 8) AS x, 6 * x ``` -```text + +``` text DB::Exception: Decimal math overflow. ``` Overflow checks lead to operations slowdown. If it is known that overflows are not possible, it makes sense to disable checks using `decimal_check_overflow` setting. When checks are disabled and overflow happens, the result will be incorrect: -```sql +``` sql SET decimal_check_overflow = 0; SELECT toDecimal32(4.2, 8) AS x, 6 * x ``` -```text + +``` text ┌──────────x─┬─multiply(6, toDecimal32(4.2, 8))─┐ │ 4.20000000 │ -17.74967296 │ └────────────┴──────────────────────────────────┘ ``` -Overflow checks happen not only on arithmetic operations, but also on value comparison: +Overflow checks happen not only on arithmetic operations but also on value comparison: -```sql +``` sql SELECT toDecimal32(1, 8) < 100 ``` -```text + +``` text DB::Exception: Can't compare. ``` diff --git a/docs/en/data_types/domains/ipv4.md b/docs/en/data_types/domains/ipv4.md index f4c76a711c0..8b1b5ab4959 100644 --- a/docs/en/data_types/domains/ipv4.md +++ b/docs/en/data_types/domains/ipv4.md @@ -1,16 +1,16 @@ -## IPv4 +## IPv4 {#ipv4} `IPv4` is a domain based on `UInt32` type and serves as typed replacement for storing IPv4 values. It provides compact storage with human-friendly input-output format, and column type information on inspection. -### Basic Usage +### Basic Usage {#basic-usage} -```sql +``` sql CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -```text +``` text ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ │ url │ String │ │ │ │ │ │ from │ IPv4 │ │ │ │ │ @@ -19,19 +19,19 @@ DESCRIBE TABLE hits; OR you can use IPv4 domain as a key: -```sql +``` sql CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY from; ``` `IPv4` domain supports custom input format as IPv4-strings: -```sql +``` sql INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '116.253.40.133')('https://clickhouse.tech', '183.247.232.58')('https://clickhouse.yandex/docs/en/', '116.106.34.242'); SELECT * FROM hits; ``` -```text +``` text ┌─url────────────────────────────────┬───────────from─┐ │ https://clickhouse.tech/docs/en/ │ 116.106.34.242 │ │ https://wikipedia.org │ 116.253.40.133 │ @@ -41,11 +41,11 @@ SELECT * FROM hits; Values are stored in compact binary form: -```sql +``` sql SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(from)─┬─hex(from)─┐ │ IPv4 │ B7F7E83A │ └──────────────────┴───────────┘ @@ -54,23 +54,21 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; Domain values are not implicitly convertible to types other than `UInt32`. If you want to convert `IPv4` value to a string, you have to do that explicitly with `IPv4NumToString()` function: -```sql +``` sql SELECT toTypeName(s), IPv4NumToString(from) as s FROM hits LIMIT 1; ``` -``` -┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ -│ String │ 183.247.232.58 │ -└───────────────────────────────────┴────────────────┘ -``` + ┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ + │ String │ 183.247.232.58 │ + └───────────────────────────────────┴────────────────┘ Or cast to a `UInt32` value: -```sql +``` sql SELECT toTypeName(i), CAST(from as UInt32) as i FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ │ UInt32 │ 3086477370 │ └──────────────────────────────────┴────────────┘ diff --git a/docs/en/data_types/domains/ipv6.md b/docs/en/data_types/domains/ipv6.md index 1d5ebf598b3..1006ecd1ef9 100644 --- a/docs/en/data_types/domains/ipv6.md +++ b/docs/en/data_types/domains/ipv6.md @@ -1,16 +1,16 @@ -## IPv6 +## IPv6 {#ipv6} -`IPv6` is a domain based on `FixedString(16)` type and serves as typed replacement for storing IPv6 values. It provides compact storage with human-friendly input-output format, and column type information on inspection. +`IPv6` is a domain based on `FixedString(16)` type and serves as typed replacement for storing IPv6 values. It provides compact storage with human-friendly input-output format, and column type information on inspection. -### Basic Usage +### Basic Usage {#basic-usage} -```sql +``` sql CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -```text +``` text ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ │ url │ String │ │ │ │ │ │ from │ IPv6 │ │ │ │ │ @@ -19,19 +19,19 @@ DESCRIBE TABLE hits; OR you can use `IPv6` domain as a key: -```sql +``` sql CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from; ``` `IPv6` domain supports custom input as IPv6-strings: -```sql +``` sql INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '2a02:aa08:e000:3100::2')('https://clickhouse.tech', '2001:44c8:129:2632:33:0:252:2')('https://clickhouse.yandex/docs/en/', '2a02:e980:1e::1'); SELECT * FROM hits; ``` -```text +``` text ┌─url────────────────────────────────┬─from──────────────────────────┐ │ https://clickhouse.tech │ 2001:44c8:129:2632:33:0:252:2 │ │ https://clickhouse.tech/docs/en/ │ 2a02:e980:1e::1 │ @@ -41,11 +41,11 @@ SELECT * FROM hits; Values are stored in compact binary form: -```sql +``` sql SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(from)─┬─hex(from)────────────────────────┐ │ IPv6 │ 200144C8012926320033000002520002 │ └──────────────────┴──────────────────────────────────┘ @@ -54,11 +54,11 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; Domain values are not implicitly convertible to types other than `FixedString(16)`. If you want to convert `IPv6` value to a string, you have to do that explicitly with `IPv6NumToString()` function: -```sql +``` sql SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐ │ String │ 2001:44c8:129:2632:33:0:252:2 │ └───────────────────────────────────┴───────────────────────────────┘ @@ -66,11 +66,11 @@ SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1; Or cast to a `FixedString(16)` value: -```sql +``` sql SELECT toTypeName(i), CAST(from as FixedString(16)) as i FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐ │ FixedString(16) │ ��� │ └───────────────────────────────────────────┴─────────┘ diff --git a/docs/en/data_types/domains/overview.md b/docs/en/data_types/domains/overview.md index 103a48653fc..47eeb644d1a 100644 --- a/docs/en/data_types/domains/overview.md +++ b/docs/en/data_types/domains/overview.md @@ -1,26 +1,26 @@ -# Domains +# Domains {#domains} Domains are special-purpose types, that add some extra features atop of existing base type, leaving on-wire and on-disc format of underlying table intact. At the moment, ClickHouse does not support user-defined domains. You can use domains anywhere corresponding base type can be used: -* Create a column of domain type -* Read/write values from/to domain column -* Use it as index if base type can be used as index -* Call functions with values of domain column -* etc. +- Create a column of domain type +- Read/write values from/to domain column +- Use it as index if base type can be used as index +- Call functions with values of domain column +- etc. -### Extra Features of Domains +### Extra Features of Domains {#extra-features-of-domains} -* Explicit column type name in `SHOW CREATE TABLE` or `DESCRIBE TABLE` -* Input from human-friendly format with `INSERT INTO domain_table(domain_column) VALUES(...)` -* Output to human-friendly format for `SELECT domain_column FROM domain_table` -* Loading data from external source in human-friendly format: `INSERT INTO domain_table FORMAT CSV ...` +- Explicit column type name in `SHOW CREATE TABLE` or `DESCRIBE TABLE` +- Input from human-friendly format with `INSERT INTO domain_table(domain_column) VALUES(...)` +- Output to human-friendly format for `SELECT domain_column FROM domain_table` +- Loading data from external source in human-friendly format: `INSERT INTO domain_table FORMAT CSV ...` -### Limitations +### Limitations {#limitations} -* Can't convert index column of base type to domain type via `ALTER TABLE`. -* Can't implicitly convert string values into domain values when inserting data from another column or table. -* Domain adds no constrains on stored values. +- Can’t convert index column of base type to domain type via `ALTER TABLE`. +- Can’t implicitly convert string values into domain values when inserting data from another column or table. +- Domain adds no constrains on stored values. -[Original article](https://clickhouse.tech/docs/en/data_types/domains/overview) \ No newline at end of file +[Original article](https://clickhouse.tech/docs/en/data_types/domains/overview) diff --git a/docs/en/data_types/enum.md b/docs/en/data_types/enum.md index 6fee1602e38..200b8990937 100644 --- a/docs/en/data_types/enum.md +++ b/docs/en/data_types/enum.md @@ -1,21 +1,21 @@ -# Enum +# Enum {#enum} -Enumerated type consisting of named values. +Enumerated type consisting of named values. -Named values must be delcared as `'string' = integer` pairs. ClickHouse stores only numbers, but supports operations with the values through their names. +Named values must be declared as `'string' = integer` pairs. ClickHouse stores only numbers, but supports operations with the values through their names. ClickHouse supports: -- 8-bit `Enum`. It can contain up to 256 values enumerated in the `[-128, 127]` range. -- 16-bit `Enum`. It can contain up to 65536 values enumerated in the `[-32768, 32767]` range. +- 8-bit `Enum`. It can contain up to 256 values enumerated in the `[-128, 127]` range. +- 16-bit `Enum`. It can contain up to 65536 values enumerated in the `[-32768, 32767]` range. ClickHouse automatically chooses the type of `Enum` when data is inserted. You can also use `Enum8` or `Enum16` types to be sure in the size of storage. -## Usage examples +## Usage examples {#usage-examples} Here we create a table with an `Enum8('hello' = 1, 'world' = 2)` type column: -```sql +``` sql CREATE TABLE t_enum ( x Enum('hello' = 1, 'world' = 2) @@ -25,26 +25,30 @@ ENGINE = TinyLog Column `x` can only store values that are listed in the type definition: `'hello'` or `'world'`. If you try to save any other value, ClickHouse will raise an exception. 8-bit size for this `Enum` is chosen automatically. -```sql +``` sql INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') ``` -```text + +``` text Ok. ``` -```sql + +``` sql INSERT INTO t_enum values('a') ``` -```text + +``` text Exception on client: Code: 49. DB::Exception: Unknown element 'a' for type Enum('hello' = 1, 'world' = 2) ``` When you query data from the table, ClickHouse outputs the string values from `Enum`. -```sql +``` sql SELECT * FROM t_enum ``` -```text + +``` text ┌─x─────┐ │ hello │ │ world │ @@ -54,10 +58,11 @@ SELECT * FROM t_enum If you need to see the numeric equivalents of the rows, you must cast the `Enum` value to integer type. -```sql +``` sql SELECT CAST(x, 'Int8') FROM t_enum ``` -```text + +``` text ┌─CAST(x, 'Int8')─┐ │ 1 │ │ 2 │ @@ -67,16 +72,17 @@ SELECT CAST(x, 'Int8') FROM t_enum To create an Enum value in a query, you also need to use `CAST`. -```sql +``` sql SELECT toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)')) ``` -```text + +``` text ┌─toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)'))─┐ │ Enum8('a' = 1, 'b' = 2) │ └─────────────────────────────────────────────────────┘ ``` -## General rules and usage +## General rules and usage {#general-rules-and-usage} Each of the values is assigned a number in the range `-128 ... 127` for `Enum8` or in the range `-32768 ... 32767` for `Enum16`. All the strings and numbers must be different. An empty string is allowed. If this type is specified (in a table definition), numbers can be in an arbitrary order. However, the order does not matter. @@ -84,7 +90,7 @@ Neither the string nor the numeric value in an `Enum` can be [NULL](../query_lan An `Enum` can be contained in [Nullable](nullable.md) type. So if you create a table using the query -```sql +``` sql CREATE TABLE t_enum_nullable ( x Nullable( Enum8('hello' = 1, 'world' = 2) ) @@ -94,7 +100,7 @@ ENGINE = TinyLog it can store not only `'hello'` and `'world'`, but `NULL`, as well. -```sql +``` sql INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) ``` @@ -106,9 +112,9 @@ The implicit default value is the value with the lowest number. During `ORDER BY`, `GROUP BY`, `IN`, `DISTINCT` and so on, Enums behave the same way as the corresponding numbers. For example, ORDER BY sorts them numerically. Equality and comparison operators work the same way on Enums as they do on the underlying numeric values. -Enum values cannot be compared with numbers. Enums can be compared to a constant string. If the string compared to is not a valid value for the Enum, an exception will be thrown. The IN operator is supported with the Enum on the left hand side and a set of strings on the right hand side. The strings are the values of the corresponding Enum. +Enum values cannot be compared with numbers. Enums can be compared to a constant string. If the string compared to is not a valid value for the Enum, an exception will be thrown. The IN operator is supported with the Enum on the left-hand side and a set of strings on the right-hand side. The strings are the values of the corresponding Enum. -Most numeric and string operations are not defined for Enum values, e.g. adding a number to an Enum or concatenating a string to an Enum. +Most numeric and string operations are not defined for Enum values, e.g. adding a number to an Enum or concatenating a string to an Enum. However, the Enum has a natural `toString` function that returns its string value. Enum values are also convertible to numeric types using the `toT` function, where T is a numeric type. When T corresponds to the enum’s underlying numeric type, this conversion is zero-cost. @@ -116,5 +122,4 @@ The Enum type can be changed without cost using ALTER, if only the set of values Using ALTER, it is possible to change an Enum8 to an Enum16 or vice versa, just like changing an Int8 to Int16. - [Original article](https://clickhouse.tech/docs/en/data_types/enum/) diff --git a/docs/en/data_types/fixedstring.md b/docs/en/data_types/fixedstring.md index 9c40295bbb8..602741eac43 100644 --- a/docs/en/data_types/fixedstring.md +++ b/docs/en/data_types/fixedstring.md @@ -1,10 +1,10 @@ -# FixedString +# FixedString {#fixedstring} A fixed-length string of `N` bytes (neither characters nor code points). To declare a column of `FixedString` type, use the following syntax: -```sql +``` sql FixedString(N) ``` @@ -14,23 +14,23 @@ The `FixedString` type is efficient when data has the length of precisely `N` by Examples of the values that can be efficiently stored in `FixedString`-typed columns: -- Binary representation of IP addresses (`FixedString(16)` for IPv6). -- Language codes (ru_RU, en_US ... ). -- Currency codes (USD, RUB ... ). -- Binary representation of hashes (`FixedString(16)` for MD5, `FixedString(32)` for SHA256). +- The binary representation of IP addresses (`FixedString(16)` for IPv6). +- Language codes (ru\_RU, en\_US … ). +- Currency codes (USD, RUB … ). +- Binary representation of hashes (`FixedString(16)` for MD5, `FixedString(32)` for SHA256). To store UUID values, use the [UUID](uuid.md) data type. When inserting the data, ClickHouse: -- Complements a string with null bytes if the string contains fewer than `N` bytes. -- Throws the `Too large value for FixedString(N)` exception if the string contains more than `N` bytes. +- Complements a string with null bytes if the string contains fewer than `N` bytes. +- Throws the `Too large value for FixedString(N)` exception if the string contains more than `N` bytes. When selecting the data, ClickHouse does not remove the null bytes at the end of the string. If you use the `WHERE` clause, you should add null bytes manually to match the `FixedString` value. The following example illustrates how to use the `WHERE` clause with `FixedString`. -Let's consider the following table with the single `FixedString(2)` column: +Let’s consider the following table with the single `FixedString(2)` column: -```text +``` text ┌─name──┐ │ b │ └───────┘ @@ -38,18 +38,19 @@ Let's consider the following table with the single `FixedString(2)` column: The query `SELECT * FROM FixedStringTable WHERE a = 'b'` does not return any data as a result. We should complement the filter pattern with null bytes. -```sql +``` sql SELECT * FROM FixedStringTable WHERE a = 'b\0' ``` -```text + +``` text ┌─a─┐ │ b │ └───┘ ``` -This behavior differs from MySQL behavior for the `CHAR` type (where strings are padded with spaces, and the spaces are removed for output). +This behaviour differs from MySQL for the `CHAR` type (where strings are padded with spaces, and the spaces are removed for output). -Note that the length of the `FixedString(N)` value is constant. The [length](../query_language/functions/array_functions.md#array_functions-length) function returns `N` even if the `FixedString(N)` value is filled only with null bytes, but the [empty](../query_language/functions/string_functions.md#string_functions-empty) function returns `1` in this case. +Note that the length of the `FixedString(N)` value is constant. The [length](../query_language/functions/array_functions.md#array_functions-length) function returns `N` even if the `FixedString(N)` value is filled only with null bytes, but the [empty](../query_language/functions/string_functions.md#empty) function returns `1` in this case. [Original article](https://clickhouse.tech/docs/en/data_types/fixedstring/) diff --git a/docs/en/data_types/float.md b/docs/en/data_types/float.md index 1531c8d5722..b04bceae57c 100644 --- a/docs/en/data_types/float.md +++ b/docs/en/data_types/float.md @@ -1,73 +1,80 @@ -# Float32, Float64 +# Float32, Float64 {#float32-float64} [Floating point numbers](https://en.wikipedia.org/wiki/IEEE_754). Types are equivalent to types of C: -- `Float32` - `float` -- `Float64` - `double` +- `Float32` - `float` +- `Float64` - `double` We recommend that you store data in integer form whenever possible. For example, convert fixed precision numbers to integer values, such as monetary amounts or page load times in milliseconds. -## Using Floating-point Numbers +## Using Floating-point Numbers {#using-floating-point-numbers} -- Computations with floating-point numbers might produce a rounding error. +- Computations with floating-point numbers might produce a rounding error. -```sql + + +``` sql SELECT 1 - 0.9 ``` -```text +``` text ┌───────minus(1, 0.9)─┐ │ 0.09999999999999998 │ └─────────────────────┘ ``` -- The result of the calculation depends on the calculation method (the processor type and architecture of the computer system). -- Floating-point calculations might result in numbers such as infinity (`Inf`) and "not-a-number" (`NaN`). This should be taken into account when processing the results of calculations. -- When parsing floating point numbers from text, the result might not be the nearest machine-representable number. +- The result of the calculation depends on the calculation method (the processor type and architecture of the computer system). +- Floating-point calculations might result in numbers such as infinity (`Inf`) and “not-a-number” (`NaN`). This should be taken into account when processing the results of calculations. +- When parsing floating-point numbers from text, the result might not be the nearest machine-representable number. -## NaN and Inf {#data_type-float-nan-inf} +## NaN and Inf {#data-type-float-nan-inf} In contrast to standard SQL, ClickHouse supports the following categories of floating-point numbers: -- `Inf` – Infinity. +- `Inf` – Infinity. -```sql + + +``` sql SELECT 0.5 / 0 ``` -```text +``` text ┌─divide(0.5, 0)─┐ │ inf │ └────────────────┘ ``` -- `-Inf` – Negative infinity. +- `-Inf` – Negative infinity. -```sql + + +``` sql SELECT -0.5 / 0 ``` -```text +``` text ┌─divide(-0.5, 0)─┐ │ -inf │ └─────────────────┘ ``` -- `NaN` – Not a number. +- `NaN` – Not a number. -```sql + + +``` sql SELECT 0 / 0 ``` -```text +``` text ┌─divide(0, 0)─┐ │ nan │ └──────────────┘ ``` - See the rules for `NaN` sorting in the section [ORDER BY clause](../query_language/select.md). - + See the rules for `NaN` sorting in the section [ORDER BY clause](../query_language/select.md). [Original article](https://clickhouse.tech/docs/en/data_types/float/) diff --git a/docs/en/data_types/index.md b/docs/en/data_types/index.md index 17a30842748..0b327e5adb6 100644 --- a/docs/en/data_types/index.md +++ b/docs/en/data_types/index.md @@ -1,8 +1,7 @@ -# Data Types {#data_types} +# Data Types {#data-types} -ClickHouse can store various types of data in table cells. - -This section describes the supported data types and special considerations when using and/or implementing them, if any. +ClickHouse can store various kinds of data in table cells. +This section describes the supported data types and special considerations for using and/or implementing them if any. [Original article](https://clickhouse.tech/docs/en/data_types/) diff --git a/docs/en/data_types/int_uint.md b/docs/en/data_types/int_uint.md index ae318445c1a..2864e79394b 100644 --- a/docs/en/data_types/int_uint.md +++ b/docs/en/data_types/int_uint.md @@ -1,21 +1,19 @@ - -# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 +# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 {#uint8-uint16-uint32-uint64-int8-int16-int32-int64} Fixed-length integers, with or without a sign. -## Int Ranges +## Int Ranges {#int-ranges} -- Int8 - [-128 : 127] -- Int16 - [-32768 : 32767] -- Int32 - [-2147483648 : 2147483647] -- Int64 - [-9223372036854775808 : 9223372036854775807] +- Int8 - \[-128 : 127\] +- Int16 - \[-32768 : 32767\] +- Int32 - \[-2147483648 : 2147483647\] +- Int64 - \[-9223372036854775808 : 9223372036854775807\] -## Uint Ranges - -- UInt8 - [0 : 255] -- UInt16 - [0 : 65535] -- UInt32 - [0 : 4294967295] -- UInt64 - [0 : 18446744073709551615] +## Uint Ranges {#uint-ranges} +- UInt8 - \[0 : 255\] +- UInt16 - \[0 : 65535\] +- UInt32 - \[0 : 4294967295\] +- UInt64 - \[0 : 18446744073709551615\] [Original article](https://clickhouse.tech/docs/en/data_types/int_uint/) diff --git a/docs/en/data_types/nested_data_structures/aggregatefunction.md b/docs/en/data_types/nested_data_structures/aggregatefunction.md index 67520f75670..34ee0f4127d 100644 --- a/docs/en/data_types/nested_data_structures/aggregatefunction.md +++ b/docs/en/data_types/nested_data_structures/aggregatefunction.md @@ -1,4 +1,4 @@ -# AggregateFunction(name, types_of_arguments...) {#data_type-aggregatefunction} +# AggregateFunction(name, types\_of\_arguments…) {#data-type-aggregatefunction} The intermediate state of an aggregate function. To get it, use aggregate functions with the `-State` suffix. To get aggregated data in the future, you must use the same aggregate functions with the `-Merge`suffix. @@ -6,15 +6,15 @@ The intermediate state of an aggregate function. To get it, use aggregate functi **Parameters** -- Name of the aggregate function. +- Name of the aggregate function. - If the function is parametric specify its parameters too. + If the function is parametric specify its parameters too. -- Types of the aggregate function arguments. +- Types of the aggregate function arguments. **Example** -```sql +``` sql CREATE TABLE t ( column1 AggregateFunction(uniq, UInt64), @@ -25,24 +25,24 @@ CREATE TABLE t [uniq](../../query_language/agg_functions/reference.md#agg_function-uniq), anyIf ([any](../../query_language/agg_functions/reference.md#agg_function-any)+[If](../../query_language/agg_functions/combinators.md#agg-functions-combinator-if)) and [quantiles](../../query_language/agg_functions/reference.md) are the aggregate functions supported in ClickHouse. -## Usage +## Usage {#usage} -### Data Insertion +### Data Insertion {#data-insertion} To insert data, use `INSERT SELECT` with aggregate `-State`- functions. **Function examples** -```sql +``` sql uniqState(UserID) quantilesState(0.5, 0.9)(SendTiming) ``` -In contrast to the corresponding functions `uniq` and `quantiles`, `-State`- functions return the state, instead the final value. In other words, they return a value of `AggregateFunction` type. +In contrast to the corresponding functions `uniq` and `quantiles`, `-State`- functions return the state, instead of the final value. In other words, they return a value of `AggregateFunction` type. -In the results of `SELECT` query the values of `AggregateFunction` type have implementation-specific binary representation for all of the ClickHouse output formats. If dump data into, for example, `TabSeparated` format with `SELECT` query then this dump can be loaded back using `INSERT` query. +In the results of `SELECT` query, the values of `AggregateFunction` type have implementation-specific binary representation for all of the ClickHouse output formats. If dump data into, for example, `TabSeparated` format with `SELECT` query then this dump can be loaded back using `INSERT` query. -### Data Selection +### Data Selection {#data-selection} When selecting data from `AggregatingMergeTree` table, use `GROUP BY` clause and the same aggregate functions as when inserting data, but using `-Merge`suffix. @@ -50,15 +50,14 @@ An aggregate function with `-Merge` suffix takes a set of states, combines them, For example, the following two queries return the same result: -```sql +``` sql SELECT uniq(UserID) FROM table SELECT uniqMerge(state) FROM (SELECT uniqState(UserID) AS state FROM table GROUP BY RegionID) ``` -## Usage Example +## Usage Example {#usage-example} See [AggregatingMergeTree](../../operations/table_engines/aggregatingmergetree.md) engine description. - [Original article](https://clickhouse.tech/docs/en/data_types/nested_data_structures/aggregatefunction/) diff --git a/docs/en/data_types/nested_data_structures/index.md b/docs/en/data_types/nested_data_structures/index.md index 7815b8229b9..f6010f804e9 100644 --- a/docs/en/data_types/nested_data_structures/index.md +++ b/docs/en/data_types/nested_data_structures/index.md @@ -1,4 +1,3 @@ -# Nested Data Structures - +# Nested Data Structures {#nested-data-structures} [Original article](https://clickhouse.tech/docs/en/data_types/nested_data_structures/) diff --git a/docs/en/data_types/nested_data_structures/nested.md b/docs/en/data_types/nested_data_structures/nested.md index 727150ce841..2e40fac00ed 100644 --- a/docs/en/data_types/nested_data_structures/nested.md +++ b/docs/en/data_types/nested_data_structures/nested.md @@ -1,10 +1,10 @@ -# Nested(Name1 Type1, Name2 Type2, ...) +# Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} A nested data structure is like a nested table. The parameters of a nested data structure – the column names and types – are specified the same way as in a CREATE query. Each table row can correspond to any number of rows in a nested data structure. Example: -```sql +``` sql CREATE TABLE test.visits ( CounterID UInt32, @@ -27,7 +27,7 @@ CREATE TABLE test.visits ) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) ``` -This example declares the `Goals` nested data structure, which contains data about conversions (goals reached). Each row in the 'visits' table can correspond to zero or any number of conversions. +This example declares the `Goals` nested data structure, which contains data about conversions (goals reached). Each row in the ‘visits’ table can correspond to zero or any number of conversions. Only a single nesting level is supported. Columns of nested structures containing arrays are equivalent to multidimensional arrays, so they have limited support (there is no support for storing these columns in tables with the MergeTree engine). @@ -35,7 +35,7 @@ In most cases, when working with a nested data structure, its individual columns Example: -```sql +``` sql SELECT Goals.ID, Goals.EventTime @@ -44,7 +44,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -```text +``` text ┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ │ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ │ [1073752] │ ['2014-03-17 00:28:25'] │ @@ -61,9 +61,9 @@ LIMIT 10 It is easiest to think of a nested data structure as a set of multiple column arrays of the same length. -The only place where a SELECT query can specify the name of an entire nested data structure instead of individual columns is the ARRAY JOIN clause. For more information, see "ARRAY JOIN clause". Example: +The only place where a SELECT query can specify the name of an entire nested data structure instead of individual columns is the ARRAY JOIN clause. For more information, see “ARRAY JOIN clause”. Example: -```sql +``` sql SELECT Goal.ID, Goal.EventTime @@ -73,7 +73,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -```text +``` text ┌─Goal.ID─┬──────Goal.EventTime─┐ │ 1073752 │ 2014-03-17 16:38:10 │ │ 591325 │ 2014-03-17 16:38:48 │ @@ -88,7 +88,7 @@ LIMIT 10 └─────────┴─────────────────────┘ ``` -You can't perform SELECT for an entire nested data structure. You can only explicitly list individual columns that are part of it. +You can’t perform SELECT for an entire nested data structure. You can only explicitly list individual columns that are part of it. For an INSERT query, you should pass all the component column arrays of a nested data structure separately (as if they were individual column arrays). During insertion, the system checks that they have the same length. @@ -96,5 +96,4 @@ For a DESCRIBE query, the columns in a nested data structure are listed separate The ALTER query is very limited for elements in a nested data structure. - [Original article](https://clickhouse.tech/docs/en/data_types/nested_data_structures/nested/) diff --git a/docs/en/data_types/nullable.md b/docs/en/data_types/nullable.md index 3974091b4ce..e6c855860d2 100644 --- a/docs/en/data_types/nullable.md +++ b/docs/en/data_types/nullable.md @@ -1,32 +1,35 @@ -# Nullable(TypeName) {#data_type-nullable} +# Nullable(TypeName) {#data-type-nullable} -Allows to store special marker ([NULL](../query_language/syntax.md)) that denotes "missing value" alongside normal values allowed by `TypeName`. For example, a `Nullable(Int8)` type column can store `Int8` type values, and the rows that don't have a value will store `NULL`. +Allows to store special marker ([NULL](../query_language/syntax.md)) that denotes “missing value” alongside normal values allowed by `TypeName`. For example, a `Nullable(Int8)` type column can store `Int8` type values, and the rows that don’t have a value will store `NULL`. -For a `TypeName`, you can't use composite data types [Array](array.md) and [Tuple](tuple.md). Composite data types can contain `Nullable` type values, such as `Array(Nullable(Int8))`. +For a `TypeName`, you can’t use composite data types [Array](array.md) and [Tuple](tuple.md). Composite data types can contain `Nullable` type values, such as `Array(Nullable(Int8))`. -A `Nullable` type field can't be included in table indexes. +A `Nullable` type field can’t be included in table indexes. `NULL` is the default value for any `Nullable` type, unless specified otherwise in the ClickHouse server configuration. -## Storage features +## Storage features {#storage-features} -To store `Nullable` type values in table column, ClickHouse uses a separate file with `NULL` masks in addition to normal file with values. Entries in masks file allow ClickHouse to distinguish between `NULL` and default value of corresponding data type for each table row. Because of additional file, `Nullable` column consumes additional storage space compared to similar normal one. +To store `Nullable` type values in a table column, ClickHouse uses a separate file with `NULL` masks in addition to normal file with values. Entries in masks file allow ClickHouse to distinguish between `NULL` and a default value of corresponding data type for each table row. Because of an additional file, `Nullable` column consumes additional storage space compared to a similar normal one. !!! info "Note" Using `Nullable` almost always negatively affects performance, keep this in mind when designing your databases. -## Usage example +## Usage example {#usage-example} -```sql +``` sql CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog ``` -```sql + +``` sql INSERT INTO t_null VALUES (1, NULL), (2, 3) ``` -```sql + +``` sql SELECT x + y FROM t_null ``` -```text + +``` text ┌─plus(x, y)─┐ │ ᴺᵁᴸᴸ │ │ 5 │ diff --git a/docs/en/data_types/special_data_types/expression.md b/docs/en/data_types/special_data_types/expression.md index e3d461e94d2..4eaaa3a6542 100644 --- a/docs/en/data_types/special_data_types/expression.md +++ b/docs/en/data_types/special_data_types/expression.md @@ -1,6 +1,5 @@ -# Expression +# Expression {#expression} Used for representing lambda expressions in high-order functions. - [Original article](https://clickhouse.tech/docs/en/data_types/special_data_types/expression/) diff --git a/docs/en/data_types/special_data_types/index.md b/docs/en/data_types/special_data_types/index.md index 14c2406edc0..6930cb9220d 100644 --- a/docs/en/data_types/special_data_types/index.md +++ b/docs/en/data_types/special_data_types/index.md @@ -1,6 +1,5 @@ -# Special Data Types - -Special data type values can't be saved to a table or output in results, but are used as the intermediate result of running a query. +# Special Data Types {#special-data-types} +Special data type values can’t be saved to a table or output in results, but can be used as the intermediate result when running a query. [Original article](https://clickhouse.tech/docs/en/data_types/special_data_types/) diff --git a/docs/en/data_types/special_data_types/interval.md b/docs/en/data_types/special_data_types/interval.md index b0d152e69bd..372648653b1 100644 --- a/docs/en/data_types/special_data_types/interval.md +++ b/docs/en/data_types/special_data_types/interval.md @@ -3,30 +3,31 @@ The family of data types representing time and date intervals. The resulting types of the [INTERVAL](../../query_language/operators.md#operator-interval) operator. !!! warning "Warning" - You can't use `Interval` data types for storing values in tables. + `Interval` data type values can’t be stored in tables. Structure: -- Time interval as an unsigned integer value. -- Type of an interval. +- Time interval as an unsigned integer value. +- Type of an interval. Supported interval types: -- `SECOND` -- `MINUTE` -- `HOUR` -- `DAY` -- `WEEK` -- `MONTH` -- `QUARTER` -- `YEAR` +- `SECOND` +- `MINUTE` +- `HOUR` +- `DAY` +- `WEEK` +- `MONTH` +- `QUARTER` +- `YEAR` For each interval type, there is a separate data type. For example, the `DAY` interval is expressed as the `IntervalDay` data type: -```sql +``` sql SELECT toTypeName(INTERVAL 4 DAY) ``` -```text + +``` text ┌─toTypeName(toIntervalDay(4))─┐ │ IntervalDay │ └──────────────────────────────┘ @@ -36,23 +37,25 @@ SELECT toTypeName(INTERVAL 4 DAY) You can use `Interval`-type values in arithmetical operations with [Date](../../data_types/date.md) and [DateTime](../../data_types/datetime.md)-type values. For example, you can add 4 days to the current time: -```sql +``` sql SELECT now() as current_date_time, current_date_time + INTERVAL 4 DAY ``` -```text + +``` text ┌───current_date_time─┬─plus(now(), toIntervalDay(4))─┐ │ 2019-10-23 10:58:45 │ 2019-10-27 10:58:45 │ └─────────────────────┴───────────────────────────────┘ ``` -Intervals with different types can't be combined. You can't use intervals like `4 DAY 1 HOUR`. Express intervals in units that are smaller or equal to the smallest unit of the interval, for example, the interval `1 day and an hour` interval can be expressed as `25 HOUR` or `90000 SECOND`. +Intervals with different types can’t be combined. You can’t use intervals like `4 DAY 1 HOUR`. Express intervals in units that are smaller or equal to the smallest unit of the interval, for example, the interval `1 day and an hour` interval can be expressed as `25 HOUR` or `90000 SECOND`. -You can't perform arithmetical operations with `Interval`-type values, but you can add intervals of different types consequently to values in `Date` or `DateTime` data types. For example: +You can’t perform arithmetical operations with `Interval`-type values, but you can add intervals of different types consequently to values in `Date` or `DateTime` data types. For example: -```sql +``` sql SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL 3 HOUR ``` -```text + +``` text ┌───current_date_time─┬─plus(plus(now(), toIntervalDay(4)), toIntervalHour(3))─┐ │ 2019-10-23 11:16:28 │ 2019-10-27 14:16:28 │ └─────────────────────┴────────────────────────────────────────────────────────┘ @@ -60,15 +63,16 @@ SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL The following query causes an exception: -```sql +``` sql select now() AS current_date_time, current_date_time + (INTERVAL 4 DAY + INTERVAL 3 HOUR) ``` -```text + +``` text Received exception from server (version 19.14.1): -Code: 43. DB::Exception: Received from localhost:9000. DB::Exception: Wrong argument types for function plus: if one argument is Interval, then another must be Date or DateTime.. +Code: 43. DB::Exception: Received from localhost:9000. DB::Exception: Wrong argument types for function plus: if one argument is Interval, then another must be Date or DateTime.. ``` -## See Also +## See Also {#see-also} -- [INTERVAL](../../query_language/operators.md#operator-interval) operator -- [toInterval](../../query_language/functions/type_conversion_functions.md#function-tointerval) type convertion functions +- [INTERVAL](../../query_language/operators.md#operator-interval) operator +- [toInterval](../../query_language/functions/type_conversion_functions.md#function-tointerval) type convertion functions diff --git a/docs/en/data_types/special_data_types/nothing.md b/docs/en/data_types/special_data_types/nothing.md index 3540ecf861d..2d4948d5194 100644 --- a/docs/en/data_types/special_data_types/nothing.md +++ b/docs/en/data_types/special_data_types/nothing.md @@ -1,20 +1,19 @@ +# Nothing {#nothing} -# Nothing - -The only purpose of this data type is to represent cases where value is not expected. So you can't create a `Nothing` type value. +The only purpose of this data type is to represent cases where value is not expected. So you can’t create a `Nothing` type value. For example, literal [NULL](../../query_language/syntax.md#null-literal) has type of `Nullable(Nothing)`. See more about [Nullable](../../data_types/nullable.md). The `Nothing` type can also used to denote empty arrays: -```sql +``` sql SELECT toTypeName(array()) ``` -```text + +``` text ┌─toTypeName(array())─┐ │ Array(Nothing) │ └─────────────────────┘ ``` - [Original article](https://clickhouse.tech/docs/en/data_types/special_data_types/nothing/) diff --git a/docs/en/data_types/special_data_types/set.md b/docs/en/data_types/special_data_types/set.md index 560a4b4d01f..37489c85d74 100644 --- a/docs/en/data_types/special_data_types/set.md +++ b/docs/en/data_types/special_data_types/set.md @@ -1,6 +1,5 @@ -# Set - -Used for the right half of an IN expression. +# Set {#set} +Used for the right half of an [IN](../../query_language/select.md#select-in-operators) expression. [Original article](https://clickhouse.tech/docs/en/data_types/special_data_types/set/) diff --git a/docs/en/data_types/string.md b/docs/en/data_types/string.md index 59dd88dbf73..de3e74d96da 100644 --- a/docs/en/data_types/string.md +++ b/docs/en/data_types/string.md @@ -1,15 +1,13 @@ - -# String +# String {#string} Strings of an arbitrary length. The length is not limited. The value can contain an arbitrary set of bytes, including null bytes. The String type replaces the types VARCHAR, BLOB, CLOB, and others from other DBMSs. -## Encodings +## Encodings {#encodings} -ClickHouse doesn't have the concept of encodings. Strings can contain an arbitrary set of bytes, which are stored and output as-is. +ClickHouse doesn’t have the concept of encodings. Strings can contain an arbitrary set of bytes, which are stored and output as-is. If you need to store texts, we recommend using UTF-8 encoding. At the very least, if your terminal uses UTF-8 (as recommended), you can read and write your values without making conversions. Similarly, certain functions for working with strings have separate variations that work under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. -For example, the 'length' function calculates the string length in bytes, while the 'lengthUTF8' function calculates the string length in Unicode code points, assuming that the value is UTF-8 encoded. - +For example, the ‘length’ function calculates the string length in bytes, while the ‘lengthUTF8’ function calculates the string length in Unicode code points, assuming that the value is UTF-8 encoded. [Original article](https://clickhouse.tech/docs/en/data_types/string/) diff --git a/docs/en/data_types/tuple.md b/docs/en/data_types/tuple.md index 17acf696813..2f9d449b207 100644 --- a/docs/en/data_types/tuple.md +++ b/docs/en/data_types/tuple.md @@ -1,5 +1,4 @@ - -# Tuple(T1, T2, ...) +# Tuple(T1, T2, …) {#tuplet1-t2} A tuple of elements, each having an individual [type](index.md#data_types). @@ -7,39 +6,40 @@ Tuples are used for temporary column grouping. Columns can be grouped when an IN Tuples can be the result of a query. In this case, for text formats other than JSON, values are comma-separated in brackets. In JSON formats, tuples are output as arrays (in square brackets). -## Creating a tuple +## Creating a tuple {#creating-a-tuple} You can use a function to create a tuple: -```sql +``` sql tuple(T1, T2, ...) ``` Example of creating a tuple: -```sql +``` sql SELECT tuple(1,'a') AS x, toTypeName(x) ``` -```text + +``` text ┌─x───────┬─toTypeName(tuple(1, 'a'))─┐ │ (1,'a') │ Tuple(UInt8, String) │ └─────────┴───────────────────────────┘ ``` -## Working with data types +## Working with data types {#working-with-data-types} When creating a tuple on the fly, ClickHouse automatically detects the type of each argument as the minimum of the types which can store the argument value. If the argument is [NULL](../query_language/syntax.md#null-literal), the type of the tuple element is [Nullable](nullable.md). Example of automatic data type detection: -```sql +``` sql SELECT tuple(1, NULL) AS x, toTypeName(x) ``` -```text + +``` text ┌─x────────┬─toTypeName(tuple(1, NULL))──────┐ │ (1,NULL) │ Tuple(UInt8, Nullable(Nothing)) │ └──────────┴─────────────────────────────────┘ ``` - [Original article](https://clickhouse.tech/docs/en/data_types/tuple/) diff --git a/docs/en/data_types/uuid.md b/docs/en/data_types/uuid.md index c5ace976ef9..a04ae21755a 100644 --- a/docs/en/data_types/uuid.md +++ b/docs/en/data_types/uuid.md @@ -2,38 +2,41 @@ A universally unique identifier (UUID) is a 16-byte number used to identify records. For detailed information about the UUID, see [Wikipedia](https://en.wikipedia.org/wiki/Universally_unique_identifier). -The example of UUID type value is represented below: +The example of UUID type value is represented below: -```text +``` text 61f0c404-5cb3-11e7-907b-a6006ad3dba0 ``` If you do not specify the UUID column value when inserting a new record, the UUID value is filled with zero: -```text +``` text 00000000-0000-0000-0000-000000000000 ``` -## How to generate +## How to generate {#how-to-generate} To generate the UUID value, ClickHouse provides the [generateUUIDv4](../query_language/functions/uuid_functions.md) function. -## Usage example +## Usage example {#usage-example} **Example 1** This example demonstrates creating a table with the UUID type column and inserting a value into the table. -```sql +``` sql CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog ``` -```sql + +``` sql INSERT INTO t_uuid SELECT generateUUIDv4(), 'Example 1' ``` -```sql + +``` sql SELECT * FROM t_uuid ``` -```text + +``` text ┌────────────────────────────────────x─┬─y─────────┐ │ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ └──────────────────────────────────────┴───────────┘ @@ -43,20 +46,22 @@ SELECT * FROM t_uuid In this example, the UUID column value is not specified when inserting a new record. -```sql +``` sql INSERT INTO t_uuid (y) VALUES ('Example 2') ``` -```sql + +``` sql SELECT * FROM t_uuid ``` -```text + +``` text ┌────────────────────────────────────x─┬─y─────────┐ │ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ │ 00000000-0000-0000-0000-000000000000 │ Example 2 │ └──────────────────────────────────────┴───────────┘ ``` -## Restrictions +## Restrictions {#restrictions} The UUID data type only supports functions which [String](string.md) data type also supports (for example, [min](../query_language/agg_functions/reference.md#agg_function-min), [max](../query_language/agg_functions/reference.md#agg_function-max), and [count](../query_language/agg_functions/reference.md#agg_function-count)). diff --git a/docs/en/database_engines/index.md b/docs/en/database_engines/index.md index 184cd29982b..d9c7060fe93 100644 --- a/docs/en/database_engines/index.md +++ b/docs/en/database_engines/index.md @@ -1,4 +1,4 @@ -# Database Engines +# Database Engines {#database-engines} Database engines allow you to work with tables. @@ -6,8 +6,8 @@ By default, ClickHouse uses its native database engine, which provides configura You can also use the following database engines: -- [MySQL](mysql.md) +- [MySQL](mysql.md) -- [Lazy](lazy.md) +- [Lazy](lazy.md) [Original article](https://clickhouse.tech/docs/en/database_engines/) diff --git a/docs/en/database_engines/lazy.md b/docs/en/database_engines/lazy.md index d552fbc4589..e9e2bd1608e 100644 --- a/docs/en/database_engines/lazy.md +++ b/docs/en/database_engines/lazy.md @@ -1,13 +1,11 @@ -# Lazy +# Lazy {#lazy} Keeps tables in RAM only `expiration_time_in_seconds` seconds after last access. Can be used only with \*Log tables. -It's optimized for storing many small \*Log tables, for which there is a long time interval between accesses. +It’s optimized for storing many small \*Log tables, for which there is a long time interval between accesses. -## Creating a Database +## Creating a Database {#creating-a-database} -``` -CREATE DATABASE testlazy ENGINE = Lazy(expiration_time_in_seconds); -``` + CREATE DATABASE testlazy ENGINE = Lazy(expiration_time_in_seconds); [Original article](https://clickhouse.tech/docs/en/database_engines/lazy/) diff --git a/docs/en/database_engines/mysql.md b/docs/en/database_engines/mysql.md index fc08c2cc146..9becb908f76 100644 --- a/docs/en/database_engines/mysql.md +++ b/docs/en/database_engines/mysql.md @@ -1,4 +1,4 @@ -# MySQL +# MySQL {#mysql} Allows to connect to databases on a remote MySQL server and perform `INSERT` and `SELECT` queries to exchange data between ClickHouse and MySQL. @@ -6,12 +6,11 @@ The `MySQL` database engine translate queries to the MySQL server so you can per You cannot perform the following queries: -- `RENAME` -- `CREATE TABLE` -- `ALTER` +- `RENAME` +- `CREATE TABLE` +- `ALTER` - -## Creating a Database +## Creating a Database {#creating-a-database} ``` sql CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] @@ -20,40 +19,38 @@ ENGINE = MySQL('host:port', 'database', 'user', 'password') **Engine Parameters** -- `host:port` — MySQL server address. -- `database` — Remote database name. -- `user` — MySQL user. -- `password` — User password. +- `host:port` — MySQL server address. +- `database` — Remote database name. +- `user` — MySQL user. +- `password` — User password. +## Data Types Support {#data-types-support} -## Data Types Support - -MySQL | ClickHouse -------|------------ -UNSIGNED TINYINT | [UInt8](../data_types/int_uint.md) -TINYINT | [Int8](../data_types/int_uint.md) -UNSIGNED SMALLINT | [UInt16](../data_types/int_uint.md) -SMALLINT | [Int16](../data_types/int_uint.md) -UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../data_types/int_uint.md) -INT, MEDIUMINT | [Int32](../data_types/int_uint.md) -UNSIGNED BIGINT | [UInt64](../data_types/int_uint.md) -BIGINT | [Int64](../data_types/int_uint.md) -FLOAT | [Float32](../data_types/float.md) -DOUBLE | [Float64](../data_types/float.md) -DATE | [Date](../data_types/date.md) -DATETIME, TIMESTAMP | [DateTime](../data_types/datetime.md) -BINARY | [FixedString](../data_types/fixedstring.md) +| MySQL | ClickHouse | +|----------------------------------|---------------------------------------------| +| UNSIGNED TINYINT | [UInt8](../data_types/int_uint.md) | +| TINYINT | [Int8](../data_types/int_uint.md) | +| UNSIGNED SMALLINT | [UInt16](../data_types/int_uint.md) | +| SMALLINT | [Int16](../data_types/int_uint.md) | +| UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../data_types/int_uint.md) | +| INT, MEDIUMINT | [Int32](../data_types/int_uint.md) | +| UNSIGNED BIGINT | [UInt64](../data_types/int_uint.md) | +| BIGINT | [Int64](../data_types/int_uint.md) | +| FLOAT | [Float32](../data_types/float.md) | +| DOUBLE | [Float64](../data_types/float.md) | +| DATE | [Date](../data_types/date.md) | +| DATETIME, TIMESTAMP | [DateTime](../data_types/datetime.md) | +| BINARY | [FixedString](../data_types/fixedstring.md) | All other MySQL data types are converted into [String](../data_types/string.md). [Nullable](../data_types/nullable.md) is supported. - -## Examples of Use +## Examples of Use {#examples-of-use} Table in MySQL: -```text +``` text mysql> USE test; Database changed @@ -77,42 +74,51 @@ mysql> select * from mysql_table; Database in ClickHouse, exchanging data with the MySQL server: -```sql +``` sql CREATE DATABASE mysql_db ENGINE = MySQL('localhost:3306', 'test', 'my_user', 'user_password') ``` -```sql + +``` sql SHOW DATABASES ``` -```text + +``` text ┌─name─────┐ │ default │ │ mysql_db │ │ system │ └──────────┘ ``` -```sql + +``` sql SHOW TABLES FROM mysql_db ``` -```text + +``` text ┌─name─────────┐ │ mysql_table │ └──────────────┘ ``` -```sql + +``` sql SELECT * FROM mysql_db.mysql_table ``` -```text + +``` text ┌─int_id─┬─value─┐ │ 1 │ 2 │ └────────┴───────┘ ``` -```sql + +``` sql INSERT INTO mysql_db.mysql_table VALUES (3,4) ``` -```sql + +``` sql SELECT * FROM mysql_db.mysql_table ``` -```text + +``` text ┌─int_id─┬─value─┐ │ 1 │ 2 │ │ 3 │ 4 │ diff --git a/docs/en/development/architecture.md b/docs/en/development/architecture.md index 28ddd71f4a2..e9a0c6257cc 100644 --- a/docs/en/development/architecture.md +++ b/docs/en/development/architecture.md @@ -1,73 +1,72 @@ -# Overview of ClickHouse Architecture +# Overview of ClickHouse Architecture {#overview-of-clickhouse-architecture} -ClickHouse is a true column-oriented DBMS. Data is stored by columns, and during the execution of arrays (vectors or chunks of columns). Whenever possible, operations are dispatched on arrays, rather than on individual values. This is called "vectorized query execution," and it helps lower the cost of actual data processing. +ClickHouse is a true column-oriented DBMS. Data is stored by columns and during the execution of arrays (vectors or chunks of columns). Whenever possible, operations are dispatched on arrays, rather than on individual values. This is called “vectorized query execution,” and it helps lower the cost of actual data processing. -> This idea is nothing new. It dates back to the `APL` programming language and its descendants: `A +`, `J`, `K`, and `Q`. Array programming is used in scientific data processing. Neither is this idea something new in relational databases: for example, it is used in the `Vectorwise` system. +> This idea is nothing new. It dates back to the `APL` programming language and its descendants: `A +`, `J`, `K`, and `Q`. Array programming is used in the scientific data processing. Neither is this idea something new in relational databases: for example, it is used in the `Vectorwise` system. -There are two different approaches for speeding up the query processing: vectorized query execution and runtime code generation. In the latter, the code is generated for every kind of query on the fly, removing all indirection and dynamic dispatch. Neither of these approaches is strictly better than the other. Runtime code generation can be better when it fuses many operations together, thus fully utilizing CPU execution units and the pipeline. Vectorized query execution can be less practical, because it involves temporary vectors that must be written to the cache and read back. If the temporary data does not fit in the L2 cache, this becomes an issue. But vectorized query execution more easily utilizes the SIMD capabilities of the CPU. A [research paper](http://15721.courses.cs.cmu.edu/spring2016/papers/p5-sompolski.pdf) written by our friends shows that it is better to combine both approaches. ClickHouse uses vectorized query execution and has limited initial support for runtime code generation. +There are two different approaches for speeding up query processing: vectorized query execution and runtime code generation. In the latter, the code is generated for every kind of query on the fly, removing all indirection and dynamic dispatch. Neither of these approaches is strictly better than the other. Runtime code generation can be better when it fuses many operations together, thus fully utilizing CPU execution units and the pipeline. Vectorized query execution can be less practical because it involves temporary vectors that must be written to the cache and read back. If the temporary data does not fit in the L2 cache, this becomes an issue. But vectorized query execution more easily utilizes the SIMD capabilities of the CPU. A [research paper](http://15721.courses.cs.cmu.edu/spring2016/papers/p5-sompolski.pdf) written by our friends shows that it is better to combine both approaches. ClickHouse uses vectorized query execution and has limited initial support for runtime code generation. -## Columns +## Columns {#columns} To represent columns in memory (actually, chunks of columns), the `IColumn` interface is used. This interface provides helper methods for implementation of various relational operators. Almost all operations are immutable: they do not modify the original column, but create a new modified one. For example, the `IColumn :: filter` method accepts a filter byte mask. It is used for the `WHERE` and `HAVING` relational operators. Additional examples: the `IColumn :: permute` method to support `ORDER BY`, the `IColumn :: cut` method to support `LIMIT`, and so on. -Various `IColumn` implementations (`ColumnUInt8`, `ColumnString` and so on) are responsible for the memory layout of columns. Memory layout is usually a contiguous array. For the integer type of columns it is just one contiguous array, like `std :: vector`. For `String` and `Array` columns, it is two vectors: one for all array elements, placed contiguously, and a second one for offsets to the beginning of each array. There is also `ColumnConst` that stores just one value in memory, but looks like a column. +Various `IColumn` implementations (`ColumnUInt8`, `ColumnString` and so on) are responsible for the memory layout of columns. The memory layout is usually a contiguous array. For the integer type of columns, it is just one contiguous array, like `std :: vector`. For `String` and `Array` columns, it is two vectors: one for all array elements, placed contiguously, and a second one for offsets to the beginning of each array. There is also `ColumnConst` that stores just one value in memory, but looks like a column. -## Field +## Field {#field} -Nevertheless, it is possible to work with individual values as well. To represent an individual value, the `Field` is used. `Field` is just a discriminated union of `UInt64`, `Int64`, `Float64`, `String` and `Array`. `IColumn` has the `operator[]` method to get the n-th value as a `Field`, and the `insert` method to append a `Field` to the end of a column. These methods are not very efficient, because they require dealing with temporary `Field` objects representing an individual value. There are more efficient methods, such as `insertFrom`, `insertRangeFrom`, and so on. +Nevertheless, it is possible to work with individual values as well. To represent an individual value, the `Field` is used. `Field` is just a discriminated union of `UInt64`, `Int64`, `Float64`, `String` and `Array`. `IColumn` has the `operator[]` method to get the n-th value as a `Field` and the `insert` method to append a `Field` to the end of a column. These methods are not very efficient, because they require dealing with temporary `Field` objects representing an individual value. There are more efficient methods, such as `insertFrom`, `insertRangeFrom`, and so on. -`Field` doesn't have enough information about a specific data type for a table. For example, `UInt8`, `UInt16`, `UInt32`, and `UInt64` are all represented as `UInt64` in a `Field`. +`Field` doesn’t have enough information about a specific data type for a table. For example, `UInt8`, `UInt16`, `UInt32`, and `UInt64` are all represented as `UInt64` in a `Field`. -## Leaky Abstractions +## Leaky Abstractions {#leaky-abstractions} -`IColumn` has methods for common relational transformations of data, but they don't meet all needs. For example, `ColumnUInt64` doesn't have a method to calculate the sum of two columns, and `ColumnString` doesn't have a method to run a substring search. These countless routines are implemented outside of `IColumn`. +`IColumn` has methods for common relational transformations of data, but they don’t meet all needs. For example, `ColumnUInt64` doesn’t have a method to calculate the sum of two columns, and `ColumnString` doesn’t have a method to run a substring search. These countless routines are implemented outside of `IColumn`. -Various functions on columns can be implemented in a generic, non-efficient way using `IColumn` methods to extract `Field` values, or in a specialized way using knowledge of inner memory layout of data in a specific `IColumn` implementation. To do this, functions are cast to a specific `IColumn` type and deal with internal representation directly. For example, `ColumnUInt64` has the `getData` method that returns a reference to an internal array, then a separate routine reads or fills that array directly. In fact, we have "leaky abstractions" to allow efficient specializations of various routines. +Various functions on columns can be implemented in a generic, non-efficient way using `IColumn` methods to extract `Field` values, or in a specialized way using knowledge of inner memory layout of data in a specific `IColumn` implementation. To do this, functions are cast to a specific `IColumn` type and deal with internal representation directly. For example, `ColumnUInt64` has the `getData` method that returns a reference to an internal array, then a separate routine reads or fills that array directly. In fact, we have “leaky abstractions” to allow efficient specializations of various routines. -## Data Types +## Data Types {#data-types} -`IDataType` is responsible for serialization and deserialization: for reading and writing chunks of columns or individual values in binary or text form. -`IDataType` directly corresponds to data types in tables. For example, there are `DataTypeUInt32`, `DataTypeDateTime`, `DataTypeString` and so on. +`IDataType` is responsible for serialization and deserialization: for reading and writing chunks of columns or individual values in binary or text form. `IDataType` directly corresponds to data types in tables. For example, there are `DataTypeUInt32`, `DataTypeDateTime`, `DataTypeString` and so on. `IDataType` and `IColumn` are only loosely related to each other. Different data types can be represented in memory by the same `IColumn` implementations. For example, `DataTypeUInt32` and `DataTypeDateTime` are both represented by `ColumnUInt32` or `ColumnConstUInt32`. In addition, the same data type can be represented by different `IColumn` implementations. For example, `DataTypeUInt8` can be represented by `ColumnUInt8` or `ColumnConstUInt8`. -`IDataType` only stores metadata. For instance, `DataTypeUInt8` doesn't store anything at all (except vptr) and `DataTypeFixedString` stores just `N` (the size of fixed-size strings). +`IDataType` only stores metadata. For instance, `DataTypeUInt8` doesn’t store anything at all (except vptr) and `DataTypeFixedString` stores just `N` (the size of fixed-size strings). -`IDataType` has helper methods for various data formats. Examples are methods to serialize a value with possible quoting, to serialize a value for JSON, and to serialize a value as part of XML format. There is no direct correspondence to data formats. For example, the different data formats `Pretty` and `TabSeparated` can use the same `serializeTextEscaped` helper method from the `IDataType` interface. +`IDataType` has helper methods for various data formats. Examples are methods to serialize a value with possible quoting, to serialize a value for JSON, and to serialize a value as part of the XML format. There is no direct correspondence to data formats. For example, the different data formats `Pretty` and `TabSeparated` can use the same `serializeTextEscaped` helper method from the `IDataType` interface. -## Block +## Block {#block} -A `Block` is a container that represents a subset (chunk) of a table in memory. It is just a set of triples: `(IColumn, IDataType, column name)`. During query execution, data is processed by `Block`s. If we have a `Block`, we have data (in the `IColumn` object), we have information about its type (in `IDataType`) that tells us how to deal with that column, and we have the column name (either the original column name from the table, or some artificial name assigned for getting temporary results of calculations). +A `Block` is a container that represents a subset (chunk) of a table in memory. It is just a set of triples: `(IColumn, IDataType, column name)`. During query execution, data is processed by `Block`s. If we have a `Block`, we have data (in the `IColumn` object), we have information about its type (in `IDataType`) that tells us how to deal with that column, and we have the column name (either the original column name from the table or some artificial name assigned for getting temporary results of calculations). -When we calculate some function over columns in a block, we add another column with its result to the block, and we don't touch columns for arguments of the function because operations are immutable. Later, unneeded columns can be removed from the block, but not modified. This is convenient for elimination of common subexpressions. +When we calculate some function over columns in a block, we add another column with its result to the block, and we don’t touch columns for arguments of the function because operations are immutable. Later, unneeded columns can be removed from the block, but not modified. This is convenient for the elimination of common subexpressions. -Blocks are created for every processed chunk of data. Note that for the same type of calculation, the column names and types remain the same for different blocks, and only column data changes. It is better to split block data from the block header, because small block sizes will have a high overhead of temporary strings for copying shared_ptrs and column names. +Blocks are created for every processed chunk of data. Note that for the same type of calculation, the column names and types remain the same for different blocks and only column data changes. It is better to split block data from the block header because small block sizes will have a high overhead of temporary strings for copying shared\_ptrs and column names. -## Block Streams +## Block Streams {#block-streams} Block streams are for processing data. We use streams of blocks to read data from somewhere, perform data transformations, or write data to somewhere. `IBlockInputStream` has the `read` method to fetch the next block while available. `IBlockOutputStream` has the `write` method to push the block somewhere. Streams are responsible for: -1. Reading or writing to a table. The table just returns a stream for reading or writing blocks. -2. Implementing data formats. For example, if you want to output data to a terminal in `Pretty` format, you create a block output stream where you push blocks, and it formats them. -3. Performing data transformations. Let's say you have `IBlockInputStream` and want to create a filtered stream. You create `FilterBlockInputStream` and initialize it with your stream. Then when you pull a block from `FilterBlockInputStream`, it pulls a block from your stream, filters it, and returns the filtered block to you. Query execution pipelines are represented this way. +1. Reading or writing to a table. The table just returns a stream for reading or writing blocks. +2. Implementing data formats. For example, if you want to output data to a terminal in `Pretty` format, you create a block output stream where you push blocks, and it formats them. +3. Performing data transformations. Let’s say you have `IBlockInputStream` and want to create a filtered stream. You create `FilterBlockInputStream` and initialize it with your stream. Then when you pull a block from `FilterBlockInputStream`, it pulls a block from your stream, filters it, and returns the filtered block to you. Query execution pipelines are represented this way. There are more sophisticated transformations. For example, when you pull from `AggregatingBlockInputStream`, it reads all data from its source, aggregates it, and then returns a stream of aggregated data for you. Another example: `UnionBlockInputStream` accepts many input sources in the constructor and also a number of threads. It launches multiple threads and reads from multiple sources in parallel. -> Block streams use the "pull" approach to control flow: when you pull a block from the first stream, it consequently pulls the required blocks from nested streams, and the entire execution pipeline will work. Neither "pull" nor "push" is the best solution, because control flow is implicit, and that limits implementation of various features like simultaneous execution of multiple queries (merging many pipelines together). This limitation could be overcome with coroutines or just running extra threads that wait for each other. We may have more possibilities if we make control flow explicit: if we locate the logic for passing data from one calculation unit to another outside of those calculation units. Read this [article](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/) for more thoughts. +> Block streams use the “pull” approach to control flow: when you pull a block from the first stream, it consequently pulls the required blocks from nested streams, and the entire execution pipeline will work. Neither “pull” nor “push” is the best solution, because control flow is implicit, and that limits the implementation of various features like simultaneous execution of multiple queries (merging many pipelines together). This limitation could be overcome with coroutines or just running extra threads that wait for each other. We may have more possibilities if we make control flow explicit: if we locate the logic for passing data from one calculation unit to another outside of those calculation units. Read this [article](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/) for more thoughts. We should note that the query execution pipeline creates temporary data at each step. We try to keep block size small enough so that temporary data fits in the CPU cache. With that assumption, writing and reading temporary data is almost free in comparison with other calculations. We could consider an alternative, which is to fuse many operations in the pipeline together, to make the pipeline as short as possible and remove much of the temporary data. This could be an advantage, but it also has drawbacks. For example, a split pipeline makes it easy to implement caching intermediate data, stealing intermediate data from similar queries running at the same time, and merging pipelines for similar queries. -## Formats +## Formats {#formats} -Data formats are implemented with block streams. There are "presentational" formats only suitable for output of data to the client, such as `Pretty` format, which provides only `IBlockOutputStream`. And there are input/output formats, such as `TabSeparated` or `JSONEachRow`. +Data formats are implemented with block streams. There are “presentational” formats only suitable for the output of data to the client, such as `Pretty` format, which provides only `IBlockOutputStream`. And there are input/output formats, such as `TabSeparated` or `JSONEachRow`. -There are also row streams: `IRowInputStream` and `IRowOutputStream`. They allow you to pull/push data by individual rows, not by blocks. And they are only needed to simplify implementation of row-oriented formats. The wrappers `BlockInputStreamFromRowInputStream` and `BlockOutputStreamFromRowOutputStream` allow you to convert row-oriented streams to regular block-oriented streams. +There are also row streams: `IRowInputStream` and `IRowOutputStream`. They allow you to pull/push data by individual rows, not by blocks. And they are only needed to simplify the implementation of row-oriented formats. The wrappers `BlockInputStreamFromRowInputStream` and `BlockOutputStreamFromRowOutputStream` allow you to convert row-oriented streams to regular block-oriented streams. -## I/O +## I/O {#io} -For byte-oriented input/output, there are `ReadBuffer` and `WriteBuffer` abstract classes. They are used instead of C++ `iostream`s. Don't worry: every mature C++ project is using something other than `iostream`s for good reasons. +For byte-oriented input/output, there are `ReadBuffer` and `WriteBuffer` abstract classes. They are used instead of C++ `iostream`s. Don’t worry: every mature C++ project is using something other than `iostream`s for good reasons. `ReadBuffer` and `WriteBuffer` are just a contiguous buffer and a cursor pointing to the position in that buffer. Implementations may own or not own the memory for the buffer. There is a virtual method to fill the buffer with the following data (for `ReadBuffer`) or to flush the buffer somewhere (for `WriteBuffer`). The virtual methods are rarely called. @@ -75,9 +74,9 @@ Implementations of `ReadBuffer`/`WriteBuffer` are used for working with files an Read/WriteBuffers only deal with bytes. To help with formatted input/output (for instance, to write a number in decimal format), there are functions from `ReadHelpers` and `WriteHelpers` header files. -Let's look at what happens when you want to write a result set in `JSON` format to stdout. You have a result set ready to be fetched from `IBlockInputStream`. You create `WriteBufferFromFileDescriptor(STDOUT_FILENO)` to write bytes to stdout. You create `JSONRowOutputStream`, initialized with that `WriteBuffer`, to write rows in `JSON` to stdout. You create `BlockOutputStreamFromRowOutputStream` on top of it, to represent it as `IBlockOutputStream`. Then you call `copyData` to transfer data from `IBlockInputStream` to `IBlockOutputStream`, and everything works. Internally, `JSONRowOutputStream` will write various JSON delimiters and call the `IDataType::serializeTextJSON` method with a reference to `IColumn` and the row number as arguments. Consequently, `IDataType::serializeTextJSON` will call a method from `WriteHelpers.h`: for example, `writeText` for numeric types and `writeJSONString` for `DataTypeString`. +Let’s look at what happens when you want to write a result set in `JSON` format to stdout. You have a result set ready to be fetched from `IBlockInputStream`. You create `WriteBufferFromFileDescriptor(STDOUT_FILENO)` to write bytes to stdout. You create `JSONRowOutputStream`, initialized with that `WriteBuffer`, to write rows in `JSON` to stdout. You create `BlockOutputStreamFromRowOutputStream` on top of it, to represent it as `IBlockOutputStream`. Then you call `copyData` to transfer data from `IBlockInputStream` to `IBlockOutputStream`, and everything works. Internally, `JSONRowOutputStream` will write various JSON delimiters and call the `IDataType::serializeTextJSON` method with a reference to `IColumn` and the row number as arguments. Consequently, `IDataType::serializeTextJSON` will call a method from `WriteHelpers.h`: for example, `writeText` for numeric types and `writeJSONString` for `DataTypeString`. -## Tables +## Tables {#tables} Tables are represented by the `IStorage` interface. Different implementations of that interface are different table engines. Examples are `StorageMergeTree`, `StorageMemory`, and so on. Instances of these classes are just tables. @@ -87,105 +86,104 @@ In most cases, the read method is only responsible for reading the specified col But there are notable exceptions: -- The AST query is passed to the `read` method and the table engine can use it to derive index usage and to read less data from a table. -- Sometimes the table engine can process data itself to a specific stage. For example, `StorageDistributed` can send a query to remote servers, ask them to process data to a stage where data from different remote servers can be merged, and return that preprocessed data. -The query interpreter then finishes processing the data. +- The AST query is passed to the `read` method and the table engine can use it to derive index usage and to read less data from a table. +- Sometimes the table engine can process data itself to a specific stage. For example, `StorageDistributed` can send a query to remote servers, ask them to process data to a stage where data from different remote servers can be merged, and return that preprocessed data. The query interpreter then finishes processing the data. -The table's `read` method can return multiple `IBlockInputStream` objects to allow parallel data processing. These multiple block input streams can read from a table in parallel. Then you can wrap these streams with various transformations (such as expression evaluation or filtering) that can be calculated independently and create a `UnionBlockInputStream` on top of them, to read from multiple streams in parallel. +The table’s `read` method can return multiple `IBlockInputStream` objects to allow parallel data processing. These multiple block input streams can read from a table in parallel. Then you can wrap these streams with various transformations (such as expression evaluation or filtering) that can be calculated independently and create a `UnionBlockInputStream` on top of them, to read from multiple streams in parallel. There are also `TableFunction`s. These are functions that return a temporary `IStorage` object to use in the `FROM` clause of a query. To get a quick idea of how to implement your own table engine, look at something simple, like `StorageMemory` or `StorageTinyLog`. -> As the result of the `read` method, `IStorage` returns `QueryProcessingStage` – information about what parts of the query were already calculated inside storage. Currently we have only very coarse granularity for that information. There is no way for the storage to say "I have already processed this part of the expression in WHERE, for this range of data". We need to work on that. +> As the result of the `read` method, `IStorage` returns `QueryProcessingStage` – information about what parts of the query were already calculated inside storage. Currently, we have only very coarse granularity for that information. There is no way for the storage to say “I have already processed this part of the expression in WHERE, for this range of data”. We need to work on that. -## Parsers +## Parsers {#parsers} A query is parsed by a hand-written recursive descent parser. For example, `ParserSelectQuery` just recursively calls the underlying parsers for various parts of the query. Parsers create an `AST`. The `AST` is represented by nodes, which are instances of `IAST`. > Parser generators are not used for historical reasons. -## Interpreters +## Interpreters {#interpreters} -Interpreters are responsible for creating the query execution pipeline from an `AST`. There are simple interpreters, such as `InterpreterExistsQuery`and `InterpreterDropQuery`, or the more sophisticated `InterpreterSelectQuery`. The query execution pipeline is a combination of block input or output streams. For example, the result of interpreting the `SELECT` query is the `IBlockInputStream` to read the result set from; the result of the INSERT query is the `IBlockOutputStream` to write data for insertion to; and the result of interpreting the `INSERT SELECT` query is the `IBlockInputStream` that returns an empty result set on the first read, but that copies data from `SELECT` to `INSERT` at the same time. +Interpreters are responsible for creating the query execution pipeline from an `AST`. There are simple interpreters, such as `InterpreterExistsQuery` and `InterpreterDropQuery`, or the more sophisticated `InterpreterSelectQuery`. The query execution pipeline is a combination of block input or output streams. For example, the result of interpreting the `SELECT` query is the `IBlockInputStream` to read the result set from; the result of the INSERT query is the `IBlockOutputStream` to write data for insertion to, and the result of interpreting the `INSERT SELECT` query is the `IBlockInputStream` that returns an empty result set on the first read, but that copies data from `SELECT` to `INSERT` at the same time. `InterpreterSelectQuery` uses `ExpressionAnalyzer` and `ExpressionActions` machinery for query analysis and transformations. This is where most rule-based query optimizations are done. `ExpressionAnalyzer` is quite messy and should be rewritten: various query transformations and optimizations should be extracted to separate classes to allow modular transformations or query. -## Functions +## Functions {#functions} There are ordinary functions and aggregate functions. For aggregate functions, see the next section. -Ordinary functions don't change the number of rows – they work as if they are processing each row independently. In fact, functions are not called for individual rows, but for `Block`'s of data to implement vectorized query execution. +Ordinary functions don’t change the number of rows – they work as if they are processing each row independently. In fact, functions are not called for individual rows, but for `Block`’s of data to implement vectorized query execution. There are some miscellaneous functions, like [blockSize](../query_language/functions/other_functions.md#function-blocksize), [rowNumberInBlock](../query_language/functions/other_functions.md#function-rownumberinblock), and [runningAccumulate](../query_language/functions/other_functions.md#function-runningaccumulate), that exploit block processing and violate the independence of rows. -ClickHouse has strong typing, so implicit type conversion doesn't occur. If a function doesn't support a specific combination of types, an exception will be thrown. But functions can work (be overloaded) for many different combinations of types. For example, the `plus` function (to implement the `+` operator) works for any combination of numeric types: `UInt8` + `Float32`, `UInt16` + `Int8`, and so on. Also, some variadic functions can accept any number of arguments, such as the `concat` function. +ClickHouse has strong typing, so implicit type conversion doesn’t occur. If a function doesn’t support a specific combination of types, an exception will be thrown. But functions can work (be overloaded) for many different combinations of types. For example, the `plus` function (to implement the `+` operator) works for any combination of numeric types: `UInt8` + `Float32`, `UInt16` + `Int8`, and so on. Also, some variadic functions can accept any number of arguments, such as the `concat` function. Implementing a function may be slightly inconvenient because a function explicitly dispatches supported data types and supported `IColumns`. For example, the `plus` function has code generated by instantiation of a C++ template for each combination of numeric types, and for constant or non-constant left and right arguments. -> This is a nice place to implement runtime code generation to avoid template code bloat. Also, it will make it possible to add fused functions like fused multiply-add, or to make multiple comparisons in one loop iteration. +> This is a nice place to implement runtime code generation to avoid template code bloat. Also, it will make it possible to add fused functions like fused multiply-add or to make multiple comparisons in one loop iteration. -Due to vectorized query execution, functions are not short-circuit. For example, if you write `WHERE f(x) AND g(y)`, both sides will be calculated, even for rows, when `f(x)` is zero (except when `f(x)` is a zero constant expression). But if selectivity of the `f(x)` condition is high, and calculation of `f(x)` is much cheaper than `g(y)`, it's better to implement multi-pass calculation: first calculate `f(x)`, then filter columns by the result, and then calculate `g(y)` only for smaller, filtered chunks of data. +Due to vectorized query execution, functions are not short-circuited. For example, if you write `WHERE f(x) AND g(y)`, both sides will be calculated, even for rows, when `f(x)` is zero (except when `f(x)` is a zero constant expression). But if the selectivity of the `f(x)` condition is high, and calculation of `f(x)` is much cheaper than `g(y)`, it’s better to implement multi-pass calculation: first calculate `f(x)`, then filter columns by the result, and then calculate `g(y)` only for smaller, filtered chunks of data. -## Aggregate Functions +## Aggregate Functions {#aggregate-functions} -Aggregate functions are stateful functions. They accumulate passed values into some state, and allow you to get results from that state. They are managed with the `IAggregateFunction` interface. States can be rather simple (the state for `AggregateFunctionCount` is just a single `UInt64` value) or quite complex (the state of `AggregateFunctionUniqCombined` is a combination of a linear array, a hash table and a `HyperLogLog` probabilistic data structure). +Aggregate functions are stateful functions. They accumulate passed values into some state and allow you to get results from that state. They are managed with the `IAggregateFunction` interface. States can be rather simple (the state for `AggregateFunctionCount` is just a single `UInt64` value) or quite complex (the state of `AggregateFunctionUniqCombined` is a combination of a linear array, a hash table and a `HyperLogLog` probabilistic data structure). To deal with multiple states while executing a high-cardinality `GROUP BY` query, states are allocated in `Arena` (a memory pool), or they could be allocated in any suitable piece of memory. States can have a non-trivial constructor and destructor: for example, complex aggregation states can allocate additional memory themselves. This requires some attention to creating and destroying states and properly passing their ownership, to keep track of who and when will destroy states. -Aggregation states can be serialized and deserialized to pass over the network during distributed query execution or to write them on disk where there is not enough RAM. They can even be stored in a table with the `DataTypeAggregateFunction` to allow incremental aggregation of data. +Aggregation states can be serialized and deserialized to pass over the network during distributed query execution or to write them on the disk where there is not enough RAM. They can even be stored in a table with the `DataTypeAggregateFunction` to allow incremental aggregation of data. > The serialized data format for aggregate function states is not versioned right now. This is ok if aggregate states are only stored temporarily. But we have the `AggregatingMergeTree` table engine for incremental aggregation, and people are already using it in production. This is why we should add support for backward compatibility when changing the serialized format for any aggregate function in the future. -## Server +## Server {#server} The server implements several different interfaces: -- An HTTP interface for any foreign clients. -- A TCP interface for the native ClickHouse client and for cross-server communication during distributed query execution. -- An interface for transferring data for replication. +- An HTTP interface for any foreign clients. +- A TCP interface for the native ClickHouse client and for cross-server communication during distributed query execution. +- An interface for transferring data for replication. Internally, it is just a basic multithreaded server without coroutines, fibers, etc. Since the server is not designed to process a high rate of simple queries but is intended to process a relatively low rate of complex queries, each of them can process a vast amount of data for analytics. The server initializes the `Context` class with the necessary environment for query execution: the list of available databases, users and access rights, settings, clusters, the process list, the query log, and so on. This environment is used by interpreters. -We maintain full backward and forward compatibility for the server TCP protocol: old clients can talk to new servers and new clients can talk to old servers. But we don't want to maintain it eternally, and we are removing support for old versions after about one year. +We maintain full backwards and forward compatibility for the server TCP protocol: old clients can talk to new servers and new clients can talk to old servers. But we don’t want to maintain it eternally, and we are removing support for old versions after about one year. -> For all external applications, we recommend using the HTTP interface because it is simple and easy to use. The TCP protocol is more tightly linked to internal data structures: it uses an internal format for passing blocks of data and it uses custom framing for compressed data. We haven't released a C library for that protocol because it requires linking most of the ClickHouse codebase, which is not practical. +> For all external applications, we recommend using the HTTP interface because it is simple and easy to use. The TCP protocol is more tightly linked to internal data structures: it uses an internal format for passing blocks of data and it uses custom framing for compressed data. We haven’t released a C library for that protocol because it requires linking most of the ClickHouse codebase, which is not practical. -## Distributed Query Execution +## Distributed Query Execution {#distributed-query-execution} -Servers in a cluster setup are mostly independent. You can create a `Distributed` table on one or all servers in a cluster. The `Distributed` table does not store data itself – it only provides a "view" to all local tables on multiple nodes of a cluster. When you SELECT from a `Distributed` table, it rewrites that query, chooses remote nodes according to load balancing settings, and sends the query to them. The `Distributed` table requests remote servers to process a query just up to a stage where intermediate results from different servers can be merged. Then it receives the intermediate results and merges them. The distributed table tries to distribute as much work as possible to remote servers, and does not send much intermediate data over the network. +Servers in a cluster setup are mostly independent. You can create a `Distributed` table on one or all servers in a cluster. The `Distributed` table does not store data itself – it only provides a “view” to all local tables on multiple nodes of a cluster. When you SELECT from a `Distributed` table, it rewrites that query, chooses remote nodes according to load balancing settings, and sends the query to them. The `Distributed` table requests remote servers to process a query just up to a stage where intermediate results from different servers can be merged. Then it receives the intermediate results and merges them. The distributed table tries to distribute as much work as possible to remote servers and does not send much intermediate data over the network. -> Things become more complicated when you have subqueries in IN or JOIN clauses and each of them uses a `Distributed` table. We have different strategies for execution of these queries. +> Things become more complicated when you have subqueries in IN or JOIN clauses and each of them uses a `Distributed` table. We have different strategies for the execution of these queries. -There is no global query plan for distributed query execution. Each node has its own local query plan for its part of the job. We only have simple one-pass distributed query execution: we send queries for remote nodes and then merge the results. But this is not feasible for difficult queries with high cardinality GROUP BYs or with a large amount of temporary data for JOIN: in such cases, we need to "reshuffle" data between servers, which requires additional coordination. ClickHouse does not support that kind of query execution, and we need to work on it. +There is no global query plan for distributed query execution. Each node has its own local query plan for its part of the job. We only have simple one-pass distributed query execution: we send queries for remote nodes and then merge the results. But this is not feasible for difficult queries with high cardinality GROUP BYs or with a large amount of temporary data for JOIN: in such cases, we need to “reshuffle” data between servers, which requires additional coordination. ClickHouse does not support that kind of query execution, and we need to work on it. -## Merge Tree +## Merge Tree {#merge-tree} -`MergeTree` is a family of storage engines that supports indexing by primary key. The primary key can be an arbitrary tuple of columns or expressions. Data in a `MergeTree` table is stored in "parts". Each part stores data in the primary key order (data is ordered lexicographically by the primary key tuple). All the table columns are stored in separate `column.bin` files in these parts. The files consist of compressed blocks. Each block is usually from 64 KB to 1 MB of uncompressed data, depending on the average value size. The blocks consist of column values placed contiguously one after the other. Column values are in the same order for each column (the order is defined by the primary key), so when you iterate by many columns, you get values for the corresponding rows. +`MergeTree` is a family of storage engines that supports indexing by primary key. The primary key can be an arbitrary tuple of columns or expressions. Data in a `MergeTree` table is stored in “parts”. Each part stores data in the primary key order (data is ordered lexicographically by the primary key tuple). All the table columns are stored in separate `column.bin` files in these parts. The files consist of compressed blocks. Each block is usually from 64 KB to 1 MB of uncompressed data, depending on the average value size. The blocks consist of column values placed contiguously one after the other. Column values are in the same order for each column (the order is defined by the primary key), so when you iterate by many columns, you get values for the corresponding rows. -The primary key itself is "sparse". It doesn't address each single row, but only some ranges of data. A separate `primary.idx` file has the value of the primary key for each N-th row, where N is called `index_granularity` (usually, N = 8192). Also, for each column, we have `column.mrk` files with "marks," which are offsets to each N-th row in the data file. Each mark is a pair: the offset in the file to the beginning of the compressed block, and the offset in the decompressed block to the beginning of data. Usually compressed blocks are aligned by marks, and the offset in the decompressed block is zero. Data for `primary.idx` always resides in memory and data for `column.mrk` files is cached. +The primary key itself is “sparse”. It doesn’t address every single row, but only some ranges of data. A separate `primary.idx` file has the value of the primary key for each N-th row, where N is called `index_granularity` (usually, N = 8192). Also, for each column, we have `column.mrk` files with “marks,” which are offsets to each N-th row in the data file. Each mark is a pair: the offset in the file to the beginning of the compressed block, and the offset in the decompressed block to the beginning of data. Usually, compressed blocks are aligned by marks, and the offset in the decompressed block is zero. Data for `primary.idx` always resides in memory and data for `column.mrk` files is cached. When we are going to read something from a part in `MergeTree`, we look at `primary.idx` data and locate ranges that could possibly contain requested data, then look at `column.mrk` data and calculate offsets for where to start reading those ranges. Because of sparseness, excess data may be read. ClickHouse is not suitable for a high load of simple point queries, because the entire range with `index_granularity` rows must be read for each key, and the entire compressed block must be decompressed for each column. We made the index sparse because we must be able to maintain trillions of rows per single server without noticeable memory consumption for the index. Also, because the primary key is sparse, it is not unique: it cannot check the existence of the key in the table at INSERT time. You could have many rows with the same key in a table. -When you `INSERT` a bunch of data into `MergeTree`, that bunch is sorted by primary key order and forms a new part. To keep the number of parts relatively low, there are background threads that periodically select some parts and merge them to a single sorted part. That's why it is called `MergeTree`. Of course, merging leads to "write amplification". All parts are immutable: they are only created and deleted, but not modified. When SELECT is run, it holds a snapshot of the table (a set of parts). After merging, we also keep old parts for some time to make recovery after failure easier, so if we see that some merged part is probably broken, we can replace it with its source parts. +When you `INSERT` a bunch of data into `MergeTree`, that bunch is sorted by primary key order and forms a new part. To keep the number of parts relatively low, there are background threads that periodically select some parts and merge them into a single sorted part. That’s why it is called `MergeTree`. Of course, merging leads to “write amplification”. All parts are immutable: they are only created and deleted, but not modified. When SELECT is run, it holds a snapshot of the table (a set of parts). After merging, we also keep old parts for some time to make recovery after failure easier, so if we see that some merged part is probably broken, we can replace it with its source parts. -`MergeTree` is not an LSM tree because it doesn't contain "memtable" and "log": inserted data is written directly to the filesystem. This makes it suitable only to INSERT data in batches, not by individual row and not very frequently – about once per second is ok, but a thousand times a second is not. We did it this way for simplicity's sake, and because we are already inserting data in batches in our applications. +`MergeTree` is not an LSM tree because it doesn’t contain “memtable” and “log”: inserted data is written directly to the filesystem. This makes it suitable only to INSERT data in batches, not by individual row and not very frequently – about once per second is ok, but a thousand times a second is not. We did it this way for simplicity’s sake, and because we are already inserting data in batches in our applications. -> MergeTree tables can only have one (primary) index: there aren't any secondary indices. It would be nice to allow multiple physical representations under one logical table, for example, to store data in more than one physical order or even to allow representations with pre-aggregated data along with original data. +> MergeTree tables can only have one (primary) index: there aren’t any secondary indices. It would be nice to allow multiple physical representations under one logical table, for example, to store data in more than one physical order or even to allow representations with pre-aggregated data along with original data. -There are MergeTree engines that are doing additional work during background merges. Examples are `CollapsingMergeTree` and `AggregatingMergeTree`. This could be treated as special support for updates. Keep in mind that these are not real updates because users usually have no control over the time when background merges will be executed, and data in a `MergeTree` table is almost always stored in more than one part, not in completely merged form. +There are MergeTree engines that are doing additional work during background merges. Examples are `CollapsingMergeTree` and `AggregatingMergeTree`. This could be treated as a special support for updates. Keep in mind that these are not real updates because users usually have no control over the time when background merges will be executed, and data in a `MergeTree` table is almost always stored in more than one part, not in completely merged form. -## Replication +## Replication {#replication} Replication in ClickHouse is implemented on a per-table basis. You could have some replicated and some non-replicated tables on the same server. You could also have tables replicated in different ways, such as one table with two-factor replication and another with three-factor. Replication is implemented in the `ReplicatedMergeTree` storage engine. The path in `ZooKeeper` is specified as a parameter for the storage engine. All tables with the same path in `ZooKeeper` become replicas of each other: they synchronize their data and maintain consistency. Replicas can be added and removed dynamically simply by creating or dropping a table. -Replication uses an asynchronous multi-master scheme. You can insert data into any replica that has a session with `ZooKeeper`, and data is replicated to all other replicas asynchronously. Because ClickHouse doesn't support UPDATEs, replication is conflict-free. As there is no quorum acknowledgment of inserts, just-inserted data might be lost if one node fails. +Replication uses an asynchronous multi-master scheme. You can insert data into any replica that has a session with `ZooKeeper`, and data is replicated to all other replicas asynchronously. Because ClickHouse doesn’t support UPDATEs, replication is conflict-free. As there is no quorum acknowledgement of inserts, just-inserted data might be lost if one node fails. -Metadata for replication is stored in ZooKeeper. There is a replication log that lists what actions to do. Actions are: get part; merge parts; drop partition, etc. Each replica copies the replication log to its queue and then executes the actions from the queue. For example, on insertion, the "get part" action is created in the log, and every replica downloads that part. Merges are coordinated between replicas to get byte-identical results. All parts are merged in the same way on all replicas. To achieve this, one replica is elected as the leader, and that replica initiates merges and writes "merge parts" actions to the log. +Metadata for replication is stored in ZooKeeper. There is a replication log that lists what actions to do. Actions are: get part; merge parts; drop a partition, etc. Each replica copies the replication log to its queue and then executes the actions from the queue. For example, on insertion, the “get the part” action is created in the log, and every replica downloads that part. Merges are coordinated between replicas to get byte-identical results. All parts are merged in the same way on all replicas. To achieve this, one replica is elected as the leader, and that replica initiates merges and writes “merge parts” actions to the log. Replication is physical: only compressed parts are transferred between nodes, not queries. To lower the network cost (to avoid network amplification), merges are processed on each replica independently in most cases. Large merged parts are sent over the network only in cases of significant replication lag. @@ -193,5 +191,4 @@ In addition, each replica stores its state in ZooKeeper as the set of parts and > The ClickHouse cluster consists of independent shards, and each shard consists of replicas. The cluster is not elastic, so after adding a new shard, data is not rebalanced between shards automatically. Instead, the cluster load will be uneven. This implementation gives you more control, and it is fine for relatively small clusters such as tens of nodes. But for clusters with hundreds of nodes that we are using in production, this approach becomes a significant drawback. We should implement a table engine that will span its data across the cluster with dynamically replicated regions that could be split and balanced between clusters automatically. - [Original article](https://clickhouse.tech/docs/en/development/architecture/) diff --git a/docs/en/development/browse_code.md b/docs/en/development/browse_code.md index 0a99f5128cb..2a109c9323c 100644 --- a/docs/en/development/browse_code.md +++ b/docs/en/development/browse_code.md @@ -1,7 +1,7 @@ -# Browse ClickHouse Source Code +# Browse ClickHouse Source Code {#browse-clickhouse-source-code} You can use **Woboq** online code browser available [here](https://clickhouse-test-reports.s3.yandex.net/codebrowser/html_report///ClickHouse/dbms/src/index.html). It provides code navigation and semantic highlighting, search and indexing. The code snapshot is updated daily. -Also you can browse sources on [GitHub](https://github.com/ClickHouse/ClickHouse) as usual. +Also, you can browse sources on [GitHub](https://github.com/ClickHouse/ClickHouse) as usual. -If you're interested what IDE to use, we recommend CLion, QT Creator, VS Code and KDevelop (with caveats). You can use any favourite IDE. Vim and Emacs also count. +If you’re interested what IDE to use, we recommend CLion, QT Creator, VS Code and KDevelop (with caveats). You can use any favourite IDE. Vim and Emacs also count. diff --git a/docs/en/development/build.md b/docs/en/development/build.md index ea3f68355ca..57873226f7c 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -1,54 +1,56 @@ -# How to Build ClickHouse for Development +# How to Build ClickHouse for Development {#how-to-build-clickhouse-for-development} The following tutorial is based on the Ubuntu Linux system. With appropriate changes, it should also work on any other Linux distribution. -Supported platforms: x86_64 and AArch64. Support for Power9 is experimental. +Supported platforms: x86\_64 and AArch64. Support for Power9 is experimental. -## Install Git, CMake, Python and Ninja +## Install Git, CMake, Python and Ninja {#install-git-cmake-python-and-ninja} -```bash +``` bash $ sudo apt-get install git cmake python ninja-build ``` Or cmake3 instead of cmake on older systems. -## Install GCC 9 +## Install GCC 9 {#install-gcc-9} There are several ways to do this. -### Install from a PPA Package +### Install from a PPA Package {#install-from-a-ppa-package} -```bash +``` bash $ sudo apt-get install software-properties-common $ sudo apt-add-repository ppa:ubuntu-toolchain-r/test $ sudo apt-get update $ sudo apt-get install gcc-9 g++-9 ``` -### Install from Sources +### Install from Sources {#install-from-sources} Look at [utils/ci/build-gcc-from-sources.sh](https://github.com/ClickHouse/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh) -## Use GCC 9 for Builds +## Use GCC 9 for Builds {#use-gcc-9-for-builds} -```bash +``` bash $ export CC=gcc-9 $ export CXX=g++-9 ``` -## Checkout ClickHouse Sources +## Checkout ClickHouse Sources {#checkout-clickhouse-sources} -```bash +``` bash $ git clone --recursive git@github.com:ClickHouse/ClickHouse.git ``` + or -```bash + +``` bash $ git clone --recursive https://github.com/ClickHouse/ClickHouse.git ``` -## Build ClickHouse +## Build ClickHouse {#build-clickhouse} -```bash +``` bash $ cd ClickHouse $ mkdir build $ cd build @@ -60,82 +62,73 @@ $ cd .. To create an executable, run `ninja clickhouse`. This will create the `dbms/programs/clickhouse` executable, which can be used with `client` or `server` arguments. +# How to Build ClickHouse on Any Linux {#how-to-build-clickhouse-on-any-linux} -# How to Build ClickHouse on Any Linux +The build requires the following components: -The build requires the following componenets: - -- Git (is used only to checkout the sources, it's not needed for build) -- CMake 3.10 or newer -- Ninja (recommended) or Make -- C++ compiler: gcc 9 or clang 8 or newer -- Linker: lld or gold (the classic GNU ld won't work) -- Python (is only used inside LLVM build and it is optional) +- Git (is used only to checkout the sources, it’s not needed for the build) +- CMake 3.10 or newer +- Ninja (recommended) or Make +- C++ compiler: gcc 9 or clang 8 or newer +- Linker: lld or gold (the classic GNU ld won’t work) +- Python (is only used inside LLVM build and it is optional) If all the components are installed, you may build in the same way as the steps above. Example for Ubuntu Eoan: -``` -sudo apt update -sudo apt install git cmake ninja-build g++ python -git clone --recursive https://github.com/ClickHouse/ClickHouse.git -mkdir build && cd build -cmake ../ClickHouse -ninja -``` + sudo apt update + sudo apt install git cmake ninja-build g++ python + git clone --recursive https://github.com/ClickHouse/ClickHouse.git + mkdir build && cd build + cmake ../ClickHouse + ninja Example for OpenSUSE Tumbleweed: -``` -sudo zypper install git cmake ninja gcc-c++ python lld -git clone --recursive https://github.com/ClickHouse/ClickHouse.git -mkdir build && cd build -cmake ../ClickHouse -ninja -``` + sudo zypper install git cmake ninja gcc-c++ python lld + git clone --recursive https://github.com/ClickHouse/ClickHouse.git + mkdir build && cd build + cmake ../ClickHouse + ninja Example for Fedora Rawhide: -``` -sudo yum update -yum --nogpg install git cmake make gcc-c++ python2 -git clone --recursive https://github.com/ClickHouse/ClickHouse.git -mkdir build && cd build -cmake ../ClickHouse -make -j $(nproc) -``` + sudo yum update + yum --nogpg install git cmake make gcc-c++ python2 + git clone --recursive https://github.com/ClickHouse/ClickHouse.git + mkdir build && cd build + cmake ../ClickHouse + make -j $(nproc) -# You Don't Have to Build ClickHouse +# You Don’t Have to Build ClickHouse {#you-dont-have-to-build-clickhouse} ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour. -They are build for stable, prestable and testing releases as long as for every commit to master and for every pull request. +They are built for stable, prestable and testing releases as long as for every commit to master and for every pull request. -To find the most fresh build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green check mark or red cross near commit, and click to the "Details" link right after "ClickHouse Build Check". +To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green checkmark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”. +# How to Build ClickHouse Debian Package {#how-to-build-clickhouse-debian-package} -# How to Build ClickHouse Debian Package +## Install Git and Pbuilder {#install-git-and-pbuilder} -## Install Git and Pbuilder - -```bash +``` bash $ sudo apt-get update $ sudo apt-get install git python pbuilder debhelper lsb-release fakeroot sudo debian-archive-keyring debian-keyring ``` -## Checkout ClickHouse Sources +## Checkout ClickHouse Sources {#checkout-clickhouse-sources-1} -```bash +``` bash $ git clone --recursive --branch master https://github.com/ClickHouse/ClickHouse.git $ cd ClickHouse ``` -## Run Release Script +## Run Release Script {#run-release-script} -```bash +``` bash $ ./release ``` - [Original article](https://clickhouse.tech/docs/en/development/build/) diff --git a/docs/en/development/build_cross_arm.md b/docs/en/development/build_cross_arm.md index 68a4596559f..cf6e73308c4 100644 --- a/docs/en/development/build_cross_arm.md +++ b/docs/en/development/build_cross_arm.md @@ -1,32 +1,32 @@ -# How to Build ClickHouse on Linux for AARCH64 (ARM64) architecture +# How to Build ClickHouse on Linux for AARCH64 (ARM64) architecture {#how-to-build-clickhouse-on-linux-for-aarch64-arm64-architecture} This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on another Linux machine with AARCH64 CPU architecture. This is intended for continuous integration checks that run on Linux servers. The cross-build for AARCH64 is based on the [Build instructions](build.md), follow them first. -# Install Clang-8 +# Install Clang-8 {#install-clang-8} Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup. For example, in Ubuntu Bionic you can use the following commands: -```bash +``` bash echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" | sudo tee /etc/apt/sources.list.d/llvm.list sudo apt-get update sudo apt-get install clang-8 ``` -# Install Cross-Compilation Toolset +# Install Cross-Compilation Toolset {#install-cross-compilation-toolset} -```bash +``` bash cd ClickHouse mkdir -p build-aarch64/cmake/toolchain/linux-aarch64 wget 'https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&la=en' -O gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build-aarch64/cmake/toolchain/linux-aarch64 --strip-components=1 ``` -# Build ClickHouse +# Build ClickHouse {#build-clickhouse} -```bash +``` bash cd ClickHouse mkdir build-arm64 CC=clang-8 CXX=clang++-8 cmake . -Bbuild-arm64 -DCMAKE_TOOLCHAIN_FILE=cmake/linux/toolchain-aarch64.cmake diff --git a/docs/en/development/build_cross_osx.md b/docs/en/development/build_cross_osx.md index b8e38b44e1e..f79fddc64d1 100644 --- a/docs/en/development/build_cross_osx.md +++ b/docs/en/development/build_cross_osx.md @@ -1,24 +1,24 @@ -# How to Build ClickHouse on Linux for Mac OS X +# How to Build ClickHouse on Linux for Mac OS X {#how-to-build-clickhouse-on-linux-for-mac-os-x} -This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on OS X. This is intended for continuous integration checks that run on Linux servers. If you want to build ClickHouse directly on Mac OS X, then proceed with another instruction: https://clickhouse.tech/docs/en/development/build_osx/ +This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on OS X. This is intended for continuous integration checks that run on Linux servers. If you want to build ClickHouse directly on Mac OS X, then proceed with [another instruction](build_osx.md). The cross-build for Mac OS X is based on the [Build instructions](build.md), follow them first. -# Install Clang-8 +# Install Clang-8 {#install-clang-8} Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup. For example the commands for Bionic are like: -```bash +``` bash sudo echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" >> /etc/apt/sources.list sudo apt-get install clang-8 ``` -# Install Cross-Compilation Toolset +# Install Cross-Compilation Toolset {#install-cross-compilation-toolset} -Let's remember the path where we install `cctools` as ${CCTOOLS} +Let’s remember the path where we install `cctools` as ${CCTOOLS} -```bash +``` bash mkdir ${CCTOOLS} git clone https://github.com/tpoechtrager/apple-libtapi.git @@ -33,18 +33,18 @@ cd cctools-port/cctools make install ``` -Also, we need to download MacOS X SDK into the working tree. +Also, we need to download macOS X SDK into the working tree. -```bash +``` bash cd ClickHouse wget 'https://github.com/phracker/MacOSX-SDKs/releases/download/10.14-beta4/MacOSX10.14.sdk.tar.xz' mkdir -p build-darwin/cmake/toolchain/darwin-x86_64 tar xJf MacOSX10.14.sdk.tar.xz -C build-darwin/cmake/toolchain/darwin-x86_64 --strip-components=1 ``` -# Build ClickHouse +# Build ClickHouse {#build-clickhouse} -```bash +``` bash cd ClickHouse mkdir build-osx CC=clang-8 CXX=clang++-8 cmake . -Bbuild-osx -DCMAKE_TOOLCHAIN_FILE=cmake/darwin/toolchain-x86_64.cmake \ @@ -54,4 +54,4 @@ CC=clang-8 CXX=clang++-8 cmake . -Bbuild-osx -DCMAKE_TOOLCHAIN_FILE=cmake/darwin ninja -C build-osx ``` -The resulting binary will have Mach-O executable format and can't be run on Linux. +The resulting binary will have a Mach-O executable format and can’t be run on Linux. diff --git a/docs/en/development/build_osx.md b/docs/en/development/build_osx.md index a31264325b4..e4a26f9e6b4 100644 --- a/docs/en/development/build_osx.md +++ b/docs/en/development/build_osx.md @@ -1,34 +1,36 @@ -# How to Build ClickHouse on Mac OS X +# How to Build ClickHouse on Mac OS X {#how-to-build-clickhouse-on-mac-os-x} Build should work on Mac OS X 10.15 (Catalina) -## Install Homebrew +## Install Homebrew {#install-homebrew} -```bash +``` bash $ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" ``` -## Install Required Compilers, Tools, and Libraries +## Install Required Compilers, Tools, and Libraries {#install-required-compilers-tools-and-libraries} -```bash +``` bash $ brew install cmake ninja libtool gettext ``` -## Checkout ClickHouse Sources +## Checkout ClickHouse Sources {#checkout-clickhouse-sources} -```bash +``` bash $ git clone --recursive git@github.com:ClickHouse/ClickHouse.git ``` + or -```bash + +``` bash $ git clone --recursive https://github.com/ClickHouse/ClickHouse.git $ cd ClickHouse ``` -## Build ClickHouse +## Build ClickHouse {#build-clickhouse} -```bash +``` bash $ mkdir build $ cd build $ cmake .. -DCMAKE_CXX_COMPILER=`which clang++` -DCMAKE_C_COMPILER=`which clang` @@ -36,17 +38,18 @@ $ ninja $ cd .. ``` -## Caveats +## Caveats {#caveats} -If you intend to run clickhouse-server, make sure to increase the system's maxfiles variable. +If you intend to run clickhouse-server, make sure to increase the system’s maxfiles variable. !!! info "Note" - You'll need to use sudo. + You’ll need to use sudo. To do so, create the following file: /Library/LaunchDaemons/limit.maxfiles.plist: -```xml + +``` xml @@ -71,13 +74,13 @@ To do so, create the following file: ``` Execute the following command: -```bash + +``` bash $ sudo chown root:wheel /Library/LaunchDaemons/limit.maxfiles.plist ``` Reboot. -To check if it's working, you can use `ulimit -n` command. - +To check if it’s working, you can use `ulimit -n` command. [Original article](https://clickhouse.tech/docs/en/development/build_osx/) diff --git a/docs/en/development/contrib.md b/docs/en/development/contrib.md index 36aedff5943..1dc203ca89c 100644 --- a/docs/en/development/contrib.md +++ b/docs/en/development/contrib.md @@ -1,35 +1,35 @@ -# Third-Party Libraries Used +# Third-Party Libraries Used {#third-party-libraries-used} -| Library | License | -| ------- | ------- | -| base64 | [BSD 2-Clause License](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) | -| boost | [Boost Software License 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) | -| brotli | [MIT](https://github.com/google/brotli/blob/master/LICENSE) | -| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/master/LICENSE) | -| cctz | [Apache License 2.0](https://github.com/google/cctz/blob/4f9776a310f4952454636363def82c2bf6641d5f/LICENSE.txt) | -| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | -| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | -| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | -| h3 | [Apache License 2.0](https://github.com/uber/h3/blob/master/LICENSE) -| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | -| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | -| libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | -| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) -| libhdfs3 | [Apache License 2.0](https://github.com/ClickHouse-Extras/libhdfs3/blob/bd6505cbb0c130b0db695305b9a38546fa880e5a/LICENSE.txt) | -| libmetrohash | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | -| libpcg-random | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libpcg-random/LICENSE-APACHE.txt) | -| libressl | [OpenSSL License](https://github.com/ClickHouse-Extras/ssl/blob/master/COPYING) | -| librdkafka | [BSD 2-Clause License](https://github.com/edenhill/librdkafka/blob/363dcad5a23dc29381cc626620e68ae418b3af19/LICENSE) | -| libwidechar\_width | [CC0 1.0 Universal](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libwidechar_width/LICENSE) | -| llvm | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/llvm/blob/163def217817c90fb982a6daf384744d8472b92b/llvm/LICENSE.TXT) | -| lz4 | [BSD 2-Clause License](https://github.com/lz4/lz4/blob/c10863b98e1503af90616ae99725ecd120265dfb/LICENSE) | -| mariadb-connector-c | [LGPL v2.1](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/3.1/COPYING.LIB) | -| murmurhash | [Public Domain](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) -| pdqsort | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/pdqsort/license.txt) | -| poco | [Boost Software License - Version 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) | -| protobuf | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) | -| re2 | [BSD 3-Clause License](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) | -| UnixODBC | [LGPL v2.1](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) | -| zlib-ng | [Zlib License](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) | -| zstd | [BSD 3-Clause License](https://github.com/facebook/zstd/blob/dev/LICENSE) | +| Library | License | +|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------| +| base64 | [BSD 2-Clause License](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) | +| boost | [Boost Software License 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) | +| brotli | [MIT](https://github.com/google/brotli/blob/master/LICENSE) | +| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/master/LICENSE) | +| cctz | [Apache License 2.0](https://github.com/google/cctz/blob/4f9776a310f4952454636363def82c2bf6641d5f/LICENSE.txt) | +| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | +| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | +| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | +| h3 | [Apache License 2.0](https://github.com/uber/h3/blob/master/LICENSE) | +| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | +| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | +| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | +| libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | +| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | +| libhdfs3 | [Apache License 2.0](https://github.com/ClickHouse-Extras/libhdfs3/blob/bd6505cbb0c130b0db695305b9a38546fa880e5a/LICENSE.txt) | +| libmetrohash | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | +| libpcg-random | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libpcg-random/LICENSE-APACHE.txt) | +| libressl | [OpenSSL License](https://github.com/ClickHouse-Extras/ssl/blob/master/COPYING) | +| librdkafka | [BSD 2-Clause License](https://github.com/edenhill/librdkafka/blob/363dcad5a23dc29381cc626620e68ae418b3af19/LICENSE) | +| libwidechar\_width | [CC0 1.0 Universal](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libwidechar_width/LICENSE) | +| llvm | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/llvm/blob/163def217817c90fb982a6daf384744d8472b92b/llvm/LICENSE.TXT) | +| lz4 | [BSD 2-Clause License](https://github.com/lz4/lz4/blob/c10863b98e1503af90616ae99725ecd120265dfb/LICENSE) | +| mariadb-connector-c | [LGPL v2.1](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/3.1/COPYING.LIB) | +| murmurhash | [Public Domain](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) | +| pdqsort | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/pdqsort/license.txt) | +| poco | [Boost Software License - Version 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) | +| protobuf | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) | +| re2 | [BSD 3-Clause License](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) | +| UnixODBC | [LGPL v2.1](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) | +| zlib-ng | [Zlib License](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) | +| zstd | [BSD 3-Clause License](https://github.com/facebook/zstd/blob/dev/LICENSE) | diff --git a/docs/en/development/developer_instruction.md b/docs/en/development/developer_instruction.md index d636ee7189c..8f21b2b0dd9 100644 --- a/docs/en/development/developer_instruction.md +++ b/docs/en/development/developer_instruction.md @@ -1,122 +1,113 @@ Building of ClickHouse is supported on Linux, FreeBSD and Mac OS X. -# If you use Windows +# If you use Windows {#if-you-use-windows} -If you use Windows, you need to create a virtual machine with Ubuntu. To start working with a virtual machine please install VirtualBox. You can download Ubuntu from the website: https://www.ubuntu.com/#download. Please create a virtual machine from the downloaded image (you should reserve at least 4GB of RAM for it). To run a command line terminal in Ubuntu, please locate a program containing the word "terminal" in its name (gnome-terminal, konsole etc.) or just press Ctrl+Alt+T. +If you use Windows, you need to create a virtual machine with Ubuntu. To start working with a virtual machine please install VirtualBox. You can download Ubuntu from the website: https://www.ubuntu.com/\#download. Please create a virtual machine from the downloaded image (you should reserve at least 4GB of RAM for it). To run a command-line terminal in Ubuntu, please locate a program containing the word “terminal” in its name (gnome-terminal, konsole etc.) or just press Ctrl+Alt+T. -# If you use 32-bit system +# If you use a 32-bit system {#if-you-use-a-32-bit-system} -ClickHouse cannot work or build on 32-bit system. You should acquire access to 64-bit system and you can continue reading. +ClickHouse cannot work or build on a 32-bit system. You should acquire access to a 64-bit system and you can continue reading. - -# Creating a repository on GitHub +# Creating a repository on GitHub {#creating-a-repository-on-github} To start working with ClickHouse repository you will need a GitHub account. -You probably already have one, but if you don't, please register at https://github.com. In case you do not have SSH keys, you should generate them and then upload them on GitHub. It is required for sending over your patches. It is also possible to use the same SSH keys that you use with any other SSH servers - probably you already have those. +You probably already have one, but if you don’t, please register at https://github.com. In case you do not have SSH keys, you should generate them and then upload them on GitHub. It is required for sending over your patches. It is also possible to use the same SSH keys that you use with any other SSH servers - probably you already have those. -Create a fork of ClickHouse repository. To do that please click on the "fork" button in the upper right corner at https://github.com/ClickHouse/ClickHouse. It will fork your own copy of ClickHouse/ClickHouse to your account. +Create a fork of ClickHouse repository. To do that please click on the “fork” button in the upper right corner at https://github.com/ClickHouse/ClickHouse. It will fork your own copy of ClickHouse/ClickHouse to your account. -Development process consists of first committing the intended changes into your fork of ClickHouse and then creating a "pull request" for these changes to be accepted into the main repository (ClickHouse/ClickHouse). +The development process consists of first committing the intended changes into your fork of ClickHouse and then creating a “pull request” for these changes to be accepted into the main repository (ClickHouse/ClickHouse). To work with git repositories, please install `git`. To do that in Ubuntu you would run in the command line terminal: -``` -sudo apt update -sudo apt install git -``` + + sudo apt update + sudo apt install git A brief manual on using Git can be found here: https://services.github.com/on-demand/downloads/github-git-cheat-sheet.pdf. -For a detailed manual on Git see: https://git-scm.com/book/ru/v2. +For a detailed manual on Git see https://git-scm.com/book/ru/v2. +# Cloning a repository to your development machine {#cloning-a-repository-to-your-development-machine} -# Cloning a repository to your development machine - -Next, you need to download the source files onto your working machine. This is called "to clone a repository" because it creates a local copy of the repository on your working machine. +Next, you need to download the source files onto your working machine. This is called “to clone a repository” because it creates a local copy of the repository on your working machine. In the command line terminal run: -``` -git clone --recursive git@guthub.com:your_github_username/ClickHouse.git -cd ClickHouse -``` -Note: please, substitute *your_github_username* with what is appropriate! + + git clone --recursive git@guthub.com:your_github_username/ClickHouse.git + cd ClickHouse + +Note: please, substitute *your\_github\_username* with what is appropriate! This command will create a directory `ClickHouse` containing the working copy of the project. It is important that the path to the working directory contains no whitespaces as it may lead to problems with running the build system. -Please note that ClickHouse repository uses `submodules`. That is what the references to additional repositories are called (i.e. external libraries on which the project depends). It means that when cloning the repository you need to specify the `--recursive` flag as in the example above. If the repository has been cloned without submodules, to download them you need to run the following: -``` -git submodule init -git submodule update -``` -You can check status with command: `git submodule status`. +Please note that ClickHouse repository uses `submodules`. That is what the references to additional repositories are called (i.e. external libraries on which the project depends). It means that when cloning the repository you need to specify the `--recursive` flag as in the example above. If the repository has been cloned without submodules, to download them you need to run the following: + + git submodule init + git submodule update + +You can check the status with the command: `git submodule status`. If you get the following error message: -``` -Permission denied (publickey). -fatal: Could not read from remote repository. -Please make sure you have the correct access rights -and the repository exists. -``` + Permission denied (publickey). + fatal: Could not read from remote repository. + + Please make sure you have the correct access rights + and the repository exists. + It generally means that the SSH keys for connecting to GitHub are missing. These keys are normally located in `~/.ssh`. For SSH keys to be accepted you need to upload them in the settings section of GitHub UI. You can also clone the repository via https protocol: -``` -git clone https://github.com/ClickHouse/ClickHouse.git -``` -This however will not let you send your changes to the server. You can still use it temporarily and add the SSH keys later replacing the remote address of the repository with `git remote` command. -You can also add original ClickHouse repo's address to your local repository to pull updates from there: -``` -git remote add upstream git@github.com:ClickHouse/ClickHouse.git -``` + git clone https://github.com/ClickHouse/ClickHouse.git + +This, however, will not let you send your changes to the server. You can still use it temporarily and add the SSH keys later replacing the remote address of the repository with `git remote` command. + +You can also add original ClickHouse repo’s address to your local repository to pull updates from there: + + git remote add upstream git@github.com:ClickHouse/ClickHouse.git + After successfully running this command you will be able to pull updates from the main ClickHouse repo by running `git pull upstream master`. - -## Working with submodules +## Working with submodules {#working-with-submodules} Working with submodules in git could be painful. Next commands will help to manage it: -``` -# ! each command accepts --recursive -# Update remote URLs for submodules. Barely rare case -git submodule sync -# Add new submodules -git submodule init -# Update existing submodules to the current state -git submodule update -# Two last commands could be merged together -git submodule update --init -``` + # ! each command accepts --recursive + # Update remote URLs for submodules. Barely rare case + git submodule sync + # Add new submodules + git submodule init + # Update existing submodules to the current state + git submodule update + # Two last commands could be merged together + git submodule update --init -The next commands would help you to reset all submodules to the initial state (!WARING! - any chenges inside will be deleted): +The next commands would help you to reset all submodules to the initial state (!WARNING! - any changes inside will be deleted): -``` -# Synchronizes submodules' remote URL with .gitmodules -git submodule sync --recursive -# Update the registered submodules with initialize not yet initialized -git submodule update --init --recursive -# Reset all changes done after HEAD -git submodule foreach git reset --hard -# Clean files from .gitignore -git submodule foreach git clean -xfd -# Repeat last 4 commands for all submodule -git submodule foreach git submodule sync --recursive -git submodule foreach git submodule update --init --recursive -git submodule foreach git submodule foreach git reset --hard -git submodule foreach git submodule foreach git clean -xfd -``` + # Synchronizes submodules' remote URL with .gitmodules + git submodule sync --recursive + # Update the registered submodules with initialize not yet initialized + git submodule update --init --recursive + # Reset all changes done after HEAD + git submodule foreach git reset --hard + # Clean files from .gitignore + git submodule foreach git clean -xfd + # Repeat last 4 commands for all submodule + git submodule foreach git submodule sync --recursive + git submodule foreach git submodule update --init --recursive + git submodule foreach git submodule foreach git reset --hard + git submodule foreach git submodule foreach git clean -xfd - -# Build System +# Build System {#build-system} ClickHouse uses CMake and Ninja for building. CMake - a meta-build system that can generate Ninja files (build tasks). -Ninja - a smaller build system with focus on speed used to execute those cmake generated tasks. +Ninja - a smaller build system with a focus on the speed used to execute those cmake generated tasks. To install on Ubuntu, Debian or Mint run `sudo apt install cmake ninja-build`. @@ -125,20 +116,17 @@ On CentOS, RedHat run `sudo yum install cmake ninja-build`. If you use Arch or Gentoo, you probably know it yourself how to install CMake. For installing CMake and Ninja on Mac OS X first install Homebrew and then install everything else via brew: -``` -/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" -brew install cmake ninja -``` + + /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" + brew install cmake ninja Next, check the version of CMake: `cmake --version`. If it is below 3.3, you should install a newer version from the website: https://cmake.org/download/. - -# Optional External Libraries +# Optional External Libraries {#optional-external-libraries} ClickHouse uses several external libraries for building. All of them do not need to be installed separately as they are built together with ClickHouse from the sources located in the submodules. You can check the list in `contrib`. - -# C++ Compiler +# C++ Compiler {#c-compiler} Compilers GCC starting from version 9 and Clang version 8 or above are supported for building ClickHouse. @@ -146,120 +134,107 @@ Official Yandex builds currently use GCC because it generates machine code of sl To install GCC on Ubuntu run: `sudo apt install gcc g++` -Check the version of gcc: `gcc --version`. If it is below 9, then follow the instruction here: https://clickhouse.tech/docs/en/development/build/#install-gcc-9. +Check the version of gcc: `gcc --version`. If it is below 9, then follow the instruction here: https://clickhouse.tech/docs/en/development/build/\#install-gcc-9. Mac OS X build is supported only for Clang. Just run `brew install llvm` If you decide to use Clang, you can also install `libc++` and `lld`, if you know what it is. Using `ccache` is also recommended. - -# The Building process +# The Building process {#the-building-process} Now that you are ready to build ClickHouse we recommend you to create a separate directory `build` inside `ClickHouse` that will contain all of the build artefacts: -``` -mkdir build -cd build -``` -You can have several different directories (build_release, build_debug, etc.) for different types of build. -While inside the `build` directory, configure your build by running CMake. Before the first run you need to define environment variables that specify compiler (version 9 gcc compiler in this example). + mkdir build + cd build + +You can have several different directories (build\_release, build\_debug, etc.) for different types of build. + +While inside the `build` directory, configure your build by running CMake. Before the first run, you need to define environment variables that specify compiler (version 9 gcc compiler in this example). Linux: -``` -export CC=gcc-9 CXX=g++-9 -cmake .. -``` + + export CC=gcc-9 CXX=g++-9 + cmake .. Mac OS X: -``` -export CC=clang CXX=clang++ -cmake .. -``` + + export CC=clang CXX=clang++ + cmake .. The `CC` variable specifies the compiler for C (short for C Compiler), and `CXX` variable instructs which C++ compiler is to be used for building. -For a faster build you can resort to the `debug` build type - a build with no optimizations. For that supply the following parameter `-D CMAKE_BUILD_TYPE=Debug`: -``` -cmake -D CMAKE_BUILD_TYPE=Debug .. -``` +For a faster build, you can resort to the `debug` build type - a build with no optimizations. For that supply the following parameter `-D CMAKE_BUILD_TYPE=Debug`: + + cmake -D CMAKE_BUILD_TYPE=Debug .. + You can change the type of build by running this command in the `build` directory. Run ninja to build: -``` -ninja clickhouse-server clickhouse-client -``` + + ninja clickhouse-server clickhouse-client + Only the required binaries are going to be built in this example. If you require to build all the binaries (utilities and tests), you should run ninja with no parameters: -``` -ninja -``` + + ninja Full build requires about 30GB of free disk space or 15GB to build the main binaries. -When large amount of RAM is available on build machine you should limit the number of build tasks run in parallel with `-j` param: -``` -ninja -j 1 clickhouse-server clickhouse-client -``` -On machines with 4GB of RAM it is recommended to specify 1, for 8GB of RAM `-j 2` is recommended. +When a large amount of RAM is available on build machine you should limit the number of build tasks run in parallel with `-j` param: + + ninja -j 1 clickhouse-server clickhouse-client + +On machines with 4GB of RAM, it is recommended to specify 1, for 8GB of RAM `-j 2` is recommended. If you get the message: `ninja: error: loading 'build.ninja': No such file or directory`, it means that generating a build configuration has failed and you need to inspect the message above. -Upon successful start of the building process you'll see the build progress - the number of processed tasks and the total number of tasks. +Upon the successful start of the building process, you’ll see the build progress - the number of processed tasks and the total number of tasks. While building messages about protobuf files in libhdfs2 library like `libprotobuf WARNING` may show up. They affect nothing and are safe to be ignored. Upon successful build you get an executable file `ClickHouse//dbms/programs/clickhouse`: -``` -ls -l dbms/programs/clickhouse -``` + ls -l dbms/programs/clickhouse -# Running the built executable of ClickHouse +# Running the built executable of ClickHouse {#running-the-built-executable-of-clickhouse} To run the server under the current user you need to navigate to `ClickHouse/dbms/programs/server/` (located outside of `build`) and run: -``` -../../../build/dbms/programs/clickhouse server -``` + ../../../build/dbms/programs/clickhouse server -In this case ClickHouse will use config files located in the current directory. You can run `clickhouse server` from any directory specifying the path to a config file as a command line parameter `--config-file`. +In this case, ClickHouse will use config files located in the current directory. You can run `clickhouse server` from any directory specifying the path to a config file as a command-line parameter `--config-file`. To connect to ClickHouse with clickhouse-client in another terminal navigate to `ClickHouse/build/dbms/programs/` and run `clickhouse client`. If you get `Connection refused` message on Mac OS X or FreeBSD, try specifying host address 127.0.0.1: -``` -clickhouse client --host 127.0.0.1 -``` -You can replace production version of ClickHouse binary installed in your system with your custom built ClickHouse binary. To do that install ClickHouse on your machine following the instructions from the official website. Next, run the following: -``` -sudo service clickhouse-server stop -sudo cp ClickHouse/build/dbms/programs/clickhouse /usr/bin/ -sudo service clickhouse-server start -``` + clickhouse client --host 127.0.0.1 + +You can replace the production version of ClickHouse binary installed in your system with your custom-built ClickHouse binary. To do that install ClickHouse on your machine following the instructions from the official website. Next, run the following: + + sudo service clickhouse-server stop + sudo cp ClickHouse/build/dbms/programs/clickhouse /usr/bin/ + sudo service clickhouse-server start Note that `clickhouse-client`, `clickhouse-server` and others are symlinks to the commonly shared `clickhouse` binary. -You can also run your custom built ClickHouse binary with the config file from the ClickHouse package installed on your system: -``` -sudo service clickhouse-server stop -sudo -u clickhouse ClickHouse/build/dbms/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml -``` +You can also run your custom-built ClickHouse binary with the config file from the ClickHouse package installed on your system: + sudo service clickhouse-server stop + sudo -u clickhouse ClickHouse/build/dbms/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml -# IDE (Integrated Development Environment) +# IDE (Integrated Development Environment) {#ide-integrated-development-environment} -If you do not know which IDE to use, we recommend that you use CLion. CLion is a commercial software, but it offers 30 day free trial period. It is also free of charge for students. CLion can be used both on Linux and on Mac OS X. +If you do not know which IDE to use, we recommend that you use CLion. CLion is commercial software, but it offers 30 days free trial period. It is also free of charge for students. CLion can be used both on Linux and on Mac OS X. -KDevelop and QTCreator are another great alternatives of an IDE for developing ClickHouse. KDevelop comes in as a very handy IDE although unstable. If KDevelop crashes after a while upon opening project, you should click "Stop All" button as soon as it has opened the list of project's files. After doing so KDevelop should be fine to work with. +KDevelop and QTCreator are other great alternatives of an IDE for developing ClickHouse. KDevelop comes in as a very handy IDE although unstable. If KDevelop crashes after a while upon opening project, you should click “Stop All” button as soon as it has opened the list of project’s files. After doing so KDevelop should be fine to work with. -As simple code editors you can use Sublime Text or Visual Studio Code, or Kate (all of which are available on Linux). +As simple code editors, you can use Sublime Text or Visual Studio Code, or Kate (all of which are available on Linux). -Just in case, it is worth mentioning that CLion creates `build` path on its own, it also on its own selects `debug` for build type, for configuration it uses a version of CMake that is defined in CLion and not the one installed by you, and finally CLion will use `make` to run build tasks instead of `ninja`. This is a normal behaviour, just keep that in mind to avoid confusion. +Just in case, it is worth mentioning that CLion creates `build` path on its own, it also on its own selects `debug` for build type, for configuration it uses a version of CMake that is defined in CLion and not the one installed by you, and finally, CLion will use `make` to run build tasks instead of `ninja`. This is normal behaviour, just keep that in mind to avoid confusion. - -# Writing Code +# Writing Code {#writing-code} The description of ClickHouse architecture can be found here: https://clickhouse.tech/docs/en/development/architecture/ @@ -267,42 +242,37 @@ The Code Style Guide: https://clickhouse.tech/docs/en/development/style/ Writing tests: https://clickhouse.tech/docs/en/development/tests/ -List of tasks: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/instructions/easy_tasks_sorted_en.md +List of tasks: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/instructions/easy\_tasks\_sorted\_en.md +# Test Data {#test-data} -# Test Data +Developing ClickHouse often requires loading realistic datasets. It is particularly important for performance testing. We have a specially prepared set of anonymized data from Yandex.Metrica. It requires additionally some 3GB of free disk space. Note that this data is not required to accomplish most of the development tasks. -Developing ClickHouse often requires loading realistic datasets. It is particularly important for performance testing. We have a specially prepared set of anonymized data from Yandex.Metrica. It requires additionally some 3GB of free disk space. Note that this data is not required to accomplish most of development tasks. + sudo apt install wget xz-utils -``` -sudo apt install wget xz-utils + wget https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz + wget https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz -wget https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz -wget https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz + xz -v -d hits_v1.tsv.xz + xz -v -d visits_v1.tsv.xz -xz -v -d hits_v1.tsv.xz -xz -v -d visits_v1.tsv.xz + clickhouse-client -clickhouse-client + CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime); -CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime); + CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID); -CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID); + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv -clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv -clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv -``` +# Creating Pull Request {#creating-pull-request} +Navigate to your fork repository in GitHub’s UI. If you have been developing in a branch, you need to select that branch. There will be a “Pull request” button located on the screen. In essence, this means “create a request for accepting my changes into the main repository”. +A pull request can be created even if the work is not completed yet. In this case please put the word “WIP” (work in progress) at the beginning of the title, it can be changed later. This is useful for cooperative reviewing and discussion of changes as well as for running all of the available tests. It is important that you provide a brief description of your changes, it will later be used for generating release changelogs. -# Creating Pull Request +Testing will commence as soon as Yandex employees label your PR with a tag “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour. -Navigate to your fork repository in GitHub's UI. If you have been developing in a branch, you need to select that branch. There will be a "Pull request" button located on the screen. In essence this means "create a request for accepting my changes into the main repository". +The system will prepare ClickHouse binary builds for your pull request individually. To retrieve these builds click the “Details” link next to “ClickHouse build check” entry in the list of checks. There you will find direct links to the built .deb packages of ClickHouse which you can deploy even on your production servers (if you have no fear). -A pull request can be created even if the work is not completed yet. In this case please put the word "WIP" (work in progress) at the beginning of the title, it can be changed later. This is useful for cooperative reviewing and discussion of changes as well as for running all of the available tests. It is important that you provide a brief description of your changes, it will later be used for generating realease changelogs. - -Testing will commence as soon as Yandex employees label your PR with a tag "can be tested". The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within a half an hour. And the main set of tests will report itself within an hour. - -The system will prepare ClickHouse binary builds for your pull request individually. To retrieve these builds click the "Details" link next to "ClickHouse build check" entry in the list of checks. There you will find direct links to the built .deb packages of ClickHouse which you can deploy even on your production servers (if you have no fear). - -Most probably some of the builds will fail at first times. This is due to the fact that we check builds both with gcc as well as with clang, with almost all of existing warnings (always with the `-Werror` flag) enabled for clang. On that same page you can find all of the build logs so that you do not have to build ClickHouse in all of the possible ways. +Most probably some of the builds will fail at first times. This is due to the fact that we check builds both with gcc as well as with clang, with almost all of existing warnings (always with the `-Werror` flag) enabled for clang. On that same page, you can find all of the build logs so that you do not have to build ClickHouse in all of the possible ways. diff --git a/docs/en/development/index.md b/docs/en/development/index.md index 19074d6f102..a18972cdaa2 100644 --- a/docs/en/development/index.md +++ b/docs/en/development/index.md @@ -1,4 +1,3 @@ -# ClickHouse Development - +# ClickHouse Development {#clickhouse-development} [Original article](https://clickhouse.tech/docs/en/development/) diff --git a/docs/en/development/style.md b/docs/en/development/style.md index e896b1356ac..645fe5210c3 100644 --- a/docs/en/development/style.md +++ b/docs/en/development/style.md @@ -1,6 +1,6 @@ -# How to Write C++ Code +# How to Write C++ Code {#how-to-write-c-code} -## General Recommendations +## General Recommendations {#general-recommendations} **1.** The following are recommendations, not requirements. @@ -10,7 +10,7 @@ **4.** Many of the rules do not have logical reasons; they are dictated by established practices. -## Formatting +## Formatting {#formatting} **1.** Most of the formatting will be done automatically by `clang-format`. @@ -18,7 +18,7 @@ **3.** Opening and closing curly brackets must be on a separate line. -```cpp +``` cpp inline void readBoolText(bool & x, ReadBuffer & buf) { char tmp = '0'; @@ -29,30 +29,30 @@ inline void readBoolText(bool & x, ReadBuffer & buf) **4.** If the entire function body is a single `statement`, it can be placed on a single line. Place spaces around curly braces (besides the space at the end of the line). -```cpp +``` cpp inline size_t mask() const { return buf_size() - 1; } inline size_t place(HashValue x) const { return x & mask(); } ``` -**5.** For functions. Don't put spaces around brackets. +**5.** For functions. Don’t put spaces around brackets. -```cpp +``` cpp void reinsert(const Value & x) ``` -```cpp +``` cpp memcpy(&buf[place_value], &x, sizeof(x)); ``` **6.** In `if`, `for`, `while` and other expressions, a space is inserted in front of the opening bracket (as opposed to function calls). -```cpp +``` cpp for (size_t i = 0; i < rows; i += storage.index_granularity) ``` -**7.** Add spaces around binary operators (`+`, `-`, `*`, `/`, `%`, ...) and the ternary operator `?:`. +**7.** Add spaces around binary operators (`+`, `-`, `*`, `/`, `%`, …) and the ternary operator `?:`. -```cpp +``` cpp UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); @@ -60,7 +60,7 @@ UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); **8.** If a line feed is entered, put the operator on a new line and increase the indent before it. -```cpp +``` cpp if (elapsed_ns) message << " (" << rows_read_on_server * 1000000000 / elapsed_ns << " rows/s., " @@ -69,17 +69,17 @@ if (elapsed_ns) **9.** You can use spaces for alignment within a line, if desired. -```cpp +``` cpp dst.ClickLogID = click.LogID; dst.ClickEventID = click.EventID; dst.ClickGoodEvent = click.GoodEvent; ``` -**10.** Don't use spaces around the operators `.`, `->`. +**10.** Don’t use spaces around the operators `.`, `->`. If necessary, the operator can be wrapped to the next line. In this case, the offset in front of it is increased. -**11.** Do not use a space to separate unary operators (`--`, `++`, `*`, `&`, ...) from the argument. +**11.** Do not use a space to separate unary operators (`--`, `++`, `*`, `&`, …) from the argument. **12.** Put a space after a comma, but not before it. The same rule goes for a semicolon inside a `for` expression. @@ -87,7 +87,7 @@ If necessary, the operator can be wrapped to the next line. In this case, the of **14.** In a `template <...>` expression, use a space between `template` and `<`; no spaces after `<` or before `>`. -```cpp +``` cpp template struct AggregatedStatElement {} @@ -95,7 +95,7 @@ struct AggregatedStatElement **15.** In classes and structures, write `public`, `private`, and `protected` on the same level as `class/struct`, and indent the rest of the code. -```cpp +``` cpp template class MultiVersion { @@ -106,25 +106,25 @@ public: } ``` -**16.** If the same `namespace` is used for the entire file, and there isn't anything else significant, an offset is not necessary inside `namespace`. +**16.** If the same `namespace` is used for the entire file, and there isn’t anything else significant, an offset is not necessary inside `namespace`. -**17.** If the block for an `if`, `for`, `while`, or other expression consists of a single `statement`, the curly brackets are optional. Place the `statement` on a separate line, instead. This rule is also valid for nested `if`, `for`, `while`, ... +**17.** If the block for an `if`, `for`, `while`, or other expression consists of a single `statement`, the curly brackets are optional. Place the `statement` on a separate line, instead. This rule is also valid for nested `if`, `for`, `while`, … But if the inner `statement` contains curly brackets or `else`, the external block should be written in curly brackets. -```cpp +``` cpp /// Finish write. for (auto & stream : streams) stream.second->finalize(); ``` -**18.** There shouldn't be any spaces at the ends of lines. +**18.** There shouldn’t be any spaces at the ends of lines. **19.** Source files are UTF-8 encoded. **20.** Non-ASCII characters can be used in string literals. -```cpp +``` cpp << ", " << (timer.elapsed() / chunks_stats.hits) << " μsec/hit."; ``` @@ -136,7 +136,7 @@ for (auto & stream : streams) **24.** `A const` (related to a value) must be written before the type name. -```cpp +``` cpp //correct const char * pos const std::string & s @@ -146,7 +146,7 @@ char const * pos **25.** When declaring a pointer or reference, the `*` and `&` symbols should be separated by spaces on both sides. -```cpp +``` cpp //correct const char * pos //incorrect @@ -156,11 +156,11 @@ const char *pos **26.** When using template types, alias them with the `using` keyword (except in the simplest cases). -In other words, the template parameters are specified only in `using` and aren't repeated in the code. +In other words, the template parameters are specified only in `using` and aren’t repeated in the code. `using` can be declared locally, such as inside a function. -```cpp +``` cpp //correct using FileStreams = std::map>; FileStreams streams; @@ -170,14 +170,14 @@ std::map> streams; **27.** Do not declare several variables of different types in one statement. -```cpp +``` cpp //incorrect int x, *y; ``` **28.** Do not use C-style casts. -```cpp +``` cpp //incorrect std::cerr << (int)c <<; std::endl; //correct @@ -190,23 +190,23 @@ std::cerr << static_cast(c) << std::endl; The same is true for small methods in any classes or structs. -For templated classes and structs, don't separate the method declarations from the implementation (because otherwise they must be defined in the same translation unit). +For templated classes and structs, don’t separate the method declarations from the implementation (because otherwise they must be defined in the same translation unit). **31.** You can wrap lines at 140 characters, instead of 80. **32.** Always use the prefix increment/decrement operators if postfix is not required. -```cpp +``` cpp for (Names::const_iterator it = column_names.begin(); it != column_names.end(); ++it) ``` -## Comments +## Comments {#comments} **1.** Be sure to add comments for all non-trivial parts of code. -This is very important. Writing the comment might help you realize that the code isn't necessary, or that it is designed wrong. +This is very important. Writing the comment might help you realize that the code isn’t necessary, or that it is designed wrong. -```cpp +``` cpp /** Part of piece of memory, that can be used. * For example, if internal_buffer is 1MB, and there was only 10 bytes loaded to buffer from file for reading, * then working_buffer will have size of only 10 bytes @@ -218,7 +218,7 @@ This is very important. Writing the comment might help you realize that the code **3.** Place comments before the code they describe. In rare cases, comments can come after the code, on the same line. -```cpp +``` cpp /** Parses and executes the query. */ void executeQuery( @@ -236,7 +236,7 @@ void executeQuery( **6.** Do not add comments that do not provide additional information. In particular, do not leave empty comments like this: -```cpp +``` cpp /* * Procedure Name: * Original procedure name: @@ -257,11 +257,11 @@ void executeQuery( */ ``` -The example is borrowed from the resource [http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/](http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/). +The example is borrowed from the resource http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/. **7.** Do not write garbage comments (author, creation date ..) at the beginning of each file. -**8.** Single-line comments begin with three slashes: `///` and multi-line comments begin with `/**`. These comments are considered "documentation". +**8.** Single-line comments begin with three slashes: `///` and multi-line comments begin with `/**`. These comments are considered “documentation”. Note: You can use Doxygen to generate documentation from these comments. But Doxygen is not generally used because it is more convenient to navigate the code in the IDE. @@ -275,45 +275,45 @@ Note: You can use Doxygen to generate documentation from these comments. But Dox **13.** Do not use uppercase letters. Do not use excessive punctuation. -```cpp +``` cpp /// WHAT THE FAIL??? ``` **14.** Do not use comments to make delimeters. -```cpp +``` cpp ///****************************************************** ``` **15.** Do not start discussions in comments. -```cpp +``` cpp /// Why did you do this stuff? ``` -**16.** There's no need to write a comment at the end of a block describing what it was about. +**16.** There’s no need to write a comment at the end of a block describing what it was about. -```cpp +``` cpp /// for ``` -## Names +## Names {#names} **1.** Use lowercase letters with underscores in the names of variables and class members. -```cpp +``` cpp size_t max_block_size; ``` **2.** For the names of functions (methods), use camelCase beginning with a lowercase letter. -```cpp +``` cpp std::string getName() const override { return "Memory"; } ``` **3.** For the names of classes (structs), use CamelCase beginning with an uppercase letter. Prefixes other than I are not used for interfaces. -```cpp +``` cpp class StorageMemory : public IStorage ``` @@ -323,21 +323,21 @@ class StorageMemory : public IStorage For more complex cases, either follow the rules for class names, or add the prefix `T`. -```cpp +``` cpp template struct AggregatedStatElement ``` **6.** Names of template constant arguments: either follow the rules for variable names, or use `N` in simple cases. -```cpp +``` cpp template struct ExtractDomain ``` **7.** For abstract classes (interfaces) you can add the `I` prefix. -```cpp +``` cpp class IBlockInputStream ``` @@ -345,13 +345,13 @@ class IBlockInputStream In all other cases, use a name that describes the meaning. -```cpp +``` cpp bool info_successfully_loaded = false; ``` -**9.** Names of `define`s and global constants use ALL_CAPS with underscores. +**9.** Names of `define`s and global constants use ALL\_CAPS with underscores. -```cpp +``` cpp #define MAX_SRC_TABLE_NAMES_TO_STORE 1000 ``` @@ -363,12 +363,12 @@ If the file contains a single function, name the file the same way as the functi **11.** If the name contains an abbreviation, then: -- For variable names, the abbreviation should use lowercase letters `mysql_connection` (not `mySQL_connection`). -- For names of classes and functions, keep the uppercase letters in the abbreviation`MySQLConnection` (not `MySqlConnection`). +- For variable names, the abbreviation should use lowercase letters `mysql_connection` (not `mySQL_connection`). +- For names of classes and functions, keep the uppercase letters in the abbreviation`MySQLConnection` (not `MySqlConnection`). **12.** Constructor arguments that are used just to initialize the class members should be named the same way as the class members, but with an underscore at the end. -```cpp +``` cpp FileQueueProcessor( const std::string & path_, const std::string & prefix_, @@ -385,13 +385,13 @@ The underscore suffix can be omitted if the argument is not used in the construc **13.** There is no difference in the names of local variables and class members (no prefixes required). -```cpp +``` cpp timer (not m_timer) ``` -**14.** For the constants in an `enum`, use CamelCase with a capital letter. ALL_CAPS is also acceptable. If the `enum` is non-local, use an `enum class`. +**14.** For the constants in an `enum`, use CamelCase with a capital letter. ALL\_CAPS is also acceptable. If the `enum` is non-local, use an `enum class`. -```cpp +``` cpp enum class CompressionMethod { QuickLZ = 0, @@ -401,17 +401,13 @@ enum class CompressionMethod **15.** All names must be in English. Transliteration of Russian words is not allowed. -``` -not Stroka -``` + not Stroka **16.** Abbreviations are acceptable if they are well known (when you can easily find the meaning of the abbreviation in Wikipedia or in a search engine). -``` -`AST`, `SQL`. + `AST`, `SQL`. -Not `NVDH` (some random letters) -``` + Not `NVDH` (some random letters) Incomplete words are acceptable if the shortened version is common use. @@ -419,7 +415,7 @@ You can also use an abbreviation if the full name is included next to it in the **17.** File names with C++ source code must have the `.cpp` extension. Header files must have the `.h` extension. -## How to Write Code +## How to Write Code {#how-to-write-code} **1.** Memory management. @@ -431,9 +427,9 @@ In application code, memory must be freed by the object that owns it. Examples: -- The easiest way is to place an object on the stack, or make it a member of another class. -- For a large number of small objects, use containers. -- For automatic deallocation of a small number of objects that reside in the heap, use `shared_ptr/unique_ptr`. +- The easiest way is to place an object on the stack, or make it a member of another class. +- For a large number of small objects, use containers. +- For automatic deallocation of a small number of objects that reside in the heap, use `shared_ptr/unique_ptr`. **2.** Resource management. @@ -441,15 +437,15 @@ Use `RAII` and see above. **3.** Error handling. -Use exceptions. In most cases, you only need to throw an exception, and don't need to catch it (because of `RAII`). +Use exceptions. In most cases, you only need to throw an exception, and don’t need to catch it (because of `RAII`). -In offline data processing applications, it's often acceptable to not catch exceptions. +In offline data processing applications, it’s often acceptable to not catch exceptions. -In servers that handle user requests, it's usually enough to catch exceptions at the top level of the connection handler. +In servers that handle user requests, it’s usually enough to catch exceptions at the top level of the connection handler. In thread functions, you should catch and keep all exceptions to rethrow them in the main thread after `join`. -```cpp +``` cpp /// If there weren't any calculations yet, calculate the first block synchronously if (!started) { @@ -465,14 +461,14 @@ if (exception) Never hide exceptions without handling. Never just blindly put all exceptions to log. -```cpp +``` cpp //Not correct catch (...) {} ``` If you need to ignore some exceptions, do so only for specific ones and rethrow the rest. -```cpp +``` cpp catch (const DB::Exception & e) { if (e.code() == ErrorCodes::UNKNOWN_AGGREGATE_FUNCTION) @@ -484,7 +480,7 @@ catch (const DB::Exception & e) When using functions with response codes or `errno`, always check the result and throw an exception in case of error. -```cpp +``` cpp if (0 != close(fd)) throwFromErrno("Cannot close file " + file_name, ErrorCodes::CANNOT_CLOSE_FILE); ``` @@ -501,16 +497,16 @@ This is not recommended, but it is allowed. Use the following options: -- Create a function (`done()` or `finalize()`) that will do all the work in advance that might lead to an exception. If that function was called, there should be no exceptions in the destructor later. -- Tasks that are too complex (such as sending messages over the network) can be put in separate method that the class user will have to call before destruction. -- If there is an exception in the destructor, it’s better to log it than to hide it (if the logger is available). -- In simple applications, it is acceptable to rely on `std::terminate` (for cases of `noexcept` by default in C++11) to handle exceptions. +- Create a function (`done()` or `finalize()`) that will do all the work in advance that might lead to an exception. If that function was called, there should be no exceptions in the destructor later. +- Tasks that are too complex (such as sending messages over the network) can be put in separate method that the class user will have to call before destruction. +- If there is an exception in the destructor, it’s better to log it than to hide it (if the logger is available). +- In simple applications, it is acceptable to rely on `std::terminate` (for cases of `noexcept` by default in C++11) to handle exceptions. **6.** Anonymous code blocks. You can create a separate code block inside a single function in order to make certain variables local, so that the destructors are called when exiting the block. -```cpp +``` cpp Block block = data.in->read(); { @@ -526,11 +522,11 @@ ready_any.set(); In offline data processing programs: -- Try to get the best possible performance on a single CPU core. You can then parallelize your code if necessary. +- Try to get the best possible performance on a single CPU core. You can then parallelize your code if necessary. In server applications: -- Use the thread pool to process requests. At this point, we haven't had any tasks that required userspace context switching. +- Use the thread pool to process requests. At this point, we haven’t had any tasks that required userspace context switching. Fork is not used for parallelization. @@ -566,7 +562,7 @@ Use `unsigned` if necessary. Use the types `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, and `Int64`, as well as `size_t`, `ssize_t`, and `ptrdiff_t`. -Don't use these types for numbers: `signed/unsigned long`, `long long`, `short`, `signed/unsigned char`, `char`. +Don’t use these types for numbers: `signed/unsigned long`, `long long`, `short`, `signed/unsigned char`, `char`. **13.** Passing arguments. @@ -582,7 +578,7 @@ If the function allocates an object on heap and returns it, use `shared_ptr` or In rare cases you might need to return the value via an argument. In this case, the argument should be a reference. -```cpp +``` cpp using AggregateFunctionPtr = std::shared_ptr; /** Allows creating an aggregate function by its name. @@ -598,23 +594,23 @@ public: There is no need to use a separate `namespace` for application code. -Small libraries don't need this, either. +Small libraries don’t need this, either. For medium to large libraries, put everything in a `namespace`. -In the library's `.h` file, you can use `namespace detail` to hide implementation details not needed for the application code. +In the library’s `.h` file, you can use `namespace detail` to hide implementation details not needed for the application code. In a `.cpp` file, you can use a `static` or anonymous namespace to hide symbols. -Also, a `namespace` can be used for an `enum` to prevent the corresponding names from falling into an external `namespace` (but it's better to use an `enum class`). +Also, a `namespace` can be used for an `enum` to prevent the corresponding names from falling into an external `namespace` (but it’s better to use an `enum class`). **16.** Deferred initialization. -If arguments are required for initialization, then you normally shouldn't write a default constructor. +If arguments are required for initialization, then you normally shouldn’t write a default constructor. If later you’ll need to delay initialization, you can add a default constructor that will create an invalid object. Or, for a small number of objects, you can use `shared_ptr/unique_ptr`. -```cpp +``` cpp Loader(DB::Connection * connection_, const std::string & query, size_t max_block_size_); /// For deferred initialization @@ -651,7 +647,7 @@ Use UTF-8 encoding in the log. In rare cases you can use non-ASCII characters in **20.** Input-output. -Don't use `iostreams` in internal cycles that are critical for application performance (and never use `stringstream`). +Don’t use `iostreams` in internal cycles that are critical for application performance (and never use `stringstream`). Use the `DB/IO` library instead. @@ -669,13 +665,13 @@ Always use `#pragma once` instead of include guards. **24.** Do not use `trailing return type` for functions unless necessary. -```cpp +``` cpp [auto f() -> void;]{.strike} ``` **25.** Declaration and initialization of variables. -```cpp +``` cpp //right way std::string s = "Hello"; std::string s{"Hello"}; @@ -686,13 +682,13 @@ auto s = std::string{"Hello"}; **26.** For virtual functions, write `virtual` in the base class, but write `override` instead of `virtual` in descendent classes. -## Unused Features of C++ +## Unused Features of C++ {#unused-features-of-c} **1.** Virtual inheritance is not used. **2.** Exception specifiers from C++03 are not used. -## Platform +## Platform {#platform} **1.** We write code for a specific platform. @@ -706,7 +702,7 @@ The standard library is used (`libstdc++` or `libc++`). **4.**OS: Linux Ubuntu, not older than Precise. -**5.**Code is written for x86_64 CPU architecture. +**5.**Code is written for x86\_64 CPU architecture. The CPU instruction set is the minimum supported set among our servers. Currently, it is SSE 4.2. @@ -716,7 +712,7 @@ The CPU instruction set is the minimum supported set among our servers. Currentl **8.** Code is developed and debugged with release settings. -## Tools +## Tools {#tools} **1.** KDevelop is a good IDE. @@ -738,13 +734,13 @@ Though only selected revisions are considered workable. Use branches for this purpose. -If your code in the `master` branch is not buildable yet, exclude it from the build before the `push`. You'll need to finish it or remove it within a few days. +If your code in the `master` branch is not buildable yet, exclude it from the build before the `push`. You’ll need to finish it or remove it within a few days. **9.** For non-trivial changes, use branches and publish them on the server. **10.** Unused code is removed from the repository. -## Libraries +## Libraries {#libraries} **1.** The C++14 standard library is used (experimental extensions are allowed), as well as `boost` and `Poco` frameworks. @@ -754,19 +750,19 @@ If there is a good solution already available, then use it, even if it means you (But be prepared to remove bad libraries from code.) -**3.** You can install a library that isn't in the packages, if the packages don't have what you need or have an outdated version or the wrong type of compilation. +**3.** You can install a library that isn’t in the packages, if the packages don’t have what you need or have an outdated version or the wrong type of compilation. -**4.** If the library is small and doesn't have its own complex build system, put the source files in the `contrib` folder. +**4.** If the library is small and doesn’t have its own complex build system, put the source files in the `contrib` folder. **5.** Preference is always given to libraries that are already in use. -## General Recommendations +## General Recommendations {#general-recommendations-1} **1.** Write as little code as possible. **2.** Try the simplest solution. -**3.** Don't write code until you know how it's going to work and how the inner loop will function. +**3.** Don’t write code until you know how it’s going to work and how the inner loop will function. **4.** In the simplest cases, use `using` instead of classes or structs. @@ -774,11 +770,11 @@ If there is a good solution already available, then use it, even if it means you **6.** Code simplification is encouraged. Reduce the size of your code where possible. -## Additional Recommendations +## Additional Recommendations {#additional-recommendations} **1.** Explicitly specifying `std::` for types from `stddef.h` -is not recommended. In other words, we recommend writing `size_t` instead `std::size_t`, because it's shorter. +is not recommended. In other words, we recommend writing `size_t` instead `std::size_t`, because it’s shorter. It is acceptable to add `std::`. @@ -802,32 +798,32 @@ For example, use `memcpy` instead of `std::copy` for copying large chunks of mem Any of the following wrapping styles are allowed: -```cpp +``` cpp function( T1 x1, T2 x2) ``` -```cpp +``` cpp function( size_t left, size_t right, const & RangesInDataParts ranges, size_t limit) ``` -```cpp +``` cpp function(size_t left, size_t right, const & RangesInDataParts ranges, size_t limit) ``` -```cpp +``` cpp function(size_t left, size_t right, const & RangesInDataParts ranges, size_t limit) ``` -```cpp +``` cpp function( size_t left, size_t right, diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index bc692ce4041..e061f1c3144 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -1,7 +1,6 @@ -# ClickHouse Testing +# ClickHouse Testing {#clickhouse-testing} - -## Functional Tests +## Functional Tests {#functional-tests} Functional tests are the most simple and convenient to use. Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way. @@ -28,45 +27,39 @@ meaning. `long` is for tests that run slightly longer that one second. You can disable these groups of tests using `--no-zookeeper`, `--no-shard` and `--no-long` options, respectively. - -## Known bugs +## Known bugs {#known-bugs} If we know some bugs that can be easily reproduced by functional tests, we place prepared functional tests in `dbms/tests/queries/bugs` directory. These tests will be moved to `dbms/tests/queries/0_stateless` when bugs are fixed. - -## Integration Tests +## Integration Tests {#integration-tests} Integration tests allow to test ClickHouse in clustered configuration and ClickHouse interaction with other servers like MySQL, Postgres, MongoDB. They are useful to emulate network splits, packet drops, etc. These tests are run under Docker and create multiple containers with various software. See `dbms/tests/integration/README.md` on how to run these tests. -Note that integration of ClickHouse with third-party drivers is not tested. Also we currently don't have integration tests with our JDBC and ODBC drivers. +Note that integration of ClickHouse with third-party drivers is not tested. Also we currently don’t have integration tests with our JDBC and ODBC drivers. - -## Unit Tests +## Unit Tests {#unit-tests} Unit tests are useful when you want to test not the ClickHouse as a whole, but a single isolated library or class. You can enable or disable build of tests with `ENABLE_TESTS` CMake option. Unit tests (and other test programs) are located in `tests` subdirectories across the code. To run unit tests, type `ninja test`. Some tests use `gtest`, but some are just programs that return non-zero exit code on test failure. -It's not necessarily to have unit tests if the code is already covered by functional tests (and functional tests are usually much more simple to use). +It’s not necessarily to have unit tests if the code is already covered by functional tests (and functional tests are usually much more simple to use). - -## Performance Tests +## Performance Tests {#performance-tests} Performance tests allow to measure and compare performance of some isolated part of ClickHouse on synthetic queries. Tests are located at `dbms/tests/performance`. Each test is represented by `.xml` file with description of test case. Tests are run with `clickhouse performance-test` tool (that is embedded in `clickhouse` binary). See `--help` for invocation. -Each test run one or miltiple queries (possibly with combinations of parameters) in a loop with some conditions for stop (like "maximum execution speed is not changing in three seconds") and measure some metrics about query performance (like "maximum execution speed"). Some tests can contain preconditions on preloaded test dataset. +Each test run one or miltiple queries (possibly with combinations of parameters) in a loop with some conditions for stop (like “maximum execution speed is not changing in three seconds”) and measure some metrics about query performance (like “maximum execution speed”). Some tests can contain preconditions on preloaded test dataset. If you want to improve performance of ClickHouse in some scenario, and if improvements can be observed on simple queries, it is highly recommended to write a performance test. It always makes sense to use `perf top` or other perf tools during your tests. - -## Test Tools And Scripts +## Test Tools And Scripts {#test-tools-and-scripts} Some programs in `tests` directory are not prepared tests, but are test tools. For example, for `Lexer` there is a tool `dbms/src/Parsers/tests/lexer` that just do tokenization of stdin and writes colorized result to stdout. You can use these kind of tools as a code examples and for exploration and manual testing. You can also place pair of files `.sh` and `.reference` along with the tool to run it on some predefined input - then script result can be compared to `.reference` file. These kind of tests are not automated. - -## Miscellanous Tests +## Miscellanous Tests {#miscellanous-tests} There are tests for external dictionaries located at `dbms/tests/external_dictionaries` and for machine learned models in `dbms/tests/external_models`. These tests are not updated and must be transferred to integration tests. @@ -74,8 +67,7 @@ There is separate test for quorum inserts. This test run ClickHouse cluster on s Quorum test was written by separate team before ClickHouse was open-sourced. This team no longer work with ClickHouse. Test was accidentially written in Java. For these reasons, quorum test must be rewritten and moved to integration tests. - -## Manual Testing +## Manual Testing {#manual-testing} When you develop a new feature, it is reasonable to also test it manually. You can do it with the following steps: @@ -86,47 +78,49 @@ Note that all clickhouse tools (server, client, etc) are just symlinks to a sing Alternatively you can install ClickHouse package: either stable release from Yandex repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo service clickhouse-server start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`. When ClickHouse is already installed on your system, you can build a new `clickhouse` binary and replace the existing binary: -```bash + +``` bash $ sudo service clickhouse-server stop $ sudo cp ./clickhouse /usr/bin/ $ sudo service clickhouse-server start ``` Also you can stop system clickhouse-server and run your own with the same configuration but with logging to terminal: -```bash + +``` bash $ sudo service clickhouse-server stop $ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml ``` Example with gdb: -```bash + +``` bash $ sudo -u clickhouse gdb --args /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml ``` -If the system clickhouse-server is already running and you don't want to stop it, you can change port numbers in your `config.xml` (or override them in a file in `config.d` directory), provide appropriate data path, and run it. +If the system clickhouse-server is already running and you don’t want to stop it, you can change port numbers in your `config.xml` (or override them in a file in `config.d` directory), provide appropriate data path, and run it. `clickhouse` binary has almost no dependencies and works across wide range of Linux distributions. To quick and dirty test your changes on a server, you can simply `scp` your fresh built `clickhouse` binary to your server and then run it as in examples above. - -## Testing Environment +## Testing Environment {#testing-environment} Before publishing release as stable we deploy it on testing environment. Testing environment is a cluster that process 1/39 part of [Yandex.Metrica](https://metrica.yandex.com/) data. We share our testing environment with Yandex.Metrica team. ClickHouse is upgraded without downtime on top of existing data. We look at first that data is processed successfully without lagging from realtime, the replication continue to work and there is no issues visible to Yandex.Metrica team. First check can be done in the following way: -```sql +``` sql SELECT hostName() AS h, any(version()), any(uptime()), max(UTCEventTime), count() FROM remote('example01-01-{1..3}t', merge, hits) WHERE EventDate >= today() - 2 GROUP BY h ORDER BY h; ``` In some cases we also deploy to testing environment of our friend teams in Yandex: Market, Cloud, etc. Also we have some hardware servers that are used for development purposes. - -## Load Testing +## Load Testing {#load-testing} After deploying to testing environment we run load testing with queries from production cluster. This is done manually. Make sure you have enabled `query_log` on your production cluster. Collect query log for a day or more: -```bash + +``` bash $ clickhouse-client --query="SELECT DISTINCT query FROM system.query_log WHERE event_date = today() AND query LIKE '%ym:%' AND query NOT LIKE '%system.query_log%' AND type = 2 AND is_initial_query" > queries.tsv ``` @@ -134,19 +128,19 @@ This is a way complicated example. `type = 2` will filter queries that are execu `scp` this log to your testing cluster and run it as following: -```bash +``` bash $ clickhouse benchmark --concurrency 16 < queries.tsv ``` + (probably you also want to specify a `--user`) Then leave it for a night or weekend and go take a rest. -You should check that `clickhouse-server` doesn't crash, memory footprint is bounded and performance not degrading over time. +You should check that `clickhouse-server` doesn’t crash, memory footprint is bounded and performance not degrading over time. Precise query execution timings are not recorded and not compared due to high variability of queries and environment. - -## Build Tests +## Build Tests {#build-tests} Build tests allow to check that build is not broken on various alternative configurations and on some foreign systems. Tests are located at `ci` directory. They run build from source inside Docker, Vagrant, and sometimes with `qemu-user-static` inside Docker. These tests are under development and test runs are not automated. @@ -154,23 +148,21 @@ Motivation: Normally we release and run all tests on a single variant of ClickHouse build. But there are alternative build variants that are not thoroughly tested. Examples: -- build on FreeBSD; -- build on Debian with libraries from system packages; -- build with shared linking of libraries; -- build on AArch64 platform; -- build on PowerPc platform. +- build on FreeBSD; +- build on Debian with libraries from system packages; +- build with shared linking of libraries; +- build on AArch64 platform; +- build on PowerPc platform. For example, build with system packages is bad practice, because we cannot guarantee what exact version of packages a system will have. But this is really needed by Debian maintainers. For this reason we at least have to support this variant of build. Another example: shared linking is a common source of trouble, but it is needed for some enthusiasts. Though we cannot run all tests on all variant of builds, we want to check at least that various build variants are not broken. For this purpose we use build tests. - -## Testing For Protocol Compatibility +## Testing For Protocol Compatibility {#testing-for-protocol-compatibility} When we extend ClickHouse network protocol, we test manually that old clickhouse-client works with new clickhouse-server and new clickhouse-client works with old clickhouse-server (simply by running binaries from corresponding packages). - -## Help From The Compiler +## Help From The Compiler {#help-from-the-compiler} Main ClickHouse code (that is located in `dbms` directory) is built with `-Wall -Wextra -Werror` and with some additional enabled warnings. Although these options are not enabled for third-party libraries. @@ -178,8 +170,7 @@ Clang has even more useful warnings - you can look for them with `-Weverything` For production builds, gcc is used (it still generates slightly more efficient code than clang). For development, clang is usually more convenient to use. You can build on your own machine with debug mode (to save battery of your laptop), but please note that compiler is able to generate more warnings with `-O3` due to better control flow and inter-procedure analysis. When building with clang, `libc++` is used instead of `libstdc++` and when building with debug mode, debug version of `libc++` is used that allows to catch more errors at runtime. - -## Sanitizers +## Sanitizers {#sanitizers} **Address sanitizer**. We run functional and integration tests under ASan on per-commit basis. @@ -191,71 +182,63 @@ We run functional tests under Valgrind overnight. It takes multiple hours. Curre We run functional and integration tests under ASan on per-commit basis. **Thread sanitizer**. -We run functional tests under TSan on per-commit basis. We still don't run integration tests under TSan on per-commit basis. +We run functional tests under TSan on per-commit basis. We still don’t run integration tests under TSan on per-commit basis. **Memory sanitizer**. -Currently we still don't use MSan. +Currently we still don’t use MSan. **Debug allocator.** Debug version of `jemalloc` is used for debug build. +## Fuzzing {#fuzzing} -## Fuzzing +We use simple fuzz test to generate random SQL queries and to check that the server doesn’t die. Fuzz testing is performed with Address sanitizer. You can find it in `00746_sql_fuzzy.pl`. This test should be run continuously (overnight and longer). -We use simple fuzz test to generate random SQL queries and to check that the server doesn't die. Fuzz testing is performed with Address sanitizer. You can find it in `00746_sql_fuzzy.pl`. This test should be run continuously (overnight and longer). +As of December 2018, we still don’t use isolated fuzz testing of library code. -As of December 2018, we still don't use isolated fuzz testing of library code. - - -## Security Audit +## Security Audit {#security-audit} People from Yandex Cloud department do some basic overview of ClickHouse capabilities from the security standpoint. - -## Static Analyzers +## Static Analyzers {#static-analyzers} We run `PVS-Studio` on per-commit basis. We have evaluated `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`. You will find instructions for usage in `dbms/tests/instructions/` directory. Also you can read [the article in russian](https://habr.com/company/yandex/blog/342018/). If you use `CLion` as an IDE, you can leverage some `clang-tidy` checks out of the box. +## Hardening {#hardening} -## Hardening +`FORTIFY_SOURCE` is used by default. It is almost useless, but still makes sense in rare cases and we don’t disable it. -`FORTIFY_SOURCE` is used by default. It is almost useless, but still makes sense in rare cases and we don't disable it. - - -## Code Style +## Code Style {#code-style} Code style rules are described [here](https://clickhouse.tech/docs/en/development/style/). To check for some common style violations, you can use `utils/check-style` script. -To force proper style of your code, you can use `clang-format`. File `.clang-format` is located at the sources root. It mostly corresponding with our actual code style. But it's not recommended to apply `clang-format` to existing files because it makes formatting worse. You can use `clang-format-diff` tool that you can find in clang source repository. +To force proper style of your code, you can use `clang-format`. File `.clang-format` is located at the sources root. It mostly corresponding with our actual code style. But it’s not recommended to apply `clang-format` to existing files because it makes formatting worse. You can use `clang-format-diff` tool that you can find in clang source repository. Alternatively you can try `uncrustify` tool to reformat your code. Configuration is in `uncrustify.cfg` in the sources root. It is less tested than `clang-format`. `CLion` has its own code formatter that has to be tuned for our code style. - -## Metrica B2B Tests +## Metrica B2B Tests {#metrica-b2b-tests} Each ClickHouse release is tested with Yandex Metrica and AppMetrica engines. Testing and stable versions of ClickHouse are deployed on VMs and run with a small copy of Metrica engine that is processing fixed sample of input data. Then results of two instances of Metrica engine are compared together. These tests are automated by separate team. Due to high number of moving parts, tests are fail most of the time by completely unrelated reasons, that are very difficult to figure out. Most likely these tests have negative value for us. Nevertheless these tests was proved to be useful in about one or two times out of hundreds. +## Test Coverage {#test-coverage} -## Test Coverage +As of July 2018 we don’t track test coverage. -As of July 2018 we don't track test coverage. +## Test Automation {#test-automation} +We run tests with Yandex internal CI and job automation system named “Sandbox”. -## Test Automation +Build jobs and tests are run in Sandbox on per commit basis. Resulting packages and test results are published in GitHub and can be downloaded by direct links. Artifacts are stored eternally. When you send a pull request on GitHub, we tag it as “can be tested” and our CI system will build ClickHouse packages (release, debug, with address sanitizer, etc) for you. -We run tests with Yandex internal CI and job automation system named "Sandbox". - -Build jobs and tests are run in Sandbox on per commit basis. Resulting packages and test results are published in GitHub and can be downloaded by direct links. Artifacts are stored eternally. When you send a pull request on GitHub, we tag it as "can be tested" and our CI system will build ClickHouse packages (release, debug, with address sanitizer, etc) for you. - -We don't use Travis CI due to the limit on time and computational power. -We don't use Jenkins. It was used before and now we are happy we are not using Jenkins. +We don’t use Travis CI due to the limit on time and computational power. +We don’t use Jenkins. It was used before and now we are happy we are not using Jenkins. [Original article](https://clickhouse.tech/docs/en/development/tests/) diff --git a/docs/en/faq/general.md b/docs/en/faq/general.md index fb753026812..bd573590190 100644 --- a/docs/en/faq/general.md +++ b/docs/en/faq/general.md @@ -1,35 +1,32 @@ -# General Questions +# General Questions {#general-questions} -## Why Not Use Something Like MapReduce? +## Why Not Use Something Like MapReduce? {#why-not-use-something-like-mapreduce} -We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open source solution in this class is [Apache Hadoop](http://hadoop.apache.org). Yandex uses their in-house solution, YT. +We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open-source solution in this class is [Apache Hadoop](http://hadoop.apache.org). Yandex uses its in-house solution, YT. -These systems aren't appropriate for online queries due to their high latency. In other words, they can't be used as the back-end for a web interface. -These types of systems aren't useful for real-time data updates. -Distributed sorting isn't the best way to perform reduce operations if the result of the operation and all the intermediate results (if there are any) are located in the RAM of a single server, which is usually the case for online queries. In such a case, a hash table is the optimal way to perform reduce operations. A common approach to optimizing map-reduce tasks is pre-aggregation (partial reduce) using a hash table in RAM. The user performs this optimization manually. -Distributed sorting is one of the main causes of reduced performance when running simple map-reduce tasks. +These systems aren’t appropriate for online queries due to their high latency. In other words, they can’t be used as the back-end for a web interface. These types of systems aren’t useful for real-time data updates. Distributed sorting isn’t the best way to perform reduce operations if the result of the operation and all the intermediate results (if there are any) are located in the RAM of a single server, which is usually the case for online queries. In such a case, a hash table is an optimal way to perform reduce operations. A common approach to optimizing map-reduce tasks is pre-aggregation (partial reduce) using a hash table in RAM. The user performs this optimization manually. Distributed sorting is one of the main causes of reduced performance when running simple map-reduce tasks. -Most MapReduce implementations allow you to execute arbitrary code on a cluster. But a declarative query language is better suited to OLAP in order to run experiments quickly. For example, Hadoop has Hive and Pig. Also consider Cloudera Impala or Shark (outdated) for Spark, as well as Spark SQL, Presto, and Apache Drill. Performance when running such tasks is highly sub-optimal compared to specialized systems, but relatively high latency makes it unrealistic to use these systems as the backend for a web interface. +Most MapReduce implementations allow you to execute arbitrary code on a cluster. But a declarative query language is better suited to OLAP to run experiments quickly. For example, Hadoop has Hive and Pig. Also consider Cloudera Impala or Shark (outdated) for Spark, as well as Spark SQL, Presto, and Apache Drill. Performance when running such tasks is highly sub-optimal compared to specialized systems, but relatively high latency makes it unrealistic to use these systems as the backend for a web interface. ## What If I Have a Problem with Encodings When Using Oracle Through ODBC? {#oracle-odbc-encodings} -If you use Oracle through the ODBC driver as a source of external dictionaries, you need to set the correct value for the `NLS_LANG` environment variable in `/etc/default/clickhouse`. For more information, see the [Oracle NLS_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). +If you use Oracle through the ODBC driver as a source of external dictionaries, you need to set the correct value for the `NLS_LANG` environment variable in `/etc/default/clickhouse`. For more information, see the [Oracle NLS\_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). **Example** -```sql +``` sql NLS_LANG=RUSSIAN_RUSSIA.UTF8 ``` ## How Do I Export Data from ClickHouse to a File? {#how-to-export-to-file} -### Using INTO OUTFILE Clause +### Using INTO OUTFILE Clause {#using-into-outfile-clause} Add an [INTO OUTFILE](../query_language/select/#into-outfile-clause) clause to your query. For example: -```sql +``` sql SELECT * FROM table INTO OUTFILE 'file' ``` @@ -37,17 +34,17 @@ By default, ClickHouse uses the [TabSeparated](../interfaces/formats.md#tabsepar For example: -```sql +``` sql SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV ``` -### Using a File-Engine Table +### Using a File-Engine Table {#using-a-file-engine-table} See [File](../operations/table_engines/file.md). -### Using Command-Line Redirection +### Using Command-Line Redirection {#using-command-line-redirection} -```sql +``` sql $ clickhouse-client --query "SELECT * from table" --format FormatName > result.txt ``` diff --git a/docs/en/getting_started/example_datasets/amplab_benchmark.md b/docs/en/getting_started/example_datasets/amplab_benchmark.md index 67ac38406ee..7a9473e6eb0 100644 --- a/docs/en/getting_started/example_datasets/amplab_benchmark.md +++ b/docs/en/getting_started/example_datasets/amplab_benchmark.md @@ -1,12 +1,12 @@ -# AMPLab Big Data Benchmark +# AMPLab Big Data Benchmark {#amplab-big-data-benchmark} -See +See https://amplab.cs.berkeley.edu/benchmark/ -Sign up for a free account at . You will need a credit card, email and phone number.Get a new access key at +Sign up for a free account at https://aws.amazon.com. You will need a credit card, email and phone number. Get a new access key at https://console.aws.amazon.com/iam/home?nc2=h\_m\_sc\#security\_credential Run the following in the console: -```bash +``` bash $ sudo apt-get install s3cmd $ mkdir tiny; cd tiny; $ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . @@ -21,7 +21,7 @@ $ cd .. Run the following ClickHouse queries: -```sql +``` sql CREATE TABLE rankings_tiny ( pageURL String, @@ -85,7 +85,7 @@ CREATE TABLE uservisits_5nodes_on_single Go back to the console: -```bash +``` bash $ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done $ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done $ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done @@ -119,5 +119,4 @@ ORDER BY totalRevenue DESC LIMIT 1 ``` - [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/amplab_benchmark/) diff --git a/docs/en/getting_started/example_datasets/criteo.md b/docs/en/getting_started/example_datasets/criteo.md index d9a7f5b56c1..a1bca49597b 100644 --- a/docs/en/getting_started/example_datasets/criteo.md +++ b/docs/en/getting_started/example_datasets/criteo.md @@ -1,16 +1,16 @@ -# Terabyte of Click Logs from Criteo +# Terabyte of Click Logs from Criteo {#terabyte-of-click-logs-from-criteo} -Download the data from +Download the data from http://labs.criteo.com/downloads/download-terabyte-click-logs/ Create a table to import the log to: -```sql +``` sql CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log ``` Download the data: -```bash +``` bash $ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done ``` @@ -65,11 +65,10 @@ CREATE TABLE criteo Transform data from the raw log and put it in the second table: -```sql +``` sql INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log; DROP TABLE criteo_log; ``` - [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/criteo/) diff --git a/docs/en/getting_started/example_datasets/metrica.md b/docs/en/getting_started/example_datasets/metrica.md index a4d2687c8e3..fb6d3ae0a69 100644 --- a/docs/en/getting_started/example_datasets/metrica.md +++ b/docs/en/getting_started/example_datasets/metrica.md @@ -1,9 +1,10 @@ -# Anonymized Yandex.Metrica Data +# Anonymized Yandex.Metrica Data {#anonymized-yandex-metrica-data} + Dataset consists of two tables containing anonymized data about hits (`hits_v1`) and visits (`visits_v1`) of Yandex.Metrica. You can read more about Yandex.Metrica in [ClickHouse history](../../introduction/history.md) section. -The dataset consists of two tables, either of them can be downloaded as a compressed `tsv.xz` file or as prepared partitions. In addition to that, an extended version of the `hits` table containing 100 million rows is available as TSV at and as prepared partitions at . +The dataset consists of two tables, either of them can be downloaded as a compressed `tsv.xz` file or as prepared partitions. In addition to that, an extended version of the `hits` table containing 100 million rows is available as TSV at https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits\_100m\_obfuscated\_v1.tsv.xz and as prepared partitions at https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits\_100m\_obfuscated\_v1.tar.xz. -## Obtaining Tables from Prepared Partitions +## Obtaining Tables from Prepared Partitions {#obtaining-tables-from-prepared-partitions} Download and import hits table: @@ -25,12 +26,12 @@ sudo service clickhouse-server restart clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` -## Obtaining Tables from Compressed TSV File +## Obtaining Tables from Compressed TSV File {#obtaining-tables-from-compressed-tsv-file} Download and import hits from compressed TSV file: ``` bash -curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv +curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv # now create table clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" @@ -44,7 +45,7 @@ clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" Download and import visits from compressed tsv-file: ``` bash -curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv +curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv # now create table clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign)" @@ -55,7 +56,7 @@ clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` -## Example Queries +## Example Queries {#example-queries} [ClickHouse tutorial](../../getting_started/tutorial.md) is based on Yandex.Metrica dataset and the recommended way to get started with this dataset is to just go through tutorial. diff --git a/docs/en/getting_started/example_datasets/nyc_taxi.md b/docs/en/getting_started/example_datasets/nyc_taxi.md index ab91089b0dd..84a24717024 100644 --- a/docs/en/getting_started/example_datasets/nyc_taxi.md +++ b/docs/en/getting_started/example_datasets/nyc_taxi.md @@ -1,20 +1,20 @@ -# New York Taxi Data +# New York Taxi Data {#new-york-taxi-data} This dataset can be obtained in two ways: -- import from raw data -- download of prepared partitions +- import from raw data +- download of prepared partitions -## How to Import The Raw Data +## How to Import The Raw Data {#how-to-import-the-raw-data} -See and for the description of a dataset and instructions for downloading. +See https://github.com/toddwschneider/nyc-taxi-data and http://tech.marksblogg.com/billion-nyc-taxi-rides-redshift.html for the description of a dataset and instructions for downloading. Downloading will result in about 227 GB of uncompressed data in CSV files. The download takes about an hour over a 1 Gbit connection (parallel downloading from s3.amazonaws.com recovers at least half of a 1 Gbit channel). Some of the files might not download fully. Check the file sizes and re-download any that seem doubtful. Some of the files might contain invalid rows. You can fix them as follows: -```bash +``` bash sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-02.csv > data/yellow_tripdata_2010-02.csv_ sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-03.csv > data/yellow_tripdata_2010-03.csv_ mv data/yellow_tripdata_2010-02.csv_ data/yellow_tripdata_2010-02.csv @@ -25,11 +25,11 @@ Then the data must be pre-processed in PostgreSQL. This will create selections o Be careful when running `initialize_database.sh` and manually re-check that all the tables were created correctly. -It takes about 20-30 minutes to process each month's worth of data in PostgreSQL, for a total of about 48 hours. +It takes about 20-30 minutes to process each month’s worth of data in PostgreSQL, for a total of about 48 hours. You can check the number of downloaded rows as follows: -```bash +``` bash $ time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" ## Count 1298979494 @@ -44,7 +44,7 @@ The data in PostgreSQL uses 370 GB of space. Exporting the data from PostgreSQL: -```sql +``` sql COPY ( SELECT trips.id, @@ -119,7 +119,7 @@ This takes about 5 hours. The resulting TSV file is 590612904969 bytes. Create a temporary table in ClickHouse: -```sql +``` sql CREATE TABLE trips ( trip_id UInt32, @@ -178,7 +178,7 @@ dropoff_puma Nullable(String) It is needed for converting fields to more correct data types and, if possible, to eliminate NULLs. -```bash +``` bash $ time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv real 75m56.214s @@ -188,15 +188,15 @@ Data is read at a speed of 112-140 Mb/second. Loading data into a Log type table in one stream took 76 minutes. The data in this table uses 142 GB. -(Importing data directly from Postgres is also possible using ` COPY ... TO PROGRAM`.) +(Importing data directly from Postgres is also possible using `COPY ... TO PROGRAM`.) -Unfortunately, all the fields associated with the weather (precipitation...average_wind_speed) were filled with NULL. Because of this, we will remove them from the final data set. +Unfortunately, all the fields associated with the weather (precipitation…average\_wind\_speed) were filled with NULL. Because of this, we will remove them from the final data set. -To start, we'll create a table on a single server. Later we will make the table distributed. +To start, we’ll create a table on a single server. Later we will make the table distributed. Create and populate a summary table: -```sql +``` sql CREATE TABLE trips_mergetree ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) AS SELECT @@ -263,20 +263,21 @@ To load it faster, you can create the table with the `Log` engine instead of `Me The table uses 126 GB of disk space. -```sql +``` sql SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active ``` -```text + +``` text ┌─formatReadableSize(sum(bytes))─┐ │ 126.18 GiB │ └────────────────────────────────┘ ``` -Among other things, you can run the OPTIMIZE query on MergeTree. But it's not required, since everything will be fine without it. +Among other things, you can run the OPTIMIZE query on MergeTree. But it’s not required since everything will be fine without it. -## Download of Prepared Partitions +## Download of Prepared Partitions {#download-of-prepared-partitions} -```bash +``` bash $ curl -O https://clickhouse-datasets.s3.yandex.net/trips_mergetree/partitions/trips_mergetree.tar $ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # path to ClickHouse data directory $ # check permissions of unpacked data, fix if required @@ -284,16 +285,14 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "select count(*) from datasets.trips_mergetree" ``` -!!!info - If you will run queries described below, you have to use full table name, - `datasets.trips_mergetree`. +!!! info "Info" + If you will run the queries described below, you have to use the full table name, `datasets.trips_mergetree`. - -## Results on Single Server +## Results on Single Server {#results-on-single-server} Q1: -```sql +``` sql SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type ``` @@ -301,7 +300,7 @@ SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type Q2: -```sql +``` sql SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenger_count ``` @@ -309,7 +308,7 @@ SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenge Q3: -```sql +``` sql SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetree GROUP BY passenger_count, year ``` @@ -317,7 +316,7 @@ SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetr Q4: -```sql +``` sql SELECT passenger_count, toYear(pickup_date) AS year, round(trip_distance) AS distance, count(*) FROM trips_mergetree GROUP BY passenger_count, year, distance @@ -330,25 +329,25 @@ The following server was used: Two Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz, 16 physical kernels total,128 GiB RAM,8x6 TB HD on hardware RAID-5 -Execution time is the best of three runsBut starting from the second run, queries read data from the file system cache. No further caching occurs: the data is read out and processed in each run. +Execution time is the best of three runs. But starting from the second run, queries read data from the file system cache. No further caching occurs: the data is read out and processed in each run. Creating a table on three servers: On each server: -```sql +``` sql CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) ``` On the source server: -```sql +``` sql CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) ``` The following query redistributes data: -```sql +``` sql INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree ``` @@ -363,7 +362,7 @@ Q4: 1.241 seconds. No surprises here, since the queries are scaled linearly. -We also have results from a cluster of 140 servers: +We also have the results from a cluster of 140 servers: Q1: 0.028 sec. Q2: 0.043 sec. @@ -373,13 +372,12 @@ Q4: 0.072 sec. In this case, the query processing time is determined above all by network latency. We ran queries using a client located in a Yandex datacenter in Finland on a cluster in Russia, which added about 20 ms of latency. -## Summary - -| servers | Q1 | Q2 | Q3 | Q4 | -| ------- | ----- | ----- | ----- | ----- | -| 1 | 0.490 | 1.224 | 2.104 | 3.593 | -| 3 | 0.212 | 0.438 | 0.733 | 1.241 | -| 140 | 0.028 | 0.043 | 0.051 | 0.072 | +## Summary {#summary} +| servers | Q1 | Q2 | Q3 | Q4 | +|---------|-------|-------|-------|-------| +| 1 | 0.490 | 1.224 | 2.104 | 3.593 | +| 3 | 0.212 | 0.438 | 0.733 | 1.241 | +| 140 | 0.028 | 0.043 | 0.051 | 0.072 | [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/nyc_taxi/) diff --git a/docs/en/getting_started/example_datasets/ontime.md b/docs/en/getting_started/example_datasets/ontime.md index e29305bcef8..ebad5d98241 100644 --- a/docs/en/getting_started/example_datasets/ontime.md +++ b/docs/en/getting_started/example_datasets/ontime.md @@ -1,16 +1,15 @@ - -# OnTime +# OnTime {#ontime} This dataset can be obtained in two ways: -- import from raw data -- download of prepared partitions +- import from raw data +- download of prepared partitions -## Import From Raw Data +## Import From Raw Data {#import-from-raw-data} Downloading data: -```bash +``` bash for s in `seq 1987 2018` do for m in `seq 1 12` @@ -20,11 +19,11 @@ done done ``` -(from ) +(from https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh ) Creating a table: -```sql +``` sql CREATE TABLE `ontime` ( `Year` UInt16, `Quarter` UInt8, @@ -135,21 +134,21 @@ CREATE TABLE `ontime` ( `Div5LongestGTime` String, `Div5WheelsOff` String, `Div5TailNum` String -) ENGINE = MergeTree -PARTITION BY Year -ORDER BY (Carrier, FlightDate) +) ENGINE = MergeTree +PARTITION BY Year +ORDER BY (Carrier, FlightDate) SETTINGS index_granularity = 8192; ``` Loading data: -```bash +``` bash $ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ``` -## Download of Prepared Partitions +## Download of Prepared Partitions {#download-of-prepared-partitions} -```bash +``` bash $ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar $ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory $ # check permissions of unpacked data, fix if required @@ -157,15 +156,14 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "select count(*) from datasets.ontime" ``` -!!!info - If you will run queries described below, you have to use full table name, - `datasets.ontime`. +!!! info "Info" + If you will run the queries described below, you have to use the full table name, `datasets.ontime`. -## Queries +## Queries {#queries} Q0. -```sql +``` sql SELECT avg(c1) FROM ( @@ -177,7 +175,7 @@ FROM Q1. The number of flights per day from the year 2000 to 2008 -```sql +``` sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -187,7 +185,7 @@ ORDER BY c DESC; Q2. The number of flights delayed by more than 10 minutes, grouped by the day of the week, for 2000-2008 -```sql +``` sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -195,9 +193,9 @@ GROUP BY DayOfWeek ORDER BY c DESC; ``` -Q3. The number of delays by airport for 2000-2008 +Q3. The number of delays by the airport for 2000-2008 -```sql +``` sql SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -208,7 +206,7 @@ LIMIT 10; Q4. The number of delays by carrier for 2007 -```sql +``` sql SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year=2007 @@ -218,7 +216,7 @@ ORDER BY count(*) DESC; Q5. The percentage of delays by carrier for 2007 -```sql +``` sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -244,7 +242,7 @@ ORDER BY c3 DESC; Better version of the same query: -```sql +``` sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year=2007 @@ -254,7 +252,7 @@ ORDER BY c3 DESC Q6. The previous request for a broader range of years, 2000-2008 -```sql +``` sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -280,7 +278,7 @@ ORDER BY c3 DESC; Better version of the same query: -```sql +``` sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -290,7 +288,7 @@ ORDER BY c3 DESC; Q7. Percentage of flights delayed for more than 10 minutes, by year -```sql +``` sql SELECT Year, c1/c2 FROM ( @@ -314,7 +312,7 @@ ORDER BY Year; Better version of the same query: -```sql +``` sql SELECT Year, avg(DepDelay>10)*100 FROM ontime GROUP BY Year @@ -323,8 +321,8 @@ ORDER BY Year; Q8. The most popular destinations by the number of directly connected cities for various year ranges -```sql -SELECT DestCityName, uniqExact(OriginCityName) AS u +``` sql +SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime WHERE Year >= 2000 and Year <= 2010 GROUP BY DestCityName @@ -333,7 +331,7 @@ ORDER BY u DESC LIMIT 10; Q9. -```sql +``` sql SELECT Year, count(*) AS c1 FROM ontime GROUP BY Year; @@ -341,7 +339,7 @@ GROUP BY Year; Q10. -```sql +``` sql SELECT min(Year), max(Year), Carrier, count(*) AS cnt, sum(ArrDelayMinutes>30) AS flights_delayed, @@ -359,7 +357,7 @@ LIMIT 1000; Bonus: -```sql +``` sql SELECT avg(cnt) FROM ( @@ -397,12 +395,11 @@ LIMIT 10; This performance test was created by Vadim Tkachenko. See: -- -- -- -- -- -- - +- https://www.percona.com/blog/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/ +- https://www.percona.com/blog/2009/10/26/air-traffic-queries-in-luciddb/ +- https://www.percona.com/blog/2009/11/02/air-traffic-queries-in-infinidb-early-alpha/ +- https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/ +- https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/ +- http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/ontime/) diff --git a/docs/en/getting_started/example_datasets/star_schema.md b/docs/en/getting_started/example_datasets/star_schema.md index 93760bf3c90..6763ec63b61 100644 --- a/docs/en/getting_started/example_datasets/star_schema.md +++ b/docs/en/getting_started/example_datasets/star_schema.md @@ -1,8 +1,8 @@ -# Star Schema Benchmark +# Star Schema Benchmark {#star-schema-benchmark} Compiling dbgen: -```bash +``` bash $ git clone git@github.com:vadimtk/ssb-dbgen.git $ cd ssb-dbgen $ make @@ -11,10 +11,9 @@ $ make Generating data: !!! warning "Attention" - -s 100 -- dbgen generates 600 million rows (67 GB) - -s 1000 -- dbgen generates 6 billion rows (takes a lot of time) + With `-s 100` dbgen generates 600 million rows (67 GB), while while `-s 1000` it generates 6 billion rows (which takes a lot of time) -```bash +``` bash $ ./dbgen -s 1000 -T c $ ./dbgen -s 1000 -T l $ ./dbgen -s 1000 -T p @@ -24,7 +23,7 @@ $ ./dbgen -s 1000 -T d Creating tables in ClickHouse: -```sql +``` sql CREATE TABLE customer ( C_CUSTKEY UInt32, @@ -89,16 +88,16 @@ ENGINE = MergeTree ORDER BY S_SUPPKEY; Inserting data: -```bash +``` bash $ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl $ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl $ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl $ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl ``` -Converting "star schema" to denormalized "flat schema": +Converting “star schema” to denormalized “flat schema”: -```sql +``` sql SET max_memory_usage = 20000000000; CREATE TABLE lineorder_flat @@ -148,32 +147,38 @@ FROM lineorder AS l INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY; - ``` Running the queries: Q1.1 -```sql + +``` sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; ``` + Q1.2 -```sql + +``` sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; ``` + Q1.3 -```sql + +``` sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat -WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 +WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; ``` + Q2.1 -```sql + +``` sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, @@ -187,8 +192,10 @@ ORDER BY year, P_BRAND; ``` + Q2.2 -```sql + +``` sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, @@ -202,8 +209,10 @@ ORDER BY year, P_BRAND; ``` + Q2.3 -```sql + +``` sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, @@ -217,8 +226,10 @@ ORDER BY year, P_BRAND; ``` + Q3.1 -```sql + +``` sql SELECT C_NATION, S_NATION, @@ -234,8 +245,10 @@ ORDER BY year ASC, revenue DESC; ``` + Q3.2 -```sql + +``` sql SELECT C_CITY, S_CITY, @@ -251,8 +264,10 @@ ORDER BY year ASC, revenue DESC; ``` + Q3.3 -```sql + +``` sql SELECT C_CITY, S_CITY, @@ -268,8 +283,10 @@ ORDER BY year ASC, revenue DESC; ``` + Q3.4 -```sql + +``` sql SELECT C_CITY, S_CITY, @@ -285,8 +302,10 @@ ORDER BY year ASC, revenue DESC; ``` + Q4.1 -```sql + +``` sql SELECT toYear(LO_ORDERDATE) AS year, C_NATION, @@ -300,8 +319,10 @@ ORDER BY year ASC, C_NATION ASC; ``` + Q4.2 -```sql + +``` sql SELECT toYear(LO_ORDERDATE) AS year, S_NATION, @@ -318,8 +339,10 @@ ORDER BY S_NATION ASC, P_CATEGORY ASC; ``` + Q4.3 -```sql + +``` sql SELECT toYear(LO_ORDERDATE) AS year, S_CITY, diff --git a/docs/en/getting_started/example_datasets/wikistat.md b/docs/en/getting_started/example_datasets/wikistat.md index b6827375c57..c1448284b82 100644 --- a/docs/en/getting_started/example_datasets/wikistat.md +++ b/docs/en/getting_started/example_datasets/wikistat.md @@ -1,10 +1,10 @@ -# WikiStat +# WikiStat {#wikistat} -See: +See: http://dumps.wikimedia.org/other/pagecounts-raw/ Creating a table: -```sql +``` sql CREATE TABLE wikistat ( date Date, @@ -19,11 +19,10 @@ CREATE TABLE wikistat Loading data: -```bash +``` bash $ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt $ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done $ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done ``` - [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/wikistat/) diff --git a/docs/en/getting_started/index.md b/docs/en/getting_started/index.md index 7ffdceebdea..f9e6c4bd5d3 100644 --- a/docs/en/getting_started/index.md +++ b/docs/en/getting_started/index.md @@ -1,8 +1,8 @@ -# Getting Started +# Getting Started {#getting-started} -If you are new to ClickHouse and want to get a hands-on feeling of it's performance, first of all you need to go through the [installation process](install.md). After that you can: +If you are new to ClickHouse and want to get a hands-on feeling of its performance, first of all, you need to go through the [installation process](install.md). After that you can: -* [Go through detailed tutorial](tutorial.md) -* [Experiment with example datasets](example_datasets/ontime.md) +- [Go through detailed tutorial](tutorial.md) +- [Experiment with example datasets](example_datasets/ontime.md) [Original article](https://clickhouse.tech/docs/en/getting_started/) diff --git a/docs/en/getting_started/install.md b/docs/en/getting_started/install.md index 9bcff1cbeab..2636e9f8bb1 100644 --- a/docs/en/getting_started/install.md +++ b/docs/en/getting_started/install.md @@ -1,10 +1,10 @@ -# Installation +# Installation {#installation} -## System Requirements +## System Requirements {#system-requirements} ClickHouse can run on any Linux, FreeBSD or Mac OS X with x86\_64, AArch64 or PowerPC64LE CPU architecture. -Official pre-built binaries are typically compiled for x86\_64 and leverage SSE 4.2 instruction set, so unless otherwise stated usage of CPU that supports it becomes an additional system requirement. Here's the command to check if current CPU has support for SSE 4.2: +Official pre-built binaries are typically compiled for x86\_64 and leverage SSE 4.2 instruction set, so unless otherwise stated usage of CPU that supports it becomes an additional system requirement. Here’s the command to check if current CPU has support for SSE 4.2: ``` bash $ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" @@ -12,7 +12,7 @@ $ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not To run ClickHouse on processors that do not support SSE 4.2 or have AArch64 or PowerPC64LE architecture, you should [build ClickHouse from sources](#from-sources) with proper configuration adjustments. -## Available Installation Options +## Available Installation Options {#available-installation-options} ### From DEB Packages {#install-from-deb-packages} @@ -20,37 +20,35 @@ It is recommended to use official pre-compiled `deb` packages for Debian or Ubun To install official packages add the Yandex repository in `/etc/apt/sources.list` or in a separate `/etc/apt/sources.list.d/clickhouse.list` file: -``` -deb http://repo.clickhouse.tech/deb/stable/ main/ -``` + deb http://repo.clickhouse.tech/deb/stable/ main/ If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). Then run these commands to actually install packages: -```bash +``` bash sudo apt-get install dirmngr # optional sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional sudo apt-get update sudo apt-get install clickhouse-client clickhouse-server ``` -You can also download and install packages manually from here: . +You can also download and install packages manually from here: https://repo.yandex.ru/clickhouse/deb/stable/main/. -#### Packages +#### Packages {#packages} -- `clickhouse-common-static` — Installs ClickHouse compiled binary files. -- `clickhouse-server` — Creates symbolic link for `clickhouse-server`. Installs server configuration. -- `clickhouse-client` — Creates symbolic link for `clickhouse-client` and other client-related tools. Installs client configurations. -- `clickhouse-common-static-dbg` — Installs ClickHouse compiled binary files with debug info. +- `clickhouse-common-static` — Installs ClickHouse compiled binary files. +- `clickhouse-server` — Creates a symbolic link for `clickhouse-server`. Installs server configuration. +- `clickhouse-client` — Creates a symbolic link for `clickhouse-client` and other client-related tools. Installs client configurations. +- `clickhouse-common-static-dbg` — Installs ClickHouse compiled binary files with debug info. -### From RPM Packages +### From RPM Packages {#from-rpm-packages} It is recommended to use official pre-compiled `rpm` packages for CentOS, RedHat and all other rpm-based Linux distributions. -First you need to add the official repository: +First, you need to add the official repository: -```bash +``` bash sudo yum install yum-utils sudo rpm --import https://repo.clickhouse.tech/CLICKHOUSE-KEY.GPG sudo yum-config-manager --add-repo https://repo.clickhouse.tech/rpm/stable/x86_64 @@ -58,21 +56,22 @@ sudo yum-config-manager --add-repo https://repo.clickhouse.tech/rpm/stable/x86_6 If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). The `prestable` tag is sometimes available too. -Then run these commands to actually install packages: +Then run these commands to install packages: -```bash +``` bash sudo yum install clickhouse-server clickhouse-client ``` -You can also download and install packages manually from here: . +You can also download and install packages manually from here: https://repo.clickhouse.tech/rpm/stable/x86\_64. ### From tgz archives {#from-tgz-archives} It is recommended to use official pre-compiled `tgz` archives for all Linux distributions, where installation of `deb` or `rpm` packages is not possible. -Required version can be dowloaded with `curl` or `wget` from repository . +The required version can be downloaded with `curl` or `wget` from repository https://repo.yandex.ru/clickhouse/tgz/. After that downloaded archives should be unpacked and installed with installation scripts. Example for the latest version: -```bash + +``` bash export LATEST_VERSION=`curl https://api.github.com/repos/ClickHouse/ClickHouse/tags 2>/dev/null | grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -n 1` curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-$LATEST_VERSION.tgz curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-dbg-$LATEST_VERSION.tgz @@ -93,32 +92,29 @@ tar -xzvf clickhouse-client-$LATEST_VERSION.tgz sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh ``` -For production environments it's recommended to use latest `stable`-version. You can find it's number on github page https://github.com/ClickHouse/ClickHouse/tags with postfix `-stable`. +For production environments, it’s recommended to use the latest `stable`-version. You can find its number on GitHub page https://github.com/ClickHouse/ClickHouse/tags with postfix `-stable`. -### From Docker Image +### From Docker Image {#from-docker-image} To run ClickHouse inside Docker follow the guide on [Docker Hub](https://hub.docker.com/r/yandex/clickhouse-server/). Those images use official `deb` packages inside. -### From Sources +### From Sources {#from-sources} To manually compile ClickHouse, follow the instructions for [Linux](../development/build.md) or [Mac OS X](../development/build_osx.md). You can compile packages and install them or use programs without installing packages. Also by building manually you can disable SSE 4.2 requirement or build for AArch64 CPUs. -``` -Client: dbms/programs/clickhouse-client -Server: dbms/programs/clickhouse-server -``` + Client: dbms/programs/clickhouse-client + Server: dbms/programs/clickhouse-server -You'll need to create a data and metadata folders and `chown` them for the desired user. Their paths can be changed in server config (src/dbms/programs/server/config.xml), by default they are: -``` -/opt/clickhouse/data/default/ -/opt/clickhouse/metadata/default/ -``` +You’ll need to create a data and metadata folders and `chown` them for the desired user. Their paths can be changed in server config (src/dbms/programs/server/config.xml), by default they are: -On Gentoo you can just use `emerge clickhouse` to install ClickHouse from sources. + /opt/clickhouse/data/default/ + /opt/clickhouse/metadata/default/ -## Launch +On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sources. + +## Launch {#launch} To start the server as a daemon, run: @@ -126,16 +122,15 @@ To start the server as a daemon, run: $ sudo service clickhouse-server start ``` -If you don't have `service` command, run as +If you don’t have `service` command, run as ``` bash $ sudo /etc/init.d/clickhouse-server start ``` - See the logs in the `/var/log/clickhouse-server/` directory. -If the server doesn't start, check the configurations in the file `/etc/clickhouse-server/config.xml`. +If the server doesn’t start, check the configurations in the file `/etc/clickhouse-server/config.xml`. You can also manually launch the server from the console: @@ -144,11 +139,11 @@ $ clickhouse-server --config-file=/etc/clickhouse-server/config.xml ``` In this case, the log will be printed to the console, which is convenient during development. -If the configuration file is in the current directory, you don't need to specify the `--config-file` parameter. By default, it uses `./config.xml`. +If the configuration file is in the current directory, you don’t need to specify the `--config-file` parameter. By default, it uses `./config.xml`. ClickHouse supports access restriction settings. They are located in the `users.xml` file (next to `config.xml`). By default, access is allowed from anywhere for the `default` user, without a password. See `user/default/networks`. -For more information, see the section ["Configuration Files"](../operations/configuration_files.md). +For more information, see the section [“Configuration Files”](../operations/configuration_files.md). After launching server, you can use the command-line client to connect to it: @@ -156,12 +151,13 @@ After launching server, you can use the command-line client to connect to it: $ clickhouse-client ``` -By default it connects to `localhost:9000` on behalf of the user `default` without a password. It can also be used to connect to a remote server using `--host` argument. +By default, it connects to `localhost:9000` on behalf of the user `default` without a password. It can also be used to connect to a remote server using `--host` argument. The terminal must use UTF-8 encoding. -For more information, see the section ["Command-line client"](../interfaces/cli.md). +For more information, see the section [“Command-line client”](../interfaces/cli.md). Example: + ``` bash $ ./clickhouse-client ClickHouse client version 0.0.18749. @@ -183,7 +179,6 @@ SELECT 1 **Congratulations, the system works!** -To continue experimenting, you can download one of test data sets or go through [tutorial](https://clickhouse.tech/tutorial.html). - +To continue experimenting, you can download one of the test data sets or go through [tutorial](https://clickhouse.tech/tutorial.html). [Original article](https://clickhouse.tech/docs/en/getting_started/install/) diff --git a/docs/en/getting_started/tutorial.md b/docs/en/getting_started/tutorial.md index 4bc6ceb5a6b..deaa6099200 100644 --- a/docs/en/getting_started/tutorial.md +++ b/docs/en/getting_started/tutorial.md @@ -1,14 +1,15 @@ -# ClickHouse Tutorial +# ClickHouse Tutorial {#clickhouse-tutorial} -## What to Expect from This Tutorial? +## What to Expect from This Tutorial? {#what-to-expect-from-this-tutorial} -By going through this tutorial you'll learn how to set up basic ClickHouse cluster, it'll be small, but fault tolerant and scalable. We will use one of example datasets to fill it with data and execute some demo queries. +By going through this tutorial you’ll learn how to set up basic ClickHouse cluster, it’ll be small, but fault-tolerant and scalable. We will use one of the example datasets to fill it with data and execute some demo queries. -## Single Node Setup +## Single Node Setup {#single-node-setup} -To postpone complexities of distributed environment, we'll start with deploying ClickHouse on a single server or virtual machine. ClickHouse is usually installed from [deb](index.md#install-from-deb-packages) or [rpm](index.md#from-rpm-packages) packages, but there are [alternatives](index.md#from-docker-image) for the operating systems that do no support them. +To postpone complexities of distributed environment, we’ll start with deploying ClickHouse on a single server or virtual machine. ClickHouse is usually installed from [deb](index.md#install-from-deb-packages) or [rpm](index.md#from-rpm-packages) packages, but there are [alternatives](index.md#from-docker-image) for the operating systems that do no support them. For example, you have chosen `deb` packages and executed: + ``` bash sudo apt-get install dirmngr sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 @@ -21,41 +22,47 @@ sudo apt-get install -y clickhouse-server clickhouse-client What do we have in the packages that got installed: -* `clickhouse-client` package contains [clickhouse-client](../interfaces/cli.md) application, interactive ClickHouse console client. -* `clickhouse-common` package contains a ClickHouse executable file. -* `clickhouse-server` package contains configuration files to run ClickHouse as a server. +- `clickhouse-client` package contains [clickhouse-client](../interfaces/cli.md) application, interactive ClickHouse console client. +- `clickhouse-common` package contains a ClickHouse executable file. +- `clickhouse-server` package contains configuration files to run ClickHouse as a server. -Server config files are located in `/etc/clickhouse-server/`. Before going further please notice the `` element in `config.xml`. Path determines the location for data storage, so it should be located on volume with large disk capacity, the default value is `/var/lib/clickhouse/`. If you want to adjust the configuration it's not really handy to directly edit `config.xml` file, considering it might get rewritten on future package updates. Recommended way to override the config elements is to create [files in config.d directory](../operations/configuration_files.md) which serve as "patches" to config.xml. +Server config files are located in `/etc/clickhouse-server/`. Before going further please notice the `` element in `config.xml`. Path determines the location for data storage, so it should be located on volume with large disk capacity, the default value is `/var/lib/clickhouse/`. If you want to adjust the configuration it’s not handy to directly edit `config.xml` file, considering it might get rewritten on future package updates. The recommended way to override the config elements is to create [files in config.d directory](../operations/configuration_files.md) which serve as “patches” to config.xml. -As you might have noticed, `clickhouse-server` is not launched automatically after package installation. It won't be automatically restarted after updates either. The way you start the server depends on your init system, usually it's: +As you might have noticed, `clickhouse-server` is not launched automatically after package installation. It won’t be automatically restarted after updates either. The way you start the server depends on your init system, usually, it’s: ``` bash sudo service clickhouse-server start ``` + or ``` bash sudo /etc/init.d/clickhouse-server start ``` -The default location for server logs is `/var/log/clickhouse-server/`. Server will be ready to handle client connections once `Ready for connections` message was logged. +The default location for server logs is `/var/log/clickhouse-server/`. The server will be ready to handle client connections once `Ready for connections` message was logged. Once the `clickhouse-server` is up and running, we can use `clickhouse-client` to connect to the server and run some test queries like `SELECT "Hello, world!";`. -
Quick tips for clickhouse-client +
+ +Quick tips for clickhouse-client Interactive mode: + ``` bash clickhouse-client clickhouse-client --host=... --port=... --user=... --password=... ``` Enable multiline queries: + ``` bash clickhouse-client -m clickhouse-client --multiline ``` Run queries in batch-mode: + ``` bash clickhouse-client --query='SELECT 1' echo 'SELECT 1' | clickhouse-client @@ -63,17 +70,19 @@ clickhouse-client <<< 'SELECT 1' ``` Insert data from a file in specified format: + ``` bash clickhouse-client --query='INSERT INTO table VALUES' < data.txt clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv ``` +
-## Import Sample Dataset +## Import Sample Dataset {#import-sample-dataset} -Now it's time to fill our ClickHouse server with some sample data. In this tutorial we'll use anonymized data of Yandex.Metrica, the first service that run ClickHouse in production way before it became open-source (more on that in [history section](../introduction/history.md)). There are [multiple ways to import Yandex.Metrica dataset](example_datasets/metrica.md) and for the sake of the tutorial we'll go with the most realistic one. +Now it’s time to fill our ClickHouse server with some sample data. In this tutorial, we’ll use anonymized data of Yandex.Metrica, the first service that runs ClickHouse in production way before it became open-source (more on that in [history section](../introduction/history.md)). There are [multiple ways to import Yandex.Metrica dataset](example_datasets/metrica.md) and for the sake of the tutorial, we’ll go with the most realistic one. -### Download and Extract Table Data +### Download and Extract Table Data {#download-and-extract-table-data} ``` bash curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv @@ -82,9 +91,9 @@ curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unx The extracted files are about 10GB in size. -### Create Tables +### Create Tables {#create-tables} -Tables are logically grouped into "databases". There's a `default` database, but we'll create a new one named `tutorial`: +Tables are logically grouped into “databases”. There’s a `default` database, but we’ll create a new one named `tutorial`: ``` bash clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial" @@ -92,16 +101,16 @@ clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial" Syntax for creating tables is way more complicated compared to databases (see [reference](../query_language/create.md). In general `CREATE TABLE` statement has to specify three key things: -1. Name of table to create. -2. Table schema, i.e. list of columns and their [data types](../data_types/index.md). -3. [Table engine](../operations/table_engines/index.md) and it's settings, which determines all the details on how queries to this table will be physically executed. +1. Name of table to create. +2. Table schema, i.e. list of columns and their [data types](../data_types/index.md). +3. [Table engine](../operations/table_engines/index.md) and it’s settings, which determines all the details on how queries to this table will be physically executed. -Yandex.Metrica is a web analytics service and sample dataset doesn't cover it's full functionality, so there are only two tables to create: +Yandex.Metrica is a web analytics service and sample dataset doesn’t cover its full functionality, so there are only two tables to create: -* `hits` is a table with each action done by all users on all websites covered by the service. -* `visits` is a table that contains pre-built sessions instead of individual actions. +- `hits` is a table with each action done by all users on all websites covered by the service. +- `visits` is a table that contains pre-built sessions instead of individual actions. -Let's see and execute the real create table queries for these tables: +Let’s see and execute the real create table queries for these tables: ``` sql CREATE TABLE tutorial.hits_v1 @@ -444,15 +453,15 @@ SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192 ``` -You can execute those queries using interactive mode of `clickhouse-client` (just launch it in terminal without specifying a query in advance) or try some [alternative interface](../interfaces/index.md) if you want. +You can execute those queries using the interactive mode of `clickhouse-client` (just launch it in a terminal without specifying a query in advance) or try some [alternative interface](../interfaces/index.md) if you want. As we can see, `hits_v1` uses the [basic MergeTree engine](../operations/table_engines/mergetree.md), while the `visits_v1` uses the [Collapsing](../operations/table_engines/collapsingmergetree.md) variant. -### Import Data +### Import Data {#import-data} Data import to ClickHouse is done via [INSERT INTO](../query_language/insert_into.md) query like in many other SQL databases. However data is usually provided in one of the [supported formats](../interfaces/formats.md) instead of `VALUES` clause (which is also supported). -The files we downloaded earlier are in tab-separated format, so here's how to import them via console client: +The files we downloaded earlier are in tab-separated format, so here’s how to import them via console client: ``` bash clickhouse-client --query "INSERT INTO tutorial.hits_v1 FORMAT TSV" --max_insert_block_size=100000 < hits_v1.tsv @@ -470,21 +479,23 @@ FORMAT TSV max_insert_block_size 1048576 0 "The maximum block size for insertion, if we control the creation of blocks for insertion." ``` -Optionally you can [OPTIMIZE](../query_language/misc/#misc_operations-optimize) the tables after import. Tables that are configured with MergeTree-family engine always do merges of data parts in background to optimize data storage (or at least check if it makes sense). These queries will just force table engine to do storage optimization right now instead of some time later: +Optionally you can [OPTIMIZE](../query_language/misc/#misc_operations-optimize) the tables after import. Tables that are configured with MergeTree-family engine always do merges of data parts in background to optimize data storage (or at least check if it makes sense). These queries will just force the table engine to do storage optimization right now instead of some time later: + ``` bash clickhouse-client --query "OPTIMIZE TABLE tutorial.hits_v1 FINAL" clickhouse-client --query "OPTIMIZE TABLE tutorial.visits_v1 FINAL" ``` -This is I/O and CPU intensive operation so if the table constantly receives new data it's better to leave it alone and let merges run in background. +This is I/O and CPU intensive operation so if the table constantly receives new data it’s better to leave it alone and let merges run in background. Now we can check that the tables are successfully imported: + ``` bash clickhouse-client --query "SELECT COUNT(*) FROM tutorial.hits_v1" clickhouse-client --query "SELECT COUNT(*) FROM tutorial.visits_v1" ``` -## Example Queries +## Example Queries {#example-queries} ``` sql SELECT @@ -506,18 +517,19 @@ FROM tutorial.visits_v1 WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru') ``` -## Cluster Deployment +## Cluster Deployment {#cluster-deployment} ClickHouse cluster is a homogenous cluster. Steps to set up: -1. Install ClickHouse server on all machines of the cluster -2. Set up cluster configs in configuration files -3. Create local tables on each instance -4. Create a [Distributed table](../operations/table_engines/distributed.md) +1. Install ClickHouse server on all machines of the cluster +2. Set up cluster configs in configuration files +3. Create local tables on each instance +4. Create a [Distributed table](../operations/table_engines/distributed.md) -[Distributed table](../operations/table_engines/distributed.md) is actually a kind of "view" to local tables of ClickHouse cluster. SELECT query from a distributed table will be executed using resources of all cluster's shards. You may specify configs for multiple clusters and create multiple distributed tables providing views to different clusters. +[Distributed table](../operations/table_engines/distributed.md) is actually a kind of “view” to local tables of ClickHouse cluster. SELECT query from a distributed table will be executed using resources of all cluster’s shards. You may specify configs for multiple clusters and create multiple distributed tables providing views to different clusters. + +Example config for a cluster with three shards, one replica each: -Example config for cluster with three shards, one replica each: ``` xml @@ -543,35 +555,38 @@ Example config for cluster with three shards, one replica each: ``` -For further demonstration let's create new local table with exactly the same `CREATE TABLE` query that we used for `hits_v1`, but different table name: +For further demonstration let’s create a new local table with the same `CREATE TABLE` query that we used for `hits_v1`, but different table name: + ``` sql CREATE TABLE tutorial.hits_local (...) ENGINE = MergeTree() ... ``` Creating a distributed table providing a view into local tables of the cluster: + ``` sql CREATE TABLE tutorial.hits_all AS tutorial.hits_local ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand()); ``` -Common practice is to create similar Distributed tables on all machines of the cluster. This would allow to run distributed queries on any machine of the cluster. Also there's an alternative option to create temporary distributed table for a given SELECT query using [remote](../query_language/table_functions/remote.md) table function. +A common practice is to create similar Distributed tables on all machines of the cluster. This would allow running distributed queries on any machine of the cluster. Also there’s an alternative option to create temporary distributed table for a given SELECT query using [remote](../query_language/table_functions/remote.md) table function. -Let's run [INSERT SELECT](../query_language/insert_into.md) into Distributed table to spread the table to multiple servers. +Let’s run [INSERT SELECT](../query_language/insert_into.md) into the Distributed table to spread the table to multiple servers. ``` sql INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; ``` !!! warning "Notice" - This approach is not suitable for sharding of large tables. There's a separate tool [clickhouse-copier](../operations/utils/clickhouse-copier.md) that can re-shard arbitrary large tables. + This approach is not suitable for sharding of large tables. There’s a separate tool [clickhouse-copier](../operations/utils/clickhouse-copier.md) that can re-shard arbitrary large tables. As you could expect computationally heavy queries are executed N times faster being launched on 3 servers instead of one. -In this case we have used a cluster with 3 shards each contains a single replica. +In this case, we have used a cluster with 3 shards each contains a single replica. -To provide resilience in production environment we recommend that each shard should contain 2-3 replicas distributed between multiple data-centers. Note that ClickHouse supports unlimited number of replicas. +To provide resilience in a production environment we recommend that each shard should contain 2-3 replicas distributed between multiple datacenters. Note that ClickHouse supports an unlimited number of replicas. + +Example config for a cluster of one shard containing three replicas: -Example config for cluster of one shard containing three replicas: ``` xml ... @@ -595,12 +610,13 @@ Example config for cluster of one shard containing three replicas: ``` To enable native replication ZooKeeper is required. ClickHouse will take care of data consistency on all replicas and run restore procedure after failure - automatically. It's recommended to deploy ZooKeeper cluster to separate servers. +automatically. It’s recommended to deploy ZooKeeper cluster to separate servers. -ZooKeeper is not a strict requirement: in some simple cases you can duplicate the data by writing it into all the replicas from your application code. This approach is **not** recommended, in this case ClickHouse won't be able to - guarantee data consistency on all replicas. This remains the responsibility of your application. +ZooKeeper is not a strict requirement: in some simple cases, you can duplicate the data by writing it into all the replicas from your application code. This approach is **not** recommended, in this case, ClickHouse won’t be able to +guarantee data consistency on all replicas. This remains the responsibility of your application. + +ZooKeeper locations need to be specified in the configuration file: -ZooKeeper locations need to be specified in configuration file: ``` xml @@ -618,7 +634,8 @@ ZooKeeper locations need to be specified in configuration file: ``` -Also we need to set macros for identifying each shard and replica, it will be used on table creation: +Also, we need to set macros for identifying each shard and replica, it will be used on table creation: + ``` xml 01 @@ -626,7 +643,7 @@ Also we need to set macros for identifying each shard and replica, it will be us ``` -If there are no replicas at the moment on replicated table creation, a new first replica will be instantiated. If there are already live replicas, new replica will clone the data from existing ones. You have an option to create all replicated tables first and that insert data to it. Another option is to create some replicas and add the others after or during data insertion. +If there are no replicas at the moment on replicated table creation, a new first replica will be instantiated. If there are already live replicas, the new replica will clone the data from existing ones. You have an option to create all replicated tables first and that insert data to it. Another option is to create some replicas and add the others after or during data insertion. ``` sql CREATE TABLE tutorial.hits_replica (...) @@ -642,4 +659,7 @@ Here we use [ReplicatedMergeTree](../operations/table_engines/replication.md) ta ``` sql INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local; ``` -Replication operates in multi-master mode. Data can be loaded into any replica and it will be synced with other instances automatically. Replication is asynchronous so at a given moment of time not all replicas may contain recently inserted data. To allow data insertion at least one replica should be up. Others will sync up data and repair consistency once they will become active again. Please notice that such approach allows for the low possibility of loss of just appended data. + +Replication operates in multi-master mode. Data can be loaded into any replica and it will be synced with other instances automatically. Replication is asynchronous so at a given moment, not all replicas may contain recently inserted data. To allow data insertion at least one replica should be up. Others will sync up data and repair consistency once they will become active again. Please notice that such an approach allows for the low possibility of a loss of just appended data. + +[Original article](https://clickhouse.tech/docs/en/getting_started/tutorial/) diff --git a/docs/en/guides/apply_catboost_model.md b/docs/en/guides/apply_catboost_model.md index a9d8707f5ca..bddde92d46d 100644 --- a/docs/en/guides/apply_catboost_model.md +++ b/docs/en/guides/apply_catboost_model.md @@ -6,16 +6,16 @@ With this instruction, you will learn to apply pre-trained models in ClickHouse To apply a CatBoost model in ClickHouse: -1. [Create a Table](#create-table). -2. [Insert the Data to the Table](#insert-data-to-table). -3. [Integrate CatBoost into ClickHouse](#integrate-catboost-into-clickhouse) (Optional step). -4. [Run the Model Inference from SQL](#run-model-inference). +1. [Create a Table](#create-table). +2. [Insert the Data to the Table](#insert-data-to-table). +3. [Integrate CatBoost into ClickHouse](#integrate-catboost-into-clickhouse) (Optional step). +4. [Run the Model Inference from SQL](#run-model-inference). For more information about training CatBoost models, see [Training and applying models](https://catboost.ai/docs/features/training.html#training). ## Prerequisites {#prerequisites} -If you don't have the [Docker](https://docs.docker.com/install/) yet, install it. +If you don’t have the [Docker](https://docs.docker.com/install/) yet, install it. !!! note "Note" [Docker](https://www.docker.com) is a software platform that allows you to create containers that isolate a CatBoost and ClickHouse installation from the rest of the system. @@ -24,7 +24,7 @@ Before applying a CatBoost model: **1.** Pull the [Docker image](https://hub.docker.com/r/yandex/tutorial-catboost-clickhouse) from the registry: -```bash +``` bash $ docker pull yandex/tutorial-catboost-clickhouse ``` @@ -32,7 +32,7 @@ This Docker image contains everything you need to run CatBoost and ClickHouse: c **2.** Make sure the Docker image has been successfully pulled: -```bash +``` bash $ docker image ls REPOSITORY TAG IMAGE ID CREATED SIZE yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 hours ago 1.37GB @@ -40,17 +40,17 @@ yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 **3.** Start a Docker container based on this image: -```bash +``` bash $ docker run -it -p 8888:8888 yandex/tutorial-catboost-clickhouse ``` ## 1. Create a Table {#create-table} -To create a ClickHouse table for the train sample: +To create a ClickHouse table for the training sample: -**1.** Start ClickHouse console client in interactive mode: +**1.** Start ClickHouse console client in the interactive mode: -```bash +``` bash $ clickhouse client ``` @@ -59,19 +59,19 @@ $ clickhouse client **2.** Create the table using the command: -```sql +``` sql :) CREATE TABLE amazon_train ( - date Date MATERIALIZED today(), - ACTION UInt8, - RESOURCE UInt32, - MGR_ID UInt32, - ROLE_ROLLUP_1 UInt32, - ROLE_ROLLUP_2 UInt32, - ROLE_DEPTNAME UInt32, - ROLE_TITLE UInt32, - ROLE_FAMILY_DESC UInt32, - ROLE_FAMILY UInt32, + date Date MATERIALIZED today(), + ACTION UInt8, + RESOURCE UInt32, + MGR_ID UInt32, + ROLE_ROLLUP_1 UInt32, + ROLE_ROLLUP_2 UInt32, + ROLE_DEPTNAME UInt32, + ROLE_TITLE UInt32, + ROLE_FAMILY_DESC UInt32, + ROLE_FAMILY UInt32, ROLE_CODE UInt32 ) ENGINE = MergeTree ORDER BY date @@ -79,7 +79,7 @@ ENGINE = MergeTree ORDER BY date **3.** Exit from ClickHouse console client: -```sql +``` sql :) exit ``` @@ -89,19 +89,19 @@ To insert the data: **1.** Run the following command: -```bash +``` bash $ clickhouse client --host 127.0.0.1 --query 'INSERT INTO amazon_train FORMAT CSVWithNames' < ~/amazon/train.csv ``` -**2.** Start ClickHouse console client in interactive mode: +**2.** Start ClickHouse console client in the interactive mode: -```bash +``` bash $ clickhouse client ``` **3.** Make sure the data has been uploaded: -```sql +``` sql :) SELECT count() FROM amazon_train SELECT count() @@ -127,11 +127,11 @@ The fastest way to evaluate a CatBoost model is compile `libcatboostmodel. @@ -148,7 +148,7 @@ The fastest way to evaluate a CatBoost model is compile `libcatboostmodel. /home/catboost/data/libcatboostmodel.so /home/catboost/models/*_model.xml @@ -158,11 +158,11 @@ The fastest way to evaluate a CatBoost model is compile `libcatboostmodel. 0 AS prediction, + ROLE_CODE) > 0 AS prediction, ACTION AS target FROM amazon_train LIMIT 10 @@ -180,11 +180,11 @@ LIMIT 10 !!! note "Note" Function [modelEvaluate](../query_language/functions/other_functions.md#function-modelevaluate) returns tuple with per-class raw predictions for multiclass models. -Let's predict probability: +Let’s predict the probability: -```sql -:) SELECT - modelEvaluate('amazon', +``` sql +:) SELECT + modelEvaluate('amazon', RESOURCE, MGR_ID, ROLE_ROLLUP_1, @@ -194,7 +194,7 @@ Let's predict probability: ROLE_FAMILY_DESC, ROLE_FAMILY, ROLE_CODE) AS prediction, - 1. / (1 + exp(-prediction)) AS probability, + 1. / (1 + exp(-prediction)) AS probability, ACTION AS target FROM amazon_train LIMIT 10 @@ -203,14 +203,14 @@ LIMIT 10 !!! note "Note" More info about [exp()](../query_language/functions/math_functions.md) function. -Let's calculate LogLoss on the sample: +Let’s calculate LogLoss on the sample: -```sql +``` sql :) SELECT -avg(tg * log(prob) + (1 - tg) * log(1 - prob)) AS logloss -FROM +FROM ( - SELECT - modelEvaluate('amazon', + SELECT + modelEvaluate('amazon', RESOURCE, MGR_ID, ROLE_ROLLUP_1, @@ -220,7 +220,7 @@ FROM ROLE_FAMILY_DESC, ROLE_FAMILY, ROLE_CODE) AS prediction, - 1. / (1. + exp(-prediction)) AS prob, + 1. / (1. + exp(-prediction)) AS prob, ACTION AS tg FROM amazon_train ) @@ -228,3 +228,5 @@ FROM !!! note "Note" More info about [avg()](../query_language/agg_functions/reference.md#agg_function-avg) and [log()](../query_language/functions/math_functions.md) functions. + +[Original article](https://clickhouse.tech/docs/en/guides/apply_catboost_model/) diff --git a/docs/en/guides/index.md b/docs/en/guides/index.md index 32c2da1ad2f..3e7e1ae9026 100644 --- a/docs/en/guides/index.md +++ b/docs/en/guides/index.md @@ -1,5 +1,7 @@ -# ClickHouse Guides +# ClickHouse Guides {#clickhouse-guides} Detailed step-by-step instructions that will help you solve various tasks using ClickHouse. -- [Applying a CatBoost Model in ClickHouse](apply_catboost_model.md) \ No newline at end of file +- [Applying a CatBoost Model in ClickHouse](apply_catboost_model.md) + +[Original article](https://clickhouse.tech/docs/en/guides/) diff --git a/docs/en/index.md b/docs/en/index.md index 3a3c2595ec0..da2cfdd9cdf 100644 --- a/docs/en/index.md +++ b/docs/en/index.md @@ -1,61 +1,57 @@ -# What is ClickHouse? +# What is ClickHouse? {#what-is-clickhouse} ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP). -In a "normal" row-oriented DBMS, data is stored in this order: +In a “normal” row-oriented DBMS, data is stored in this order: -| Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | -| ------ | ------------------- | ---------- | ------------------ | --------- | ------------------- | -| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | ... | ... | ... | ... | ... | +| Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | +|-----|-------------|------------|--------------------|-----------|---------------------| +| \#0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| \#1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| \#2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| \#N | … | … | … | … | … | In other words, all the values related to a row are physically stored next to each other. -Examples of a row-oriented DBMS are MySQL, Postgres, and MS SQL Server. -{: .grey } +Examples of a row-oriented DBMS are MySQL, Postgres, and MS SQL Server. {: .grey } In a column-oriented DBMS, data is stored like this: -| Row: | #0 | #1 | #2 | #N | -| ----------- | ------------------- | ------------------- | ------------------- | ------------------- | -| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | -| JavaEnable: | 1 | 0 | 1 | ... | -| Title: | Investor Relations | Contact us | Mission | ... | -| GoodEvent: | 1 | 1 | 1 | ... | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | +| Row: | \#0 | \#1 | \#2 | \#N | +|-------------|---------------------|---------------------|---------------------|-----| +| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | +| JavaEnable: | 1 | 0 | 1 | … | +| Title: | Investor Relations | Contact us | Mission | … | +| GoodEvent: | 1 | 1 | 1 | … | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | -These examples only show the order that data is arranged in. -The values from different columns are stored separately, and data from the same column is stored together. +These examples only show the order that data is arranged in. The values from different columns are stored separately, and data from the same column is stored together. -Examples of a column-oriented DBMS: Vertica, Paraccel (Actian Matrix and Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, and kdb+. -{: .grey } +Examples of a column-oriented DBMS: Vertica, Paraccel (Actian Matrix and Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, and kdb+. {: .grey } -Different orders for storing data are better suited to different scenarios. -The data access scenario refers to what queries are made, how often, and in what proportion; how much data is read for each type of query – rows, columns, and bytes; the relationship between reading and updating data; the working size of the data and how locally it is used; whether transactions are used, and how isolated they are; requirements for data replication and logical integrity; requirements for latency and throughput for each type of query, and so on. +Different orders for storing data are better suited to different scenarios. The data access scenario refers to what queries are made, how often, and in what proportion; how much data is read for each type of query – rows, columns, and bytes; the relationship between reading and updating data; the working size of the data and how locally it is used; whether transactions are used, and how isolated they are; requirements for data replication and logical integrity; requirements for latency and throughput for each type of query, and so on. The higher the load on the system, the more important it is to customize the system set up to match the requirements of the usage scenario, and the more fine grained this customization becomes. There is no system that is equally well-suited to significantly different scenarios. If a system is adaptable to a wide set of scenarios, under a high load, the system will handle all the scenarios equally poorly, or will work well for just one or few of possible scenarios. -## Key Properties of OLAP Scenario +## Key Properties of OLAP Scenario {#key-properties-of-olap-scenario} -- The vast majority of requests are for read access. -- Data is updated in fairly large batches (> 1000 rows), not by single rows; or it is not updated at all. -- Data is added to the DB but is not modified. -- For reads, quite a large number of rows are extracted from the DB, but only a small subset of columns. -- Tables are "wide," meaning they contain a large number of columns. -- Queries are relatively rare (usually hundreds of queries per server or less per second). -- For simple queries, latencies around 50 ms are allowed. -- Column values are fairly small: numbers and short strings (for example, 60 bytes per URL). -- Requires high throughput when processing a single query (up to billions of rows per second per server). -- Transactions are not necessary. -- Low requirements for data consistency. -- There is one large table per query. All tables are small, except for one. -- A query result is significantly smaller than the source data. In other words, data is filtered or aggregated, so the result fits in a single server's RAM. +- The vast majority of requests are for read access. +- Data is updated in fairly large batches (\> 1000 rows), not by single rows; or it is not updated at all. +- Data is added to the DB but is not modified. +- For reads, quite a large number of rows are extracted from the DB, but only a small subset of columns. +- Tables are “wide,” meaning they contain a large number of columns. +- Queries are relatively rare (usually hundreds of queries per server or less per second). +- For simple queries, latencies around 50 ms are allowed. +- Column values are fairly small: numbers and short strings (for example, 60 bytes per URL). +- Requires high throughput when processing a single query (up to billions of rows per second per server). +- Transactions are not necessary. +- Low requirements for data consistency. +- There is one large table per query. All tables are small, except for one. +- A query result is significantly smaller than the source data. In other words, data is filtered or aggregated, so the result fits in a single server’s RAM. -It is easy to see that the OLAP scenario is very different from other popular scenarios (such as OLTP or Key-Value access). So it doesn't make sense to try to use OLTP or a Key-Value DB for processing analytical queries if you want to get decent performance. For example, if you try to use MongoDB or Redis for analytics, you will get very poor performance compared to OLAP databases. +It is easy to see that the OLAP scenario is very different from other popular scenarios (such as OLTP or Key-Value access). So it doesn’t make sense to try to use OLTP or a Key-Value DB for processing analytical queries if you want to get decent performance. For example, if you try to use MongoDB or Redis for analytics, you will get very poor performance compared to OLAP databases. -## Why Column-Oriented Databases Work Better in the OLAP Scenario +## Why Column-Oriented Databases Work Better in the OLAP Scenario {#why-column-oriented-databases-work-better-in-the-olap-scenario} Column-oriented databases are better suited to OLAP scenarios: they are at least 100 times faster in processing most queries. The reasons are explained in detail below, but the fact is easier to demonstrate visually: @@ -69,25 +65,30 @@ Column-oriented databases are better suited to OLAP scenarios: they are at least See the difference? -### Input/output +### Input/output {#inputoutput} -1. For an analytical query, only a small number of table columns need to be read. In a column-oriented database, you can read just the data you need. For example, if you need 5 columns out of 100, you can expect a 20-fold reduction in I/O. -2. Since data is read in packets, it is easier to compress. Data in columns is also easier to compress. This further reduces the I/O volume. -3. Due to the reduced I/O, more data fits in the system cache. +1. For an analytical query, only a small number of table columns need to be read. In a column-oriented database, you can read just the data you need. For example, if you need 5 columns out of 100, you can expect a 20-fold reduction in I/O. +2. Since data is read in packets, it is easier to compress. Data in columns is also easier to compress. This further reduces the I/O volume. +3. Due to the reduced I/O, more data fits in the system cache. -For example, the query "count the number of records for each advertising platform" requires reading one "advertising platform ID" column, which takes up 1 byte uncompressed. If most of the traffic was not from advertising platforms, you can expect at least 10-fold compression of this column. When using a quick compression algorithm, data decompression is possible at a speed of at least several gigabytes of uncompressed data per second. In other words, this query can be processed at a speed of approximately several billion rows per second on a single server. This speed is actually achieved in practice. +For example, the query “count the number of records for each advertising platform” requires reading one “advertising platform ID” column, which takes up 1 byte uncompressed. If most of the traffic was not from advertising platforms, you can expect at least 10-fold compression of this column. When using a quick compression algorithm, data decompression is possible at a speed of at least several gigabytes of uncompressed data per second. In other words, this query can be processed at a speed of approximately several billion rows per second on a single server. This speed is actually achieved in practice. -
Example -```bash +
+ +Example + +``` bash $ clickhouse-client ClickHouse client version 0.0.52053. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.52053. ``` -```sql + +``` sql SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 ``` -```text + +``` text ┌─CounterID─┬──count()─┐ │ 114208 │ 56057344 │ │ 115080 │ 51619590 │ @@ -114,19 +115,18 @@ SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIM
-### CPU +### CPU {#cpu} -Since executing a query requires processing a large number of rows, it helps to dispatch all operations for entire vectors instead of for separate rows, or to implement the query engine so that there is almost no dispatching cost. If you don't do this, with any half-decent disk subsystem, the query interpreter inevitably stalls the CPU. -It makes sense to both store data in columns and process it, when possible, by columns. +Since executing a query requires processing a large number of rows, it helps to dispatch all operations for entire vectors instead of for separate rows, or to implement the query engine so that there is almost no dispatching cost. If you don’t do this, with any half-decent disk subsystem, the query interpreter inevitably stalls the CPU. It makes sense to both store data in columns and process it, when possible, by columns. There are two ways to do this: -1. A vector engine. All operations are written for vectors, instead of for separate values. This means you don't need to call operations very often, and dispatching costs are negligible. Operation code contains an optimized internal cycle. +1. A vector engine. All operations are written for vectors, instead of for separate values. This means you don’t need to call operations very often, and dispatching costs are negligible. Operation code contains an optimized internal cycle. -2. Code generation. The code generated for the query has all the indirect calls in it. +2. Code generation. The code generated for the query has all the indirect calls in it. -This is not done in "normal" databases, because it doesn't make sense when running simple queries. However, there are exceptions. For example, MemSQL uses code generation to reduce latency when processing SQL queries. (For comparison, analytical DBMSs require optimization of throughput, not latency.) +This is not done in “normal” databases, because it doesn’t make sense when running simple queries. However, there are exceptions. For example, MemSQL uses code generation to reduce latency when processing SQL queries. (For comparison, analytical DBMSs require optimization of throughput, not latency.) Note that for CPU efficiency, the query language must be declarative (SQL or MDX), or at least a vector (J, K). The query should only contain implicit loops, allowing for optimization. -[Original article](https://clickhouse.tech/docs/en/) +{## [Original article](https://clickhouse.tech/docs/en/) ##} diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 48965d11062..a1d527da7dd 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -1,10 +1,10 @@ -# Command-line Client +# Command-line Client {#command-line-client} ClickHouse provides a native command-line client: `clickhouse-client`. The client supports command-line options and configuration files. For more information, see [Configuring](#interfaces_cli_configuration). [Install](../getting_started/index.md) it from the `clickhouse-client` package and run it with the command `clickhouse-client`. -```bash +``` bash $ clickhouse-client ClickHouse client version 19.17.1.1579 (official build). Connecting to localhost:9000 as user default. @@ -15,19 +15,15 @@ Connected to ClickHouse server version 19.17.1 revision 54428. Different client and server versions are compatible with one another, but some features may not be available in older clients. We recommend using the same version of the client as the server app. When you try to use a client of the older version, then the server, `clickhouse-client` displays the message: -``` -ClickHouse client version is older than ClickHouse server. It may lack support for new features. -``` + ClickHouse client version is older than ClickHouse server. It may lack support for new features. -## Usage {#cli_usage} +## Usage {#cli-usage} -The client can be used in interactive and non-interactive (batch) mode. -To use batch mode, specify the 'query' parameter, or send data to 'stdin' (it verifies that 'stdin' is not a terminal), or both. -Similar to the HTTP interface, when using the 'query' parameter and sending data to 'stdin', the request is a concatenation of the 'query' parameter, a line feed, and the data in 'stdin'. This is convenient for large INSERT queries. +The client can be used in interactive and non-interactive (batch) mode. To use batch mode, specify the ‘query’ parameter, or send data to ‘stdin’ (it verifies that ‘stdin’ is not a terminal), or both. Similar to the HTTP interface, when using the ‘query’ parameter and sending data to ‘stdin’, the request is a concatenation of the ‘query’ parameter, a line feed, and the data in ‘stdin’. This is convenient for large INSERT queries. Example of using the client to insert data: -```bash +``` bash $ echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; $ cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; @@ -40,12 +36,11 @@ $ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FOR In batch mode, the default data format is TabSeparated. You can set the format in the FORMAT clause of the query. -By default, you can only process a single query in batch mode. To make multiple queries from a "script," use the --multiquery parameter. This works for all queries except INSERT. Query results are output consecutively without additional separators. -Similarly, to process a large number of queries, you can run 'clickhouse-client' for each query. Note that it may take tens of milliseconds to launch the 'clickhouse-client' program. +By default, you can only process a single query in batch mode. To make multiple queries from a “script,” use the `--multiquery` parameter. This works for all queries except INSERT. Query results are output consecutively without additional separators. Similarly, to process a large number of queries, you can run ‘clickhouse-client’ for each query. Note that it may take tens of milliseconds to launch the ‘clickhouse-client’ program. In interactive mode, you get a command line where you can enter queries. -If 'multiline' is not specified (the default): To run the query, press Enter. The semicolon is not necessary at the end of the query. To enter a multiline query, enter a backslash `\` before the line feed. After you press Enter, you will be asked to enter the next line of the query. +If ‘multiline’ is not specified (the default): To run the query, press Enter. The semicolon is not necessary at the end of the query. To enter a multiline query, enter a backslash `\` before the line feed. After you press Enter, you will be asked to enter the next line of the query. If multiline is specified: To run a query, end it with a semicolon and press Enter. If the semicolon was omitted at the end of the entered line, you will be asked to enter the next line of the query. @@ -53,29 +48,28 @@ Only a single query is run, so everything after the semicolon is ignored. You can specify `\G` instead of or after the semicolon. This indicates Vertical format. In this format, each value is printed on a separate line, which is convenient for wide tables. This unusual feature was added for compatibility with the MySQL CLI. -The command line is based on 'replxx' (similar to 'readline'). In other words, it uses the familiar keyboard shortcuts and keeps a history. -The history is written to `~/.clickhouse-client-history`. +The command line is based on ‘replxx’ (similar to ‘readline’). In other words, it uses the familiar keyboard shortcuts and keeps a history. The history is written to `~/.clickhouse-client-history`. By default, the format used is PrettyCompact. You can change the format in the FORMAT clause of the query, or by specifying `\G` at the end of the query, using the `--format` or `--vertical` argument in the command line, or using the client configuration file. -To exit the client, press Ctrl+D (or Ctrl+C), or enter one of the following instead of a query: "exit", "quit", "logout", "exit;", "quit;", "logout;", "q", "Q", ":q" +To exit the client, press Ctrl+D (or Ctrl+C), or enter one of the following instead of a query: “exit”, “quit”, “logout”, “exit;”, “quit;”, “logout;”, “q”, “Q”, “:q” When processing a query, the client shows: -1. Progress, which is updated no more than 10 times per second (by default). For quick queries, the progress might not have time to be displayed. -2. The formatted query after parsing, for debugging. -3. The result in the specified format. -4. The number of lines in the result, the time passed, and the average speed of query processing. +1. Progress, which is updated no more than 10 times per second (by default). For quick queries, the progress might not have time to be displayed. +2. The formatted query after parsing, for debugging. +3. The result in the specified format. +4. The number of lines in the result, the time passed, and the average speed of query processing. -You can cancel a long query by pressing Ctrl+C. However, you will still need to wait a little for the server to abort the request. It is not possible to cancel a query at certain stages. If you don't wait and press Ctrl+C a second time, the client will exit. +You can cancel a long query by pressing Ctrl+C. However, you will still need to wait for a little for the server to abort the request. It is not possible to cancel a query at certain stages. If you don’t wait and press Ctrl+C a second time, the client will exit. -The command-line client allows passing external data (external temporary tables) for querying. For more information, see the section "External data for query processing". +The command-line client allows passing external data (external temporary tables) for querying. For more information, see the section “External data for query processing”. ### Queries with Parameters {#cli-queries-with-parameters} You can create a query with parameters and pass values to them from client application. This allows to avoid formatting query with specific dynamic values on client side. For example: -```bash +``` bash $ clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {parName:Array(UInt16)}" ``` @@ -83,62 +77,61 @@ $ clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = Format a query as usual, then place the values that you want to pass from the app parameters to the query in braces in the following format: -```sql +``` sql {:} ``` -- `name` — Placeholder identifier. In the console client it should be used in app parameters as `--param_ = value`. -- `data type` — [Data type](../data_types/index.md) of the app parameter value. For example, a data structure like `(integer, ('string', integer))` can have the `Tuple(UInt8, Tuple(String, UInt8))` data type (you can also use another [integer](../data_types/int_uint.md) types). +- `name` — Placeholder identifier. In the console client it should be used in app parameters as `--param_ = value`. +- `data type` — [Data type](../data_types/index.md) of the app parameter value. For example, a data structure like `(integer, ('string', integer))` can have the `Tuple(UInt8, Tuple(String, UInt8))` data type (you can also use another [integer](../data_types/int_uint.md) types). -#### Example +#### Example {#example} -```bash +``` bash $ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM table WHERE val = {tuple_in_tuple:Tuple(UInt8, Tuple(String, UInt8))}" ``` -## Configuring {#interfaces_cli_configuration} +## Configuring {#interfaces-cli-configuration} You can pass parameters to `clickhouse-client` (all parameters have a default value) using: -- From the Command Line +- From the Command Line - Command-line options override the default values and settings in configuration files. + Command-line options override the default values and settings in configuration files. -- Configuration files. +- Configuration files. - Settings in the configuration files override the default values. + Settings in the configuration files override the default values. -### Command Line Options +### Command Line Options {#command-line-options} -- `--host, -h` -– The server name, 'localhost' by default. You can use either the name or the IPv4 or IPv6 address. -- `--port` – The port to connect to. Default value: 9000. Note that the HTTP interface and the native interface use different ports. -- `--user, -u` – The username. Default value: default. -- `--password` – The password. Default value: empty string. -- `--query, -q` – The query to process when using non-interactive mode. -- `--database, -d` – Select the current default database. Default value: the current database from the server settings ('default' by default). -- `--multiline, -m` – If specified, allow multiline queries (do not send the query on Enter). -- `--multiquery, -n` – If specified, allow processing multiple queries separated by semicolons. -- `--format, -f` – Use the specified default format to output the result. -- `--vertical, -E` – If specified, use the Vertical format by default to output the result. This is the same as '--format=Vertical'. In this format, each value is printed on a separate line, which is helpful when displaying wide tables. -- `--time, -t` – If specified, print the query execution time to 'stderr' in non-interactive mode. -- `--stacktrace` – If specified, also print the stack trace if an exception occurs. -- `--config-file` – The name of the configuration file. -- `--secure` – If specified, will connect to server over secure connection. -- `--param_` — Value for a [query with parameters](#cli-queries-with-parameters). +- `--host, -h` -– The server name, ‘localhost’ by default. You can use either the name or the IPv4 or IPv6 address. +- `--port` – The port to connect to. Default value: 9000. Note that the HTTP interface and the native interface use different ports. +- `--user, -u` – The username. Default value: default. +- `--password` – The password. Default value: empty string. +- `--query, -q` – The query to process when using non-interactive mode. +- `--database, -d` – Select the current default database. Default value: the current database from the server settings (‘default’ by default). +- `--multiline, -m` – If specified, allow multiline queries (do not send the query on Enter). +- `--multiquery, -n` – If specified, allow processing multiple queries separated by semicolons. +- `--format, -f` – Use the specified default format to output the result. +- `--vertical, -E` – If specified, use the Vertical format by default to output the result. This is the same as ‘–format=Vertical’. In this format, each value is printed on a separate line, which is helpful when displaying wide tables. +- `--time, -t` – If specified, print the query execution time to ‘stderr’ in non-interactive mode. +- `--stacktrace` – If specified, also print the stack trace if an exception occurs. +- `--config-file` – The name of the configuration file. +- `--secure` – If specified, will connect to server over secure connection. +- `--param_` — Value for a [query with parameters](#cli-queries-with-parameters). - -### Configuration Files +### Configuration Files {#configuration-files} `clickhouse-client` uses the first existing file of the following: -- Defined in the `--config-file` parameter. -- `./clickhouse-client.xml` -- `~/.clickhouse-client/config.xml` -- `/etc/clickhouse-client/config.xml` +- Defined in the `--config-file` parameter. +- `./clickhouse-client.xml` +- `~/.clickhouse-client/config.xml` +- `/etc/clickhouse-client/config.xml` Example of a config file: -```xml +``` xml username password @@ -146,5 +139,4 @@ Example of a config file: ``` - [Original article](https://clickhouse.tech/docs/en/interfaces/cli/) diff --git a/docs/en/interfaces/cpp.md b/docs/en/interfaces/cpp.md index cc5e670a00a..71880c31635 100644 --- a/docs/en/interfaces/cpp.md +++ b/docs/en/interfaces/cpp.md @@ -1,4 +1,4 @@ -# C++ Client Library +# C++ Client Library {#c-client-library} See README at [clickhouse-cpp](https://github.com/ClickHouse/clickhouse-cpp) repository. diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a6deb4ccb02..b3a6976b3ec 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -5,58 +5,58 @@ results of a `SELECT`, and to perform `INSERT`s into a file-backed table. The supported formats are: -| Format | Input | Output | -| ------- | -------- | -------- | -| [TabSeparated](#tabseparated) | ✔ | ✔ | -| [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | -| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | -| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | -| [Template](#format-template) | ✔ | ✔ | -| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | -| [CSV](#csv) | ✔ | ✔ | -| [CSVWithNames](#csvwithnames) | ✔ | ✔ | -| [CustomSeparated](#format-customseparated) | ✔ | ✔ | -| [Values](#data-format-values) | ✔ | ✔ | -| [Vertical](#vertical) | ✗ | ✔ | -| [VerticalRaw](#verticalraw) | ✗ | ✔ | -| [JSON](#json) | ✗ | ✔ | -| [JSONCompact](#jsoncompact) | ✗ | ✔ | -| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | -| [TSKV](#tskv) | ✔ | ✔ | -| [Pretty](#pretty) | ✗ | ✔ | -| [PrettyCompact](#prettycompact) | ✗ | ✔ | -| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | -| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | -| [PrettySpace](#prettyspace) | ✗ | ✔ | -| [Protobuf](#protobuf) | ✔ | ✔ | -| [Avro](#data-format-avro) | ✔ | ✔ | -| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | -| [Parquet](#data-format-parquet) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | -| [RowBinary](#rowbinary) | ✔ | ✔ | -| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | -| [Native](#native) | ✔ | ✔ | -| [Null](#null) | ✗ | ✔ | -| [XML](#xml) | ✗ | ✔ | -| [CapnProto](#capnproto) | ✔ | ✗ | +| Format | Input | Output | +|-----------------------------------------------------------------|-------|--------| +| [TabSeparated](#tabseparated) | ✔ | ✔ | +| [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | +| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | +| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | +| [CSV](#csv) | ✔ | ✔ | +| [CSVWithNames](#csvwithnames) | ✔ | ✔ | +| [CustomSeparated](#format-customseparated) | ✔ | ✔ | +| [Values](#data-format-values) | ✔ | ✔ | +| [Vertical](#vertical) | ✗ | ✔ | +| [VerticalRaw](#verticalraw) | ✗ | ✔ | +| [JSON](#json) | ✗ | ✔ | +| [JSONCompact](#jsoncompact) | ✗ | ✔ | +| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | +| [TSKV](#tskv) | ✔ | ✔ | +| [Pretty](#pretty) | ✗ | ✔ | +| [PrettyCompact](#prettycompact) | ✗ | ✔ | +| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | +| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | +| [PrettySpace](#prettyspace) | ✗ | ✔ | +| [Protobuf](#protobuf) | ✔ | ✔ | +| [Avro](#data-format-avro) | ✔ | ✔ | +| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | +| [Parquet](#data-format-parquet) | ✔ | ✔ | +| [ORC](#data-format-orc) | ✔ | ✗ | +| [RowBinary](#rowbinary) | ✔ | ✔ | +| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | +| [Native](#native) | ✔ | ✔ | +| [Null](#null) | ✗ | ✔ | +| [XML](#xml) | ✗ | ✔ | +| [CapnProto](#capnproto) | ✔ | ✗ | You can control some format processing parameters with the ClickHouse settings. For more information read the [Settings](../operations/settings/settings.md) section. ## TabSeparated {#tabseparated} -In TabSeparated format, data is written by row. Each row contains values separated by tabs. Each value is follow by a tab, except the last value in the row, which is followed by a line feed. Strictly Unix line feeds are assumed everywhere. The last row also must contain a line feed at the end. Values are written in text format, without enclosing quotation marks, and with special characters escaped. +In TabSeparated format, data is written by row. Each row contains values separated by tabs. Each value is followed by a tab, except the last value in the row, which is followed by a line feed. Strictly Unix line feeds are assumed everywhere. The last row also must contain a line feed at the end. Values are written in text format, without enclosing quotation marks, and with special characters escaped. This format is also available under the name `TSV`. -The `TabSeparated` format is convenient for processing data using custom programs and scripts. It is used by default in the HTTP interface, and in the command-line client's batch mode. This format also allows transferring data between different DBMSs. For example, you can get a dump from MySQL and upload it to ClickHouse, or vice versa. +The `TabSeparated` format is convenient for processing data using custom programs and scripts. It is used by default in the HTTP interface, and in the command-line client’s batch mode. This format also allows transferring data between different DBMSs. For example, you can get a dump from MySQL and upload it to ClickHouse, or vice versa. -The `TabSeparated` format supports outputting total values (when using WITH TOTALS) and extreme values (when 'extremes' is set to 1). In these cases, the total values and extremes are output after the main data. The main result, total values, and extremes are separated from each other by an empty line. Example: +The `TabSeparated` format supports outputting total values (when using WITH TOTALS) and extreme values (when ‘extremes’ is set to 1). In these cases, the total values and extremes are output after the main data. The main result, total values, and extremes are separated from each other by an empty line. Example: -```sql +``` sql SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated`` ``` -```text +``` text 2014-03-17 1406958 2014-03-18 1383658 2014-03-19 1405797 @@ -71,24 +71,24 @@ SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORD 2014-03-23 1406958 ``` -### Data Formatting +### Data Formatting {#data-formatting} -Integer numbers are written in decimal form. Numbers can contain an extra "+" character at the beginning (ignored when parsing, and not recorded when formatting). Non-negative numbers can't contain the negative sign. When reading, it is allowed to parse an empty string as a zero, or (for signed types) a string consisting of just a minus sign as a zero. Numbers that do not fit into the corresponding data type may be parsed as a different number, without an error message. +Integer numbers are written in decimal form. Numbers can contain an extra “+” character at the beginning (ignored when parsing, and not recorded when formatting). Non-negative numbers can’t contain the negative sign. When reading, it is allowed to parse an empty string as a zero, or (for signed types) a string consisting of just a minus sign as a zero. Numbers that do not fit into the corresponding data type may be parsed as a different number, without an error message. -Floating-point numbers are written in decimal form. The dot is used as the decimal separator. Exponential entries are supported, as are 'inf', '+inf', '-inf', and 'nan'. An entry of floating-point numbers may begin or end with a decimal point. +Floating-point numbers are written in decimal form. The dot is used as the decimal separator. Exponential entries are supported, as are ‘inf’, ‘+inf’, ‘-inf’, and ‘nan’. An entry of floating-point numbers may begin or end with a decimal point. During formatting, accuracy may be lost on floating-point numbers. During parsing, it is not strictly required to read the nearest machine-representable number. Dates are written in YYYY-MM-DD format and parsed in the same format, but with any characters as separators. -Dates with times are written in the format YYYY-MM-DD hh:mm:ss and parsed in the same format, but with any characters as separators. -This all occurs in the system time zone at the time the client or server starts (depending on which one formats data). For dates with times, daylight saving time is not specified. So if a dump has times during daylight saving time, the dump does not unequivocally match the data, and parsing will select one of the two times. +Dates with times are written in the format `YYYY-MM-DD hh:mm:ss` and parsed in the same format, but with any characters as separators. +This all occurs in the system time zone at the time the client or server starts (depending on which of them formats data). For dates with times, daylight saving time is not specified. So if a dump has times during daylight saving time, the dump does not unequivocally match the data, and parsing will select one of the two times. During a read operation, incorrect dates and dates with times can be parsed with natural overflow or as null dates and times, without an error message. As an exception, parsing dates with times is also supported in Unix timestamp format, if it consists of exactly 10 decimal digits. The result is not time zone-dependent. The formats YYYY-MM-DD hh:mm:ss and NNNNNNNNNN are differentiated automatically. -Strings are output with backslash-escaped special characters. The following escape sequences are used for output: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`. Parsing also supports the sequences `\a`, `\v`, and `\xHH` (hex escape sequences) and any `\c` sequences, where `c` is any character (these sequences are converted to `c`). Thus, reading data supports formats where a line feed can be written as `\n` or `\`, or as a line feed. For example, the string `Hello world` with a line feed between the words instead of a space can be parsed in any of the following variations: +Strings are output with backslash-escaped special characters. The following escape sequences are used for output: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`. Parsing also supports the sequences `\a`, `\v`, and `\xHH` (hex escape sequences) and any `\c` sequences, where `c` is any character (these sequences are converted to `c`). Thus, reading data supports formats where a line feed can be written as `\n` or `\`, or as a line feed. For example, the string `Hello world` with a line feed between the words instead of space can be parsed in any of the following variations: -```text +``` text Hello\nworld Hello\ @@ -109,7 +109,7 @@ Each element of [Nested](../data_types/nested_data_structures/nested.md) structu For example: -```sql +``` sql CREATE TABLE nestedt ( `id` UInt8, @@ -120,14 +120,17 @@ CREATE TABLE nestedt ) ENGINE = TinyLog ``` -```sql + +``` sql INSERT INTO nestedt Values ( 1, [1], ['a']) ``` -```sql + +``` sql SELECT * FROM nestedt FORMAT TSV ``` -```text -1 [1] ['a'] + +``` text +1 [1] ['a'] ``` ## TabSeparatedRaw {#tabseparatedraw} @@ -140,7 +143,7 @@ This format is also available under the name `TSVRaw`. ## TabSeparatedWithNames {#tabseparatedwithnames} Differs from the `TabSeparated` format in that the column names are written in the first row. -During parsing, the first row is completely ignored. You can't use column names to determine their position or to check their correctness. +During parsing, the first row is completely ignored. You can’t use column names to determine their position or to check their correctness. (Support for parsing the header row may be added in the future.) This format is also available under the name `TSVWithNames`. @@ -154,59 +157,62 @@ This format is also available under the name `TSVWithNamesAndTypes`. ## Template {#format-template} -This format allows to specify a custom format string with placeholders for values with specified escaping rule. +This format allows specifying a custom format string with placeholders for values with a specified escaping rule. -It uses settings `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) +It uses settings `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) Setting `format_template_row` specifies path to file, which contains format string for rows with the following syntax: - `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, +`delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, - where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), - `column_i` is a name or index of a column whose values are to be selected or inserted (if empty, then column will be skipped), - `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: +where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), +`column_i` is a name or index of a column whose values are to be selected or inserted (if empty, then column will be skipped), +`serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: - - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) - - `Escaped` (similarly to `TSV`) - - `Quoted` (similarly to `Values`) - - `Raw` (without escaping, similarly to `TSVRaw`) - - `None` (no escaping rule, see further) +- `CSV`, `JSON`, `XML` (similarly to the formats of the same names) +- `Escaped` (similarly to `TSV`) +- `Quoted` (similarly to `Values`) +- `Raw` (without escaping, similarly to `TSVRaw`) +- `None` (no escaping rule, see further) - If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. +If an escaping rule is omitted, then `None` will be used. `XML` and `Raw` are suitable only for output. - So, for the following format string: +So, for the following format string: - `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` - the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: +the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase:`, `, count:`, `, ad price: $` and `;` delimiters respectively. For example: - `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` +`Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` - The `format_template_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) +The `format_template_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) -Setting `format_template_resultset` specifies path to file, which contains format string for resultset. Format string for resultset has the same syntax as format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: +Setting `format_template_resultset` specifies the path to file, which contains a format string for resultset. Format string for resultset has the same syntax as a format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: - - `data` is the rows with data in `format_template_row` format, separated by `format_template_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. - - `totals` is the row with total values in `format_template_row` format (when using WITH TOTALS) - - `min` is the row with minimum values in `format_template_row` format (when extremes is set to 1) - - `max` is the row with maximum values in `format_template_row` format (when extremes is set to 1) - - `rows` is the total number of output rows - - `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. - - `time` is the request execution time in seconds - - `rows_read` is the number of rows have been read - - `bytes_read` is the number of bytes (uncompressed) have been read +- `data` is the rows with data in `format_template_row` format, separated by `format_template_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. +- `totals` is the row with total values in `format_template_row` format (when using WITH TOTALS) +- `min` is the row with minimum values in `format_template_row` format (when extremes are set to 1) +- `max` is the row with maximum values in `format_template_row` format (when extremes are set to 1) +- `rows` is the total number of output rows +- `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows\_before\_limit\_at\_least is the exact number of rows there would have been without a LIMIT. +- `time` is the request execution time in seconds +- `rows_read` is the number of rows has been read +- `bytes_read` is the number of bytes (uncompressed) has been read - The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. - If the `format_template_resultset` setting is an empty string, `${data}` is used as default value. - For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). +The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. +If the `format_template_resultset` setting is an empty string, `${data}` is used as default value. +For insert queries format allows skipping some columns or some fields if prefix or suffix (see example). - Select example: -```sql +Select example: + +``` sql SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 FORMAT Template SETTINGS format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = '\n ' ``` + `/some/path/resultset.format`: -```text + +``` text Search phrases @@ -221,12 +227,16 @@ format_template_resultset = '/some/path/resultset.format', format_template_row = ``` + `/some/path/row.format`: -```text + +``` text
``` + Result: -```html + +``` html Search phrases @@ -247,42 +257,54 @@ Result: ``` Insert example: -```text + +``` text Some header Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 Total rows: 2 ``` -```sql + +``` sql INSERT INTO UserActivity FORMAT Template SETTINGS format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format' ``` + `/some/path/resultset.format`: -```text + +``` text Some header\n${data}\nTotal rows: ${:CSV}\n ``` + `/some/path/row.format`: -```text + +``` text Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV} ``` -`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows: ` in suffix will be ignored. + +`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows:` in suffix will be ignored. All delimiters in the input data must be strictly equal to delimiters in specified format strings. ## TemplateIgnoreSpaces {#templateignorespaces} This format is suitable only for input. -Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. -It's possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): -```sql +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. +It’s possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): + +``` sql INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = ',' ``` + `/some/path/resultset.format`: -```text + +``` text {${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}} ``` + `/some/path/row.format`: -```text + +``` text {${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}} ``` @@ -290,7 +312,7 @@ format_template_resultset = '/some/path/resultset.format', format_template_row = Similar to TabSeparated, but outputs a value in name=value format. Names are escaped the same way as in TabSeparated format, and the = symbol is also escaped. -```text +``` text SearchPhrase= count()=8267016 SearchPhrase=bathroom interior design count()=2166 SearchPhrase=yandex count()=1655 @@ -309,8 +331,8 @@ SearchPhrase=baku count()=1000 SELECT * FROM t_null FORMAT TSKV ``` -```text -x=1 y=\N +``` text +x=1 y=\N ``` When there is a large number of small columns, this format is ineffective, and there is generally no reason to use it. Nevertheless, it is no worse than JSONEachRow in terms of efficiency. @@ -323,25 +345,25 @@ Parsing allows the presence of the additional field `tskv` without the equal sig Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)). -When formatting, rows are enclosed in double quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost). +When formatting, rows are enclosed in double-quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double-quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format\_csv\_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first, the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double-quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost). -```bash +``` bash $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv ``` -*By default, the delimiter is `,`. See the [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) setting for more information. +\*By default, the delimiter is `,`. See the [format\_csv\_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) setting for more information. When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported. Empty unquoted input values are replaced with default values for the respective columns, if -[input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) +[input\_format\_defaults\_for\_omitted\_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) is enabled. -`NULL` is formatted as `\N` or `NULL` or an empty unquoted string (see settings [input_format_csv_unquoted_null_literal_as_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null) and [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)). +`NULL` is formatted as `\N` or `NULL` or an empty unquoted string (see settings [input\_format\_csv\_unquoted\_null\_literal\_as\_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null) and [input\_format\_defaults\_for\_omitted\_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)). The CSV format supports the output of totals and extremes the same way as `TabSeparated`. -## CSVWithNames +## CSVWithNames {#csvwithnames} Also prints the header row, similar to `TabSeparatedWithNames`. @@ -352,13 +374,13 @@ There is also `CustomSeparatedIgnoreSpaces` format, which is similar to `Templat ## JSON {#json} -Outputs data in JSON format. Besides data tables, it also outputs column names and types, along with some additional information: the total number of output rows, and the number of rows that could have been output if there weren't a LIMIT. Example: +Outputs data in JSON format. Besides data tables, it also outputs column names and types, along with some additional information: the total number of output rows, and the number of rows that could have been output if there weren’t a LIMIT. Example: -```sql +``` sql SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON ``` -```json +``` json { "meta": [ @@ -422,16 +444,16 @@ SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTA } ``` -The JSON is compatible with JavaScript. To ensure this, some characters are additionally escaped: the slash `/` is escaped as `\/`; alternative line breaks `U+2028` and `U+2029`, which break some browsers, are escaped as `\uXXXX`. ASCII control characters are escaped: backspace, form feed, line feed, carriage return, and horizontal tab are replaced with `\b`, `\f`, `\n`, `\r`, `\t` , as well as the remaining bytes in the 00-1F range using `\uXXXX` sequences. Invalid UTF-8 sequences are changed to the replacement character � so the output text will consist of valid UTF-8 sequences. For compatibility with JavaScript, Int64 and UInt64 integers are enclosed in double quotes by default. To remove the quotes, you can set the configuration parameter [output_format_json_quote_64bit_integers](../operations/settings/settings.md#session_settings-output_format_json_quote_64bit_integers) to 0. +The JSON is compatible with JavaScript. To ensure this, some characters are additionally escaped: the slash `/` is escaped as `\/`; alternative line breaks `U+2028` and `U+2029`, which break some browsers, are escaped as `\uXXXX`. ASCII control characters are escaped: backspace, form feed, line feed, carriage return, and horizontal tab are replaced with `\b`, `\f`, `\n`, `\r`, `\t` , as well as the remaining bytes in the 00-1F range using `\uXXXX` sequences. Invalid UTF-8 sequences are changed to the replacement character � so the output text will consist of valid UTF-8 sequences. For compatibility with JavaScript, Int64 and UInt64 integers are enclosed in double-quotes by default. To remove the quotes, you can set the configuration parameter [output\_format\_json\_quote\_64bit\_integers](../operations/settings/settings.md#session_settings-output_format_json_quote_64bit_integers) to 0. `rows` – The total number of output rows. `rows_before_limit_at_least` The minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. -If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. +If the query contains GROUP BY, rows\_before\_limit\_at\_least is the exact number of rows there would have been without a LIMIT. `totals` – Total values (when using WITH TOTALS). -`extremes` – Extreme values (when extremes is set to 1). +`extremes` – Extreme values (when extremes are set to 1). This format is only appropriate for outputting a query result, but not for parsing (retrieving data to insert in a table). @@ -445,7 +467,7 @@ Differs from JSON only in that data rows are output in arrays, not in objects. Example: -```json +``` json { "meta": [ @@ -489,7 +511,7 @@ See also the `JSONEachRow` format. When using this format, ClickHouse outputs rows as separated, newline-delimited JSON objects, but the data as a whole is not valid JSON. -```json +``` json {"SearchPhrase":"curtain designs","count()":"1064"} {"SearchPhrase":"baku","count()":"1000"} {"SearchPhrase":"","count()":"8267016"} @@ -497,28 +519,28 @@ When using this format, ClickHouse outputs rows as separated, newline-delimited When inserting the data, you should provide a separate JSON object for each row. -### Inserting Data +### Inserting Data {#inserting-data} -```sql +``` sql INSERT INTO UserActivity FORMAT JSONEachRow {"PageViews":5, "UserID":"4324182021466249494", "Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} ``` ClickHouse allows: -- Any order of key-value pairs in the object. -- Omitting some values. +- Any order of key-value pairs in the object. +- Omitting some values. -ClickHouse ignores spaces between elements and commas after the objects. You can pass all the objects in one line. You don't have to separate them with line breaks. +ClickHouse ignores spaces between elements and commas after the objects. You can pass all the objects in one line. You don’t have to separate them with line breaks. **Omitted values processing** ClickHouse substitutes omitted values with the default values for the corresponding [data types](../data_types/index.md). -If `DEFAULT expr` is specified, ClickHouse uses different substitution rules depending on the [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) setting. +If `DEFAULT expr` is specified, ClickHouse uses different substitution rules depending on the [input\_format\_defaults\_for\_omitted\_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) setting. Consider the following table: -```sql +``` sql CREATE TABLE IF NOT EXISTS example_table ( x UInt32, @@ -526,17 +548,17 @@ CREATE TABLE IF NOT EXISTS example_table ) ENGINE = Memory; ``` -- If `input_format_defaults_for_omitted_fields = 0`, then the default value for `x` and `a` equals `0` (as the default value for the `UInt32` data type). -- If `input_format_defaults_for_omitted_fields = 1`, then the default value for `x` equals `0`, but the default value of `a` equals `x * 2`. +- If `input_format_defaults_for_omitted_fields = 0`, then the default value for `x` and `a` equals `0` (as the default value for the `UInt32` data type). +- If `input_format_defaults_for_omitted_fields = 1`, then the default value for `x` equals `0`, but the default value of `a` equals `x * 2`. !!! note "Warning" When inserting data with `insert_sample_with_metadata = 1`, ClickHouse consumes more computational resources, compared to insertion with `insert_sample_with_metadata = 0`. -### Selecting Data +### Selecting Data {#selecting-data} Consider the `UserActivity` table as an example: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -545,7 +567,7 @@ Consider the `UserActivity` table as an example: The query `SELECT * FROM UserActivity FORMAT JSONEachRow` returns: -```text +``` text {"UserID":"4324182021466249494","PageViews":5,"Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} ``` @@ -557,23 +579,23 @@ Unlike the [JSON](#json) format, there is no substitution of invalid UTF-8 seque ### Usage of Nested Structures {#jsoneachrow-nested} -If you have a table with [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data with the same structure. Enable this feature with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. +If you have a table with [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data with the same structure. Enable this feature with the [input\_format\_import\_nested\_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. For example, consider the following table: -```sql +``` sql CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory ``` As you can see in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column (`n.s` and `n.i` for our table). You can insert data in the following way: -```sql +``` sql INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} ``` -To insert data as a hierarchical JSON object, set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). +To insert data as a hierarchical JSON object, set [input\_format\_import\_nested\_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). -```json +``` json { "n": { "s": ["abc", "def"], @@ -584,26 +606,31 @@ To insert data as a hierarchical JSON object, set [input_format_import_nested_js Without this setting, ClickHouse throws an exception. -```sql +``` sql SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json' ``` -```text + +``` text ┌─name────────────────────────────┬─value─┐ │ input_format_import_nested_json │ 0 │ └─────────────────────────────────┴───────┘ ``` -```sql + +``` sql INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} ``` -```text + +``` text Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1) ``` -```sql + +``` sql SET input_format_import_nested_json=1 INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} SELECT * FROM json_each_row_nested ``` -```text + +``` text ┌─n.s───────────┬─n.i────┐ │ ['abc','def'] │ [1,23] │ └───────────────┴────────┘ @@ -611,18 +638,18 @@ SELECT * FROM json_each_row_nested ## Native {#native} -The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is "columnar" – it doesn't convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients. +The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is “columnar” – it doesn’t convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients. -You can use this format to quickly generate dumps that can only be read by the ClickHouse DBMS. It doesn't make sense to work with this format yourself. +You can use this format to quickly generate dumps that can only be read by the ClickHouse DBMS. It doesn’t make sense to work with this format yourself. ## Null {#null} -Nothing is output. However, the query is processed, and when using the command-line client, data is transmitted to the client. This is used for tests, including productivity testing. +Nothing is output. However, the query is processed, and when using the command-line client, data is transmitted to the client. This is used for tests, including performance testing. Obviously, this format is only appropriate for output, not for parsing. ## Pretty {#pretty} -Outputs data as Unicode-art tables, also using ANSI-escape sequences for setting colors in the terminal. +Outputs data as Unicode-art tables, also using ANSI-escape sequences for setting colours in the terminal. A full grid of the table is drawn, and each row occupies two lines in the terminal. Each result block is output as a separate table. This is necessary so that blocks can be output without buffering results (buffering would be necessary in order to pre-calculate the visible width of all the values). @@ -630,38 +657,38 @@ Each result block is output as a separate table. This is necessary so that block Example (shown for the [PrettyCompact](#prettycompact) format): -```sql +``` sql SELECT * FROM t_null ``` -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ └───┴──────┘ ``` -Rows are not escaped in Pretty* formats. Example is shown for the [PrettyCompact](#prettycompact) format: +Rows are not escaped in Pretty\* formats. Example is shown for the [PrettyCompact](#prettycompact) format: -```sql +``` sql SELECT 'String with \'quotes\' and \t character' AS Escaping_test ``` -```text +``` text ┌─Escaping_test────────────────────────┐ -│ String with 'quotes' and character │ +│ String with 'quotes' and character │ └──────────────────────────────────────┘ ``` -To avoid dumping too much data to the terminal, only the first 10,000 rows are printed. If the number of rows is greater than or equal to 10,000, the message "Showed first 10 000" is printed. +To avoid dumping too much data to the terminal, only the first 10,000 rows are printed. If the number of rows is greater than or equal to 10,000, the message “Showed first 10 000” is printed. This format is only appropriate for outputting a query result, but not for parsing (retrieving data to insert in a table). -The Pretty format supports outputting total values (when using WITH TOTALS) and extremes (when 'extremes' is set to 1). In these cases, total values and extreme values are output after the main data, in separate tables. Example (shown for the [PrettyCompact](#prettycompact) format): +The Pretty format supports outputting total values (when using WITH TOTALS) and extremes (when ‘extremes’ is set to 1). In these cases, total values and extreme values are output after the main data, in separate tables. Example (shown for the [PrettyCompact](#prettycompact) format): -```sql +``` sql SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact ``` -```text +``` text ┌──EventDate─┬───────c─┐ │ 2014-03-17 │ 1406958 │ │ 2014-03-18 │ 1383658 │ @@ -695,21 +722,21 @@ Differs from [PrettyCompact](#prettycompact) in that up to 10,000 rows are buffe ## PrettyNoEscapes {#prettynoescapes} -Differs from Pretty in that ANSI-escape sequences aren't used. This is necessary for displaying this format in a browser, as well as for using the 'watch' command-line utility. +Differs from Pretty in that ANSI-escape sequences aren’t used. This is necessary for displaying this format in a browser, as well as for using the ‘watch’ command-line utility. Example: -```bash +``` bash $ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" ``` You can use the HTTP interface for displaying in the browser. -### PrettyCompactNoEscapes +### PrettyCompactNoEscapes {#prettycompactnoescapes} The same as the previous setting. -### PrettySpaceNoEscapes +### PrettySpaceNoEscapes {#prettyspacenoescapes} The same as the previous setting. @@ -720,9 +747,9 @@ Differs from [PrettyCompact](#prettycompact) in that whitespace (space character ## RowBinary {#rowbinary} Formats and parses data by row in binary format. Rows and values are listed consecutively, without separators. -This format is less efficient than the Native format, since it is row-based. +This format is less efficient than the Native format since it is row-based. -Integers use fixed-length little endian representation. For example, UInt64 uses 8 bytes. +Integers use fixed-length little-endian representation. For example, UInt64 uses 8 bytes. DateTime is represented as UInt32 containing the Unix timestamp as the value. Date is represented as a UInt16 object that contains the number of days since 1970-01-01 as the value. String is represented as a varint length (unsigned [LEB128](https://en.wikipedia.org/wiki/LEB128)), followed by the bytes of the string. @@ -736,48 +763,49 @@ For [NULL](../query_language/syntax.md#null-literal) support, an additional byte Similar to [RowBinary](#rowbinary), but with added header: - * [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N) - * N `String`s specifying column names - * N `String`s specifying column types +- [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N) +- N `String`s specifying column names +- N `String`s specifying column types ## Values {#data-format-values} -Prints every row in brackets. Rows are separated by commas. There is no comma after the last row. The values inside the brackets are also comma-separated. Numbers are output in decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are similar to the [TabSeparated](#tabseparated) format. During formatting, extra spaces aren't inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed). [NULL](../query_language/syntax.md) is represented as `NULL`. +Prints every row in brackets. Rows are separated by commas. There is no comma after the last row. The values inside the brackets are also comma-separated. Numbers are output in a decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are similar to the [TabSeparated](#tabseparated) format. During formatting, extra spaces aren’t inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed). [NULL](../query_language/syntax.md) is represented as `NULL`. The minimum set of characters that you need to escape when passing data in Values ​​format: single quotes and backslashes. This is the format that is used in `INSERT INTO t VALUES ...`, but you can also use it for formatting query results. -See also: [input_format_values_interpret_expressions](../operations/settings/settings.md#settings-input_format_values_interpret_expressions) and [input_format_values_deduce_templates_of_expressions](../operations/settings/settings.md#settings-input_format_values_deduce_templates_of_expressions) settings. +See also: [input\_format\_values\_interpret\_expressions](../operations/settings/settings.md#settings-input_format_values_interpret_expressions) and [input\_format\_values\_deduce\_templates\_of\_expressions](../operations/settings/settings.md#settings-input_format_values_deduce_templates_of_expressions) settings. ## Vertical {#vertical} -Prints each value on a separate line with the column name specified. This format is convenient for printing just one or a few rows, if each row consists of a large number of columns. +Prints each value on a separate line with the column name specified. This format is convenient for printing just one or a few rows if each row consists of a large number of columns. [NULL](../query_language/syntax.md) is output as `ᴺᵁᴸᴸ`. Example: -```sql +``` sql SELECT * FROM t_null FORMAT Vertical ``` -```text +``` text Row 1: ────── x: 1 y: ᴺᵁᴸᴸ ``` + Rows are not escaped in Vertical format: -```sql +``` sql SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical ``` -```text +``` text Row 1: ────── -test: string with 'quotes' and with some special +test: string with 'quotes' and with some special characters ``` @@ -791,7 +819,7 @@ Similar to [Vertical](#vertical), but with escaping disabled. This format is onl XML format is suitable only for output, not for parsing. Example: -```xml +``` xml @@ -853,7 +881,7 @@ XML format is suitable only for output, not for parsing. Example: ``` -If the column name does not have an acceptable format, just 'field' is used as the element name. In general, the XML structure follows the JSON structure. +If the column name does not have an acceptable format, just ‘field’ is used as the element name. In general, the XML structure follows the JSON structure. Just as for JSON, invalid UTF-8 sequences are changed to the replacement character � so the output text will consist of valid UTF-8 sequences. In string values, the characters `<` and `&` are escaped as `<` and `&`. @@ -862,24 +890,24 @@ Arrays are output as `HelloWorld...`,an ## CapnProto {#capnproto} -Cap'n Proto is a binary message format similar to Protocol Buffers and Thrift, but not like JSON or MessagePack. +Cap’n Proto is a binary message format similar to Protocol Buffers and Thrift, but not like JSON or MessagePack. -Cap'n Proto messages are strictly typed and not self-describing, meaning they need an external schema description. The schema is applied on the fly and cached for each query. +Cap’n Proto messages are strictly typed and not self-describing, meaning they need an external schema description. The schema is applied on the fly and cached for each query. -```bash +``` bash $ cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits FORMAT CapnProto SETTINGS format_schema='schema:Message'" ``` Where `schema.capnp` looks like this: -```capnp +``` capnp struct Message { SearchPhrase @0 :Text; c @1 :Uint64; } ``` -Deserialization is effective and usually doesn't increase the system load. +Deserialization is effective and usually doesn’t increase the system load. See also [Format Schema](#formatschema). @@ -892,17 +920,17 @@ ClickHouse supports both `proto2` and `proto3` syntaxes. Repeated/optional/requi Usage examples: -```sql +``` sql SELECT * FROM test.table FORMAT Protobuf SETTINGS format_schema = 'schemafile:MessageType' ``` -```bash +``` bash cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'" ``` where the file `schemafile.proto` looks like this: -```capnp +``` capnp syntax = "proto3"; message MessageType { @@ -913,13 +941,13 @@ message MessageType { }; ``` -To find the correspondence between table columns and fields of Protocol Buffers' message type ClickHouse compares their names. +To find the correspondence between table columns and fields of Protocol Buffers’ message type ClickHouse compares their names. This comparison is case-insensitive and the characters `_` (underscore) and `.` (dot) are considered as equal. -If types of a column and a field of Protocol Buffers' message are different the necessary conversion is applied. +If types of a column and a field of Protocol Buffers’ message are different the necessary conversion is applied. Nested messages are supported. For example, for the field `z` in the following message type -```capnp +``` capnp message MessageType { message XType { message YType { @@ -936,7 +964,7 @@ Nested messages are suitable to input or output a [nested data structures](../da Default values defined in a protobuf schema like this -```capnp +``` capnp syntax = "proto2"; message MessageType { @@ -952,43 +980,41 @@ See also [how to read/write length-delimited protobuf messages in popular langua ## Avro {#data-format-avro} -[Apache Avro](http://avro.apache.org/) is a row-oriented data serialization framework developed within Apache's Hadoop project. +[Apache Avro](http://avro.apache.org/) is a row-oriented data serialization framework developed within Apache’s Hadoop project. ClickHouse Avro format supports reading and writing [Avro data files](http://avro.apache.org/docs/current/spec.html#Object+Container+Files). -### Data Types Matching +### Data Types Matching {#data-types-matching} The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` and `SELECT` queries. -| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | -| -------------------- | -------------------- | ------------------ | -| `boolean`, `int`, `long`, `float`, `double` | [Int(8\|16\|32\)](../data_types/int_uint.md), [UInt(8\|16\|32)](../data_types/int_uint.md) | `int` | -| `boolean`, `int`, `long`, `float`, `double` | [Int64](../data_types/int_uint.md), [UInt64](../data_types/int_uint.md) | `long` | -| `boolean`, `int`, `long`, `float`, `double` | [Float32](../data_types/float.md) | `float` | -| `boolean`, `int`, `long`, `float`, `double` | [Float64](../data_types/float.md) | `double` | -| `bytes`, `string`, `fixed`, `enum` | [String](../data_types/string.md) | `bytes` | -| `bytes`, `string`, `fixed` | [FixedString(N)](../data_types/fixedstring.md) | `fixed(N)` | -| `enum` | [Enum(8\|16)](../data_types/enum.md) | `enum` | -| `array(T)` | [Array(T)](../data_types/array.md) | `array(T)` | -| `union(null, T)`, `union(T, null)` | [Nullable(T)](../data_types/date.md) | `union(null, T)`| -| `null` | [Nullable(Nothing)](../data_types/special_data_types/nothing.md) | `null` | -| `int (date)` * | [Date](../data_types/date.md) | `int (date)` * | -| `long (timestamp-millis)` * | [DateTime64(3)](../data_types/datetime.md) | `long (timestamp-millis)` * | -| `long (timestamp-micros)` * | [DateTime64(6)](../data_types/datetime.md) | `long (timestamp-micros)` * | +| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | +|---------------------------------------------|-------------------------------------------------------------------------------------------|------------------------------| +| `boolean`, `int`, `long`, `float`, `double` | [Int(8\|16\|32)](../data_types/int_uint.md), [UInt(8\|16\|32)](../data_types/int_uint.md) | `int` | +| `boolean`, `int`, `long`, `float`, `double` | [Int64](../data_types/int_uint.md), [UInt64](../data_types/int_uint.md) | `long` | +| `boolean`, `int`, `long`, `float`, `double` | [Float32](../data_types/float.md) | `float` | +| `boolean`, `int`, `long`, `float`, `double` | [Float64](../data_types/float.md) | `double` | +| `bytes`, `string`, `fixed`, `enum` | [String](../data_types/string.md) | `bytes` | +| `bytes`, `string`, `fixed` | [FixedString(N)](../data_types/fixedstring.md) | `fixed(N)` | +| `enum` | [Enum(8\|16)](../data_types/enum.md) | `enum` | +| `array(T)` | [Array(T)](../data_types/array.md) | `array(T)` | +| `union(null, T)`, `union(T, null)` | [Nullable(T)](../data_types/date.md) | `union(null, T)` | +| `null` | [Nullable(Nothing)](../data_types/special_data_types/nothing.md) | `null` | +| `int (date)` \* | [Date](../data_types/date.md) | `int (date)` \* | +| `long (timestamp-millis)` \* | [DateTime64(3)](../data_types/datetime.md) | `long (timestamp-millis)` \* | +| `long (timestamp-micros)` \* | [DateTime64(6)](../data_types/datetime.md) | `long (timestamp-micros)` \* | \* [Avro logical types](http://avro.apache.org/docs/current/spec.html#Logical+Types) - - Unsupported Avro data types: `record` (non-root), `map` Unsupported Avro logical data types: `uuid`, `time-millis`, `time-micros`, `duration` -### Inserting Data +### Inserting Data {#inserting-data-1} To insert data from an Avro file into ClickHouse table: -```bash +``` bash $ cat file.avro | clickhouse-client --query="INSERT INTO {some_table} FORMAT Avro" ``` @@ -997,22 +1023,22 @@ The root schema of input Avro file must be of `record` type. To find the correspondence between table columns and fields of Avro schema ClickHouse compares their names. This comparison is case-sensitive. Unused fields are skipped. -Data types of a ClickHouse table columns can differ from the corresponding fields of the Avro data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to corresponding column type. +Data types of ClickHouse table columns can differ from the corresponding fields of the Avro data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to corresponding column type. -### Selecting Data +### Selecting Data {#selecting-data-1} To select data from ClickHouse table into an Avro file: -```bash +``` bash $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Avro" > file.avro ``` Column names must: -- start with `[A-Za-z_]` -- subsequently contain only `[A-Za-z0-9_]` +- start with `[A-Za-z_]` +- subsequently contain only `[A-Za-z0-9_]` -Output Avro file compression and sync interval can be configured with [output_format_avro_codec](../operations/settings/settings.md#settings-output_format_avro_codec) and [output_format_avro_sync_interval](../operations/settings/settings.md#settings-output_format_avro_sync_interval) respectively. +Output Avro file compression and sync interval can be configured with [output\_format\_avro\_codec](../operations/settings/settings.md#settings-output_format_avro_codec) and [output\_format\_avro\_sync\_interval](../operations/settings/settings.md#settings-output_format_avro_sync_interval) respectively. ## AvroConfluent {#data-format-avro-confluent} @@ -1022,17 +1048,17 @@ Each Avro message embeds a schema id that can be resolved to the actual schema w Schemas are cached once resolved. -Schema Registry URL is configured with [format_avro_schema_registry_url](../operations/settings/settings.md#settings-format_avro_schema_registry_url) +Schema Registry URL is configured with [format\_avro\_schema\_registry\_url](../operations/settings/settings.md#settings-format_avro_schema_registry_url) -### Data Types Matching +### Data Types Matching {#data-types-matching-1} Same as [Avro](#data-format-avro) -### Usage +### Usage {#usage} To quickly verify schema resolution you can use [kafkacat](https://github.com/edenhill/kafkacat) with [clickhouse-local](../operations/utils/clickhouse-local.md): -```bash +``` bash $ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse-local --input-format AvroConfluent --format_avro_schema_registry_url 'http://schema-registry' -S "field1 Int64, field2 String" -q 'select * from table' 1 a 2 b @@ -1040,7 +1066,8 @@ $ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse- ``` To use `AvroConfluent` with [Kafka](../operations/table_engines/kafka.md): -```sql + +``` sql CREATE TABLE topic1_stream ( field1 String, @@ -1059,52 +1086,51 @@ SELECT * FROM topic1_stream; ``` !!! note "Warning" - Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it's value after a restart. - + Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. ## Parquet {#data-format-parquet} [Apache Parquet](http://parquet.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. ClickHouse supports read and write operations for this format. -### Data Types Matching +### Data Types Matching {#data-types-matching-2} The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` and `SELECT` queries. -| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) | -| -------------------- | ------------------ | ---- | -| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | `UINT8` | -| `INT8` | [Int8](../data_types/int_uint.md) | `INT8` | -| `UINT16` | [UInt16](../data_types/int_uint.md) | `UINT16` | -| `INT16` | [Int16](../data_types/int_uint.md) | `INT16` | -| `UINT32` | [UInt32](../data_types/int_uint.md) | `UINT32` | -| `INT32` | [Int32](../data_types/int_uint.md) | `INT32` | -| `UINT64` | [UInt64](../data_types/int_uint.md) | `UINT64` | -| `INT64` | [Int64](../data_types/int_uint.md) | `INT64` | -| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | `FLOAT` | -| `DOUBLE` | [Float64](../data_types/float.md) | `DOUBLE` | -| `DATE32` | [Date](../data_types/date.md) | `UINT16` | -| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | `UINT32` | -| `STRING`, `BINARY` | [String](../data_types/string.md) | `STRING` | -| — | [FixedString](../data_types/fixedstring.md) | `STRING` | -| `DECIMAL` | [Decimal](../data_types/decimal.md) | `DECIMAL` | +| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) | +|------------------------------|---------------------------------------------|------------------------------| +| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | `UINT8` | +| `INT8` | [Int8](../data_types/int_uint.md) | `INT8` | +| `UINT16` | [UInt16](../data_types/int_uint.md) | `UINT16` | +| `INT16` | [Int16](../data_types/int_uint.md) | `INT16` | +| `UINT32` | [UInt32](../data_types/int_uint.md) | `UINT32` | +| `INT32` | [Int32](../data_types/int_uint.md) | `INT32` | +| `UINT64` | [UInt64](../data_types/int_uint.md) | `UINT64` | +| `INT64` | [Int64](../data_types/int_uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../data_types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../data_types/date.md) | `UINT16` | +| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | `UINT32` | +| `STRING`, `BINARY` | [String](../data_types/string.md) | `STRING` | +| — | [FixedString](../data_types/fixedstring.md) | `STRING` | +| `DECIMAL` | [Decimal](../data_types/decimal.md) | `DECIMAL` | ClickHouse supports configurable precision of `Decimal` type. The `INSERT` query treats the Parquet `DECIMAL` type as the ClickHouse `Decimal128` type. Unsupported Parquet data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -Data types of a ClickHouse table columns can differ from the corresponding fields of the Parquet data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [cast](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to that data type which is set for the ClickHouse table column. +Data types of ClickHouse table columns can differ from the corresponding fields of the Parquet data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [cast](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to that data type which is set for the ClickHouse table column. -### Inserting and Selecting Data +### Inserting and Selecting Data {#inserting-and-selecting-data} You can insert Parquet data from a file into ClickHouse table by the following command: -```bash +``` bash $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" ``` You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command: -```bash +``` bash $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} ``` @@ -1114,38 +1140,38 @@ To exchange data with Hadoop, you can use [HDFS table engine](../operations/tabl [Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. You can only insert data in this format to ClickHouse. -### Data Types Matching +### Data Types Matching {#data-types-matching-3} The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` queries. -| ORC data type (`INSERT`) | ClickHouse data type | -| -------------------- | ------------------ | -| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | -| `INT8` | [Int8](../data_types/int_uint.md) | -| `UINT16` | [UInt16](../data_types/int_uint.md) | -| `INT16` | [Int16](../data_types/int_uint.md) | -| `UINT32` | [UInt32](../data_types/int_uint.md) | -| `INT32` | [Int32](../data_types/int_uint.md) | -| `UINT64` | [UInt64](../data_types/int_uint.md) | -| `INT64` | [Int64](../data_types/int_uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | -| `DOUBLE` | [Float64](../data_types/float.md) | -| `DATE32` | [Date](../data_types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | -| `STRING`, `BINARY` | [String](../data_types/string.md) | -| `DECIMAL` | [Decimal](../data_types/decimal.md) | +| ORC data type (`INSERT`) | ClickHouse data type | +|--------------------------|---------------------------------------| +| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | +| `INT8` | [Int8](../data_types/int_uint.md) | +| `UINT16` | [UInt16](../data_types/int_uint.md) | +| `INT16` | [Int16](../data_types/int_uint.md) | +| `UINT32` | [UInt32](../data_types/int_uint.md) | +| `INT32` | [Int32](../data_types/int_uint.md) | +| `UINT64` | [UInt64](../data_types/int_uint.md) | +| `INT64` | [Int64](../data_types/int_uint.md) | +| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | +| `DOUBLE` | [Float64](../data_types/float.md) | +| `DATE32` | [Date](../data_types/date.md) | +| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | +| `STRING`, `BINARY` | [String](../data_types/string.md) | +| `DECIMAL` | [Decimal](../data_types/decimal.md) | ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -The data types of ClickHouse table columns don't have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. +The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. -### Inserting Data +### Inserting Data {#inserting-data-2} You can insert ORC data from a file into ClickHouse table by the following command: -```bash +``` bash $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` @@ -1154,26 +1180,26 @@ To exchange data with Hadoop, you can use [HDFS table engine](../operations/tabl ## Format Schema {#formatschema} The file name containing the format schema is set by the setting `format_schema`. -It's required to set this setting when it is used one of the formats `Cap'n Proto` and `Protobuf`. -The format schema is a combination of a file name and the name of a message type in this file, delimited by colon, -e.g. `schemafile.proto:MessageType`. +It’s required to set this setting when it is used one of the formats `Cap'n Proto` and `Protobuf`. +The format schema is a combination of a file name and the name of a message type in this file, delimited by a colon, +e.g. `schemafile.proto:MessageType`. If the file has the standard extension for the format (for example, `.proto` for `Protobuf`), -it can be omitted and in this case the format schema looks like `schemafile:MessageType`. +it can be omitted and in this case, the format schema looks like `schemafile:MessageType`. If you input or output data via the [client](../interfaces/cli.md) in the [interactive mode](../interfaces/cli.md#cli_usage), the file name specified in the format schema can contain an absolute path or a path relative to the current directory on the client. If you use the client in the [batch mode](../interfaces/cli.md#cli_usage), the path to the schema must be relative due to security reasons. If you input or output data via the [HTTP interface](../interfaces/http.md) the file name specified in the format schema -should be located in the directory specified in [format_schema_path](../operations/server_settings/settings.md#server_settings-format_schema_path) +should be located in the directory specified in [format\_schema\_path](../operations/server_settings/settings.md#server_settings-format_schema_path) in the server configuration. -[Original article](https://clickhouse.tech/docs/en/interfaces/formats/) - ## Skipping Errors {#skippingerrors} -Some formats such as `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` and `Protobuf` can skip broken row if parsing error occurred and continue parsing from the beginning of next row. See [input_format_allow_errors_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) and -[input_format_allow_errors_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio) settings. +Some formats such as `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` and `Protobuf` can skip broken row if parsing error occurred and continue parsing from the beginning of next row. See [input\_format\_allow\_errors\_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) and +[input\_format\_allow\_errors\_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio) settings. Limitations: - - In case of parsing error `JSONEachRow` skips all data until the new line (or EOF), so rows must be delimited by `\n` to count errors correctly. - - `Template` and `CustomSeparated` use delimiter after the last column and delimiter between rows to find the beginning of next row, so skipping errors works only if at least one of them is not empty. +- In case of parsing error `JSONEachRow` skips all data until the new line (or EOF), so rows must be delimited by `\n` to count errors correctly. +- `Template` and `CustomSeparated` use delimiter after the last column and delimiter between rows to find the beginning of next row, so skipping errors works only if at least one of them is not empty. + +[Original article](https://clickhouse.tech/docs/en/interfaces/formats/) diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 0ce700bdc54..fbc116c4c97 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -1,32 +1,33 @@ -# HTTP Interface {#http_interface} +# HTTP Interface {#http-interface} The HTTP interface lets you use ClickHouse on any platform from any programming language. We use it for working from Java and Perl, as well as shell scripts. In other departments, the HTTP interface is used from Perl, Python, and Go. The HTTP interface is more limited than the native interface, but it has better compatibility. By default, clickhouse-server listens for HTTP on port 8123 (this can be changed in the config). -If you make a GET / request without parameters, it returns 200 response code and the string which defined in [http_server_default_response](../operations/server_settings/settings.md#server_settings-http_server_default_response) default value "Ok." (with a line feed at the end) -```bash +If you make a GET / request without parameters, it returns 200 response code and the string which defined in [http\_server\_default\_response](../operations/server_settings/settings.md#server_settings-http_server_default_response) default value “Ok.” (with a line feed at the end) + +``` bash $ curl 'http://localhost:8123/' Ok. ``` -Use GET /ping request in health-check scripts. This handler always return "Ok." (with a line feed at the end). Available from version 18.12.13. -```bash +Use GET /ping request in health-check scripts. This handler always returns “Ok.” (with a line feed at the end). Available from version 18.12.13. + +``` bash $ curl 'http://localhost:8123/ping' Ok. ``` - -Send the request as a URL 'query' parameter, or as a POST. Or send the beginning of the query in the 'query' parameter, and the rest in the POST (we'll explain later why this is necessary). The size of the URL is limited to 16 KB, so keep this in mind when sending large queries. +Send the request as a URL ‘query’ parameter, or as a POST. Or send the beginning of the query in the ‘query’ parameter, and the rest in the POST (we’ll explain later why this is necessary). The size of the URL is limited to 16 KB, so keep this in mind when sending large queries. If successful, you receive the 200 response code and the result in the response body. If an error occurs, you receive the 500 response code and an error description text in the response body. -When using the GET method, 'readonly' is set. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body, or in the URL parameter. +When using the GET method, ‘readonly’ is set. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body or in the URL parameter. Examples: -```bash +``` bash $ curl 'http://localhost:8123/?query=SELECT%201' 1 @@ -46,9 +47,9 @@ X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","writ ``` As you can see, curl is somewhat inconvenient in that spaces must be URL escaped. -Although wget escapes everything itself, we don't recommend using it because it doesn't work well over HTTP 1.1 when using keep-alive and Transfer-Encoding: chunked. +Although wget escapes everything itself, we don’t recommend using it because it doesn’t work well over HTTP 1.1 when using keep-alive and Transfer-Encoding: chunked. -```bash +``` bash $ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @- 1 @@ -60,19 +61,19 @@ $ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @- ``` If part of the query is sent in the parameter, and part in the POST, a line feed is inserted between these two data parts. -Example (this won't work): +Example (this won’t work): -```bash +``` bash $ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @- Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL ECT 1 , expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception ``` -By default, data is returned in TabSeparated format (for more information, see the "Formats" section). +By default, data is returned in TabSeparated format (for more information, see the “Formats” section). You use the FORMAT clause of the query to request any other format. -```bash +``` bash $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @- ┏━━━┓ ┃ 1 ┃ @@ -85,37 +86,37 @@ The POST method of transmitting data is necessary for INSERT queries. In this ca Examples: Creating a table: -```bash +``` bash $ echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- ``` Using the familiar INSERT query for data insertion: -```bash +``` bash $ echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- ``` Data can be sent separately from the query: -```bash +``` bash $ echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- ``` -You can specify any data format. The 'Values' format is the same as what is used when writing INSERT INTO t VALUES: +You can specify any data format. The ‘Values’ format is the same as what is used when writing INSERT INTO t VALUES: -```bash +``` bash $ echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- ``` To insert data from a tab-separated dump, specify the corresponding format: -```bash +``` bash $ echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- ``` Reading the table contents. Data is output in random order due to parallel query processing: -```bash +``` bash $ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' 7 8 @@ -133,24 +134,24 @@ $ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' Deleting the table. -```bash +``` bash $ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- ``` -For successful requests that don't return a data table, an empty response body is returned. +For successful requests that don’t return a data table, an empty response body is returned. -You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you will need to use the special `clickhouse-compressor` program to work with it (it is installed with the `clickhouse-client` package). To increase the efficiency of data insertion, you can disable server-side checksum verification by using the [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. +You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you will need to use the special `clickhouse-compressor` program to work with it (it is installed with the `clickhouse-client` package). To increase the efficiency of data insertion, you can disable server-side checksum verification by using the [http\_native\_compression\_disable\_checksumming\_on\_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. If you specified `compress=1` in the URL, the server compresses the data it sends you. If you specified `decompress=1` in the URL, the server decompresses the same data that you pass in the `POST` method. -You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. In order for ClickHouse to compress the response, you must append `Accept-Encoding: compression_method`. ClickHouse supports `gzip`, `br`, and `deflate` [compression methods](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens). To enable HTTP compression, you must use the ClickHouse [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting. You can configure the data compression level in the [http_zlib_compression_level](#settings-http_zlib_compression_level) setting for all the compression methods. +You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. In order for ClickHouse to compress the response, you must append `Accept-Encoding: compression_method`. ClickHouse supports `gzip`, `br`, and `deflate` [compression methods](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens). To enable HTTP compression, you must use the ClickHouse [enable\_http\_compression](../operations/settings/settings.md#settings-enable_http_compression) setting. You can configure the data compression level in the [http\_zlib\_compression\_level](#settings-http_zlib_compression_level) setting for all the compression methods. You can use this to reduce network traffic when transmitting a large amount of data, or for creating dumps that are immediately compressed. Examples of sending data with compression: -```bash +``` bash #Sending data to the server: $ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' @@ -161,9 +162,9 @@ $ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gz !!! note "Note" Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. -You can use the 'database' URL parameter to specify the default database. +You can use the ‘database’ URL parameter to specify the default database. -```bash +``` bash $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @- 0 1 @@ -177,34 +178,40 @@ $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?data 9 ``` -By default, the database that is registered in the server settings is used as the default database. By default, this is the database called 'default'. Alternatively, you can always specify the database using a dot before the table name. +By default, the database that is registered in the server settings is used as the default database. By default, this is the database called ‘default’. Alternatively, you can always specify the database using a dot before the table name. The username and password can be indicated in one of three ways: -1. Using HTTP Basic Authentication. Example: +1. Using HTTP Basic Authentication. Example: -```bash + + +``` bash $ echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- ``` -2. In the 'user' and 'password' URL parameters. Example: +1. In the ‘user’ and ‘password’ URL parameters. Example: -```bash + + +``` bash $ echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- ``` -3. Using ‘X-ClickHouse-User’ and ‘X-ClickHouse-Key’ headers. Example: +1. Using ‘X-ClickHouse-User’ and ‘X-ClickHouse-Key’ headers. Example: -```bash + + +``` bash $ echo 'SELECT 1' | curl -H 'X-ClickHouse-User: user' -H 'X-ClickHouse-Key: password' 'http://localhost:8123/' -d @- ``` If the user name is not specified, the `default` name is used. If the password is not specified, the empty password is used. -You can also use the URL parameters to specify any settings for processing a single query, or entire profiles of settings. Example:http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1 +You can also use the URL parameters to specify any settings for processing a single query or entire profiles of settings. Example:http://localhost:8123/?profile=web&max\_rows\_to\_read=1000000000&query=SELECT+1 For more information, see the [Settings](../operations/settings/index.md) section. -```bash +``` bash $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @- 0 1 @@ -218,13 +225,13 @@ $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:812 9 ``` -For information about other parameters, see the section "SET". +For information about other parameters, see the section “SET”. Similarly, you can use ClickHouse sessions in the HTTP protocol. To do this, you need to add the `session_id` GET parameter to the request. You can use any string as the session ID. By default, the session is terminated after 60 seconds of inactivity. To change this timeout, modify the `default_session_timeout` setting in the server configuration, or add the `session_timeout` GET parameter to the request. To check the session status, use the `session_check=1` parameter. Only one query at a time can be executed within a single session. -You can receive information about the progress of a query in `X-ClickHouse-Progress` response headers. To do this, enable [send_progress_in_http_headers](../operations/settings/settings.md#settings-send_progress_in_http_headers). Example of the header sequence: +You can receive information about the progress of a query in `X-ClickHouse-Progress` response headers. To do this, enable [send\_progress\_in\_http\_headers](../operations/settings/settings.md#settings-send_progress_in_http_headers). Example of the header sequence: -```text +``` text X-ClickHouse-Progress: {"read_rows":"2752512","read_bytes":"240570816","total_rows_to_read":"8880128"} X-ClickHouse-Progress: {"read_rows":"5439488","read_bytes":"482285394","total_rows_to_read":"8880128"} X-ClickHouse-Progress: {"read_rows":"8783786","read_bytes":"819092887","total_rows_to_read":"8880128"} @@ -232,42 +239,42 @@ X-ClickHouse-Progress: {"read_rows":"8783786","read_bytes":"819092887","total_ro Possible header fields: -- `read_rows` — Number of rows read. -- `read_bytes` — Volume of data read in bytes. -- `total_rows_to_read` — Total number of rows to be read. -- `written_rows` — Number of rows written. -- `written_bytes` — Volume of data written in bytes. +- `read_rows` — Number of rows read. +- `read_bytes` — Volume of data read in bytes. +- `total_rows_to_read` — Total number of rows to be read. +- `written_rows` — Number of rows written. +- `written_bytes` — Volume of data written in bytes. -Running requests don't stop automatically if the HTTP connection is lost. Parsing and data formatting are performed on the server side, and using the network might be ineffective. -The optional 'query_id' parameter can be passed as the query ID (any string). For more information, see the section "Settings, replace_running_query". +Running requests don’t stop automatically if the HTTP connection is lost. Parsing and data formatting are performed on the server-side, and using the network might be ineffective. +The optional ‘query\_id’ parameter can be passed as the query ID (any string). For more information, see the section “Settings, replace\_running\_query”. -The optional 'quota_key' parameter can be passed as the quota key (any string). For more information, see the section "Quotas". +The optional ‘quota\_key’ parameter can be passed as the quota key (any string). For more information, see the section “Quotas”. -The HTTP interface allows passing external data (external temporary tables) for querying. For more information, see the section "External data for query processing". +The HTTP interface allows passing external data (external temporary tables) for querying. For more information, see the section “External data for query processing”. -## Response Buffering +## Response Buffering {#response-buffering} -You can enable response buffering on the server side. The `buffer_size` and `wait_end_of_query` URL parameters are provided for this purpose. +You can enable response buffering on the server-side. The `buffer_size` and `wait_end_of_query` URL parameters are provided for this purpose. -`buffer_size` determines the number of bytes in the result to buffer in the server memory. If the result body is larger than this threshold, the buffer is written to the HTTP channel, and the remaining data is sent directly to the HTTP channel. +`buffer_size` determines the number of bytes in the result to buffer in the server memory. If a result body is larger than this threshold, the buffer is written to the HTTP channel, and the remaining data is sent directly to the HTTP channel. To ensure that the entire response is buffered, set `wait_end_of_query=1`. In this case, the data that is not stored in memory will be buffered in a temporary server file. Example: -```bash +``` bash $ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' ``` -Use buffering to avoid situations where a query processing error occurred after the response code and HTTP headers were sent to the client. In this situation, an error message is written at the end of the response body, and on the client side, the error can only be detected at the parsing stage. +Use buffering to avoid situations where a query processing error occurred after the response code and HTTP headers were sent to the client. In this situation, an error message is written at the end of the response body, and on the client-side, the error can only be detected at the parsing stage. ### Queries with Parameters {#cli-queries-with-parameters} You can create a query with parameters and pass values for them from the corresponding HTTP request parameters. For more information, see [Queries with Parameters for CLI](cli.md#cli-queries-with-parameters). -### Example +### Example {#example} -```bash +``` bash $ curl -sS "
?param_id=2¶m_phrase=test" -d "SELECT * FROM table WHERE int_column = {id:UInt8} and string_column = {phrase:String}" ``` diff --git a/docs/en/interfaces/index.md b/docs/en/interfaces/index.md index 9af7c9863dc..b40f60a1a55 100644 --- a/docs/en/interfaces/index.md +++ b/docs/en/interfaces/index.md @@ -2,20 +2,20 @@ ClickHouse provides two network interfaces (both can be optionally wrapped in TLS for additional security): -* [HTTP](http.md), which is documented and easy to use directly. -* [Native TCP](tcp.md), which has less overhead. +- [HTTP](http.md), which is documented and easy to use directly. +- [Native TCP](tcp.md), which has less overhead. In most cases it is recommended to use appropriate tool or library instead of interacting with those directly. Officially supported by Yandex are the following: -* [Command-line client](cli.md) -* [JDBC driver](jdbc.md) -* [ODBC driver](odbc.md) -* [C++ client library](cpp.md) +- [Command-line client](cli.md) +- [JDBC driver](jdbc.md) +- [ODBC driver](odbc.md) +- [C++ client library](cpp.md) There are also a wide range of third-party libraries for working with ClickHouse: -* [Client libraries](third-party/client_libraries.md) -* [Integrations](third-party/integrations.md) -* [Visual interfaces](third-party/gui.md) +- [Client libraries](third-party/client_libraries.md) +- [Integrations](third-party/integrations.md) +- [Visual interfaces](third-party/gui.md) [Original article](https://clickhouse.tech/docs/en/interfaces/) diff --git a/docs/en/interfaces/jdbc.md b/docs/en/interfaces/jdbc.md index 039e3abfa31..af4c2949ee4 100644 --- a/docs/en/interfaces/jdbc.md +++ b/docs/en/interfaces/jdbc.md @@ -1,9 +1,8 @@ -# JDBC Driver - -* **[Official driver](https://github.com/ClickHouse/clickhouse-jdbc)** -* Third-party drivers: - * [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) - * [clickhouse4j](https://github.com/blynkkk/clickhouse4j) +# JDBC Driver {#jdbc-driver} +- **[Official driver](https://github.com/ClickHouse/clickhouse-jdbc)** +- Third-party drivers: + - [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) + - [clickhouse4j](https://github.com/blynkkk/clickhouse4j) [Original article](https://clickhouse.tech/docs/en/interfaces/jdbc/) diff --git a/docs/en/interfaces/mysql.md b/docs/en/interfaces/mysql.md index 454cdb9160d..088305e156e 100644 --- a/docs/en/interfaces/mysql.md +++ b/docs/en/interfaces/mysql.md @@ -1,17 +1,20 @@ -# MySQL interface {#mysql_interface} +# MySQL interface {#mysql-interface} -ClickHouse supports MySQL wire protocol. It can be enabled by [mysql_port](../operations/server_settings/settings.md#server_settings-mysql_port) setting in configuration file: -```xml +ClickHouse supports MySQL wire protocol. It can be enabled by [mysql\_port](../operations/server_settings/settings.md#server_settings-mysql_port) setting in configuration file: + +``` xml 9004 ``` -Example of connecting using command-line tool mysql: -```bash +Example of connecting using command-line tool `mysql`: + +``` bash $ mysql --protocol tcp -u default -P 9004 ``` -Output if connection succeeded: -```text +Output if a connection succeeded: + +``` text Welcome to the MySQL monitor. Commands end with ; or \g. Your MySQL connection id is 4 Server version: 20.2.1.1-ClickHouse @@ -24,14 +27,16 @@ owners. Type 'help;' or '\h' for help. Type '\c' to clear the current input statement. -mysql> +mysql> ``` For compatibility with all MySQL clients, it is recommended to specify user password with [double SHA1](../operations/settings/settings_users.md#password_double_sha1_hex) in configuration file. -If user password is specified using [SHA256](../operations/settings/settings_users.md#password_sha256_hex), some clients won't be able to authenticate (mysqljs and old versions of command-line tool mysql). +If user password is specified using [SHA256](../operations/settings/settings_users.md#password_sha256_hex), some clients won’t be able to authenticate (mysqljs and old versions of command-line tool mysql). Restrictions: -- prepared queries are not supported +- prepared queries are not supported -- some data types are sent as strings +- some data types are sent as strings + +[Original article](https://clickhouse.tech/docs/en/interfaces/mysql/) diff --git a/docs/en/interfaces/odbc.md b/docs/en/interfaces/odbc.md index 9e65027b6a8..f2982abf998 100644 --- a/docs/en/interfaces/odbc.md +++ b/docs/en/interfaces/odbc.md @@ -1,6 +1,5 @@ -# ODBC Driver - -- [Official driver](https://github.com/ClickHouse/clickhouse-odbc). +# ODBC Driver {#odbc-driver} +- [Official driver](https://github.com/ClickHouse/clickhouse-odbc). [Original article](https://clickhouse.tech/docs/en/interfaces/odbc/) diff --git a/docs/en/interfaces/tcp.md b/docs/en/interfaces/tcp.md index 6194809db82..03122beb8c3 100644 --- a/docs/en/interfaces/tcp.md +++ b/docs/en/interfaces/tcp.md @@ -1,5 +1,5 @@ -# Native Interface (TCP) +# Native Interface (TCP) {#native-interface-tcp} -The native protocol is used in the [command-line client](cli.md), for interserver communication during distributed query processing, and also in other C++ programs. Unfortunately, native ClickHouse protocol does not have formal specification yet, but it can be reverse engineered from ClickHouse source code (starting [around here](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client)) and/or by intercepting and analyzing TCP traffic. +The native protocol is used in the [command-line client](cli.md), for inter-server communication during distributed query processing, and also in other C++ programs. Unfortunately, native ClickHouse protocol does not have formal specification yet, but it can be reverse-engineered from ClickHouse source code (starting [around here](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client)) and/or by intercepting and analyzing TCP traffic. [Original article](https://clickhouse.tech/docs/en/interfaces/tcp/) diff --git a/docs/en/interfaces/third-party/client_libraries.md b/docs/en/interfaces/third-party/client_libraries.md index adba376413f..7df61ed0adc 100644 --- a/docs/en/interfaces/third-party/client_libraries.md +++ b/docs/en/interfaces/third-party/client_libraries.md @@ -1,51 +1,51 @@ -# Client Libraries from Third-party Developers +# Client Libraries from Third-party Developers {#client-libraries-from-third-party-developers} !!! warning "Disclaimer" - Yandex does **not** maintain the libraries listed below and haven't done any extensive testing to ensure their quality. - -- Python - - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) - - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) - - [clickhouse-client](https://github.com/yurial/clickhouse-client) - - [aiochclient](https://github.com/maximdanilchenko/aiochclient) -- PHP - - [SeasClick](https://github.com/SeasX/SeasClick) - - [phpClickHouse](https://github.com/smi2/phpClickHouse) - - [clickhouse-php-client](https://github.com/8bitov/clickhouse-php-client) - - [clickhouse-client](https://github.com/bozerkins/clickhouse-client) - - [PhpClickHouseClient](https://github.com/SevaCode/PhpClickHouseClient) -- Go - - [clickhouse](https://github.com/kshvakov/clickhouse/) - - [go-clickhouse](https://github.com/roistat/go-clickhouse) - - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) - - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) -- NodeJs - - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) - - [node-clickhouse](https://github.com/apla/node-clickhouse) -- Perl - - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) - - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) - - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) -- Ruby - - [ClickHouse (Ruby)](https://github.com/shlima/click_house) -- R - - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) - - [RClickhouse](https://github.com/IMSMWU/RClickhouse) -- Java - - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) - - [clickhouse-client](https://github.com/Ecwid/clickhouse-client) -- Scala - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- Kotlin - - [AORM](https://github.com/TanVD/AORM) -- C# - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) -- Elixir - - [clickhousex](https://github.com/appodeal/clickhousex/) -- Nim - - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) + Yandex does **not** maintain the libraries listed below and haven’t done any extensive testing to ensure their quality. +- Python + - [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm) + - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) + - [clickhouse-client](https://github.com/yurial/clickhouse-client) + - [aiochclient](https://github.com/maximdanilchenko/aiochclient) +- PHP + - [smi2/phpclickhouse](https://packagist.org/packages/smi2/phpClickHouse) + - [8bitov/clickhouse-php-client](https://packagist.org/packages/8bitov/clickhouse-php-client) + - [bozerkins/clickhouse-client](https://packagist.org/packages/bozerkins/clickhouse-client) + - [simpod/clickhouse-client](https://packagist.org/packages/simpod/clickhouse-client) + - [seva-code/php-click-house-client](https://packagist.org/packages/seva-code/php-click-house-client) + - [SeasClick C++ client](https://github.com/SeasX/SeasClick) +- Go + - [clickhouse](https://github.com/kshvakov/clickhouse/) + - [go-clickhouse](https://github.com/roistat/go-clickhouse) + - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) + - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) +- NodeJs + - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) + - [node-clickhouse](https://github.com/apla/node-clickhouse) +- Perl + - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) + - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) + - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) +- Ruby + - [ClickHouse (Ruby)](https://github.com/shlima/click_house) +- R + - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) + - [RClickhouse](https://github.com/IMSMWU/RClickhouse) +- Java + - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) + - [clickhouse-client](https://github.com/Ecwid/clickhouse-client) +- Scala + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- Kotlin + - [AORM](https://github.com/TanVD/AORM) +- C\# + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) +- Elixir + - [clickhousex](https://github.com/appodeal/clickhousex/) +- Nim + - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) [Original article](https://clickhouse.tech/docs/en/interfaces/third-party/client_libraries/) diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md index a01f524f3c7..98f4bf09140 100644 --- a/docs/en/interfaces/third-party/gui.md +++ b/docs/en/interfaces/third-party/gui.md @@ -1,54 +1,54 @@ -# Visual Interfaces from Third-party Developers +# Visual Interfaces from Third-party Developers {#visual-interfaces-from-third-party-developers} -## Open-Source +## Open-Source {#open-source} -### Tabix +### Tabix {#tabix} Web interface for ClickHouse in the [Tabix](https://github.com/tabixio/tabix) project. Features: -- Works with ClickHouse directly from the browser, without the need to install additional software. -- Query editor with syntax highlighting. -- Auto-completion of commands. -- Tools for graphical analysis of query execution. -- Color scheme options. +- Works with ClickHouse directly from the browser, without the need to install additional software. +- Query editor with syntax highlighting. +- Auto-completion of commands. +- Tools for graphical analysis of query execution. +- Colour scheme options. [Tabix documentation](https://tabix.io/doc/). -### HouseOps +### HouseOps {#houseops} [HouseOps](https://github.com/HouseOps/HouseOps) is a UI/IDE for OSX, Linux and Windows. Features: -- Query builder with syntax highlighting. View the response in a table or JSON view. -- Export query results as CSV or JSON. -- List of processes with descriptions. Write mode. Ability to stop (`KILL`) a process. -- Database graph. Shows all tables and their columns with additional information. -- Quick view of the column size. -- Server configuration. +- Query builder with syntax highlighting. View the response in a table or JSON view. +- Export query results as CSV or JSON. +- List of processes with descriptions. Write mode. Ability to stop (`KILL`) a process. +- Database graph. Shows all tables and their columns with additional information. +- A quick view of the column size. +- Server configuration. The following features are planned for development: -- Database management. -- User management. -- Real-time data analysis. -- Cluster monitoring. -- Cluster management. -- Monitoring replicated and Kafka tables. +- Database management. +- User management. +- Real-time data analysis. +- Cluster monitoring. +- Cluster management. +- Monitoring replicated and Kafka tables. -### LightHouse +### LightHouse {#lighthouse} [LightHouse](https://github.com/VKCOM/lighthouse) is a lightweight web interface for ClickHouse. Features: -- Table list with filtering and metadata. -- Table preview with filtering and sorting. -- Read-only queries execution. +- Table list with filtering and metadata. +- Table preview with filtering and sorting. +- Read-only queries execution. -### Redash +### Redash {#redash} [Redash](https://github.com/getredash/redash) is a platform for data visualization. @@ -56,50 +56,50 @@ Supports for multiple data sources including ClickHouse, Redash can join results Features: -- Powerful editor of queries. -- Database explorer. -- Visualization tools, that allow you to represent data in different forms. +- Powerful editor of queries. +- Database explorer. +- Visualization tools, that allow you to represent data in different forms. -### DBeaver +### DBeaver {#dbeaver} [DBeaver](https://dbeaver.io/) - universal desktop database client with ClickHouse support. Features: -- Query development with syntax highlight and autocompletion. -- Table list with filters and metadata search. -- Table data preview. -- Full text search. +- Query development with syntax highlight and autocompletion. +- Table list with filters and metadata search. +- Table data preview. +- Full-text search. -### clickhouse-cli +### clickhouse-cli {#clickhouse-cli} -[clickhouse-cli](https://github.com/hatarist/clickhouse-cli) is an alternative command line client for ClickHouse, written in Python 3. +[clickhouse-cli](https://github.com/hatarist/clickhouse-cli) is an alternative command-line client for ClickHouse, written in Python 3. Features: -- Autocompletion. -- Syntax highlighting for the queries and data output. -- Pager support for the data output. -- Custom PostgreSQL-like commands. +- Autocompletion. +- Syntax highlighting for the queries and data output. +- Pager support for the data output. +- Custom PostgreSQL-like commands. -### clickhouse-flamegraph +### clickhouse-flamegraph {#clickhouse-flamegraph} [clickhouse-flamegraph](https://github.com/Slach/clickhouse-flamegraph) is a specialized tool to visualize the `system.trace_log` as [flamegraph](http://www.brendangregg.com/flamegraphs.html). -## Commercial +## Commercial {#commercial} -### DataGrip +### DataGrip {#datagrip} -[DataGrip](https://www.jetbrains.com/datagrip/) is a database IDE from JetBrains with dedicated support for ClickHouse. It is also embedded into other IntelliJ-based tools: PyCharm, IntelliJ IDEA, GoLand, PhpStorm and others. +[DataGrip](https://www.jetbrains.com/datagrip/) is a database IDE from JetBrains with dedicated support for ClickHouse. It is also embedded in other IntelliJ-based tools: PyCharm, IntelliJ IDEA, GoLand, PhpStorm and others. Features: -- Very fast code completion. -- ClickHouse syntax highlighting. -- Support for features specific to ClickHouse, for example nested columns, table engines. -- Data Editor. -- Refactorings. -- Search and Navigation. +- Very fast code completion. +- ClickHouse syntax highlighting. +- Support for features specific to ClickHouse, for example, nested columns, table engines. +- Data Editor. +- Refactorings. +- Search and Navigation. ### Yandex DataLens {#yandex-datalens} @@ -107,38 +107,38 @@ Features: Features: -- Wide range of available visualizations, from simple bar charts to complex dashboards. -- Dashboards could be made publicly available. -- Support for multiple data sources including ClickHouse. -- Storage for materialized data based on ClickHouse. +- Wide range of available visualizations, from simple bar charts to complex dashboards. +- Dashboards could be made publicly available. +- Support for multiple data sources including ClickHouse. +- Storage for materialized data based on ClickHouse. DataLens is [available for free](https://cloud.yandex.com/docs/datalens/pricing) for low-load projects, even for commercial use. -- [DataLens documentation](https://cloud.yandex.com/docs/datalens/). -- [Tutorial](https://cloud.yandex.com/docs/datalens/solutions/data-from-db-visualization) on visualizing data from a ClickHouse database. +- [DataLens documentation](https://cloud.yandex.com/docs/datalens/). +- [Tutorial](https://cloud.yandex.com/docs/solutions/datalens/data-from-ch-visualization) on visualizing data from a ClickHouse database. -### Holistics Software +### Holistics Software {#holistics-software} -[Holistics](https://www.holistics.io/) is a full-stack data platform and business intelligence tool. +[Holistics](https://www.holistics.io/) is a full-stack data platform and business intelligence tool. Features: -- Automated email, Slack and Google Sheet schedules of reports. -- SQL editor with visualizations, version control, auto-completion, reusable query components and dynamic filters. -- Embedded analytics of reports and dashboards via iframe. -- Data preparation and ETL capabilities. -- SQL data modeling support for relational mapping of data. +- Automated email, Slack and Google Sheet schedules of reports. +- SQL editor with visualizations, version control, auto-completion, reusable query components and dynamic filters. +- Embedded analytics of reports and dashboards via iframe. +- Data preparation and ETL capabilities. +- SQL data modelling support for relational mapping of data. + +### Looker {#looker} -### Looker [Looker](https://looker.com) is a data platform and business intelligence tool with support for 50+ database dialects including ClickHouse. Looker is available as a SaaS platform and self-hosted. Users can use Looker via the browser to explore data, build visualizations and dashboards, schedule reports, and share their insights with colleagues. Looker provides a rich set of tools to embed these features in other applications, and an API to integrate data with other applications. Features: -- Easy and agile development using LookML, a language which supports currated -[Data Modeling](https://looker.com/platform/data-modeling) to support report writers and end users. -- Powerful workflow integration via Looker's [Data Actions](https://looker.com/platform/actions). - +- Easy and agile development using LookML, a language which supports curated + [Data Modeling](https://looker.com/platform/data-modeling) to support report writers and end-users. +- Powerful workflow integration via Looker’s [Data Actions](https://looker.com/platform/actions). [How to configure ClickHouse in Looker.](https://docs.looker.com/setup-and-management/database-config/clickhouse) diff --git a/docs/en/interfaces/third-party/integrations.md b/docs/en/interfaces/third-party/integrations.md index 62ca059fac2..ffdd2e7ce7d 100644 --- a/docs/en/interfaces/third-party/integrations.md +++ b/docs/en/interfaces/third-party/integrations.md @@ -1,89 +1,88 @@ -# Integration Libraries from Third-party Developers +# Integration Libraries from Third-party Developers {#integration-libraries-from-third-party-developers} !!! warning "Disclaimer" - Yandex does **not** maintain the tools and libraries listed below and haven't done any extensive testing to ensure their quality. + Yandex does **not** maintain the tools and libraries listed below and haven’t done any extensive testing to ensure their quality. -## Infrastructure Products +## Infrastructure Products {#infrastructure-products} -- Relational database management systems - - [MySQL](https://www.mysql.com) - - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) - - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) - - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) - - [PostgreSQL](https://www.postgresql.org) - - [clickhousedb_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) - - [infi.clickhouse_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (uses [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pg2ch](https://github.com/mkabilov/pg2ch) - - [clickhouse_fdw](https://github.com/adjust/clickhouse_fdw) - - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) - - [ClickHouseMigrator](https://github.com/zlzforever/ClickHouseMigrator) -- Message queues - - [Kafka](https://kafka.apache.org) - - [clickhouse_sinker](https://github.com/housepower/clickhouse_sinker) (uses [Go client](https://github.com/kshvakov/clickhouse/)) -- Object storages - - [S3](https://en.wikipedia.org/wiki/Amazon_S3) - - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) -- Container orchestration - - [Kubernetes](https://kubernetes.io) - - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) -- Configuration management - - [puppet](https://puppet.com) - - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) - - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) -- Monitoring - - [Graphite](https://graphiteapp.org) - - [graphouse](https://github.com/yandex/graphouse) - - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + - - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) - - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - optimizes staled partitions in [\*GraphiteMergeTree](../../operations/table_engines/graphitemergetree.md#graphitemergetree) if rules from [rollup configuration](../../operations/table_engines/graphitemergetree.md#rollup-configuration) could be applied - - [Grafana](https://grafana.com/) - - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) - - [Prometheus](https://prometheus.io/) - - [clickhouse_exporter](https://github.com/f1yegor/clickhouse_exporter) - - [PromHouse](https://github.com/Percona-Lab/PromHouse) - - [clickhouse_exporter](https://github.com/hot-wifi/clickhouse_exporter) (uses [Go client](https://github.com/kshvakov/clickhouse/)) - - [Nagios](https://www.nagios.org/) - - [check_clickhouse](https://github.com/exogroup/check_clickhouse/) - - [Zabbix](https://www.zabbix.com) - - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) - - [Sematext](https://sematext.com/) - - [clickhouse integration](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) -- Logging - - [rsyslog](https://www.rsyslog.com/) - - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) - - [fluentd](https://www.fluentd.org) - - [loghouse](https://github.com/flant/loghouse) (for [Kubernetes](https://kubernetes.io)) - - [logagent](https://www.sematext.com/logagent) - - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) -- Geo - - [MaxMind](https://dev.maxmind.com/geoip/) - - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) +- Relational database management systems + - [MySQL](https://www.mysql.com) + - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) + - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) + - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) + - [PostgreSQL](https://www.postgresql.org) + - [clickhousedb\_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) + - [infi.clickhouse\_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (uses [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pg2ch](https://github.com/mkabilov/pg2ch) + - [clickhouse\_fdw](https://github.com/adjust/clickhouse_fdw) + - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) + - [ClickHouseMigrator](https://github.com/zlzforever/ClickHouseMigrator) +- Message queues + - [Kafka](https://kafka.apache.org) + - [clickhouse\_sinker](https://github.com/housepower/clickhouse_sinker) (uses [Go client](https://github.com/kshvakov/clickhouse/)) +- Object storages + - [S3](https://en.wikipedia.org/wiki/Amazon_S3) + - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) +- Container orchestration + - [Kubernetes](https://kubernetes.io) + - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) +- Configuration management + - [puppet](https://puppet.com) + - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) + - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) +- Monitoring + - [Graphite](https://graphiteapp.org) + - [graphouse](https://github.com/yandex/graphouse) + - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + + - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) + - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - optimizes staled partitions in [\*GraphiteMergeTree](../../operations/table_engines/graphitemergetree.md#graphitemergetree) if rules from [rollup configuration](../../operations/table_engines/graphitemergetree.md#rollup-configuration) could be applied + - [Grafana](https://grafana.com/) + - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) + - [Prometheus](https://prometheus.io/) + - [clickhouse\_exporter](https://github.com/f1yegor/clickhouse_exporter) + - [PromHouse](https://github.com/Percona-Lab/PromHouse) + - [clickhouse\_exporter](https://github.com/hot-wifi/clickhouse_exporter) (uses [Go client](https://github.com/kshvakov/clickhouse/)) + - [Nagios](https://www.nagios.org/) + - [check\_clickhouse](https://github.com/exogroup/check_clickhouse/) + - [Zabbix](https://www.zabbix.com) + - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) + - [Sematext](https://sematext.com/) + - [clickhouse integration](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) +- Logging + - [rsyslog](https://www.rsyslog.com/) + - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) + - [fluentd](https://www.fluentd.org) + - [loghouse](https://github.com/flant/loghouse) (for [Kubernetes](https://kubernetes.io)) + - [logagent](https://www.sematext.com/logagent) + - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) +- Geo + - [MaxMind](https://dev.maxmind.com/geoip/) + - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) -## Programming Language Ecosystems - -- Python - - [SQLAlchemy](https://www.sqlalchemy.org) - - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (uses [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pandas](https://pandas.pydata.org) - - [pandahouse](https://github.com/kszucs/pandahouse) -- R - - [dplyr](https://db.rstudio.com/dplyr/) - - [RClickhouse](https://github.com/IMSMWU/RClickhouse) (uses [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) -- Java - - [Hadoop](http://hadoop.apache.org) - - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (uses [JDBC](../../query_language/table_functions/jdbc.md)) -- Scala - - [Akka](https://akka.io) - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- C# - - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) - - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) -- Elixir - - [Ecto](https://github.com/elixir-ecto/ecto) - - [clickhouse_ecto](https://github.com/appodeal/clickhouse_ecto) +## Programming Language Ecosystems {#programming-language-ecosystems} +- Python + - [SQLAlchemy](https://www.sqlalchemy.org) + - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (uses [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pandas](https://pandas.pydata.org) + - [pandahouse](https://github.com/kszucs/pandahouse) +- R + - [dplyr](https://db.rstudio.com/dplyr/) + - [RClickhouse](https://github.com/IMSMWU/RClickhouse) (uses [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) +- Java + - [Hadoop](http://hadoop.apache.org) + - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (uses [JDBC](../../query_language/table_functions/jdbc.md)) +- Scala + - [Akka](https://akka.io) + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- C\# + - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) + - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) +- Elixir + - [Ecto](https://github.com/elixir-ecto/ecto) + - [clickhouse\_ecto](https://github.com/appodeal/clickhouse_ecto) [Original article](https://clickhouse.tech/docs/en/interfaces/third-party/integrations/) diff --git a/docs/en/interfaces/third-party/proxy.md b/docs/en/interfaces/third-party/proxy.md index 7bce14748c8..443a229122a 100644 --- a/docs/en/interfaces/third-party/proxy.md +++ b/docs/en/interfaces/third-party/proxy.md @@ -1,38 +1,38 @@ -# Proxy Servers from Third-party Developers +# Proxy Servers from Third-party Developers {#proxy-servers-from-third-party-developers} -## chproxy +## chproxy {#chproxy} -[chproxy](https://github.com/Vertamedia/chproxy), is an http proxy and load balancer for ClickHouse database. +[chproxy](https://github.com/Vertamedia/chproxy), is an HTTP proxy and load balancer for ClickHouse database. Features: -* Per-user routing and response caching. -* Flexible limits. -* Automatic SSL cerificate renewal. +- Per-user routing and response caching. +- Flexible limits. +- Automatic SSL certificate renewal. Implemented in Go. -## KittenHouse +## KittenHouse {#kittenhouse} -[KittenHouse](https://github.com/VKCOM/kittenhouse) is designed to be a local proxy between ClickHouse and application server in case it's impossible or inconvenient to buffer INSERT data on your application side. +[KittenHouse](https://github.com/VKCOM/kittenhouse) is designed to be a local proxy between ClickHouse and application server in case it’s impossible or inconvenient to buffer INSERT data on your application side. Features: -* In-memory and on-disk data buffering. -* Per-table routing. -* Load-balancing and health checking. +- In-memory and on-disk data buffering. +- Per-table routing. +- Load-balancing and health checking. Implemented in Go. -## ClickHouse-Bulk +## ClickHouse-Bulk {#clickhouse-bulk} [ClickHouse-Bulk](https://github.com/nikepan/clickhouse-bulk) is a simple ClickHouse insert collector. Features: -* Group requests and send by threshold or interval. -* Multiple remote servers. -* Basic authentication. +- Group requests and send by threshold or interval. +- Multiple remote servers. +- Basic authentication. Implemented in Go. diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md index 053dbef85b9..d0066388281 100644 --- a/docs/en/introduction/adopters.md +++ b/docs/en/introduction/adopters.md @@ -1,76 +1,75 @@ -# ClickHouse Adopters +# ClickHouse Adopters {#clickhouse-adopters} !!! warning "Disclaimer" - The following list of companies using ClickHouse and their success stories is assembled from public sources, thus might differ from current reality. We'd really appreciate if you share the story of adopting ClickHouse in your company and [add it to the list](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md), but please make sure you won't have any NDA issues by doing so. Providing updates with publications by other companies is also useful. - -| Company | Industry | Usecase | Cluster Size | (Un)Compressed Data Size* | Reference | -| --- | --- | --- | --- | --- | --- | -| [2gis](https://2gis.ru) | Maps | Monitoring | — | — | [Talk in Russian, July 2019](https://youtu.be/58sPkXfq6nw) | -| [Aloha Browser](https://alohabrowser.com/) | Mobile App | Browser backend | — | — | [Slides in Russian, May 2019](https://github.com/yandex/clickhouse-presentations/blob/master/meetup22/aloha.pdf) | -| [Amadeus](https://amadeus.com/) | Travel | Analytics | — | — | [Press Release, April 2018](https://www.altinity.com/blog/2018/4/5/amadeus-technologies-launches-investment-and-insights-tool-based-on-machine-learning-and-strategy-algorithms) | -| [Appsflyer](https://www.appsflyer.com) | Mobile analytics | Main product | — | — | [Talk in Russian, July 2019](https://www.youtube.com/watch?v=M3wbRlcpBbY) | -| [ArenaData](https://arenadata.tech/) | Data Platform | Main product | — | — | [Slides in Russian, December 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup38/indexes.pdf) | -| [Badoo](https://badoo.com) | Dating | Timeseries | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.tech/meetup38/forecast.pdf) | -| [Benocs](https://www.benocs.com/) | Network Telemetry and Analytics | Main Product | — | — | [Slides in English, October 2017](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup9/lpm.pdf) | -| [Bloomberg](https://www.bloomberg.com/) | Finance, Media | Monitoring | 102 servers | — | [Slides, May 2018](https://www.slideshare.net/Altinity/http-analytics-for-6m-requests-per-second-using-clickhouse-by-alexander-bocharov) | -| [Bloxy](https://bloxy.info) | Blockchain | Analytics | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/4_bloxy.pptx) | -| `Dataliance/UltraPower` | Telecom | Analytics | — | — | [Slides in Chinese, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/telecom.pdf) | -| [CARTO](https://carto.com/) | Business Intelligence | Geo analytics | — | — | [Geospatial processing with Clickhouse](https://carto.com/blog/geospatial-processing-with-clickhouse/) | -| [CERN](http://public.web.cern.ch/public/) | Research | Experiment | — | — | [Press release, April 2012](https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/) | -| [Cisco](http://cisco.com/) | Networking | Traffic analysis | — | — | [Lightning talk, October 2019](https://youtu.be/-hI1vDR2oPY?t=5057) | -| [Citadel Securities](https://www.citadelsecurities.com/) | Finance | — | — | — | [Contribution, March 2019](https://github.com/ClickHouse/ClickHouse/pull/4774) | -| [Citymobil](https://city-mobil.ru) | Taxi | Analytics | — | — | [Blog Post in Russian, March 2020](https://habr.com/en/company/citymobil/blog/490660/) | -| [ContentSquare](https://contentsquare.com) | Web analytics | Main product | — | — | [Blog post in French, November 2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | -| [Cloudflare](https://cloudflare.com) | CDN | Traffic analysis | 36 servers | — | [Blog post, May 2017](https://blog.cloudflare.com/how-cloudflare-analyzes-1m-dns-queries-per-second/), [Blog post, March 2018](https://blog.cloudflare.com/http-analytics-for-6m-requests-per-second-using-clickhouse/) | -| [Corunet](https://coru.net/) | Analytics | Main product | — | — | [Slides in English, April 2019 ](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup21/predictive_models.pdf) | -| [CraiditX 氪信](https://creditx.com) | Finance AI | Analysis | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/udf.pptx) | -| [Criteo/Storetail](https://www.criteo.com/) | Retail | Main product | — | — | [Slides in English, October 2018 ](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/3_storetail.pptx) | -| [Deutsche Bank](https://db.com) | Finance | BI Analytics | — | — | [Slides in English, October 2019](https://bigdatadays.ru/wp-content/uploads/2019/10/D2-H3-3_Yakunin-Goihburg.pdf) | -| [Diva-e](https://www.diva-e.com) | Digital consulting | Main Product | — | — | [Slides in English, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup29/ClickHouse-MeetUp-Unusual-Applications-sd-2019-09-17.pdf) | -| [Exness](https://www.exness.com) | Trading | Metrics, Logging | — | — | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | -| [Geniee](https://geniee.co.jp) | Ad network | Main product | — | — | [Blog post in Japanese, July 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) | -| [HUYA](https://www.huya.com/) | Video Streaming | Analytics | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) | -| [Idealista](https://www.idealista.com) | Real Estate | Analytics | — | — | [Blog Post in English, April 2019](https://clickhouse.yandex/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) | -| [Infovista](https://www.infovista.com/) | Networks | Analytics | — | — | [Slides in English, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup30/infovista.pdf) | -| [InnoGames](https://www.innogames.com) | Games | Metrics, Logging | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/graphite_and_clickHouse.pdf) | -| [Integros](https://integros.com) | Platform for video services | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | -| [Kodiak Data](https://www.kodiakdata.com/) | Clouds | Main product | — | — | [Slides in Engish, April 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup13/kodiak_data.pdf) | -| [Kontur](https://kontur.ru) | Software Development | Metrics | — | — | [Talk in Russian, November 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) | -| [LifeStreet](https://lifestreet.com/) | Ad network | Main product | 75 servers (3 replicas) | 5.27 PiB | [Blog post in Russian, February 2017](https://habr.com/en/post/322620/) | -| [Mail.ru Cloud Solutions](https://mcs.mail.ru/) | Cloud services | Main product | — | — | [Running ClickHouse Instance, in Russian](https://mcs.mail.ru/help/db-create/clickhouse#) | -| [MessageBird](https://www.messagebird.com) | Telecommunications | Statistics | — | — | [Slides in English, November 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup20/messagebird.pdf) | -| [MGID](https://www.mgid.com/) | Ad network | Web-analytics | — | — | [Our experience in implementing analytical DBMS ClickHouse, in Russian](http://gs-studio.com/news-about-it/32777----clickhouse---c) | -| [OneAPM](https://www.oneapm.com/) | Monitorings and Data Analysis | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) | -| [Pragma Innovation](http://www.pragma-innovation.fr/) | Telemetry and Big Data Analysis | Main product | — | — | [Slides in English, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/4_pragma_innovation.pdf) | -| [QINGCLOUD](https://www.qingcloud.com/) | Cloud services | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) | -| [Qrator](https://qrator.net) | DDoS protection | Main product | — | — | [Blog Post, March 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) | -| [Beijing PERCENT Information Technology Co., Ltd.](https://www.percent.cn/) | Analytics | Main Product | — | — | [Slides in Chinese, June 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/4.%20ClickHouse万亿数据双中心的设计与实践%20.pdf) | -| [Rambler](https://rambler.ru) | Internet services | Analytics | — | — | [Talk in Russian, April 2018](https://medium.com/@ramblertop/разработка-api-clickhouse-для-рамблер-топ-100-f4c7e56f3141) | -| [Tencent](https://www.tencent.com) | Messaging | Logging | — | — | [Talk in Chinese, November 2019](https://youtu.be/T-iVQRuw-QY?t=5050) | -| [Traffic Stars](https://trafficstars.com/) | AD network | — | — | — | [Slides in Russian, May 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) | -| [S7 Airlines](https://www.s7.ru) | Airlines | Metrics, Logging | — | — | [Talk in Russian, March 2019](https://www.youtube.com/watch?v=nwG68klRpPg&t=15s) | -| [SEMrush](https://www.semrush.com/) | Marketing | Main product | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/5_semrush.pdf) | -| [scireum GmbH](https://www.scireum.de/) | e-Commerce | Main product | — | — | [Talk in German, February 2020](https://www.youtube.com/watch?v=7QWAn5RbyR4) | -| [Sentry](https://sentry.io/) | Software developer | Backend for product | — | — | [Blog Post in English, May 2019](https://blog.sentry.io/2019/05/16/introducing-snuba-sentrys-new-search-infrastructure) | -| [SGK](http://www.sgk.gov.tr/wps/portal/sgk/tr) | Goverment Social Security | Analytics | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/ClickHouse%20Meetup-Ramazan%20POLAT.pdf) | -| [seo.do](https://seo.do/) | Analytics | Main product | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/CH%20Presentation-%20Metehan%20Çetinkaya.pdf) | -| [Sina](http://english.sina.com/index.html) | News | — | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/6.%20ClickHouse最佳实践%20高鹏_新浪.pdf) | -| [SMI2](https://smi2.ru/) | News | Analytics | — | — | [Blog Post in Russian, November 2017](https://habr.com/ru/company/smi2/blog/314558/) | -| [Splunk](https://www.splunk.com/) | Business Analytics | Main product | — | — | [Slides in English, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/splunk.pdf) | -| [Spotify](https://www.spotify.com) | Music | Experimentation | — | — | [Slides, July 2018](https://www.slideshare.net/glebus/using-clickhouse-for-experimentation-104247173) | -| [Tencent](https://www.tencent.com) | Big Data | Data processing | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/5.%20ClickHouse大数据集群应用_李俊飞腾讯网媒事业部.pdf) | -| [Uber](https://www.uber.com) | Taxi | Logging | — | — | [Slides, February 2020](https://presentations.clickhouse.tech/meetup40/ml.pdf) | -| [VKontakte](https://vk.com) | Social Network | Statistics, Logging | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) | -| [Wisebits](https://wisebits.com/) | IT Solutions | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | -| [Xiaoxin Tech.](https://www.xiaoheiban.cn/) | Education | Common purpose | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/sync-clickhouse-with-mysql-mongodb.pptx) | -| [Ximalaya](https://www.ximalaya.com/) | Audio sharing | OLAP | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/ximalaya.pdf) | -| [Yandex Cloud](https://cloud.yandex.ru/services/managed-clickhouse) | Public Cloud | Main product | — | — | [Talk in Russian, December 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | -| [Yandex DataLens](https://cloud.yandex.ru/services/datalens) | Business Intelligence | Main product | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.tech/meetup38/datalens.pdf) | -| [Yandex Market](https://market.yandex.ru/) | e-Commerce | Metrics, Logging | — | — | [Talk in Russian, January 2019](https://youtu.be/_l1qP0DyBcA?t=478) | -| [Yandex Metrica](https://metrica.yandex.com) | Web analytics | Main product | 360 servers in one cluster, 1862 servers in one department | 66.41 PiB / 5.68 PiB | [Slides, February 2020](https://presentations.clickhouse.tech/meetup40/introduction/#13) | -| [ЦВТ](https://htc-cs.ru/) | Software Development | Metrics, Logging | — | — | [Blog Post, March 2019, in Russian](https://vc.ru/dev/62715-kak-my-stroili-monitoring-na-prometheus-clickhouse-i-elk) | -| [МКБ](https://mkb.ru/) | Bank | Web-system monitoring | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/mkb.pdf) | -| [金数据](https://jinshuju.net) | BI Analytics | Main product | — | — | [Slides in Chinese, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/3.%20金数据数据架构调整方案Public.pdf) | + The following list of companies using ClickHouse and their success stories is assembled from public sources, thus might differ from current reality. We’d really appreciate if you share the story of adopting ClickHouse in your company and [add it to the list](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md), but please make sure you won’t have any NDA issues by doing so. Providing updates with publications from other companies is also useful. +| Company | Industry | Usecase | Cluster Size | (Un)Compressed Data Size\* | Reference | +|-----------------------------------------------------------------------------|---------------------------------|-----------------------|------------------------------------------------------------|------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [2gis](https://2gis.ru) | Maps | Monitoring | — | — | [Talk in Russian, July 2019](https://youtu.be/58sPkXfq6nw) | +| [Aloha Browser](https://alohabrowser.com/) | Mobile App | Browser backend | — | — | [Slides in Russian, May 2019](https://github.com/yandex/clickhouse-presentations/blob/master/meetup22/aloha.pdf) | +| [Amadeus](https://amadeus.com/) | Travel | Analytics | — | — | [Press Release, April 2018](https://www.altinity.com/blog/2018/4/5/amadeus-technologies-launches-investment-and-insights-tool-based-on-machine-learning-and-strategy-algorithms) | +| [Appsflyer](https://www.appsflyer.com) | Mobile analytics | Main product | — | — | [Talk in Russian, July 2019](https://www.youtube.com/watch?v=M3wbRlcpBbY) | +| [ArenaData](https://arenadata.tech/) | Data Platform | Main product | — | — | [Slides in Russian, December 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup38/indexes.pdf) | +| [Badoo](https://badoo.com) | Dating | Timeseries | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.tech/meetup38/forecast.pdf) | +| [Benocs](https://www.benocs.com/) | Network Telemetry and Analytics | Main Product | — | — | [Slides in English, October 2017](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup9/lpm.pdf) | +| [Bloomberg](https://www.bloomberg.com/) | Finance, Media | Monitoring | 102 servers | — | [Slides, May 2018](https://www.slideshare.net/Altinity/http-analytics-for-6m-requests-per-second-using-clickhouse-by-alexander-bocharov) | +| [Bloxy](https://bloxy.info) | Blockchain | Analytics | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/4_bloxy.pptx) | +| `Dataliance/UltraPower` | Telecom | Analytics | — | — | [Slides in Chinese, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/telecom.pdf) | +| [CARTO](https://carto.com/) | Business Intelligence | Geo analytics | — | — | [Geospatial processing with Clickhouse](https://carto.com/blog/geospatial-processing-with-clickhouse/) | +| [CERN](http://public.web.cern.ch/public/) | Research | Experiment | — | — | [Press release, April 2012](https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/) | +| [Cisco](http://cisco.com/) | Networking | Traffic analysis | — | — | [Lightning talk, October 2019](https://youtu.be/-hI1vDR2oPY?t=5057) | +| [Citadel Securities](https://www.citadelsecurities.com/) | Finance | — | — | — | [Contribution, March 2019](https://github.com/ClickHouse/ClickHouse/pull/4774) | +| [Citymobil](https://city-mobil.ru) | Taxi | Analytics | — | — | [Blog Post in Russian, March 2020](https://habr.com/en/company/citymobil/blog/490660/) | +| [ContentSquare](https://contentsquare.com) | Web analytics | Main product | — | — | [Blog post in French, November 2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | +| [Cloudflare](https://cloudflare.com) | CDN | Traffic analysis | 36 servers | — | [Blog post, May 2017](https://blog.cloudflare.com/how-cloudflare-analyzes-1m-dns-queries-per-second/), [Blog post, March 2018](https://blog.cloudflare.com/http-analytics-for-6m-requests-per-second-using-clickhouse/) | +| [Corunet](https://coru.net/) | Analytics | Main product | — | — | [Slides in English, April 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup21/predictive_models.pdf) | +| [CraiditX 氪信](https://creditx.com) | Finance AI | Analysis | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/udf.pptx) | +| [Criteo/Storetail](https://www.criteo.com/) | Retail | Main product | — | — | [Slides in English, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/3_storetail.pptx) | +| [Deutsche Bank](https://db.com) | Finance | BI Analytics | — | — | [Slides in English, October 2019](https://bigdatadays.ru/wp-content/uploads/2019/10/D2-H3-3_Yakunin-Goihburg.pdf) | +| [Diva-e](https://www.diva-e.com) | Digital consulting | Main Product | — | — | [Slides in English, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup29/ClickHouse-MeetUp-Unusual-Applications-sd-2019-09-17.pdf) | +| [Exness](https://www.exness.com) | Trading | Metrics, Logging | — | — | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | +| [Geniee](https://geniee.co.jp) | Ad network | Main product | — | — | [Blog post in Japanese, July 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) | +| [HUYA](https://www.huya.com/) | Video Streaming | Analytics | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) | +| [Idealista](https://www.idealista.com) | Real Estate | Analytics | — | — | [Blog Post in English, April 2019](https://clickhouse.yandex/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) | +| [Infovista](https://www.infovista.com/) | Networks | Analytics | — | — | [Slides in English, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup30/infovista.pdf) | +| [InnoGames](https://www.innogames.com) | Games | Metrics, Logging | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/graphite_and_clickHouse.pdf) | +| [Integros](https://integros.com) | Platform for video services | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | +| [Kodiak Data](https://www.kodiakdata.com/) | Clouds | Main product | — | — | [Slides in Engish, April 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup13/kodiak_data.pdf) | +| [Kontur](https://kontur.ru) | Software Development | Metrics | — | — | [Talk in Russian, November 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) | +| [LifeStreet](https://lifestreet.com/) | Ad network | Main product | 75 servers (3 replicas) | 5.27 PiB | [Blog post in Russian, February 2017](https://habr.com/en/post/322620/) | +| [Mail.ru Cloud Solutions](https://mcs.mail.ru/) | Cloud services | Main product | — | — | [Running ClickHouse Instance, in Russian](https://mcs.mail.ru/help/db-create/clickhouse#) | +| [MessageBird](https://www.messagebird.com) | Telecommunications | Statistics | — | — | [Slides in English, November 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup20/messagebird.pdf) | +| [MGID](https://www.mgid.com/) | Ad network | Web-analytics | — | — | [Our experience in implementing analytical DBMS ClickHouse, in Russian](http://gs-studio.com/news-about-it/32777----clickhouse---c) | +| [OneAPM](https://www.oneapm.com/) | Monitorings and Data Analysis | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) | +| [Pragma Innovation](http://www.pragma-innovation.fr/) | Telemetry and Big Data Analysis | Main product | — | — | [Slides in English, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/4_pragma_innovation.pdf) | +| [QINGCLOUD](https://www.qingcloud.com/) | Cloud services | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) | +| [Qrator](https://qrator.net) | DDoS protection | Main product | — | — | [Blog Post, March 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) | +| [Beijing PERCENT Information Technology Co., Ltd.](https://www.percent.cn/) | Analytics | Main Product | — | — | [Slides in Chinese, June 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/4.%20ClickHouse万亿数据双中心的设计与实践%20.pdf) | +| [Rambler](https://rambler.ru) | Internet services | Analytics | — | — | [Talk in Russian, April 2018](https://medium.com/@ramblertop/разработка-api-clickhouse-для-рамблер-топ-100-f4c7e56f3141) | +| [Tencent](https://www.tencent.com) | Messaging | Logging | — | — | [Talk in Chinese, November 2019](https://youtu.be/T-iVQRuw-QY?t=5050) | +| [Traffic Stars](https://trafficstars.com/) | AD network | — | — | — | [Slides in Russian, May 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) | +| [S7 Airlines](https://www.s7.ru) | Airlines | Metrics, Logging | — | — | [Talk in Russian, March 2019](https://www.youtube.com/watch?v=nwG68klRpPg&t=15s) | +| [SEMrush](https://www.semrush.com/) | Marketing | Main product | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/5_semrush.pdf) | +| [scireum GmbH](https://www.scireum.de/) | e-Commerce | Main product | — | — | [Talk in German, February 2020](https://www.youtube.com/watch?v=7QWAn5RbyR4) | +| [Sentry](https://sentry.io/) | Software developer | Backend for product | — | — | [Blog Post in English, May 2019](https://blog.sentry.io/2019/05/16/introducing-snuba-sentrys-new-search-infrastructure) | +| [SGK](http://www.sgk.gov.tr/wps/portal/sgk/tr) | Goverment Social Security | Analytics | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/ClickHouse%20Meetup-Ramazan%20POLAT.pdf) | +| [seo.do](https://seo.do/) | Analytics | Main product | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/CH%20Presentation-%20Metehan%20Çetinkaya.pdf) | +| [Sina](http://english.sina.com/index.html) | News | — | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/6.%20ClickHouse最佳实践%20高鹏_新浪.pdf) | +| [SMI2](https://smi2.ru/) | News | Analytics | — | — | [Blog Post in Russian, November 2017](https://habr.com/ru/company/smi2/blog/314558/) | +| [Splunk](https://www.splunk.com/) | Business Analytics | Main product | — | — | [Slides in English, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/splunk.pdf) | +| [Spotify](https://www.spotify.com) | Music | Experimentation | — | — | [Slides, July 2018](https://www.slideshare.net/glebus/using-clickhouse-for-experimentation-104247173) | +| [Tencent](https://www.tencent.com) | Big Data | Data processing | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/5.%20ClickHouse大数据集群应用_李俊飞腾讯网媒事业部.pdf) | +| [Uber](https://www.uber.com) | Taxi | Logging | — | — | [Slides, February 2020](https://presentations.clickhouse.tech/meetup40/ml.pdf) | +| [VKontakte](https://vk.com) | Social Network | Statistics, Logging | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) | +| [Wisebits](https://wisebits.com/) | IT Solutions | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | +| [Xiaoxin Tech.](https://www.xiaoheiban.cn/) | Education | Common purpose | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/sync-clickhouse-with-mysql-mongodb.pptx) | +| [Ximalaya](https://www.ximalaya.com/) | Audio sharing | OLAP | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/ximalaya.pdf) | +| [Yandex Cloud](https://cloud.yandex.ru/services/managed-clickhouse) | Public Cloud | Main product | — | — | [Talk in Russian, December 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | +| [Yandex DataLens](https://cloud.yandex.ru/services/datalens) | Business Intelligence | Main product | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.tech/meetup38/datalens.pdf) | +| [Yandex Market](https://market.yandex.ru/) | e-Commerce | Metrics, Logging | — | — | [Talk in Russian, January 2019](https://youtu.be/_l1qP0DyBcA?t=478) | +| [Yandex Metrica](https://metrica.yandex.com) | Web analytics | Main product | 360 servers in one cluster, 1862 servers in one department | 66.41 PiB / 5.68 PiB | [Slides, February 2020](https://presentations.clickhouse.tech/meetup40/introduction/#13) | +| [ЦВТ](https://htc-cs.ru/) | Software Development | Metrics, Logging | — | — | [Blog Post, March 2019, in Russian](https://vc.ru/dev/62715-kak-my-stroili-monitoring-na-prometheus-clickhouse-i-elk) | +| [МКБ](https://mkb.ru/) | Bank | Web-system monitoring | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/mkb.pdf) | +| [金数据](https://jinshuju.net) | BI Analytics | Main product | — | — | [Slides in Chinese, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/3.%20金数据数据架构调整方案Public.pdf) | [Original article](https://clickhouse.tech/docs/en/introduction/adopters/) diff --git a/docs/en/introduction/distinctive_features.md b/docs/en/introduction/distinctive_features.md index af4050fa3e2..bcea9fedc80 100644 --- a/docs/en/introduction/distinctive_features.md +++ b/docs/en/introduction/distinctive_features.md @@ -1,61 +1,61 @@ -# Distinctive Features of ClickHouse +# Distinctive Features of ClickHouse {#distinctive-features-of-clickhouse} -## True Column-Oriented DBMS +## True Column-Oriented DBMS {#true-column-oriented-dbms} -In a true column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length "number" next to the values. As an example, a billion UInt8-type values should actually consume around 1 GB uncompressed, or this will strongly affect the CPU use. It is very important to store data compactly (without any "garbage") even when uncompressed, since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data. +In a true column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. As an example, a billion UInt8-type values should actually consume around 1 GB uncompressed, or this will strongly affect the CPU use. It is very important to store data compactly (without any “garbage”) even when uncompressed, since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data. -This is worth noting because there are systems that can store values of different columns separately, but that can't effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. In these systems, you will get throughput around a hundred thousand rows per second, but not hundreds of millions of rows per second. +This is worth noting because there are systems that can store values of different columns separately, but that can’t effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. In these systems, you will get throughput around a hundred thousand rows per second, but not hundreds of millions of rows per second. -It's also worth noting that ClickHouse is a database management system, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server. +It’s also worth noting that ClickHouse is a database management system, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server. -## Data Compression +## Data Compression {#data-compression} Some column-oriented DBMSs (InfiniDB CE and MonetDB) do not use data compression. However, data compression does play a key role in achieving excellent performance. -## Disk Storage of Data +## Disk Storage of Data {#disk-storage-of-data} -Keeping data physically sorted by primary key makes it possible to extract data for it's specific values or value ranges with low latency, less than few dozen milliseconds. Some column-oriented DBMSs (such as SAP HANA and Google PowerDrill) can only work in RAM. This approach encourages the allocation of a larger hardware budget than is actually necessary for real-time analysis. ClickHouse is designed to work on regular hard drives, which means the cost per GB of data storage is low, but SSD and additional RAM are also fully used if available. +Keeping data physically sorted by primary key makes it possible to extract data for it’s specific values or value ranges with low latency, less than few dozen milliseconds. Some column-oriented DBMSs (such as SAP HANA and Google PowerDrill) can only work in RAM. This approach encourages the allocation of a larger hardware budget than is actually necessary for real-time analysis. ClickHouse is designed to work on regular hard drives, which means the cost per GB of data storage is low, but SSD and additional RAM are also fully used if available. -## Parallel Processing on Multiple Cores +## Parallel Processing on Multiple Cores {#parallel-processing-on-multiple-cores} Large queries are parallelized in a natural way, taking all the necessary resources that available on the current server. -## Distributed Processing on Multiple Servers +## Distributed Processing on Multiple Servers {#distributed-processing-on-multiple-servers} Almost none of the columnar DBMSs mentioned above have support for distributed query processing. In ClickHouse, data can reside on different shards. Each shard can be a group of replicas that are used for fault tolerance. The query is processed on all the shards in parallel. This is transparent for the user. -## SQL Support +## SQL Support {#sql-support} ClickHouse supports a declarative query language based on SQL that is identical to the SQL standard in many cases. Supported queries include GROUP BY, ORDER BY, subqueries in FROM, IN, and JOIN clauses, and scalar subqueries. Dependent subqueries and window functions are not supported. -## Vector Engine +## Vector Engine {#vector-engine} Data is not only stored by columns, but is processed by vectors (parts of columns). This allows us to achieve high CPU efficiency. -## Real-time Data Updates +## Real-time Data Updates {#real-time-data-updates} ClickHouse supports tables with a primary key. In order to quickly perform queries on the range of the primary key, the data is sorted incrementally using the merge tree. Due to this, data can continually be added to the table. No locks are taken when new data is ingested. -## Index +## Index {#index} -Having a data physically sorted by primary key makes it possible to extract data for it's specific values or value ranges with low latency, less than few dozen milliseconds. +Having a data physically sorted by primary key makes it possible to extract data for it’s specific values or value ranges with low latency, less than few dozen milliseconds. -## Suitable for Online Queries +## Suitable for Online Queries {#suitable-for-online-queries} Low latency means that queries can be processed without delay and without trying to prepare answer in advance, right at the same moment while user interface page is loading. In other words, online. -## Support for Approximated Calculations +## Support for Approximated Calculations {#support-for-approximated-calculations} ClickHouse provides various ways to trade accuracy for performance: -1. Aggregate functions for approximated calculation of the number of distinct values, medians, and quantiles. -2. Running a query based on a part (sample) of data and getting an approximated result. In this case, proportionally less data is retrieved from the disk. -3. Running an aggregation for a limited number of random keys, instead of for all keys. Under certain conditions for key distribution in the data, this provides a reasonably accurate result while using fewer resources. +1. Aggregate functions for approximated calculation of the number of distinct values, medians, and quantiles. +2. Running a query based on a part (sample) of data and getting an approximated result. In this case, proportionally less data is retrieved from the disk. +3. Running an aggregation for a limited number of random keys, instead of for all keys. Under certain conditions for key distribution in the data, this provides a reasonably accurate result while using fewer resources. -## Data replication and data integrity support +## Data replication and data integrity support {#data-replication-and-data-integrity-support} Uses asynchronous multimaster replication. After being written to any available replica, data is distributed to all the remaining replicas in the background. The system maintains identical data on different replicas. Recovery after most failures is performed automatically, and in complex cases — semi-automatically. diff --git a/docs/en/introduction/features_considered_disadvantages.md b/docs/en/introduction/features_considered_disadvantages.md index 6717e32bfe4..9a70c2f0f6c 100644 --- a/docs/en/introduction/features_considered_disadvantages.md +++ b/docs/en/introduction/features_considered_disadvantages.md @@ -1,7 +1,7 @@ -# ClickHouse Features that Can be Considered Disadvantages +# ClickHouse Features that Can be Considered Disadvantages {#clickhouse-features-that-can-be-considered-disadvantages} -1. No full-fledged transactions. -2. Lack of ability to modify or delete already inserted data with high rate and low latency. There are batch deletes and updates available to clean up or modify data, for example to comply with [GDPR](https://gdpr-info.eu). -3. The sparse index makes ClickHouse not really suitable for point queries retrieving single rows by their keys. +1. No full-fledged transactions. +2. Lack of ability to modify or delete already inserted data with high rate and low latency. There are batch deletes and updates available to clean up or modify data, for example to comply with [GDPR](https://gdpr-info.eu). +3. The sparse index makes ClickHouse not so suitable for point queries retrieving single rows by their keys. [Original article](https://clickhouse.tech/docs/en/introduction/features_considered_disadvantages/) diff --git a/docs/en/introduction/history.md b/docs/en/introduction/history.md index 1d35a6df683..d7b7619e398 100644 --- a/docs/en/introduction/history.md +++ b/docs/en/introduction/history.md @@ -1,42 +1,42 @@ -# ClickHouse History +# ClickHouse History {#clickhouse-history} ClickHouse was originally developed to power [Yandex.Metrica](https://metrica.yandex.com/), [the second largest web analytics platform in the world](http://w3techs.com/technologies/overview/traffic_analysis/all), and continues to be the core component of this system. With more than 13 trillion records in the database and more than 20 billion events daily, ClickHouse allows generating custom reports on the fly directly from non-aggregated data. This article briefly covers the goals of ClickHouse in the early stages of its development. -Yandex.Metrica builds customized reports on the fly based on hits and sessions, with arbitrary segments defined by the user. This often requires building complex aggregates, such as the number of unique users. New data for building a report is received in real time. +Yandex.Metrica builds customized reports on the fly based on hits and sessions, with arbitrary segments defined by the user. This often requires building complex aggregates, such as the number of unique users. New data for building a report is received in real-time. -As of April 2014, Yandex.Metrica was tracking about 12 billion events (page views and clicks) daily. All these events must be stored in order to build custom reports. A single query may require scanning millions of rows within a few hundred milliseconds, or hundreds of millions of rows in just a few seconds. +As of April 2014, Yandex.Metrica was tracking about 12 billion events (page views and clicks) daily. All these events must be stored to build custom reports. A single query may require scanning millions of rows within a few hundred milliseconds, or hundreds of millions of rows in just a few seconds. -## Usage in Yandex.Metrica and Other Yandex Services +## Usage in Yandex.Metrica and Other Yandex Services {#usage-in-yandex-metrica-and-other-yandex-services} ClickHouse is used for multiple purposes in Yandex.Metrica. Its main task is to build reports in online mode using non-aggregated data. It uses a cluster of 374 servers, which store over 20.3 trillion rows in the database. The volume of compressed data, without counting duplication and replication, is about 2 PB. The volume of uncompressed data (in TSV format) would be approximately 17 PB. ClickHouse is also used for: -- Storing data for Session Replay from Yandex.Metrica. -- Processing intermediate data. -- Building global reports with Analytics. -- Running queries for debugging the Yandex.Metrica engine. -- Analyzing logs from the API and the user interface. +- Storing data for Session Replay from Yandex.Metrica. +- Processing intermediate data. +- Building global reports with Analytics. +- Running queries for debugging the Yandex.Metrica engine. +- Analyzing logs from the API and the user interface. ClickHouse has at least a dozen installations in other Yandex services: in search verticals, Market, Direct, business analytics, mobile development, AdFox, personal services, and others. -## Aggregated and Non-aggregated Data +## Aggregated and Non-aggregated Data {#aggregated-and-non-aggregated-data} -There is a popular opinion that in order to effectively calculate statistics, you must aggregate data, since this reduces the volume of data. +There is a popular opinion that to effectively calculate statistics, you must aggregate data since this reduces the volume of data. But data aggregation is a very limited solution, for the following reasons: -- You must have a pre-defined list of reports the user will need. -- The user can't make custom reports. -- When aggregating a large quantity of keys, the volume of data is not reduced, and aggregation is useless. -- For a large number of reports, there are too many aggregation variations (combinatorial explosion). -- When aggregating keys with high cardinality (such as URLs), the volume of data is not reduced by much (less than twofold). -- For this reason, the volume of data with aggregation might grow instead of shrink. -- Users do not view all the reports we generate for them. A large portion of calculations are useless. -- The logical integrity of data may be violated for various aggregations. +- You must have a pre-defined list of reports the user will need. +- The user can’t make custom reports. +- When aggregating a large number of keys, the volume of data is not reduced, and aggregation is useless. +- For a large number of reports, there are too many aggregation variations (combinatorial explosion). +- When aggregating keys with high cardinality (such as URLs), the volume of data is not reduced by much (less than twofold). +- For this reason, the volume of data with aggregation might grow instead of shrink. +- Users do not view all the reports we generate for them. A large portion of those calculations is useless. +- The logical integrity of data may be violated for various aggregations. -If we do not aggregate anything and work with non-aggregated data, this might actually reduce the volume of calculations. +If we do not aggregate anything and work with non-aggregated data, this might reduce the volume of calculations. However, with aggregation, a significant part of the work is taken offline and completed relatively calmly. In contrast, online calculations require calculating as fast as possible, since the user is waiting for the result. @@ -46,5 +46,4 @@ OLAPServer worked well for non-aggregated data, but it had many restrictions tha To remove the limitations of OLAPServer and solve the problem of working with non-aggregated data for all reports, we developed the ClickHouse DBMS. - [Original article](https://clickhouse.tech/docs/en/introduction/history/) diff --git a/docs/en/introduction/performance.md b/docs/en/introduction/performance.md index 3e6e902bd8c..438c4d30c86 100644 --- a/docs/en/introduction/performance.md +++ b/docs/en/introduction/performance.md @@ -1,24 +1,24 @@ -# Performance +# Performance {#performance} According to internal testing results at Yandex, ClickHouse shows the best performance (both the highest throughput for long queries and the lowest latency on short queries) for comparable operating scenarios among systems of its class that were available for testing. You can view the test results on a [separate page](https://clickhouse.tech/benchmark.html). This has also been confirmed by numerous independent benchmarks. They are not difficult to find using an internet search, or you can see [our small collection of related links](https://clickhouse.tech/#independent-benchmarks). -## Throughput for a Single Large Query +## Throughput for a Single Large Query {#throughput-for-a-single-large-query} Throughput can be measured in rows per second or in megabytes per second. If the data is placed in the page cache, a query that is not too complex is processed on modern hardware at a speed of approximately 2-10 GB/s of uncompressed data on a single server (for the simplest cases, the speed may reach 30 GB/s). If data is not placed in the page cache, the speed depends on the disk subsystem and the data compression rate. For example, if the disk subsystem allows reading data at 400 MB/s, and the data compression rate is 3, the speed will be around 1.2 GB/s. To get the speed in rows per second, divide the speed in bytes per second by the total size of the columns used in the query. For example, if 10 bytes of columns are extracted, the speed will be around 100-200 million rows per second. The processing speed increases almost linearly for distributed processing, but only if the number of rows resulting from aggregation or sorting is not too large. -## Latency When Processing Short Queries +## Latency When Processing Short Queries {#latency-when-processing-short-queries} If a query uses a primary key and does not select too many rows to process (hundreds of thousands), and does not use too many columns, we can expect less than 50 milliseconds of latency (single digits of milliseconds in the best case) if data is placed in the page cache. Otherwise, latency is calculated from the number of seeks. If you use rotating drives, for a system that is not overloaded, the latency is calculated by this formula: seek time (10 ms) \* number of columns queried \* number of data parts. -## Throughput When Processing a Large Quantity of Short Queries +## Throughput When Processing a Large Quantity of Short Queries {#throughput-when-processing-a-large-quantity-of-short-queries} Under the same conditions, ClickHouse can handle several hundred queries per second on a single server (up to several thousand in the best case). Since this scenario is not typical for analytical DBMSs, we recommend expecting a maximum of 100 queries per second. -## Performance When Inserting Data +## Performance When Inserting Data {#performance-when-inserting-data} We recommend inserting data in packets of at least 1000 rows, or no more than a single request per second. When inserting to a MergeTree table from a tab-separated dump, the insertion speed will be from 50 to 200 MB/s. If the inserted rows are around 1 Kb in size, the speed will be from 50,000 to 200,000 rows per second. If the rows are small, the performance will be higher in rows per second (on Banner System data -`>` 500,000 rows per second; on Graphite data -`>` 1,000,000 rows per second). To improve performance, you can make multiple INSERT queries in parallel, and performance will increase linearly. diff --git a/docs/en/operations/access_rights.md b/docs/en/operations/access_rights.md index 8806aa11ebe..510a23d6130 100644 --- a/docs/en/operations/access_rights.md +++ b/docs/en/operations/access_rights.md @@ -1,10 +1,10 @@ -# Access Rights +# Access Rights {#access-rights} Users and access rights are set up in the user config. This is usually `users.xml`. Users are recorded in the `users` section. Here is a fragment of the `users.xml` file: -```xml +``` xml @@ -65,15 +65,15 @@ Users are recorded in the `users` section. Here is a fragment of the `users.xml` You can see a declaration from two users: `default`and`web`. We added the `web` user separately. -The `default` user is chosen in cases when the username is not passed. The `default` user is also used for distributed query processing, if the configuration of the server or cluster doesn't specify the `user` and `password` (see the section on the [Distributed](../operations/table_engines/distributed.md) engine). +The `default` user is chosen in cases when the username is not passed. The `default` user is also used for distributed query processing, if the configuration of the server or cluster doesn’t specify the `user` and `password` (see the section on the [Distributed](../operations/table_engines/distributed.md) engine). The user that is used for exchanging information between servers combined in a cluster must not have substantial restrictions or quotas – otherwise, distributed queries will fail. -The password is specified in clear text (not recommended) or in SHA-256. The hash isn't salted. In this regard, you should not consider these passwords as providing security against potential malicious attacks. Rather, they are necessary for protection from employees. +The password is specified in clear text (not recommended) or in SHA-256. The hash isn’t salted. In this regard, you should not consider these passwords as providing security against potential malicious attacks. Rather, they are necessary for protection from employees. A list of networks is specified that access is allowed from. In this example, the list of networks for both users is loaded from a separate file (`/etc/metrika.xml`) containing the `networks` substitution. Here is a fragment of it: -```xml +``` xml ... @@ -85,14 +85,13 @@ A list of networks is specified that access is allowed from. In this example, th ``` -You could define this list of networks directly in `users.xml`, or in a file in the `users.d` directory (for more information, see the section "[Configuration files](configuration_files.md#configuration_files)"). +You could define this list of networks directly in `users.xml`, or in a file in the `users.d` directory (for more information, see the section “[Configuration files](configuration_files.md#configuration_files)”). The config includes comments explaining how to open access from everywhere. For use in production, only specify `ip` elements (IP addresses and their masks), since using `host` and `hoost_regexp` might cause extra latency. -Next the user settings profile is specified (see the section "[Settings profiles](settings/settings_profiles.md)". You can specify the default profile, `default'`. The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is `readonly=1`, which ensures read-only access. -Then specify the quota to be used (see the section "[Quotas](quotas.md#quotas)"). You can specify the default quota: `default`. It is set in the config by default to only count resource usage, without restricting it. The quota can have any name. You can specify the same quota for different users – in this case, resource usage is calculated for each user individually. +Next the user settings profile is specified (see the section “[Settings profiles](settings/settings_profiles.md)”. You can specify the default profile, `default'`. The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is `readonly=1`, which ensures read-only access. Then specify the quota to be used (see the section “[Quotas](quotas.md#quotas)”). You can specify the default quota: `default`. It is set in the config by default to only count resource usage, without restricting it. The quota can have any name. You can specify the same quota for different users – in this case, resource usage is calculated for each user individually. In the optional `` section, you can also specify a list of databases that the user can access. By default, all databases are available to the user. You can specify the `default` database. In this case, the user will receive access to the database by default. @@ -100,8 +99,8 @@ In the optional `` section, you can also specify a list of d Access to the `system` database is always allowed (since this database is used for processing queries). -The user can get a list of all databases and tables in them by using `SHOW` queries or system tables, even if access to individual databases isn't allowed. +The user can get a list of all databases and tables in them by using `SHOW` queries or system tables, even if access to individual databases isn’t allowed. -Database access is not related to the [readonly](settings/permissions_for_queries.md#settings_readonly) setting. You can't grant full access to one database and `readonly` access to another one. +Database access is not related to the [readonly](settings/permissions_for_queries.md#settings_readonly) setting. You can’t grant full access to one database and `readonly` access to another one. [Original article](https://clickhouse.tech/docs/en/operations/access_rights/) diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index 0b04f5aa1b8..d9d9cf3f284 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -1,31 +1,31 @@ -# Data Backup +# Data Backup {#data-backup} -While [replication](table_engines/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can't just drop tables with a MergeTree-like engine containing more than 50 Gb of data](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/dbms/programs/server/config.xml#L322-L330). However, these safeguards don't cover all possible cases and can be circumvented. +While [replication](table_engines/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/dbms/programs/server/config.xml#L322-L330). However, these safeguards don’t cover all possible cases and can be circumvented. In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**. -Each company has different resources available and business requirements, so there's no universal solution for ClickHouse backups and restores that will fit every situation. What works for one gigabyte of data likely won't work for tens of petabytes. There are a variety of possible approaches with their own pros and cons, which will be discussed below. It is a good idea to use several approaches instead of just one in order to compensate for their various shortcomings. +Each company has different resources available and business requirements, so there’s no universal solution for ClickHouse backups and restores that will fit every situation. What works for one gigabyte of data likely won’t work for tens of petabytes. There are a variety of possible approaches with their own pros and cons, which will be discussed below. It is a good idea to use several approaches instead of just one in order to compensate for their various shortcomings. !!! note "Note" Keep in mind that if you backed something up and never tried to restore it, chances are that restore will not work properly when you actually need it (or at least it will take longer than business can tolerate). So whatever backup approach you choose, make sure to automate the restore process as well, and practice it on a spare ClickHouse cluster regularly. -## Duplicating Source Data Somewhere Else +## Duplicating Source Data Somewhere Else {#duplicating-source-data-somewhere-else} Often data that is ingested into ClickHouse is delivered through some sort of persistent queue, such as [Apache Kafka](https://kafka.apache.org). In this case it is possible to configure an additional set of subscribers that will read the same data stream while it is being written to ClickHouse and store it in cold storage somewhere. Most companies already have some default recommended cold storage, which could be an object store or a distributed filesystem like [HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html). -## Filesystem Snapshots +## Filesystem Snapshots {#filesystem-snapshots} Some local filesystems provide snapshot functionality (for example, [ZFS](https://en.wikipedia.org/wiki/ZFS)), but they might not be the best choice for serving live queries. A possible solution is to create additional replicas with this kind of filesystem and exclude them from the [Distributed](table_engines/distributed.md) tables that are used for `SELECT` queries. Snapshots on such replicas will be out of reach of any queries that modify data. As a bonus, these replicas might have special hardware configurations with more disks attached per server, which would be cost-effective. -## clickhouse-copier +## clickhouse-copier {#clickhouse-copier} [clickhouse-copier](utils/clickhouse-copier.md) is a versatile tool that was initially created to re-shard petabyte-sized tables. It can also be used for backup and restore purposes because it reliably copies data between ClickHouse tables and clusters. For smaller volumes of data, a simple `INSERT INTO ... SELECT ...` to remote tables might work as well. -## Manipulations with Parts +## Manipulations with Parts {#manipulations-with-parts} -ClickHouse allows using the `ALTER TABLE ... FREEZE PARTITION ...` query to create a local copy of table partitions. This is implemented using hardlinks to the `/var/lib/clickhouse/shadow/` folder, so it usually does not consume extra disk space for old data. The created copies of files are not handled by ClickHouse server, so you can just leave them there: you will have a simple backup that doesn't require any additional external system, but it will still be prone to hardware issues. For this reason, it's better to remotely copy them to another location and then remove the local copies. Distributed filesystems and object stores are still a good options for this, but normal attached file servers with a large enough capacity might work as well (in this case the transfer will occur via the network filesystem or maybe [rsync](https://en.wikipedia.org/wiki/Rsync)). +ClickHouse allows using the `ALTER TABLE ... FREEZE PARTITION ...` query to create a local copy of table partitions. This is implemented using hardlinks to the `/var/lib/clickhouse/shadow/` folder, so it usually does not consume extra disk space for old data. The created copies of files are not handled by ClickHouse server, so you can just leave them there: you will have a simple backup that doesn’t require any additional external system, but it will still be prone to hardware issues. For this reason, it’s better to remotely copy them to another location and then remove the local copies. Distributed filesystems and object stores are still a good options for this, but normal attached file servers with a large enough capacity might work as well (in this case the transfer will occur via the network filesystem or maybe [rsync](https://en.wikipedia.org/wiki/Rsync)). For more information about queries related to partition manipulations, see the [ALTER documentation](../query_language/alter.md#alter_manipulations-with-partitions). diff --git a/docs/en/operations/configuration_files.md b/docs/en/operations/configuration_files.md index 69204506c2d..f901d58029c 100644 --- a/docs/en/operations/configuration_files.md +++ b/docs/en/operations/configuration_files.md @@ -1,4 +1,4 @@ -# Configuration Files {#configuration_files} +# Configuration Files {#configuration-files} ClickHouse supports multi-file configuration management. The main server configuration file is `/etc/clickhouse-server/config.xml`. Other files must be in the `/etc/clickhouse-server/config.d` directory. @@ -13,17 +13,19 @@ If `replace` is specified, it replaces the entire element with the specified one If `remove` is specified, it deletes the element. -The config can also define "substitutions". If an element has the `incl` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is `/etc/metrika.xml`. This can be changed in the [include_from](server_settings/settings.md#server_settings-include_from) element in the server config. The substitution values are specified in `/yandex/substitution_name` elements in this file. If a substitution specified in `incl` does not exist, it is recorded in the log. To prevent ClickHouse from logging missing substitutions, specify the `optional="true"` attribute (for example, settings for [macros](server_settings/settings.md)). +The config can also define “substitutions”. If an element has the `incl` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is `/etc/metrika.xml`. This can be changed in the [include\_from](server_settings/settings.md#server_settings-include_from) element in the server config. The substitution values are specified in `/yandex/substitution_name` elements in this file. If a substitution specified in `incl` does not exist, it is recorded in the log. To prevent ClickHouse from logging missing substitutions, specify the `optional="true"` attribute (for example, settings for [macros](server_settings/settings.md)). Substitutions can also be performed from ZooKeeper. To do this, specify the attribute `from_zk = "/path/to/node"`. The element value is replaced with the contents of the node at `/path/to/node` in ZooKeeper. You can also put an entire XML subtree on the ZooKeeper node and it will be fully inserted into the source element. -The `config.xml` file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the 'users_config' element. By default, it is `users.xml`. If `users_config` is omitted, the user settings, profiles, and quotas are specified directly in `config.xml`. +The `config.xml` file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the ‘users\_config’ element. By default, it is `users.xml`. If `users_config` is omitted, the user settings, profiles, and quotas are specified directly in `config.xml`. In addition, `users_config` may have overrides in files from the `users_config.d` directory (for example, `users.d`) and substitutions. For example, you can have separate config file for each user like this: -```bash + +``` bash $ cat /etc/clickhouse-server/users.d/alice.xml ``` -```xml + +``` xml diff --git a/docs/en/operations/index.md b/docs/en/operations/index.md index de35dfbad23..6a3d4d7975a 100644 --- a/docs/en/operations/index.md +++ b/docs/en/operations/index.md @@ -1,20 +1,20 @@ -# Operations +# Operations {#operations} ClickHouse operations manual consists of the following major sections: - - [Requirements](requirements.md) - - [Monitoring](monitoring.md) - - [Troubleshooting](troubleshooting.md) - - [Usage Recommendations](tips.md) - - [Update Procedure](update.md) - - [Access Rights](access_rights.md) - - [Data Backup](backup.md) - - [Configuration Files](configuration_files.md) - - [Quotas](quotas.md) - - [System Tables](system_tables.md) - - [Server Configuration Parameters](server_settings/index.md) - - [How To Test Your Hardware With ClickHouse](performance_test.md) - - [Settings](settings/index.md) - - [Utilities](utils/index.md) +- [Requirements](requirements.md) +- [Monitoring](monitoring.md) +- [Troubleshooting](troubleshooting.md) +- [Usage Recommendations](tips.md) +- [Update Procedure](update.md) +- [Access Rights](access_rights.md) +- [Data Backup](backup.md) +- [Configuration Files](configuration_files.md) +- [Quotas](quotas.md) +- [System Tables](system_tables.md) +- [Server Configuration Parameters](server_settings/index.md) +- [How To Test Your Hardware With ClickHouse](performance_test.md) +- [Settings](settings/index.md) +- [Utilities](utils/index.md) [Original article](https://clickhouse.tech/docs/en/operations/) diff --git a/docs/en/operations/monitoring.md b/docs/en/operations/monitoring.md index b9227cdfa1e..44afd58dd64 100644 --- a/docs/en/operations/monitoring.md +++ b/docs/en/operations/monitoring.md @@ -1,23 +1,23 @@ -# Monitoring +# Monitoring {#monitoring} You can monitor: -- Utilization of hardware resources. -- ClickHouse server metrics. +- Utilization of hardware resources. +- ClickHouse server metrics. -## Resource Utilization +## Resource Utilization {#resource-utilization} ClickHouse does not monitor the state of hardware resources by itself. It is highly recommended to set up monitoring for: -- Load and temperature on processors. +- Load and temperature on processors. You can use [dmesg](https://en.wikipedia.org/wiki/Dmesg), [turbostat](https://www.linux.org/docs/man8/turbostat.html) or other instruments. -- Utilization of storage system, RAM and network. +- Utilization of storage system, RAM and network. -## ClickHouse Server Metrics +## ClickHouse Server Metrics {#clickhouse-server-metrics} ClickHouse server has embedded instruments for self-state monitoring. @@ -25,13 +25,13 @@ To track server events use server logs. See the [logger](server_settings/setting ClickHouse collects: -- Different metrics of how the server uses computational resources. -- Common statistics on query processing. +- Different metrics of how the server uses computational resources. +- Common statistics on query processing. -You can find metrics in the [system.metrics](system_tables.md#system_tables-metrics), [system.events](system_tables.md#system_tables-events), and [system.asynchronous_metrics](system_tables.md#system_tables-asynchronous_metrics) tables. +You can find metrics in the [system.metrics](system_tables.md#system_tables-metrics), [system.events](system_tables.md#system_tables-events), and [system.asynchronous\_metrics](system_tables.md#system_tables-asynchronous_metrics) tables. You can configure ClickHouse to export metrics to [Graphite](https://github.com/graphite-project). See the [Graphite section](server_settings/settings.md#server_settings-graphite) in the ClickHouse server configuration file. Before configuring export of metrics, you should set up Graphite by following their official [guide](https://graphite.readthedocs.io/en/latest/install.html). Additionally, you can monitor server availability through the HTTP API. Send the `HTTP GET` request to `/ping`. If the server is available, it responds with `200 OK`. -To monitor servers in a cluster configuration, you should set the [max_replica_delay_for_distributed_queries](settings/settings.md#settings-max_replica_delay_for_distributed_queries) parameter and use the HTTP resource `/replicas_status`. A request to `/replicas_status` returns `200 OK` if the replica is available and is not delayed behind the other replicas. If a replica is delayed, it returns `503 HTTP_SERVICE_UNAVAILABLE` with information about the gap. +To monitor servers in a cluster configuration, you should set the [max\_replica\_delay\_for\_distributed\_queries](settings/settings.md#settings-max_replica_delay_for_distributed_queries) parameter and use the HTTP resource `/replicas_status`. A request to `/replicas_status` returns `200 OK` if the replica is available and is not delayed behind the other replicas. If a replica is delayed, it returns `503 HTTP_SERVICE_UNAVAILABLE` with information about the gap. diff --git a/docs/en/operations/performance/sampling_query_profiler.md b/docs/en/operations/performance/sampling_query_profiler.md index 7b453e9386e..d2ee7125c66 100644 --- a/docs/en/operations/performance/sampling_query_profiler.md +++ b/docs/en/operations/performance/sampling_query_profiler.md @@ -1,45 +1,49 @@ -# Sampling Query Profiler +# Sampling Query Profiler {#sampling-query-profiler} -ClickHouse runs sampling profiler that allows to analyze query execution. Using profiler you can find source code routines that used the most frequently during a query execution. You can trace CPU time and wall-clock time spent including idle time. +ClickHouse runs sampling profiler that allows analyzing query execution. Using profiler you can find source code routines that used the most frequently during query execution. You can trace CPU time and wall-clock time spent including idle time. To use profiler: -- Setup the [trace_log](../server_settings/settings.md#server_settings-trace_log) section of the server configuration. +- Setup the [trace\_log](../server_settings/settings.md#server_settings-trace_log) section of the server configuration. - This section configures the [trace_log](../system_tables.md#system_tables-trace_log) system table containing the results of the profiler functioning. It is configured by default. Remember that data in this table is valid only for running server. After the server restart, ClickHouse doesn't clean up the table and all the stored virtual memory address may become invalid. + This section configures the [trace\_log](../system_tables.md#system_tables-trace_log) system table containing the results of the profiler functioning. It is configured by default. Remember that data in this table is valid only for a running server. After the server restart, ClickHouse doesn’t clean up the table and all the stored virtual memory address may become invalid. -- Setup the [query_profiler_cpu_time_period_ns](../settings/settings.md#query_profiler_cpu_time_period_ns) or [query_profiler_real_time_period_ns](../settings/settings.md#query_profiler_real_time_period_ns) settings. Both settings can be used simultaneously. +- Setup the [query\_profiler\_cpu\_time\_period\_ns](../settings/settings.md#query_profiler_cpu_time_period_ns) or [query\_profiler\_real\_time\_period\_ns](../settings/settings.md#query_profiler_real_time_period_ns) settings. Both settings can be used simultaneously. These settings allow you to configure profiler timers. As these are the session settings, you can get different sampling frequency for the whole server, individual users or user profiles, for your interactive session, and for each individual query. -Default sampling frequency is one sample per second and both CPU and real timers are enabled. This frequency allows to collect enough information about ClickHouse cluster. At the same time, working with this frequency, profiler doesn't affect ClickHouse server's performance. If you need to profile each individual query try to use higher sampling frequency. +The default sampling frequency is one sample per second and both CPU and real timers are enabled. This frequency allows collecting enough information about ClickHouse cluster. At the same time, working with this frequency, profiler doesn’t affect ClickHouse server’s performance. If you need to profile each individual query try to use higher sampling frequency. To analyze the `trace_log` system table: -- Install the `clickhouse-common-static-dbg` package. See [Install from DEB Packages](../../getting_started/install.md#install-from-deb-packages). -- Allow introspection functions by the [allow_introspection_functions](../settings/settings.md#settings-allow_introspection_functions) setting. +- Install the `clickhouse-common-static-dbg` package. See [Install from DEB Packages](../../getting_started/install.md#install-from-deb-packages). - For security reasons introspection functions are disabled by default. +- Allow introspection functions by the [allow\_introspection\_functions](../settings/settings.md#settings-allow_introspection_functions) setting. -- Use the `addressToLine`, `addressToSymbol` and `demangle` [introspection functions](../../query_language/functions/introspection.md) to get function names and their positions in ClickHouse code. To get a profile for some query, you need to aggregate data from the `trace_log` table. You can aggregate data by individual functions or by the whole stack traces. + For security reasons, introspection functions are disabled by default. + +- Use the `addressToLine`, `addressToSymbol` and `demangle` [introspection functions](../../query_language/functions/introspection.md) to get function names and their positions in ClickHouse code. To get a profile for some query, you need to aggregate data from the `trace_log` table. You can aggregate data by individual functions or by the whole stack traces. If you need to visualize `trace_log` info, try [flamegraph](../../interfaces/third-party/gui/#clickhouse-flamegraph) and [speedscope](https://github.com/laplab/clickhouse-speedscope). - -## Example +## Example {#example} In this example we: -- Filtering `trace_log` data by a query identifier and current date. -- Aggregating by stack trace. -- Using introspection functions, we will get a report of: - - - Names of symbols and corresponding source code functions. - - Source code locations of these functions. +- Filtering `trace_log` data by a query identifier and the current date. -```sql -SELECT - count(), +- Aggregating by stack trace. + +- Using introspection functions, we will get a report of: + + - Names of symbols and corresponding source code functions. + - Source code locations of these functions. + + + +``` sql +SELECT + count(), arrayStringConcat(arrayMap(x -> concat(demangle(addressToSymbol(x)), '\n ', addressToLine(x)), trace), '\n') AS sym FROM system.trace_log WHERE (query_id = 'ebca3574-ad0a-400a-9cbc-dca382f5998c') AND (event_date = today()) @@ -47,561 +51,7 @@ GROUP BY trace ORDER BY count() DESC LIMIT 10 ``` -```text -Row 1: -────── -count(): 6344 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - -read - -DB::ReadBufferFromFileDescriptor::nextImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBufferFromFileDescriptor.cpp:56 -DB::CompressedReadBufferBase::readCompressedData(unsigned long&, unsigned long&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBuffer.h:54 -DB::CompressedReadBufferFromFile::nextImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Compression/CompressedReadBufferFromFile.cpp:22 -DB::CompressedReadBufferFromFile::seek(unsigned long, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Compression/CompressedReadBufferFromFile.cpp:63 -DB::MergeTreeReaderStream::seekToMark(unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp:200 -std::_Function_handler > const&), DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool)::{lambda(bool)#1}::operator()(bool) const::{lambda(std::vector > const&)#1}>::_M_invoke(std::_Any_data const&, std::vector > const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:212 -DB::IDataType::deserializeBinaryBulkWithMultipleStreams(DB::IColumn&, unsigned long, DB::IDataType::DeserializeBinaryBulkSettings&, std::shared_ptr&) const - /usr/local/include/c++/9.1.0/bits/std_function.h:690 -DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 -DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 -DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 -DB::MergeTreeRangeReader::continueReadingChain(DB::MergeTreeRangeReader::ReadResult&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:487 -DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 -DB::MergeTreeBaseSelectBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::PartialSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ParallelInputsProcessor::loop(unsigned long) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 -DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 -ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const - /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone - - -Row 2: -────── -count(): 3295 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - - -__pthread_cond_wait - -std::condition_variable::wait(std::unique_lock&) - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/src/c++11/../../../../../gcc-9.1.0/libstdc++-v3/src/c++11/condition_variable.cc:55 -Poco::Semaphore::wait() - /home/milovidov/ClickHouse/build_gcc9/../contrib/poco/Foundation/src/Semaphore.cpp:61 -DB::UnionBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/x86_64-pc-linux-gnu/bits/gthr-default.h:748 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Core/Block.h:90 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::LimitBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::AsynchronousBlockInputStream::calculate() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -std::_Function_handler::_M_invoke(std::_Any_data const&) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:551 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/x86_64-pc-linux-gnu/bits/gthr-default.h:748 -ThreadFromGlobalPool::ThreadFromGlobalPool::scheduleImpl(std::function, int, std::optional)::{lambda()#3}>(ThreadPoolImpl::scheduleImpl(std::function, int, std::optional)::{lambda()#3}&&)::{lambda()#1}::operator()() const - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/ThreadPool.h:146 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone - - -Row 3: -────── -count(): 1978 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - - -DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 -DB::ExpressionAction::execute(DB::Block&, bool) const - /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 -DB::ExpressionActions::execute(DB::Block&, bool) const - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 -DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 -DB::MergeTreeBaseSelectBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::PartialSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ParallelInputsProcessor::loop(unsigned long) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 -DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 -ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const - /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone - - -Row 4: -────── -count(): 1913 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - - -DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 -DB::ExpressionAction::execute(DB::Block&, bool) const - /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 -DB::ExpressionActions::execute(DB::Block&, bool) const - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 -DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 -DB::MergeTreeBaseSelectBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::PartialSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ParallelInputsProcessor::loop(unsigned long) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 -DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 -ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const - /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone - - -Row 5: -────── -count(): 1672 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - - -DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 -DB::ExpressionAction::execute(DB::Block&, bool) const - /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 -DB::ExpressionActions::execute(DB::Block&, bool) const - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 -DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 -DB::MergeTreeBaseSelectBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::PartialSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ParallelInputsProcessor::loop(unsigned long) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 -DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 -ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const - /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone - - -Row 6: -────── -count(): 1531 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - - -read - -DB::ReadBufferFromFileDescriptor::nextImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBufferFromFileDescriptor.cpp:56 -DB::CompressedReadBufferBase::readCompressedData(unsigned long&, unsigned long&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBuffer.h:54 -DB::CompressedReadBufferFromFile::nextImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Compression/CompressedReadBufferFromFile.cpp:22 -void DB::deserializeBinarySSE2<4>(DB::PODArray, 15ul, 16ul>&, DB::PODArray, 15ul, 16ul>&, DB::ReadBuffer&, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBuffer.h:53 -DB::DataTypeString::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataTypes/DataTypeString.cpp:202 -DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 -DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 -DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 -DB::MergeTreeRangeReader::startReadingChain(unsigned long, std::vector >&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 -DB::MergeTreeBaseSelectBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::PartialSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ParallelInputsProcessor::loop(unsigned long) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 -DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 -ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const - /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone - - -Row 7: -────── -count(): 1034 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - - -DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) - /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse -DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 -DB::ExpressionAction::execute(DB::Block&, bool) const - /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 -DB::ExpressionActions::execute(DB::Block&, bool) const - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 -DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 -DB::MergeTreeBaseSelectBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::PartialSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ParallelInputsProcessor::loop(unsigned long) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 -DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 -ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const - /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone - - -Row 8: -────── -count(): 989 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - - -__lll_lock_wait - -pthread_mutex_lock - -DB::MergeTreeReaderStream::loadMarks() - /usr/local/include/c++/9.1.0/bits/std_mutex.h:103 -DB::MergeTreeReaderStream::MergeTreeReaderStream(std::__cxx11::basic_string, std::allocator > const&, std::__cxx11::basic_string, std::allocator > const&, unsigned long, std::vector > const&, DB::MarkCache*, bool, DB::UncompressedCache*, unsigned long, unsigned long, unsigned long, DB::MergeTreeIndexGranularityInfo const*, std::function const&, int) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp:107 -std::_Function_handler > const&), DB::MergeTreeReader::addStreams(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, std::function const&, int)::{lambda(std::vector > const&)#1}>::_M_invoke(std::_Any_data const&, std::vector > const&) - /usr/local/include/c++/9.1.0/bits/unique_ptr.h:147 -DB::MergeTreeReader::addStreams(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, std::function const&, int) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 -DB::MergeTreeReader::MergeTreeReader(std::__cxx11::basic_string, std::allocator > const&, std::shared_ptr const&, DB::NamesAndTypesList const&, DB::UncompressedCache*, DB::MarkCache*, bool, DB::MergeTreeData const&, std::vector > const&, unsigned long, unsigned long, std::map, std::allocator >, double, std::less, std::allocator > >, std::allocator, std::allocator > const, double> > > const&, std::function const&, int) - /usr/local/include/c++/9.1.0/bits/stl_list.h:303 -DB::MergeTreeThreadSelectBlockInputStream::getNewTask() - /usr/local/include/c++/9.1.0/bits/std_function.h:259 -DB::MergeTreeBaseSelectBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:54 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::PartialSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ParallelInputsProcessor::loop(unsigned long) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 -DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 -ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const - /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone - - -Row 9: -─────── -count(): 779 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - - -void DB::deserializeBinarySSE2<4>(DB::PODArray, 15ul, 16ul>&, DB::PODArray, 15ul, 16ul>&, DB::ReadBuffer&, unsigned long) - /usr/local/lib/gcc/x86_64-pc-linux-gnu/9.1.0/include/emmintrin.h:727 -DB::DataTypeString::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataTypes/DataTypeString.cpp:202 -DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 -DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 -DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 -DB::MergeTreeRangeReader::startReadingChain(unsigned long, std::vector >&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 -DB::MergeTreeBaseSelectBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::PartialSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ParallelInputsProcessor::loop(unsigned long) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 -DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 -ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const - /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone - - -Row 10: -─────── -count(): 666 -sym: StackTrace::StackTrace(ucontext_t const&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 -DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 - - -void DB::deserializeBinarySSE2<4>(DB::PODArray, 15ul, 16ul>&, DB::PODArray, 15ul, 16ul>&, DB::ReadBuffer&, unsigned long) - /usr/local/lib/gcc/x86_64-pc-linux-gnu/9.1.0/include/emmintrin.h:727 -DB::DataTypeString::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataTypes/DataTypeString.cpp:202 -DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 -DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 -DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 -DB::MergeTreeRangeReader::startReadingChain(unsigned long, std::vector >&) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 -DB::MergeTreeBaseSelectBlockInputStream::readImpl() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ExpressionBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::PartialSortingBlockInputStream::readImpl() - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 -DB::IBlockInputStream::read() - /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 -DB::ParallelInputsProcessor::loop(unsigned long) - /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 -DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) - /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 -ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const - /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 -ThreadPoolImpl::worker(std::_List_iterator) - /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 -execute_native_thread_routine - /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 -start_thread - -__clone -``` +``` text +{% include "operations/performance/sampling_query_profiler_example_result.txt" %} +``` diff --git a/docs/en/operations/performance/sampling_query_profiler_example_result.txt b/docs/en/operations/performance/sampling_query_profiler_example_result.txt new file mode 100644 index 00000000000..8e4e0e0fd70 --- /dev/null +++ b/docs/en/operations/performance/sampling_query_profiler_example_result.txt @@ -0,0 +1,556 @@ +Row 1: +────── +count(): 6344 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +read + +DB::ReadBufferFromFileDescriptor::nextImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBufferFromFileDescriptor.cpp:56 +DB::CompressedReadBufferBase::readCompressedData(unsigned long&, unsigned long&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBuffer.h:54 +DB::CompressedReadBufferFromFile::nextImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Compression/CompressedReadBufferFromFile.cpp:22 +DB::CompressedReadBufferFromFile::seek(unsigned long, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Compression/CompressedReadBufferFromFile.cpp:63 +DB::MergeTreeReaderStream::seekToMark(unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp:200 +std::_Function_handler > const&), DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool)::{lambda(bool)#1}::operator()(bool) const::{lambda(std::vector > const&)#1}>::_M_invoke(std::_Any_data const&, std::vector > const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:212 +DB::IDataType::deserializeBinaryBulkWithMultipleStreams(DB::IColumn&, unsigned long, DB::IDataType::DeserializeBinaryBulkSettings&, std::shared_ptr&) const + /usr/local/include/c++/9.1.0/bits/std_function.h:690 +DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 +DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 +DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 +DB::MergeTreeRangeReader::continueReadingChain(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:487 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 2: +────── +count(): 3295 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +__pthread_cond_wait + +std::condition_variable::wait(std::unique_lock&) + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/src/c++11/../../../../../gcc-9.1.0/libstdc++-v3/src/c++11/condition_variable.cc:55 +Poco::Semaphore::wait() + /home/milovidov/ClickHouse/build_gcc9/../contrib/poco/Foundation/src/Semaphore.cpp:61 +DB::UnionBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/x86_64-pc-linux-gnu/bits/gthr-default.h:748 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Core/Block.h:90 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::LimitBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::AsynchronousBlockInputStream::calculate() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +std::_Function_handler::_M_invoke(std::_Any_data const&) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:551 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/x86_64-pc-linux-gnu/bits/gthr-default.h:748 +ThreadFromGlobalPool::ThreadFromGlobalPool::scheduleImpl(std::function, int, std::optional)::{lambda()#3}>(ThreadPoolImpl::scheduleImpl(std::function, int, std::optional)::{lambda()#3}&&)::{lambda()#1}::operator()() const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/ThreadPool.h:146 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 3: +────── +count(): 1978 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 +DB::ExpressionAction::execute(DB::Block&, bool) const + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::ExpressionActions::execute(DB::Block&, bool) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 +DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 4: +────── +count(): 1913 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 +DB::ExpressionAction::execute(DB::Block&, bool) const + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::ExpressionActions::execute(DB::Block&, bool) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 +DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 5: +────── +count(): 1672 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 +DB::ExpressionAction::execute(DB::Block&, bool) const + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::ExpressionActions::execute(DB::Block&, bool) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 +DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 6: +────── +count(): 1531 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +read + +DB::ReadBufferFromFileDescriptor::nextImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBufferFromFileDescriptor.cpp:56 +DB::CompressedReadBufferBase::readCompressedData(unsigned long&, unsigned long&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBuffer.h:54 +DB::CompressedReadBufferFromFile::nextImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Compression/CompressedReadBufferFromFile.cpp:22 +void DB::deserializeBinarySSE2<4>(DB::PODArray, 15ul, 16ul>&, DB::PODArray, 15ul, 16ul>&, DB::ReadBuffer&, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBuffer.h:53 +DB::DataTypeString::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataTypes/DataTypeString.cpp:202 +DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 +DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 +DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 +DB::MergeTreeRangeReader::startReadingChain(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 7: +────── +count(): 1034 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 +DB::ExpressionAction::execute(DB::Block&, bool) const + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::ExpressionActions::execute(DB::Block&, bool) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 +DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 8: +────── +count(): 989 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +__lll_lock_wait + +pthread_mutex_lock + +DB::MergeTreeReaderStream::loadMarks() + /usr/local/include/c++/9.1.0/bits/std_mutex.h:103 +DB::MergeTreeReaderStream::MergeTreeReaderStream(std::__cxx11::basic_string, std::allocator > const&, std::__cxx11::basic_string, std::allocator > const&, unsigned long, std::vector > const&, DB::MarkCache*, bool, DB::UncompressedCache*, unsigned long, unsigned long, unsigned long, DB::MergeTreeIndexGranularityInfo const*, std::function const&, int) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp:107 +std::_Function_handler > const&), DB::MergeTreeReader::addStreams(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, std::function const&, int)::{lambda(std::vector > const&)#1}>::_M_invoke(std::_Any_data const&, std::vector > const&) + /usr/local/include/c++/9.1.0/bits/unique_ptr.h:147 +DB::MergeTreeReader::addStreams(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, std::function const&, int) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::MergeTreeReader::MergeTreeReader(std::__cxx11::basic_string, std::allocator > const&, std::shared_ptr const&, DB::NamesAndTypesList const&, DB::UncompressedCache*, DB::MarkCache*, bool, DB::MergeTreeData const&, std::vector > const&, unsigned long, unsigned long, std::map, std::allocator >, double, std::less, std::allocator > >, std::allocator, std::allocator > const, double> > > const&, std::function const&, int) + /usr/local/include/c++/9.1.0/bits/stl_list.h:303 +DB::MergeTreeThreadSelectBlockInputStream::getNewTask() + /usr/local/include/c++/9.1.0/bits/std_function.h:259 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:54 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 9: +─────── +count(): 779 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +void DB::deserializeBinarySSE2<4>(DB::PODArray, 15ul, 16ul>&, DB::PODArray, 15ul, 16ul>&, DB::ReadBuffer&, unsigned long) + /usr/local/lib/gcc/x86_64-pc-linux-gnu/9.1.0/include/emmintrin.h:727 +DB::DataTypeString::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataTypes/DataTypeString.cpp:202 +DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 +DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 +DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 +DB::MergeTreeRangeReader::startReadingChain(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 10: +─────── +count(): 666 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +void DB::deserializeBinarySSE2<4>(DB::PODArray, 15ul, 16ul>&, DB::PODArray, 15ul, 16ul>&, DB::ReadBuffer&, unsigned long) + /usr/local/lib/gcc/x86_64-pc-linux-gnu/9.1.0/include/emmintrin.h:727 +DB::DataTypeString::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataTypes/DataTypeString.cpp:202 +DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 +DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 +DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 +DB::MergeTreeRangeReader::startReadingChain(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone diff --git a/docs/en/operations/performance_test.md b/docs/en/operations/performance_test.md index dfdabe08395..3413bc87346 100644 --- a/docs/en/operations/performance_test.md +++ b/docs/en/operations/performance_test.md @@ -1,76 +1,75 @@ -# How To Test Your Hardware With ClickHouse +# How To Test Your Hardware With ClickHouse {#how-to-test-your-hardware-with-clickhouse} With this instruction you can run basic ClickHouse performance test on any server without installation of ClickHouse packages. -\1. Go to "commits" page: [https://github.com/ClickHouse/ClickHouse/commits/master](https://github.com/ClickHouse/ClickHouse/commits/master) +1. Go to “commits” page: https://github.com/ClickHouse/ClickHouse/commits/master -\2. Click on the first green check mark or red cross with green "ClickHouse Build Check" and click on the "Details" link near "ClickHouse Build Check". +2. Click on the first green check mark or red cross with green “ClickHouse Build Check” and click on the “Details” link near “ClickHouse Build Check”. -\3. Copy the link to "clickhouse" binary for amd64 or aarch64. +3. Copy the link to “clickhouse” binary for amd64 or aarch64. -\4. ssh to the server and download it with wget: -``` -# For amd64: -wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578163263_binary/clickhouse -# For aarch64: -wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578161264_binary/clickhouse -# Then do: -chmod a+x clickhouse -``` +4. ssh to the server and download it with wget: -\5. Download configs: -``` -wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/config.xml -wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/users.xml -mkdir config.d -wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/config.d/path.xml -O config.d/path.xml -wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/config.d/log_to_console.xml -O config.d/log_to_console.xml -``` + -\6. Download benchmark files: -``` -wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/benchmark/clickhouse/benchmark-new.sh -chmod a+x benchmark-new.sh -wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/benchmark/clickhouse/queries.sql -``` + # For amd64: + wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578163263_binary/clickhouse + # For aarch64: + wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578161264_binary/clickhouse + # Then do: + chmod a+x clickhouse -\7. Download test data: +1. Download configs: -According to the instruction: + -[https://clickhouse.tech/docs/en/getting_started/example_datasets/metrica/](https://clickhouse.yandex/docs/en/getting_started/example_datasets/metrica/) + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/config.xml + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/users.xml + mkdir config.d + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/config.d/path.xml -O config.d/path.xml + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/config.d/log_to_console.xml -O config.d/log_to_console.xml -("hits" table containing 100 million rows) +1. Download benchmark files: -``` -wget https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_100m_obfuscated_v1.tar.xz -tar xvf hits_100m_obfuscated_v1.tar.xz -C . -mv hits_100m_obfuscated_v1/* . -``` + -\8. Run the server: -``` -./clickhouse server -``` + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/benchmark/clickhouse/benchmark-new.sh + chmod a+x benchmark-new.sh + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/benchmark/clickhouse/queries.sql -\9. Check the data: +1. Download test data according to the [Yandex.Metrica dataset](../getting_started/example_datasets/metrica.md) instruction (“hits” table containing 100 million rows). -ssh to the server in another terminal -``` -./clickhouse client --query "SELECT count() FROM hits_100m_obfuscated" -100000000 -``` + -\10. Edit the benchmark-new.sh, change "clickhouse-client" to "./clickhouse client" and add "--max_memory_usage 100000000000" parameter. -``` -mcedit benchmark-new.sh -``` + wget https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_100m_obfuscated_v1.tar.xz + tar xvf hits_100m_obfuscated_v1.tar.xz -C . + mv hits_100m_obfuscated_v1/* . -\11. Run the benchmark: -``` -./benchmark-new.sh hits_100m_obfuscated -``` +1. Run the server: -\12. Send the numbers and the info about your hardware configuration to clickhouse-feedback@yandex-team.com + -All the results are published here: [https://clickhouse.tech/benchmark_hardware.html](https://clickhouse.yandex/benchmark_hardware.html) + ./clickhouse server + +1. Check the data: ssh to the server in another terminal + + + + ./clickhouse client --query "SELECT count() FROM hits_100m_obfuscated" + 100000000 + +1. Edit the benchmark-new.sh, change “clickhouse-client” to “./clickhouse client” and add “–max\_memory\_usage 100000000000” parameter. + + + + mcedit benchmark-new.sh + +1. Run the benchmark: + + + + ./benchmark-new.sh hits_100m_obfuscated + +1. Send the numbers and the info about your hardware configuration to clickhouse-feedback@yandex-team.com + +All the results are published here: https://clickhouse.tech/benchmark\_hardware.html diff --git a/docs/en/operations/quotas.md b/docs/en/operations/quotas.md index 608e7f39034..51825788f9a 100644 --- a/docs/en/operations/quotas.md +++ b/docs/en/operations/quotas.md @@ -1,18 +1,18 @@ # Quotas {#quotas} Quotas allow you to limit resource usage over a period of time, or simply track the use of resources. -Quotas are set up in the user config. This is usually 'users.xml'. +Quotas are set up in the user config. This is usually ‘users.xml’. -The system also has a feature for limiting the complexity of a single query. See the section "Restrictions on query complexity"). +The system also has a feature for limiting the complexity of a single query. See the section “Restrictions on query complexity”). In contrast to query complexity restrictions, quotas: -- Place restrictions on a set of queries that can be run over a period of time, instead of limiting a single query. -- Account for resources spent on all remote servers for distributed query processing. +- Place restrictions on a set of queries that can be run over a period of time, instead of limiting a single query. +- Account for resources spent on all remote servers for distributed query processing. -Let's look at the section of the 'users.xml' file that defines quotas. +Let’s look at the section of the ‘users.xml’ file that defines quotas. -```xml +``` xml @@ -35,7 +35,7 @@ Let's look at the section of the 'users.xml' file that defines quotas. By default, the quota just tracks resource consumption for each hour, without limiting usage. The resource consumption calculated for each interval is output to the server log after each request. -```xml +``` xml @@ -61,7 +61,7 @@ The resource consumption calculated for each interval is output to the server lo ``` -For the 'statbox' quota, restrictions are set for every hour and for every 24 hours (86,400 seconds). The time interval is counted starting from an implementation-defined fixed moment in time. In other words, the 24-hour interval doesn't necessarily begin at midnight. +For the ‘statbox’ quota, restrictions are set for every hour and for every 24 hours (86,400 seconds). The time interval is counted starting from an implementation-defined fixed moment in time. In other words, the 24-hour interval doesn’t necessarily begin at midnight. When the interval ends, all collected values are cleared. For the next hour, the quota calculation starts over. @@ -79,28 +79,27 @@ Here are the amounts that can be restricted: If the limit is exceeded for at least one time interval, an exception is thrown with a text about which restriction was exceeded, for which interval, and when the new interval begins (when queries can be sent again). -Quotas can use the "quota key" feature in order to report on resources for multiple keys independently. Here is an example of this: +Quotas can use the “quota key” feature in order to report on resources for multiple keys independently. Here is an example of this: -```xml +``` xml - ``` -The quota is assigned to users in the 'users' section of the config. See the section "Access rights". +The quota is assigned to users in the ‘users’ section of the config. See the section “Access rights”. -For distributed query processing, the accumulated amounts are stored on the requestor server. So if the user goes to another server, the quota there will "start over". +For distributed query processing, the accumulated amounts are stored on the requestor server. So if the user goes to another server, the quota there will “start over”. When the server is restarted, quotas are reset. - [Original article](https://clickhouse.tech/docs/en/operations/quotas/) diff --git a/docs/en/operations/requirements.md b/docs/en/operations/requirements.md index cfe6e28a5fe..df4ea287cbc 100644 --- a/docs/en/operations/requirements.md +++ b/docs/en/operations/requirements.md @@ -1,53 +1,53 @@ -# Requirements +# Requirements {#requirements} -## CPU +## CPU {#cpu} -For installation from prebuilt deb packages, use a CPU with x86_64 architecture and support for SSE 4.2 instructions. To run ClickHouse with processors that do not support SSE 4.2 or have AArch64 or PowerPC64LE architecture, you should build ClickHouse from sources. +For installation from prebuilt deb packages, use a CPU with x86\_64 architecture and support for SSE 4.2 instructions. To run ClickHouse with processors that do not support SSE 4.2 or have AArch64 or PowerPC64LE architecture, you should build ClickHouse from sources. ClickHouse implements parallel data processing and uses all the hardware resources available. When choosing a processor, take into account that ClickHouse works more efficiently at configurations with a large number of cores but a lower clock rate than at configurations with fewer cores and a higher clock rate. For example, 16 cores with 2600 MHz is preferable to 8 cores with 3600 MHz. Use of **Turbo Boost** and **hyper-threading** technologies is recommended. It significantly improves performance with a typical load. -## RAM +## RAM {#ram} We recommend to use a minimum of 4GB of RAM in order to perform non-trivial queries. The ClickHouse server can run with a much smaller amount of RAM, but it requires memory for processing queries. The required volume of RAM depends on: - - The complexity of queries. - - The amount of data that is processed in queries. +- The complexity of queries. +- The amount of data that is processed in queries. To calculate the required volume of RAM, you should estimate the size of temporary data for [GROUP BY](../query_language/select.md#select-group-by-clause), [DISTINCT](../query_language/select.md#select-distinct), [JOIN](../query_language/select.md#select-join) and other operations you use. ClickHouse can use external memory for temporary data. See [GROUP BY in External Memory](../query_language/select.md#select-group-by-in-external-memory) for details. -## Swap File +## Swap File {#swap-file} Disable the swap file for production environments. -## Storage Subsystem +## Storage Subsystem {#storage-subsystem} You need to have 2GB of free disk space to install ClickHouse. The volume of storage required for your data should be calculated separately. Assessment should include: -- Estimation of the data volume. +- Estimation of the data volume. You can take a sample of the data and get the average size of a row from it. Then multiply the value by the number of rows you plan to store. -- The data compression coefficient. +- The data compression coefficient. To estimate the data compression coefficient, load a sample of your data into ClickHouse and compare the actual size of the data with the size of the table stored. For example, clickstream data is usually compressed by 6-10 times. To calculate the final volume of data to be stored, apply the compression coefficient to the estimated data volume. If you plan to store data in several replicas, then multiply the estimated volume by the number of replicas. -## Network +## Network {#network} If possible, use networks of 10G or higher class. The network bandwidth is critical for processing distributed queries with a large amount of intermediate data. In addition, network speed affects replication processes. -## Software +## Software {#software} ClickHouse is developed for the Linux family of operating systems. The recommended Linux distribution is Ubuntu. The `tzdata` package should be installed in the system. diff --git a/docs/en/operations/server_settings/index.md b/docs/en/operations/server_settings/index.md index 39f3a5bb6d4..c02947be247 100644 --- a/docs/en/operations/server_settings/index.md +++ b/docs/en/operations/server_settings/index.md @@ -1,12 +1,11 @@ -# Server configuration parameters {#server_settings} +# Server configuration parameters {#server-settings} This section contains descriptions of server settings that cannot be changed at the session or query level. These settings are stored in the `config.xml` file on the ClickHouse server. -Other settings are described in the "[Settings](../settings/index.md#settings)" section. +Other settings are described in the “[Settings](../settings/index.md#settings)” section. Before studying the settings, read the [Configuration files](../configuration_files.md#configuration_files) section and note the use of substitutions (the `incl` and `optional` attributes). - [Original article](https://clickhouse.tech/docs/en/operations/server_settings/) diff --git a/docs/en/operations/server_settings/settings.md b/docs/en/operations/server_settings/settings.md index 1e48b374711..d67a25bfa36 100644 --- a/docs/en/operations/server_settings/settings.md +++ b/docs/en/operations/server_settings/settings.md @@ -1,31 +1,29 @@ -# Server settings +# Server Settings {#server-settings} - -## builtin_dictionaries_reload_interval +## builtin\_dictionaries\_reload\_interval {#builtin-dictionaries-reload-interval} The interval in seconds before reloading built-in dictionaries. -ClickHouse reloads built-in dictionaries every x seconds. This makes it possible to edit dictionaries "on the fly" without restarting the server. +ClickHouse reloads built-in dictionaries every x seconds. This makes it possible to edit dictionaries “on the fly” without restarting the server. Default value: 3600. **Example** -```xml +``` xml 3600 ``` - ## compression {#server-settings-compression} Data compression settings for [MergeTree](../table_engines/mergetree.md)-engine tables. -!!! warning - Don't use it if you have just started using ClickHouse. +!!! warning "Warning" + Don’t use it if you have just started using ClickHouse. Configuration template: -```xml +``` xml ... @@ -38,22 +36,22 @@ Configuration template: `` fields: -- `min_part_size` – The minimum size of a data part. -- `min_part_size_ratio` – The ratio of the data part size to the table size. -- `method` – Compression method. Acceptable values: `lz4` or `zstd`. +- `min_part_size` – The minimum size of a data part. +- `min_part_size_ratio` – The ratio of the data part size to the table size. +- `method` – Compression method. Acceptable values: `lz4` or `zstd`. You can configure multiple `` sections. Actions when conditions are met: -- If a data part matches a condition set, ClickHouse uses the specified compression method. -- If a data part matches multiple condition sets, ClickHouse uses the first matched condition set. +- If a data part matches a condition set, ClickHouse uses the specified compression method. +- If a data part matches multiple condition sets, ClickHouse uses the first matched condition set. If no conditions met for a data part, ClickHouse uses the `lz4` compression. **Example** -```xml +``` xml 10000000000 @@ -63,8 +61,7 @@ If no conditions met for a data part, ClickHouse uses the `lz4` compression. ``` - -## default_database +## default\_database {#default-database} The default database. @@ -72,12 +69,11 @@ To get a list of databases, use the [SHOW DATABASES](../../query_language/show.m **Example** -```xml +``` xml default ``` - -## default_profile +## default\_profile {#default-profile} Default settings profile. @@ -85,30 +81,28 @@ Settings profiles are located in the file specified in the parameter `user_confi **Example** -```xml +``` xml default ``` - -## dictionaries_config {#server_settings-dictionaries_config} +## dictionaries\_config {#server-settings-dictionaries-config} The path to the config file for external dictionaries. Path: -- Specify the absolute path or the path relative to the server config file. -- The path can contain wildcards \* and ?. +- Specify the absolute path or the path relative to the server config file. +- The path can contain wildcards \* and ?. -See also "[External dictionaries](../../query_language/dicts/external_dicts.md)". +See also “[External dictionaries](../../query_language/dicts/external_dicts.md)”. **Example** -```xml +``` xml *_dictionary.xml ``` - -## dictionaries_lazy_load {#server_settings-dictionaries_lazy_load} +## dictionaries\_lazy\_load {#server-settings-dictionaries-lazy-load} Lazy loading of dictionaries. @@ -120,43 +114,42 @@ The default is `true`. **Example** -```xml +``` xml true ``` - -## format_schema_path {#server_settings-format_schema_path} +## format\_schema\_path {#server-settings-format-schema-path} The path to the directory with the schemes for the input data, such as schemas for the [CapnProto](../../interfaces/formats.md#capnproto) format. **Example** -```xml +``` xml format_schemas/ ``` -## graphite {#server_settings-graphite} +## graphite {#server-settings-graphite} Sending data to [Graphite](https://github.com/graphite-project). Settings: -- host – The Graphite server. -- port – The port on the Graphite server. -- interval – The interval for sending, in seconds. -- timeout – The timeout for sending data, in seconds. -- root_path – Prefix for keys. -- metrics – Sending data from the [system.metrics](../system_tables.md#system_tables-metrics) table. -- events – Sending deltas data accumulated for the time period from the [system.events](../system_tables.md#system_tables-events) table. -- events_cumulative – Sending cumulative data from the [system.events](../system_tables.md#system_tables-events) table. -- asynchronous_metrics – Sending data from the [system.asynchronous_metrics](../system_tables.md#system_tables-asynchronous_metrics) table. +- host – The Graphite server. +- port – The port on the Graphite server. +- interval – The interval for sending, in seconds. +- timeout – The timeout for sending data, in seconds. +- root\_path – Prefix for keys. +- metrics – Sending data from the [system.metrics](../system_tables.md#system_tables-metrics) table. +- events – Sending deltas data accumulated for the time period from the [system.events](../system_tables.md#system_tables-events) table. +- events\_cumulative – Sending cumulative data from the [system.events](../system_tables.md#system_tables-events) table. +- asynchronous\_metrics – Sending data from the [system.asynchronous\_metrics](../system_tables.md#system_tables-asynchronous_metrics) table. You can configure multiple `` clauses. For instance, you can use this for sending different data at different intervals. **Example** -```xml +``` xml localhost 42000 @@ -170,8 +163,7 @@ You can configure multiple `` clauses. For instance, you can use this ``` - -## graphite_rollup {#server_settings-graphite_rollup} +## graphite\_rollup {#server-settings-graphite-rollup} Settings for thinning data for Graphite. @@ -179,7 +171,7 @@ For more details, see [GraphiteMergeTree](../table_engines/graphitemergetree.md) **Example** -```xml +``` xml max @@ -199,64 +191,60 @@ For more details, see [GraphiteMergeTree](../table_engines/graphitemergetree.md) ``` - -## http_port/https_port +## http\_port/https\_port {#http-porthttps-port} The port for connecting to the server over HTTP(s). If `https_port` is specified, [openSSL](#server_settings-openssl) must be configured. -If `http_port` is specified, the openSSL configuration is ignored even if it is set. +If `http_port` is specified, the OpenSSL configuration is ignored even if it is set. **Example** -```xml +``` xml 0000 ``` - -## http_server_default_response {#server_settings-http_server_default_response} +## http\_server\_default\_response {#server-settings-http-server-default-response} The page that is shown by default when you access the ClickHouse HTTP(s) server. -Default value is "Ok." (with a line feed at the end) +The default value is “Ok.” (with a line feed at the end) **Example** -Opens `https://tabix.io/` when accessing ` http://localhost: http_port`. +Opens `https://tabix.io/` when accessing `http://localhost: http_port`. -```xml +``` xml
]]>
``` -## include_from {#server_settings-include_from} +## include\_from {#server-settings-include-from} The path to the file with substitutions. -For more information, see the section "[Configuration files](../configuration_files.md#configuration_files)". +For more information, see the section “[Configuration files](../configuration_files.md#configuration_files)”. **Example** -```xml +``` xml /etc/metrica.xml ``` - -## interserver_http_port +## interserver\_http\_port {#interserver-http-port} Port for exchanging data between ClickHouse servers. **Example** -```xml +``` xml 9009 ``` +## interserver\_http\_host {#interserver-http-host} -## interserver_http_host - -The host name that can be used by other servers to access this server. +The hostname that can be used by other servers to access this server. If omitted, it is defined in the same way as the `hostname-f` command. @@ -264,68 +252,65 @@ Useful for breaking away from a specific network interface. **Example** -```xml +``` xml example.yandex.ru ``` -## interserver_http_credentials {#server-settings-interserver_http_credentials} +## interserver\_http\_credentials {#server-settings-interserver-http-credentials} -The username and password used to authenticate during [replication](../table_engines/replication.md) with the Replicated* engines. These credentials are used only for communication between replicas and are unrelated to credentials for ClickHouse clients. The server is checking these credentials for connecting replicas and use the same credentials when connecting to other replicas. So, these credentials should be set the same for all replicas in a cluster. +The username and password used to authenticate during [replication](../table_engines/replication.md) with the Replicated\* engines. These credentials are used only for communication between replicas and are unrelated to credentials for ClickHouse clients. The server is checking these credentials for connecting replicas and use the same credentials when connecting to other replicas. So, these credentials should be set the same for all replicas in a cluster. By default, the authentication is not used. This section contains the following parameters: -- `user` — username. -- `password` — password. +- `user` — username. +- `password` — password. **Example** -```xml +``` xml admin 222 ``` - -## keep_alive_timeout +## keep\_alive\_timeout {#keep-alive-timeout} The number of seconds that ClickHouse waits for incoming requests before closing the connection. Defaults to 3 seconds. **Example** -```xml +``` xml 3 ``` - -## listen_host {#server_settings-listen_host} +## listen\_host {#server-settings-listen-host} Restriction on hosts that requests can come from. If you want the server to answer all of them, specify `::`. Examples: -```xml +``` xml ::1 127.0.0.1 ``` - -## logger {#server_settings-logger} +## logger {#server-settings-logger} Logging settings. Keys: -- level – Logging level. Acceptable values: ``trace``, ``debug``, ``information``, ``warning``, ``error``. -- log – The log file. Contains all the entries according to `level`. -- errorlog – Error log file. -- size – Size of the file. Applies to ``log``and``errorlog``. Once the file reaches ``size``, ClickHouse archives and renames it, and creates a new log file in its place. -- count – The number of archived log files that ClickHouse stores. +- level – Logging level. Acceptable values: `trace`, `debug`, `information`, `warning`, `error`. +- log – The log file. Contains all the entries according to `level`. +- errorlog – Error log file. +- size – Size of the file. Applies to `log`and`errorlog`. Once the file reaches `size`, ClickHouse archives and renames it, and creates a new log file in its place. +- count – The number of archived log files that ClickHouse stores. **Example** -```xml +``` xml trace /var/log/clickhouse-server/clickhouse-server.log @@ -337,7 +322,7 @@ Keys: Writing to the syslog is also supported. Config example: -```xml +``` xml 1 @@ -351,30 +336,28 @@ Writing to the syslog is also supported. Config example: Keys: -- use_syslog — Required setting if you want to write to the syslog. -- address — The host[:port] of syslogd. If omitted, the local daemon is used. -- hostname — Optional. The name of the host that logs are sent from. -- facility — [The syslog facility keyword](https://en.wikipedia.org/wiki/Syslog#Facility) in uppercase letters with the "LOG_" prefix: (``LOG_USER``, ``LOG_DAEMON``, ``LOG_LOCAL3``, and so on). -Default value: ``LOG_USER`` if ``address`` is specified, ``LOG_DAEMON otherwise.`` -- format – Message format. Possible values: ``bsd`` and ``syslog.`` +- use\_syslog — Required setting if you want to write to the syslog. +- address — The host\[:port\] of syslogd. If omitted, the local daemon is used. +- hostname — Optional. The name of the host that logs are sent from. +- facility — [The syslog facility keyword](https://en.wikipedia.org/wiki/Syslog#Facility) in uppercase letters with the “LOG\_” prefix: (`LOG_USER`, `LOG_DAEMON`, `LOG_LOCAL3`, and so on). + Default value: `LOG_USER` if `address` is specified, `LOG_DAEMON otherwise.` +- format – Message format. Possible values: `bsd` and `syslog.` - -## macros +## macros {#macros} Parameter substitutions for replicated tables. Can be omitted if replicated tables are not used. -For more information, see the section "[Creating replicated tables](../../operations/table_engines/replication.md)". +For more information, see the section “[Creating replicated tables](../../operations/table_engines/replication.md)”. **Example** -```xml +``` xml ``` - -## mark_cache_size {#server-mark-cache-size} +## mark\_cache\_size {#server-mark-cache-size} Approximate size (in bytes) of the cache of marks used by table engines of the [MergeTree](../table_engines/mergetree.md) family. @@ -382,53 +365,49 @@ The cache is shared for the server and memory is allocated as needed. The cache **Example** -```xml +``` xml 5368709120 ``` - -## max_concurrent_queries +## max\_concurrent\_queries {#max-concurrent-queries} The maximum number of simultaneously processed requests. **Example** -```xml +``` xml 100 ``` - -## max_connections +## max\_connections {#max-connections} The maximum number of inbound connections. **Example** -```xml +``` xml 4096 ``` - -## max_open_files +## max\_open\_files {#max-open-files} The maximum number of open files. By default: `maximum`. -We recommend using this option in Mac OS X, since the `getrlimit()` function returns an incorrect value. +We recommend using this option in Mac OS X since the `getrlimit()` function returns an incorrect value. **Example** -```xml +``` xml 262144 ``` - -## max_table_size_to_drop +## max\_table\_size\_to\_drop {#max-table-size-to-drop} Restriction on deleting tables. -If the size of a [MergeTree](../table_engines/mergetree.md) table exceeds `max_table_size_to_drop` (in bytes), you can't delete it using a DROP query. +If the size of a [MergeTree](../table_engines/mergetree.md) table exceeds `max_table_size_to_drop` (in bytes), you can’t delete it using a DROP query. If you still need to delete the table without restarting the ClickHouse server, create the `/flags/force_drop_table` file and run the DROP query. @@ -438,12 +417,11 @@ The value 0 means that you can delete all tables without any restrictions. **Example** -```xml +``` xml 0 ``` - -## merge_tree {#server_settings-merge_tree} +## merge\_tree {#server-settings-merge-tree} Fine tuning for tables in the [MergeTree](../table_engines/mergetree.md). @@ -451,14 +429,13 @@ For more information, see the MergeTreeSettings.h header file. **Example** -```xml +``` xml 5 ``` - -## openSSL {#server_settings-openssl} +## openSSL {#server-settings-openssl} SSL client/server configuration. @@ -466,30 +443,30 @@ Support for SSL is provided by the `libpoco` library. The interface is described Keys for server/client settings: -- privateKeyFile – The path to the file with the secret key of the PEM certificate. The file may contain a key and certificate at the same time. -- certificateFile – The path to the client/server certificate file in PEM format. You can omit it if `privateKeyFile` contains the certificate. -- caConfig – The path to the file or directory that contains trusted root certificates. -- verificationMode – The method for checking the node's certificates. Details are in the description of the [Context](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/Context.h) class. Possible values: ``none``, ``relaxed``, ``strict``, ``once``. -- verificationDepth – The maximum length of the verification chain. Verification will fail if the certificate chain length exceeds the set value. -- loadDefaultCAFile – Indicates that built-in CA certificates for OpenSSL will be used. Acceptable values: `true`, `false`. | -- cipherList – Supported OpenSSL encryptions. For example: `ALL:!ADH:!LOW:!EXP:!MD5:@STRENGTH`. -- cacheSessions – Enables or disables caching sessions. Must be used in combination with ``sessionIdContext``. Acceptable values: `true`, `false`. -- sessionIdContext – A unique set of random characters that the server appends to each generated identifier. The length of the string must not exceed ``SSL_MAX_SSL_SESSION_ID_LENGTH``. This parameter is always recommended, since it helps avoid problems both if the server caches the session and if the client requested caching. Default value: ``${application.name}``. -- sessionCacheSize – The maximum number of sessions that the server caches. Default value: 1024\*20. 0 – Unlimited sessions. -- sessionTimeout – Time for caching the session on the server. -- extendedVerification – Automatically extended verification of certificates after the session ends. Acceptable values: `true`, `false`. -- requireTLSv1 – Require a TLSv1 connection. Acceptable values: `true`, `false`. -- requireTLSv1_1 – Require a TLSv1.1 connection. Acceptable values: `true`, `false`. -- requireTLSv1 – Require a TLSv1.2 connection. Acceptable values: `true`, `false`. -- fips – Activates OpenSSL FIPS mode. Supported if the library's OpenSSL version supports FIPS. -- privateKeyPassphraseHandler – Class (PrivateKeyPassphraseHandler subclass) that requests the passphrase for accessing the private key. For example: ````, ``KeyFileHandler``, ``test``, ````. -- invalidCertificateHandler – Class (subclass of CertificateHandler) for verifying invalid certificates. For example: `` ConsoleCertificateHandler `` . -- disableProtocols – Protocols that are not allowed to use. -- preferServerCiphers – Preferred server ciphers on the client. +- privateKeyFile – The path to the file with the secret key of the PEM certificate. The file may contain a key and certificate at the same time. +- certificateFile – The path to the client/server certificate file in PEM format. You can omit it if `privateKeyFile` contains the certificate. +- caConfig – The path to the file or directory that contains trusted root certificates. +- verificationMode – The method for checking the node’s certificates. Details are in the description of the [Context](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/Context.h) class. Possible values: `none`, `relaxed`, `strict`, `once`. +- verificationDepth – The maximum length of the verification chain. Verification will fail if the certificate chain length exceeds the set value. +- loadDefaultCAFile – Indicates that built-in CA certificates for OpenSSL will be used. Acceptable values: `true`, `false`. \| +- cipherList – Supported OpenSSL encryptions. For example: `ALL:!ADH:!LOW:!EXP:!MD5:@STRENGTH`. +- cacheSessions – Enables or disables caching sessions. Must be used in combination with `sessionIdContext`. Acceptable values: `true`, `false`. +- sessionIdContext – A unique set of random characters that the server appends to each generated identifier. The length of the string must not exceed `SSL_MAX_SSL_SESSION_ID_LENGTH`. This parameter is always recommended since it helps avoid problems both if the server caches the session and if the client requested caching. Default value: `${application.name}`. +- sessionCacheSize – The maximum number of sessions that the server caches. Default value: 1024\*20. 0 – Unlimited sessions. +- sessionTimeout – Time for caching the session on the server. +- extendedVerification – Automatically extended verification of certificates after the session ends. Acceptable values: `true`, `false`. +- requireTLSv1 – Require a TLSv1 connection. Acceptable values: `true`, `false`. +- requireTLSv1\_1 – Require a TLSv1.1 connection. Acceptable values: `true`, `false`. +- requireTLSv1 – Require a TLSv1.2 connection. Acceptable values: `true`, `false`. +- fips – Activates OpenSSL FIPS mode. Supported if the library’s OpenSSL version supports FIPS. +- privateKeyPassphraseHandler – Class (PrivateKeyPassphraseHandler subclass) that requests the passphrase for accessing the private key. For example: ``, `KeyFileHandler`, `test`, ``. +- invalidCertificateHandler – Class (a subclass of CertificateHandler) for verifying invalid certificates. For example: ` ConsoleCertificateHandler ` . +- disableProtocols – Protocols that are not allowed to use. +- preferServerCiphers – Preferred server ciphers on the client. **Example of settings:** -```xml +``` xml @@ -517,23 +494,22 @@ Keys for server/client settings: ``` - -## part_log {#server_settings-part-log} +## part\_log {#server-settings-part-log} Logging events that are associated with [MergeTree](../table_engines/mergetree.md). For instance, adding or merging data. You can use the log to simulate merge algorithms and compare their characteristics. You can visualize the merge process. -Queries are logged in the [system.part_log](../system_tables.md#system_tables-part-log) table, not in a separate file. You can configure the name of this table in the `table` parameter (see below). +Queries are logged in the [system.part\_log](../system_tables.md#system_tables-part-log) table, not in a separate file. You can configure the name of this table in the `table` parameter (see below). Use the following parameters to configure logging: -- `database` – Name of the database. -- `table` – Name of the system table. -- `partition_by` – Sets a [custom partitioning key](../../operations/table_engines/custom_partitioning_key.md). -- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. +- `database` – Name of the database. +- `table` – Name of the system table. +- `partition_by` – Sets a [custom partitioning key](../../operations/table_engines/custom_partitioning_key.md). +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. **Example** -```xml +``` xml system
${0:XML} ${1:XML}
part_log
@@ -542,39 +518,37 @@ Use the following parameters to configure logging: ``` - -## path {#server_settings-path} +## path {#server-settings-path} The path to the directory containing data. -!!! note +!!! note "Note" The trailing slash is mandatory. **Example** -```xml +``` xml /var/lib/clickhouse/ ``` +## query\_log {#server-settings-query-log} -## query_log {#server_settings-query-log} +Setting for logging queries received with the [log\_queries=1](../settings/settings.md) setting. -Setting for logging queries received with the [log_queries=1](../settings/settings.md) setting. - -Queries are logged in the [system.query_log](../system_tables.md#system_tables-query_log) table, not in a separate file. You can change the name of the table in the `table` parameter (see below). +Queries are logged in the [system.query\_log](../system_tables.md#system_tables-query_log) table, not in a separate file. You can change the name of the table in the `table` parameter (see below). Use the following parameters to configure logging: -- `database` – Name of the database. -- `table` – Name of the system table the queries will be logged in. -- `partition_by` – Sets a [custom partitioning key](../../operations/table_engines/custom_partitioning_key.md) for a table. -- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. +- `database` – Name of the database. +- `table` – Name of the system table the queries will be logged in. +- `partition_by` – Sets a [custom partitioning key](../../operations/table_engines/custom_partitioning_key.md) for a table. +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. -If the table doesn't exist, ClickHouse will create it. If the structure of the query log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. +If the table doesn’t exist, ClickHouse will create it. If the structure of the query log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. **Example** -```xml +``` xml system query_log
@@ -583,24 +557,24 @@ If the table doesn't exist, ClickHouse will create it. If the structure of the q
``` -## query_thread_log {#server_settings-query-thread-log} +## query\_thread\_log {#server-settings-query-thread-log} -Setting for logging threads of queries received with the [log_query_threads=1](../settings/settings.md#settings-log-query-threads) setting. +Setting for logging threads of queries received with the [log\_query\_threads=1](../settings/settings.md#settings-log-query-threads) setting. -Queries are logged in the [system.query_thread_log](../system_tables.md#system_tables-query-thread-log) table, not in a separate file. You can change the name of the table in the `table` parameter (see below). +Queries are logged in the [system.query\_thread\_log](../system_tables.md#system_tables-query-thread-log) table, not in a separate file. You can change the name of the table in the `table` parameter (see below). Use the following parameters to configure logging: -- `database` – Name of the database. -- `table` – Name of the system table the queries will be logged in. -- `partition_by` – Sets a [custom partitioning key](../../operations/table_engines/custom_partitioning_key.md) for a system table. -- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. +- `database` – Name of the database. +- `table` – Name of the system table the queries will be logged in. +- `partition_by` – Sets a [custom partitioning key](../../operations/table_engines/custom_partitioning_key.md) for a system table. +- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table. -If the table doesn't exist, ClickHouse will create it. If the structure of the query thread log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. +If the table doesn’t exist, ClickHouse will create it. If the structure of the query thread log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically. **Example** -```xml +``` xml system query_thread_log
@@ -609,20 +583,20 @@ If the table doesn't exist, ClickHouse will create it. If the structure of the q
``` -## trace_log {#server_settings-trace_log} +## trace\_log {#server-settings-trace-log} -Settings for the [trace_log](../system_tables.md#system_tables-trace_log) system table operation. +Settings for the [trace\_log](../system_tables.md#system_tables-trace_log) system table operation. Parameters: -- `database` — Database for storing a table. -- `table` — Table name. -- `partition_by` — [Custom partitioning key](../../operations/table_engines/custom_partitioning_key.md) for a system table. -- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. +- `database` — Database for storing a table. +- `table` — Table name. +- `partition_by` — [Custom partitioning key](../../operations/table_engines/custom_partitioning_key.md) for a system table. +- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. The default server configuration file `config.xml` contains the following settings section: -```xml +``` xml system trace_log
@@ -631,16 +605,16 @@ The default server configuration file `config.xml` contains the following settin
``` -## query_masking_rules +## query\_masking\_rules {#query-masking-rules} Regexp-based rules, which will be applied to queries as well as all log messages before storing them in server logs, -`system.query_log`, `system.text_log`, `system.processes` table, and in logs sent to client. That allows preventing -sensitive data leakage from SQL queries (like names / emails / personal -identifiers / credit card numbers etc) to logs. +`system.query_log`, `system.text_log`, `system.processes` table, and in logs sent to the client. That allows preventing +sensitive data leakage from SQL queries (like names, emails, personal +identifiers or credit card numbers) to logs. **Example** -```xml +``` xml hide SSN @@ -655,55 +629,54 @@ Config fields: - `regexp` - RE2 compatible regular expression (mandatory) - `replace` - substitution string for sensitive data (optional, by default - six asterisks) -The masking rules are applied on whole query (to prevent leaks of sensitive data from malformed / non parsable queries). +The masking rules are applied to the whole query (to prevent leaks of sensitive data from malformed / non-parsable queries). -`system.events` table have counter `QueryMaskingRulesMatch` which have overall number of query masking rules matches. +`system.events` table have counter `QueryMaskingRulesMatch` which have an overall number of query masking rules matches. -For distributed queries each server have to be configured separately, otherwise subquries passed to other +For distributed queries each server have to be configured separately, otherwise, subqueries passed to other nodes will be stored without masking. -## remote_servers {#server_settings_remote_servers} +## remote\_servers {#server-settings-remote-servers} Configuration of clusters used by the [Distributed](../../operations/table_engines/distributed.md) table engine and by the `cluster` table function. **Example** -```xml +``` xml ``` -For the value of the `incl` attribute, see the section "[Configuration files](../configuration_files.md#configuration_files)". +For the value of the `incl` attribute, see the section “[Configuration files](../configuration_files.md#configuration_files)”. **See Also** -- [skip_unavailable_shards](../settings/settings.md#settings-skip_unavailable_shards) +- [skip\_unavailable\_shards](../settings/settings.md#settings-skip_unavailable_shards) -## timezone {#server_settings-timezone} +## timezone {#server-settings-timezone} -The server's time zone. +The server’s time zone. -Specified as an IANA identifier for the UTC time zone or geographic location (for example, Africa/Abidjan). +Specified as an IANA identifier for the UTC timezone or geographic location (for example, Africa/Abidjan). -The time zone is necessary for conversions between String and DateTime formats when DateTime fields are output to text format (printed on the screen or in a file), and when getting DateTime from a string. In addition, the time zone is used in functions that work with the time and date if they didn't receive the time zone in the input parameters. +The time zone is necessary for conversions between String and DateTime formats when DateTime fields are output to text format (printed on the screen or in a file), and when getting DateTime from a string. Besides, the time zone is used in functions that work with the time and date if they didn’t receive the time zone in the input parameters. **Example** -```xml +``` xml Europe/Moscow ``` - -## tcp_port {#server_settings-tcp_port} +## tcp\_port {#server-settings-tcp-port} Port for communicating with clients over the TCP protocol. **Example** -```xml +``` xml 9000 ``` -## tcp_port_secure {#server_settings-tcp_port_secure} +## tcp\_port\_secure {#server-settings-tcp-port-secure} TCP port for secure communication with clients. Use it with [OpenSSL](#server_settings-openssl) settings. @@ -713,11 +686,11 @@ Positive integer. **Default value** -```xml +``` xml 9440 ``` -## mysql_port {#server_settings-mysql_port} +## mysql\_port {#server-settings-mysql-port} Port for communicating with clients over MySQL protocol. @@ -727,77 +700,74 @@ Positive integer. Example -```xml +``` xml 9004 ``` -## tmp_path {#server-settings-tmp_path} +## tmp\_path {#server-settings-tmp-path} Path to temporary data for processing large queries. -!!! note +!!! note "Note" The trailing slash is mandatory. **Example** -```xml +``` xml /var/lib/clickhouse/tmp/ ``` - -## tmp_policy {#server-settings-tmp_policy} +## tmp\_policy {#server-settings-tmp-policy} Policy from [`storage_configuration`](../table_engines/mergetree.md#table_engine-mergetree-multiple-volumes) to store temporary files. If not set [`tmp_path`](#server-settings-tmp_path) is used, otherwise it is ignored. -!!! note +!!! note "Note" - `move_factor` is ignored - - `keep_free_space_bytes` is ignored - - `max_data_part_size_bytes` is ignored - - you must have exactly one volume in that policy +- `keep_free_space_bytes` is ignored +- `max_data_part_size_bytes` is ignored +- you must have exactly one volume in that policy -## uncompressed_cache_size {#server-settings-uncompressed_cache_size} +## uncompressed\_cache\_size {#server-settings-uncompressed-cache-size} Cache size (in bytes) for uncompressed data used by table engines from the [MergeTree](../table_engines/mergetree.md). -There is one shared cache for the server. Memory is allocated on demand. The cache is used if the option [use_uncompressed_cache](../settings/settings.md#setting-use_uncompressed_cache) is enabled. +There is one shared cache for the server. Memory is allocated on demand. The cache is used if the option [use\_uncompressed\_cache](../settings/settings.md#setting-use_uncompressed_cache) is enabled. The uncompressed cache is advantageous for very short queries in individual cases. **Example** -```xml +``` xml 8589934592 ``` -## user_files_path {#server_settings-user_files_path} +## user\_files\_path {#server-settings-user-files-path} The directory with user files. Used in the table function [file()](../../query_language/table_functions/file.md). **Example** -```xml +``` xml /var/lib/clickhouse/user_files/ ``` - -## users_config +## users\_config {#users-config} Path to the file that contains: -- User configurations. -- Access rights. -- Settings profiles. -- Quota settings. +- User configurations. +- Access rights. +- Settings profiles. +- Quota settings. **Example** -```xml +``` xml users.xml ``` - -## zookeeper {#server-settings_zookeeper} +## zookeeper {#server-settings-zookeeper} Contains settings that allow ClickHouse to interact with a [ZooKeeper](http://zookeeper.apache.org/) cluster. @@ -805,26 +775,28 @@ ClickHouse uses ZooKeeper for storing metadata of replicas when using replicated This section contains the following parameters: -- `node` — ZooKeeper endpoint. You can set multiple endpoints. +- `node` — ZooKeeper endpoint. You can set multiple endpoints. For example: -```xml + + +``` xml example_host 2181 ``` - The `index` attribute specifies the node order when trying to connect to the ZooKeeper cluster. + The `index` attribute specifies the node order when trying to connect to the ZooKeeper cluster. -- `session_timeout` — Maximum timeout for the client session in milliseconds. -- `root` — The [znode](http://zookeeper.apache.org/doc/r3.5.5/zookeeperOver.html#Nodes+and+ephemeral+nodes) that is used as the root for znodes used by the ClickHouse server. Optional. -- `identity` — User and password, that can be required by ZooKeeper to give access to requested znodes. Optional. +- `session_timeout` — Maximum timeout for the client session in milliseconds. +- `root` — The [znode](http://zookeeper.apache.org/doc/r3.5.5/zookeeperOver.html#Nodes+and+ephemeral+nodes) that is used as the root for znodes used by the ClickHouse server. Optional. +- `identity` — User and password, that can be required by ZooKeeper to give access to requested znodes. Optional. **Example configuration** -```xml +``` xml example1 @@ -845,45 +817,45 @@ This section contains the following parameters: **See Also** -- [Replication](../../operations/table_engines/replication.md) -- [ZooKeeper Programmer's Guide](http://zookeeper.apache.org/doc/current/zookeeperProgrammers.html) +- [Replication](../../operations/table_engines/replication.md) +- [ZooKeeper Programmer’s Guide](http://zookeeper.apache.org/doc/current/zookeeperProgrammers.html) -## use_minimalistic_part_header_in_zookeeper {#server-settings-use_minimalistic_part_header_in_zookeeper} +## use\_minimalistic\_part\_header\_in\_zookeeper {#server-settings-use-minimalistic-part-header-in-zookeeper} Storage method for data part headers in ZooKeeper. This setting only applies to the `MergeTree` family. It can be specified: -- Globally in the [merge_tree](#server_settings-merge_tree) section of the `config.xml` file. +- Globally in the [merge\_tree](#server_settings-merge_tree) section of the `config.xml` file. - ClickHouse uses the setting for all the tables on the server. You can change the setting at any time. Existing tables change their behavior when the setting changes. + ClickHouse uses the setting for all the tables on the server. You can change the setting at any time. Existing tables change their behaviour when the setting changes. -- For each individual table. +- For each table. - When creating a table, specify the corresponding [engine setting](../table_engines/mergetree.md#table_engine-mergetree-creating-a-table). The behavior of an existing table with this setting does not change, even if the global setting changes. + When creating a table, specify the corresponding [engine setting](../table_engines/mergetree.md#table_engine-mergetree-creating-a-table). The behaviour of an existing table with this setting does not change, even if the global setting changes. **Possible values** -- 0 — Functionality is turned off. -- 1 — Functionality is turned on. +- 0 — Functionality is turned off. +- 1 — Functionality is turned on. If `use_minimalistic_part_header_in_zookeeper = 1`, then [replicated](../table_engines/replication.md) tables store the headers of the data parts compactly using a single `znode`. If the table contains many columns, this storage method significantly reduces the volume of the data stored in Zookeeper. -!!! attention - After applying `use_minimalistic_part_header_in_zookeeper = 1`, you can't downgrade the ClickHouse server to a version that doesn't support this setting. Be careful when upgrading ClickHouse on servers in a cluster. Don't upgrade all the servers at once. It is safer to test new versions of ClickHouse in a test environment, or on just a few servers of a cluster. +!!! attention "Attention" + After applying `use_minimalistic_part_header_in_zookeeper = 1`, you can’t downgrade the ClickHouse server to a version that doesn’t support this setting. Be careful when upgrading ClickHouse on servers in a cluster. Don’t upgrade all the servers at once. It is safer to test new versions of ClickHouse in a test environment, or on just a few servers of a cluster. - Data part headers already stored with this setting can't be restored to their previous (non-compact) representation. + Data part headers already stored with this setting can't be restored to their previous (non-compact) representation. **Default value:** 0. -## disable_internal_dns_cache {#server-settings-disable_internal_dns_cache} +## disable\_internal\_dns\_cache {#server-settings-disable-internal-dns-cache} Disables the internal DNS cache. Recommended for operating ClickHouse in systems with frequently changing infrastructure such as Kubernetes. **Default value:** 0. -## dns_cache_update_period {#server-settings-dns_cache_update_period} +## dns\_cache\_update\_period {#server-settings-dns-cache-update-period} The period of updating IP addresses stored in the ClickHouse internal DNS cache (in seconds). The update is performed asynchronously, in a separate system thread. diff --git a/docs/en/operations/settings/constraints_on_settings.md b/docs/en/operations/settings/constraints_on_settings.md index 12419092acb..3fd21c26aa7 100644 --- a/docs/en/operations/settings/constraints_on_settings.md +++ b/docs/en/operations/settings/constraints_on_settings.md @@ -1,9 +1,9 @@ -# Constraints on Settings +# Constraints on Settings {#constraints-on-settings} The constraints on settings can be defined in the `profiles` section of the `user.xml` configuration file and prohibit users from changing some of the settings with the `SET` query. -The constraints are defined as following: +The constraints are defined as the following: -```xml +``` xml @@ -25,12 +25,12 @@ The constraints are defined as following: ``` -If user tries to violate the constraints an exception is thrown and the setting isn't actually changed. -There are supported three types of constraints: `min`, `max`, `readonly`. The `min` and `max` constraints specify upper and lower boundaries for a numeric setting and can be used in combination. The `readonly` constraint specify that the user cannot change the corresponding setting at all. +If the user tries to violate the constraints an exception is thrown and the setting isn’t changed. +There are supported three types of constraints: `min`, `max`, `readonly`. The `min` and `max` constraints specify upper and lower boundaries for a numeric setting and can be used in combination. The `readonly` constraint specifies that the user cannot change the corresponding setting at all. **Example:** Let `users.xml` includes lines: -```xml +``` xml 10000000000 @@ -51,18 +51,18 @@ There are supported three types of constraints: `min`, `max`, `readonly`. The `m The following queries all throw exceptions: -```sql +``` sql SET max_memory_usage=20000000001; SET max_memory_usage=4999999999; SET force_index_by_date=1; ``` -```text +``` text Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be greater than 20000000000. Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be less than 5000000000. Code: 452, e.displayText() = DB::Exception: Setting force_index_by_date should not be changed. ``` -**Note:** the `default` profile has a special handling: all the constraints defined for the `default` profile become the default constraints, so they restrict all the users until they're overriden explicitly for these users. +**Note:** the `default` profile has special handling: all the constraints defined for the `default` profile become the default constraints, so they restrict all the users until they’re overridden explicitly for these users. [Original article](https://clickhouse.tech/docs/en/operations/settings/constraints_on_settings/) diff --git a/docs/en/operations/settings/index.md b/docs/en/operations/settings/index.md index 3dd02876955..7a8fb2fef5f 100644 --- a/docs/en/operations/settings/index.md +++ b/docs/en/operations/settings/index.md @@ -5,20 +5,20 @@ Settings are configured in layers, so each subsequent layer redefines the previo Ways to configure settings, in order of priority: -- Settings in the `users.xml` server configuration file. +- Settings in the `users.xml` server configuration file. Set in the element ``. -- Session settings. +- Session settings. Send `SET setting=value` from the ClickHouse console client in interactive mode. -Similarly, you can use ClickHouse sessions in the HTTP protocol. To do this, you need to specify the `session_id` HTTP parameter. + Similarly, you can use ClickHouse sessions in the HTTP protocol. To do this, you need to specify the `session_id` HTTP parameter. -- Query settings. - - When starting the ClickHouse console client in non-interactive mode, set the startup parameter `--setting=value`. - - When using the HTTP API, pass CGI parameters (`URL?setting_1=value&setting_2=value...`). +- Query settings. + + - When starting the ClickHouse console client in non-interactive mode, set the startup parameter `--setting=value`. + - When using the HTTP API, pass CGI parameters (`URL?setting_1=value&setting_2=value...`). Settings that can only be made in the server config file are not covered in this section. - [Original article](https://clickhouse.tech/docs/en/operations/settings/) diff --git a/docs/en/operations/settings/permissions_for_queries.md b/docs/en/operations/settings/permissions_for_queries.md index e6dcd490e97..c5763a66138 100644 --- a/docs/en/operations/settings/permissions_for_queries.md +++ b/docs/en/operations/settings/permissions_for_queries.md @@ -1,33 +1,33 @@ -# Permissions for queries {#permissions_for_queries} +# Permissions for Queries {#permissions-for-queries} Queries in ClickHouse can be divided into several types: -1. Read data queries: `SELECT`, `SHOW`, `DESCRIBE`, `EXISTS`. -1. Write data queries: `INSERT`, `OPTIMIZE`. -1. Change settings queries: `SET`, `USE`. -1. [DDL](https://en.wikipedia.org/wiki/Data_definition_language) queries: `CREATE`, `ALTER`, `RENAME`, `ATTACH`, `DETACH`, `DROP` `TRUNCATE`. -1. `KILL QUERY`. +1. Read data queries: `SELECT`, `SHOW`, `DESCRIBE`, `EXISTS`. +2. Write data queries: `INSERT`, `OPTIMIZE`. +3. Change settings query: `SET`, `USE`. +4. [DDL](https://en.wikipedia.org/wiki/Data_definition_language) queries: `CREATE`, `ALTER`, `RENAME`, `ATTACH`, `DETACH`, `DROP` `TRUNCATE`. +5. `KILL QUERY`. The following settings regulate user permissions by the type of query: -- [readonly](#settings_readonly) — Restricts permissions for all types of queries except DDL queries. -- [allow_ddl](#settings_allow_ddl) — Restricts permissions for DDL queries. +- [readonly](#settings_readonly) — Restricts permissions for all types of queries except DDL queries. +- [allow\_ddl](#settings_allow_ddl) — Restricts permissions for DDL queries. `KILL QUERY` can be performed with any settings. -## readonly {#settings_readonly} +## readonly {#settings-readonly} -Restricts permissions for read data, write data and change settings queries. +Restricts permissions for reading data, write data and change settings queries. See how the queries are divided into types [above](#permissions_for_queries). Possible values: -- 0 — All queries are allowed. -- 1 — Only read data queries are allowed. -- 2 — Read data and change settings queries are allowed. +- 0 — All queries are allowed. +- 1 — Only read data queries are allowed. +- 2 — Read data and change settings queries are allowed. -After setting `readonly = 1`, the user can't change `readonly` and `allow_ddl` settings in the current session. +After setting `readonly = 1`, the user can’t change `readonly` and `allow_ddl` settings in the current session. When using the `GET` method in the [HTTP interface](../../interfaces/http.md), `readonly = 1` is set automatically. To modify data, use the `POST` method. @@ -36,7 +36,7 @@ from changing only specific settings, for details see [constraints on settings]( Default value: 0 -## allow_ddl {#settings_allow_ddl} +## allow\_ddl {#settings-allow-ddl} Allows or denies [DDL](https://en.wikipedia.org/wiki/Data_definition_language) queries. @@ -44,10 +44,10 @@ See how the queries are divided into types [above](#permissions_for_queries). Possible values: -- 0 — DDL queries are not allowed. -- 1 — DDL queries are allowed. +- 0 — DDL queries are not allowed. +- 1 — DDL queries are allowed. -You can't execute `SET allow_ddl = 1` if `allow_ddl = 0` for the current session. +You can’t execute `SET allow_ddl = 1` if `allow_ddl = 0` for the current session. Default value: 1 diff --git a/docs/en/operations/settings/query_complexity.md b/docs/en/operations/settings/query_complexity.md index 5b9db828d03..2af79218d2d 100644 --- a/docs/en/operations/settings/query_complexity.md +++ b/docs/en/operations/settings/query_complexity.md @@ -1,31 +1,31 @@ -# Restrictions on query complexity +# Restrictions on Query Complexity {#restrictions-on-query-complexity} Restrictions on query complexity are part of the settings. -They are used in order to provide safer execution from the user interface. +They are used to provide safer execution from the user interface. Almost all the restrictions only apply to `SELECT`. For distributed query processing, restrictions are applied on each server separately. -ClickHouse checks the restrictions for data parts, not for each row. It means that you can exceed the value of restriction with a size of the data part. +ClickHouse checks the restrictions for data parts, not for each row. It means that you can exceed the value of restriction with the size of the data part. -Restrictions on the "maximum amount of something" can take the value 0, which means "unrestricted". -Most restrictions also have an 'overflow_mode' setting, meaning what to do when the limit is exceeded. -It can take one of two values: `throw` or `break`. Restrictions on aggregation (group_by_overflow_mode) also have the value `any`. +Restrictions on the “maximum amount of something” can take the value 0, which means “unrestricted”. +Most restrictions also have an ‘overflow\_mode’ setting, meaning what to do when the limit is exceeded. +It can take one of two values: `throw` or `break`. Restrictions on aggregation (group\_by\_overflow\_mode) also have the value `any`. `throw` – Throw an exception (default). `break` – Stop executing the query and return the partial result, as if the source data ran out. -`any (only for group_by_overflow_mode)` – Continuing aggregation for the keys that got into the set, but don't add new keys to the set. +`any (only for group_by_overflow_mode)` – Continuing aggregation for the keys that got into the set, but don’t add new keys to the set. -## max_memory_usage {#settings_max_memory_usage} +## max\_memory\_usage {#settings-max-memory-usage} The maximum amount of RAM to use for running a query on a single server. In the default configuration file, the maximum is 10 GB. -The setting doesn't consider the volume of available memory or the total volume of memory on the machine. +The setting doesn’t consider the volume of available memory or the total volume of memory on the machine. The restriction applies to a single query within a single server. You can use `SHOW PROCESSLIST` to see the current memory consumption for each query. -In addition, the peak memory consumption is tracked for each query and written to the log. +Besides, the peak memory consumption is tracked for each query and written to the log. Memory usage is not monitored for the states of certain aggregate functions. @@ -33,87 +33,87 @@ Memory usage is not fully tracked for states of the aggregate functions `min`, ` Memory consumption is also restricted by the parameters `max_memory_usage_for_user` and `max_memory_usage_for_all_queries`. -## max_memory_usage_for_user +## max\_memory\_usage\_for\_user {#max-memory-usage-for-user} -The maximum amount of RAM to use for running a user's queries on a single server. +The maximum amount of RAM to use for running a user’s queries on a single server. Default values are defined in [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L288). By default, the amount is not restricted (`max_memory_usage_for_user = 0`). -See also the description of [max_memory_usage](#settings_max_memory_usage). +See also the description of [max\_memory\_usage](#settings_max_memory_usage). -## max_memory_usage_for_all_queries +## max\_memory\_usage\_for\_all\_queries {#max-memory-usage-for-all-queries} The maximum amount of RAM to use for running all queries on a single server. Default values are defined in [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L289). By default, the amount is not restricted (`max_memory_usage_for_all_queries = 0`). -See also the description of [max_memory_usage](#settings_max_memory_usage). +See also the description of [max\_memory\_usage](#settings_max_memory_usage). -## max_rows_to_read +## max\_rows\_to\_read {#max-rows-to-read} The following restrictions can be checked on each block (instead of on each row). That is, the restrictions can be broken a little. When running a query in multiple threads, the following restrictions apply to each thread separately. -Maximum number of rows that can be read from a table when running a query. +A maximum number of rows that can be read from a table when running a query. -## max_bytes_to_read +## max\_bytes\_to\_read {#max-bytes-to-read} -Maximum number of bytes (uncompressed data) that can be read from a table when running a query. +A maximum number of bytes (uncompressed data) that can be read from a table when running a query. -## read_overflow_mode +## read\_overflow\_mode {#read-overflow-mode} -What to do when the volume of data read exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do when the volume of data read exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -## max_rows_to_group_by {#settings-max_rows_to_group_by} +## max\_rows\_to\_group\_by {#settings-max-rows-to-group-by} -Maximum number of unique keys received from aggregation. This setting lets you limit memory consumption when aggregating. +A maximum number of unique keys received from aggregation. This setting lets you limit memory consumption when aggregating. -## group_by_overflow_mode +## group\_by\_overflow\_mode {#group-by-overflow-mode} -What to do when the number of unique keys for aggregation exceeds the limit: 'throw', 'break', or 'any'. By default, throw. -Using the 'any' value lets you run an approximation of GROUP BY. The quality of this approximation depends on the statistical nature of the data. +What to do when the number of unique keys for aggregation exceeds the limit: ‘throw’, ‘break’, or ‘any’. By default, throw. +Using the ‘any’ value lets you run an approximation of GROUP BY. The quality of this approximation depends on the statistical nature of the data. -## max_bytes_before_external_group_by {#settings-max_bytes_before_external_group_by} +## max\_bytes\_before\_external\_group\_by {#settings-max-bytes-before-external-group-by} Enables or disables execution of `GROUP BY` clauses in external memory. See [GROUP BY in external memory](../../query_language/select.md#select-group-by-in-external-memory). Possible values: -- Maximum volume of RAM (in bytes) that can be used by the single [GROUP BY](../../query_language/select.md#select-group-by-clause) operation. -- 0 — `GROUP BY` in external memory disabled. +- Maximum volume of RAM (in bytes) that can be used by the single [GROUP BY](../../query_language/select.md#select-group-by-clause) operation. +- 0 — `GROUP BY` in external memory disabled. Default value: 0. -## max_rows_to_sort +## max\_rows\_to\_sort {#max-rows-to-sort} -Maximum number of rows before sorting. This allows you to limit memory consumption when sorting. +A maximum number of rows before sorting. This allows you to limit memory consumption when sorting. -## max_bytes_to_sort +## max\_bytes\_to\_sort {#max-bytes-to-sort} -Maximum number of bytes before sorting. +A maximum number of bytes before sorting. -## sort_overflow_mode +## sort\_overflow\_mode {#sort-overflow-mode} -What to do if the number of rows received before sorting exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do if the number of rows received before sorting exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -## max_result_rows {#setting-max_result_rows} +## max\_result\_rows {#setting-max-result-rows} Limit on the number of rows in the result. Also checked for subqueries, and on remote servers when running parts of a distributed query. -## max_result_bytes +## max\_result\_bytes {#max-result-bytes} Limit on the number of bytes in the result. The same as the previous setting. -## result_overflow_mode +## result\_overflow\_mode {#result-overflow-mode} -What to do if the volume of the result exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do if the volume of the result exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -Using 'break' is similar to using LIMIT. `Break` interrupts execution only at the block level. This means that amount of returned rows is greater than [max_result_rows](#setting-max_result_rows), multiple of [max_block_size](settings.md#setting-max_block_size) and depends on [max_threads](settings.md#settings-max_threads). +Using ‘break’ is similar to using LIMIT. `Break` interrupts execution only at the block level. This means that amount of returned rows is greater than [max\_result\_rows](#setting-max_result_rows), multiple of [max\_block\_size](settings.md#setting-max_block_size) and depends on [max\_threads](settings.md#settings-max_threads). Example: -```sql -SET max_threads = 3, max_block_size = 3333; +``` sql +SET max_threads = 3, max_block_size = 3333; SET max_result_rows = 3334, result_overflow_mode = 'break'; SELECT * @@ -123,166 +123,165 @@ FORMAT Null; Result: -```text +``` text 6666 rows in set. ... ``` -## max_execution_time +## max\_execution\_time {#max-execution-time} Maximum query execution time in seconds. At this time, it is not checked for one of the sorting stages, or when merging and finalizing aggregate functions. -## timeout_overflow_mode +## timeout\_overflow\_mode {#timeout-overflow-mode} -What to do if the query is run longer than 'max_execution_time': 'throw' or 'break'. By default, throw. +What to do if the query is run longer than ‘max\_execution\_time’: ‘throw’ or ‘break’. By default, throw. -## min_execution_speed +## min\_execution\_speed {#min-execution-speed} -Minimal execution speed in rows per second. Checked on every data block when 'timeout_before_checking_execution_speed' expires. If the execution speed is lower, an exception is thrown. +Minimal execution speed in rows per second. Checked on every data block when ‘timeout\_before\_checking\_execution\_speed’ expires. If the execution speed is lower, an exception is thrown. -## min_execution_speed_bytes +## min\_execution\_speed\_bytes {#min-execution-speed-bytes} -Minimum number of execution bytes per second. Checked on every data block when 'timeout_before_checking_execution_speed' expires. If the execution speed is lower, an exception is thrown. +A minimum number of execution bytes per second. Checked on every data block when ‘timeout\_before\_checking\_execution\_speed’ expires. If the execution speed is lower, an exception is thrown. -## max_execution_speed +## max\_execution\_speed {#max-execution-speed} -Maximum number of execution rows per second. Checked on every data block when 'timeout_before_checking_execution_speed' expires. If the execution speed is high, the execution speed will be reduced. +A maximum number of execution rows per second. Checked on every data block when ‘timeout\_before\_checking\_execution\_speed’ expires. If the execution speed is high, the execution speed will be reduced. -## max_execution_speed_bytes +## max\_execution\_speed\_bytes {#max-execution-speed-bytes} -Maximum number of execution bytes per second. Checked on every data block when 'timeout_before_checking_execution_speed' expires. If the execution speed is high, the execution speed will be reduced. +A maximum number of execution bytes per second. Checked on every data block when ‘timeout\_before\_checking\_execution\_speed’ expires. If the execution speed is high, the execution speed will be reduced. -## timeout_before_checking_execution_speed +## timeout\_before\_checking\_execution\_speed {#timeout-before-checking-execution-speed} -Checks that execution speed is not too slow (no less than 'min_execution_speed'), after the specified time in seconds has expired. +Checks that execution speed is not too slow (no less than ‘min\_execution\_speed’), after the specified time in seconds has expired. -## max_columns_to_read +## max\_columns\_to\_read {#max-columns-to-read} -Maximum number of columns that can be read from a table in a single query. If a query requires reading a greater number of columns, it throws an exception. +A maximum number of columns that can be read from a table in a single query. If a query requires reading a greater number of columns, it throws an exception. -## max_temporary_columns +## max\_temporary\_columns {#max-temporary-columns} -Maximum number of temporary columns that must be kept in RAM at the same time when running a query, including constant columns. If there are more temporary columns than this, it throws an exception. +A maximum number of temporary columns that must be kept in RAM at the same time when running a query, including constant columns. If there are more temporary columns than this, it throws an exception. -## max_temporary_non_const_columns +## max\_temporary\_non\_const\_columns {#max-temporary-non-const-columns} -The same thing as 'max_temporary_columns', but without counting constant columns. +The same thing as ‘max\_temporary\_columns’, but without counting constant columns. Note that constant columns are formed fairly often when running a query, but they require approximately zero computing resources. -## max_subquery_depth +## max\_subquery\_depth {#max-subquery-depth} Maximum nesting depth of subqueries. If subqueries are deeper, an exception is thrown. By default, 100. -## max_pipeline_depth +## max\_pipeline\_depth {#max-pipeline-depth} Maximum pipeline depth. Corresponds to the number of transformations that each data block goes through during query processing. Counted within the limits of a single server. If the pipeline depth is greater, an exception is thrown. By default, 1000. -## max_ast_depth +## max\_ast\_depth {#max-ast-depth} Maximum nesting depth of a query syntactic tree. If exceeded, an exception is thrown. -At this time, it isn't checked during parsing, but only after parsing the query. That is, a syntactic tree that is too deep can be created during parsing, but the query will fail. By default, 1000. +At this time, it isn’t checked during parsing, but only after parsing the query. That is, a syntactic tree that is too deep can be created during parsing, but the query will fail. By default, 1000. -## max_ast_elements +## max\_ast\_elements {#max-ast-elements} -Maximum number of elements in a query syntactic tree. If exceeded, an exception is thrown. +A maximum number of elements in a query syntactic tree. If exceeded, an exception is thrown. In the same way as the previous setting, it is checked only after parsing the query. By default, 50,000. -## max_rows_in_set +## max\_rows\_in\_set {#max-rows-in-set} -Maximum number of rows for a data set in the IN clause created from a subquery. +A maximum number of rows for a data set in the IN clause created from a subquery. -## max_bytes_in_set +## max\_bytes\_in\_set {#max-bytes-in-set} -Maximum number of bytes (uncompressed data) used by a set in the IN clause created from a subquery. +A maximum number of bytes (uncompressed data) used by a set in the IN clause created from a subquery. -## set_overflow_mode +## set\_overflow\_mode {#set-overflow-mode} -What to do when the amount of data exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -## max_rows_in_distinct +## max\_rows\_in\_distinct {#max-rows-in-distinct} -Maximum number of different rows when using DISTINCT. +A maximum number of different rows when using DISTINCT. -## max_bytes_in_distinct +## max\_bytes\_in\_distinct {#max-bytes-in-distinct} -Maximum number of bytes used by a hash table when using DISTINCT. +A maximum number of bytes used by a hash table when using DISTINCT. -## distinct_overflow_mode +## distinct\_overflow\_mode {#distinct-overflow-mode} -What to do when the amount of data exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -## max_rows_to_transfer +## max\_rows\_to\_transfer {#max-rows-to-transfer} -Maximum number of rows that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. +A maximum number of rows that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. -## max_bytes_to_transfer +## max\_bytes\_to\_transfer {#max-bytes-to-transfer} -Maximum number of bytes (uncompressed data) that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. +A maximum number of bytes (uncompressed data) that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. -## transfer_overflow_mode +## transfer\_overflow\_mode {#transfer-overflow-mode} -What to do when the amount of data exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -## max_rows_in_join {#settings-max_rows_in_join} +## max\_rows\_in\_join {#settings-max-rows-in-join} Limits the number of rows in the hash table that is used when joining tables. -This settings applies to [SELECT ... JOIN](../../query_language/select.md#select-join) operations and the [Join](../table_engines/join.md) table engine. +This settings applies to [SELECT … JOIN](../../query_language/select.md#select-join) operations and the [Join](../table_engines/join.md) table engine. If a query contains multiple joins, ClickHouse checks this setting for every intermediate result. -ClickHouse can proceed with different actions when the limit is reached. Use the [join_overflow_mode](#settings-join_overflow_mode) setting to choose the action. +ClickHouse can proceed with different actions when the limit is reached. Use the [join\_overflow\_mode](#settings-join_overflow_mode) setting to choose the action. Possible values: -- Positive integer. -- 0 — Unlimited number of rows. +- Positive integer. +- 0 — Unlimited number of rows. Default value: 0. -## max_bytes_in_join {#settings-max_bytes_in_join} +## max\_bytes\_in\_join {#settings-max-bytes-in-join} Limits the size in bytes of the hash table used when joining tables. -This settings applies to [SELECT ... JOIN](../../query_language/select.md#select-join) operations and [Join table engine](../table_engines/join.md). +This settings applies to [SELECT … JOIN](../../query_language/select.md#select-join) operations and [Join table engine](../table_engines/join.md). If the query contains joins, ClickHouse checks this setting for every intermediate result. -ClickHouse can proceed with different actions when the limit is reached. Use [join_overflow_mode](#settings-join_overflow_mode) settings to choose the action. +ClickHouse can proceed with different actions when the limit is reached. Use [join\_overflow\_mode](#settings-join_overflow_mode) settings to choose the action. Possible values: -- Positive integer. -- 0 — Memory control is disabled. +- Positive integer. +- 0 — Memory control is disabled. Default value: 0. -## join_overflow_mode {#settings-join_overflow_mode} +## join\_overflow\_mode {#settings-join-overflow-mode} Defines what action ClickHouse performs when any of the following join limits is reached: -- [max_bytes_in_join](#settings-max_bytes_in_join) -- [max_rows_in_join](#settings-max_rows_in_join) +- [max\_bytes\_in\_join](#settings-max_bytes_in_join) +- [max\_rows\_in\_join](#settings-max_rows_in_join) Possible values: -- `THROW` — ClickHouse throws an exception and breaks operation. -- `BREAK` — ClickHouse breaks operation and doesn't throw an exception. +- `THROW` — ClickHouse throws an exception and breaks operation. +- `BREAK` — ClickHouse breaks operation and doesn’t throw an exception. Default value: `THROW`. **See Also** -- [JOIN clause](../../query_language/select.md#select-join) -- [Join table engine](../table_engines/join.md) +- [JOIN clause](../../query_language/select.md#select-join) +- [Join table engine](../table_engines/join.md) - -## max_partitions_per_insert_block +## max\_partitions\_per\_insert\_block {#max-partitions-per-insert-block} Limits the maximum number of partitions in a single inserted block. -- Positive integer. -- 0 — Unlimited number of partitions. +- Positive integer. +- 0 — Unlimited number of partitions. Default value: 100. @@ -290,6 +289,6 @@ Default value: 100. When inserting data, ClickHouse calculates the number of partitions in the inserted block. If the number of partitions is more than `max_partitions_per_insert_block`, ClickHouse throws an exception with the following text: -> "Too many partitions for single INSERT block (more than " + toString(max_parts) + "). The limit is controlled by 'max_partitions_per_insert_block' setting. Large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc)." +> “Too many partitions for single INSERT block (more than” + toString(max\_parts) + “). The limit is controlled by ‘max\_partitions\_per\_insert\_block’ setting. A large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).” [Original article](https://clickhouse.tech/docs/en/operations/settings/query_complexity/) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index b10553cbe77..a1c7e5e7f39 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1,27 +1,26 @@ -# Settings +# Settings {#settings} - -## distributed_product_mode +## distributed\_product\_mode {#distributed-product-mode} Changes the behavior of [distributed subqueries](../../query_language/select.md). -ClickHouse applies this setting when the query contains the product of distributed tables, i.e. when the query for a distributed table contains a non-GLOBAL subquery for the distributed table. +ClickHouse applies this setting when the query contains the product of distributed tables, i.e. when the query for a distributed table contains a non-GLOBAL subquery for the distributed table. Restrictions: -- Only applied for IN and JOIN subqueries. -- Only if the FROM section uses a distributed table containing more than one shard. -- If the subquery concerns a distributed table containing more than one shard. -- Not used for a table-valued [remote](../../query_language/table_functions/remote.md) function. +- Only applied for IN and JOIN subqueries. +- Only if the FROM section uses a distributed table containing more than one shard. +- If the subquery concerns a distributed table containing more than one shard. +- Not used for a table-valued [remote](../../query_language/table_functions/remote.md) function. Possible values: -- `deny` — Default value. Prohibits using these types of subqueries (returns the "Double-distributed in/JOIN subqueries is denied" exception). -- `local` — Replaces the database and table in the subquery with local ones for the destination server (shard), leaving the normal `IN`/`JOIN.` -- `global` — Replaces the `IN`/`JOIN` query with `GLOBAL IN`/`GLOBAL JOIN.` -- `allow` — Allows the use of these types of subqueries. +- `deny` — Default value. Prohibits using these types of subqueries (returns the “Double-distributed in/JOIN subqueries is denied” exception). +- `local` — Replaces the database and table in the subquery with local ones for the destination server (shard), leaving the normal `IN`/`JOIN.` +- `global` — Replaces the `IN`/`JOIN` query with `GLOBAL IN`/`GLOBAL JOIN.` +- `allow` — Allows the use of these types of subqueries. -## enable_optimize_predicate_expression +## enable\_optimize\_predicate\_expression {#enable-optimize-predicate-expression} Turns on predicate pushdown in `SELECT` queries. @@ -29,8 +28,8 @@ Predicate pushdown may significantly reduce network traffic for distributed quer Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 1. @@ -38,14 +37,14 @@ Usage Consider the following queries: -1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` -2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` +1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` +2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` -If `enable_optimize_predicate_expression = 1`, then the execution time of these queries is equal, because ClickHouse applies `WHERE` to the subquery when processing it. +If `enable_optimize_predicate_expression = 1`, then the execution time of these queries is equal because ClickHouse applies `WHERE` to the subquery when processing it. If `enable_optimize_predicate_expression = 0`, then the execution time of the second query is much longer, because the `WHERE` clause applies to all the data after the subquery finishes. -## fallback_to_stale_replicas_for_distributed_queries {#settings-fallback_to_stale_replicas_for_distributed_queries} +## fallback\_to\_stale\_replicas\_for\_distributed\_queries {#settings-fallback-to-stale-replicas-for-distributed-queries} Forces a query to an out-of-date replica if updated data is not available. See [Replication](../table_engines/replication.md). @@ -55,34 +54,33 @@ Used when performing `SELECT` from a distributed table that points to replicated By default, 1 (enabled). -## force_index_by_date {#settings-force_index_by_date} +## force\_index\_by\_date {#settings-force-index-by-date} -Disables query execution if the index can't be used by date. +Disables query execution if the index can’t be used by date. Works with tables in the MergeTree family. -If `force_index_by_date=1`, ClickHouse checks whether the query has a date key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition actually reduces the amount of data to read. For example, the condition `Date != ' 2000-01-01 '` is acceptable even when it matches all the data in the table (i.e., running the query requires a full scan). For more information about ranges of data in MergeTree tables, see [MergeTree](../table_engines/mergetree.md). +If `force_index_by_date=1`, ClickHouse checks whether the query has a date key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For example, the condition `Date != ' 2000-01-01 '` is acceptable even when it matches all the data in the table (i.e., running the query requires a full scan). For more information about ranges of data in MergeTree tables, see [MergeTree](../table_engines/mergetree.md). - -## force_primary_key +## force\_primary\_key {#force-primary-key} Disables query execution if indexing by the primary key is not possible. Works with tables in the MergeTree family. -If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition actually reduces the amount of data to read. For more information about data ranges in MergeTree tables, see [MergeTree](../table_engines/mergetree.md). +If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For more information about data ranges in MergeTree tables, see [MergeTree](../table_engines/mergetree.md). -## format_schema +## format\_schema {#format-schema} -This parameter is useful when you are using formats that require a schema definition, such as [Cap'n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format. +This parameter is useful when you are using formats that require a schema definition, such as [Cap’n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format. -## fsync_metadata +## fsync\_metadata {#fsync-metadata} Enables or disables [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fsync.html) when writing `.sql` files. Enabled by default. It makes sense to disable it if the server has millions of tiny tables that are constantly being created and destroyed. -## enable_http_compression {#settings-enable_http_compression} +## enable\_http\_compression {#settings-enable-http-compression} Enables or disables data compression in the response to an HTTP request. @@ -90,21 +88,20 @@ For more information, read the [HTTP interface description](../../interfaces/htt Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 0. -## http_zlib_compression_level {#settings-http_zlib_compression_level} +## http\_zlib\_compression\_level {#settings-http-zlib-compression-level} -Sets the level of data compression in the response to an HTTP request if [enable_http_compression = 1](#settings-enable_http_compression). +Sets the level of data compression in the response to an HTTP request if [enable\_http\_compression = 1](#settings-enable_http_compression). Possible values: Numbers from 1 to 9. Default value: 3. - -## http_native_compression_disable_checksumming_on_decompress {#settings-http_native_compression_disable_checksumming_on_decompress} +## http\_native\_compression\_disable\_checksumming\_on\_decompress {#settings-http-native-compression-disable-checksumming-on-decompress} Enables or disables checksum verification when decompressing the HTTP POST data from the client. Used only for ClickHouse native compression format (not used with `gzip` or `deflate`). @@ -112,12 +109,12 @@ For more information, read the [HTTP interface description](../../interfaces/htt Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 0. -## send_progress_in_http_headers {#settings-send_progress_in_http_headers} +## send\_progress\_in\_http\_headers {#settings-send-progress-in-http-headers} Enables or disables `X-ClickHouse-Progress` HTTP response headers in `clickhouse-server` responses. @@ -125,23 +122,23 @@ For more information, read the [HTTP interface description](../../interfaces/htt Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 0. -## max_http_get_redirects {#setting-max_http_get_redirects} +## max\_http\_get\_redirects {#setting-max-http-get-redirects} Limits the maximum number of HTTP GET redirect hops for [URL](../table_engines/url.md)-engine tables. The setting applies to both types of tables: those created by the [CREATE TABLE](../../query_language/create/#create-table-query) query and by the [url](../../query_language/table_functions/url.md) table function. Possible values: -- Any positive integer number of hops. -- 0 — No hops allowed. +- Any positive integer number of hops. +- 0 — No hops allowed. Default value: 0. -## input_format_allow_errors_num {#settings-input_format_allow_errors_num} +## input\_format\_allow\_errors\_num {#settings-input-format-allow-errors-num} Sets the maximum number of acceptable errors when reading from text formats (CSV, TSV, etc.). @@ -153,7 +150,7 @@ If an error occurred while reading rows but the error counter is still less than If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` are exceeded, ClickHouse throws an exception. -## input_format_allow_errors_ratio {#settings-input_format_allow_errors_ratio} +## input\_format\_allow\_errors\_ratio {#settings-input-format-allow-errors-ratio} Sets the maximum percentage of errors allowed when reading from text formats (CSV, TSV, etc.). The percentage of errors is set as a floating-point number between 0 and 1. @@ -166,18 +163,17 @@ If an error occurred while reading rows but the error counter is still less than If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` are exceeded, ClickHouse throws an exception. +## input\_format\_values\_interpret\_expressions {#settings-input-format-values-interpret-expressions} -## input_format_values_interpret_expressions {#settings-input_format_values_interpret_expressions} - -Enables or disables the full SQL parser if the fast stream parser can't parse the data. This setting is used only for the [Values](../../interfaces/formats.md#data-format-values) format at the data insertion. For more information about syntax parsing, see the [Syntax](../../query_language/syntax.md) section. +Enables or disables the full SQL parser if the fast stream parser can’t parse the data. This setting is used only for the [Values](../../interfaces/formats.md#data-format-values) format at the data insertion. For more information about syntax parsing, see the [Syntax](../../query_language/syntax.md) section. Possible values: -- 0 — Disabled. +- 0 — Disabled. In this case, you must provide formatted data. See the [Formats](../../interfaces/formats.md) section. -- 1 — Enabled. +- 1 — Enabled. In this case, you can use an SQL expression as a value, but data insertion is much slower this way. If you insert only formatted data, then ClickHouse behaves as if the setting value is 0. @@ -187,64 +183,65 @@ Example of Use Insert the [DateTime](../../data_types/datetime.md) type value with the different settings. -```sql +``` sql SET input_format_values_interpret_expressions = 0; INSERT INTO datetime_t VALUES (now()) ``` -```text +``` text Exception on client: Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row 1) ``` -```sql +``` sql SET input_format_values_interpret_expressions = 1; INSERT INTO datetime_t VALUES (now()) ``` -```text +``` text Ok. ``` The last query is equivalent to the following: -```sql +``` sql SET input_format_values_interpret_expressions = 0; INSERT INTO datetime_t SELECT now() ``` -```text +``` text Ok. ``` -## input_format_values_deduce_templates_of_expressions {#settings-input_format_values_deduce_templates_of_expressions} +## input\_format\_values\_deduce\_templates\_of\_expressions {#settings-input-format-values-deduce-templates-of-expressions} -Enables or disables template deduction for an SQL expressions in [Values](../../interfaces/formats.md#data-format-values) format. It allows to parse and interpret expressions in `Values` much faster if expressions in consecutive rows have the same structure. ClickHouse will try to deduce template of an expression, parse the following rows using this template and evaluate the expression on batch of successfully parsed rows. For the following query: +Enables or disables template deduction for an SQL expressions in [Values](../../interfaces/formats.md#data-format-values) format. It allows to parse and interpret expressions in `Values` much faster if expressions in consecutive rows have the same structure. ClickHouse will try to deduce template of an expression, parse the following rows using this template and evaluate the expression on a batch of successfully parsed rows. For the following query: -```sql +``` sql INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (upper('Values')), ... ``` -- if `input_format_values_interpret_expressions=1` and `format_values_deduce_templates_of_expressions=0` expressions will be interpreted separately for each row (this is very slow for large number of rows) -- if `input_format_values_interpret_expressions=0` and `format_values_deduce_templates_of_expressions=1` expressions in the first, second and third rows will be parsed using template `lower(String)` and interpreted together, expression is the forth row will be parsed with another template (`upper(String)`) -- if `input_format_values_interpret_expressions=1` and `format_values_deduce_templates_of_expressions=1` - the same as in previous case, but also allows fallback to interpreting expressions separately if it's not possible to deduce template. +- if `input_format_values_interpret_expressions=1` and `format_values_deduce_templates_of_expressions=0` expressions will be interpreted separately for each row (this is very slow for large number of rows) +- if `input_format_values_interpret_expressions=0` and `format_values_deduce_templates_of_expressions=1` expressions in the first, second and third rows will be parsed using template `lower(String)` and interpreted together, expression is the forth row will be parsed with another template (`upper(String)`) +- if `input_format_values_interpret_expressions=1` and `format_values_deduce_templates_of_expressions=1` - the same as in previous case, but also allows fallback to interpreting expressions separately if it’s not possible to deduce template. Enabled by default. -## input_format_values_accurate_types_of_literals {#settings-input_format_values_accurate_types_of_literals} +## input\_format\_values\_accurate\_types\_of\_literals {#settings-input-format-values-accurate-types-of-literals} This setting is used only when `input_format_values_deduce_templates_of_expressions = 1`. It can happen, that expressions for some column have the same structure, but contain numeric literals of different types, e.g -```sql + +``` sql (..., abs(0), ...), -- UInt64 literal (..., abs(3.141592654), ...), -- Float64 literal (..., abs(-1), ...), -- Int64 literal ``` -When this setting is enabled, ClickHouse will check actual type of literal and will use expression template of the corresponding type. In some cases it may significantly slow down expression evaluation in `Values`. -When disabled, ClickHouse may use more general type for some literals (e.g. `Float64` or `Int64` instead of `UInt64` for `42`), but it may cause overflow and precision issues. +When this setting is enabled, ClickHouse will check the actual type of literal and will use an expression template of the corresponding type. In some cases, it may significantly slow down expression evaluation in `Values`. +When disabled, ClickHouse may use more general type for some literals (e.g. `Float64` or `Int64` instead of `UInt64` for `42`), but it may cause overflow and precision issues. Enabled by default. -## input_format_defaults_for_omitted_fields {#session_settings-input_format_defaults_for_omitted_fields} +## input\_format\_defaults\_for\_omitted\_fields {#session-settings-input-format-defaults-for-omitted-fields} When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv) and [TabSeparated](../../interfaces/formats.md#tabseparated) formats. @@ -253,62 +250,61 @@ When performing `INSERT` queries, replace omitted input column values with defau Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 1. -## input_format_tsv_empty_as_default {#settings-input_format_tsv_empty_as_default} +## input\_format\_tsv\_empty\_as\_default {#settings-input-format-tsv-empty-as-default} When enabled, replace empty input fields in TSV with default values. For complex default expressions `input_format_defaults_for_omitted_fields` must be enabled too. Disabled by default. -## input_format_null_as_default {#settings-input_format_null_as_default} +## input\_format\_null\_as\_default {#settings-input-format-null-as-default} -Enables or disables using default values if input data contain `NULL`, but data type of corresponding column in not `Nullable(T)` (for text input formats). +Enables or disables using default values if input data contain `NULL`, but data type of the corresponding column in not `Nullable(T)` (for text input formats). - -## input_format_skip_unknown_fields {#settings-input_format_skip_unknown_fields} +## input\_format\_skip\_unknown\_fields {#settings-input-format-skip-unknown-fields} Enables or disables skipping insertion of extra data. -When writing data, ClickHouse throws an exception if input data contain columns that do not exist in the target table. If skipping is enabled, ClickHouse doesn't insert extra data and doesn't throw an exception. +When writing data, ClickHouse throws an exception if input data contain columns that do not exist in the target table. If skipping is enabled, ClickHouse doesn’t insert extra data and doesn’t throw an exception. Supported formats: -- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) -- [CSVWithNames](../../interfaces/formats.md#csvwithnames) -- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) -- [TSKV](../../interfaces/formats.md#tskv) +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [TSKV](../../interfaces/formats.md#tskv) Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 0. -## input_format_import_nested_json {#settings-input_format_import_nested_json} +## input\_format\_import\_nested\_json {#settings-input-format-import-nested-json} Enables or disables the insertion of JSON data with nested objects. Supported formats: -- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 0. See also: -- [Usage of Nested Structures](../../interfaces/formats.md#jsoneachrow-nested) with the `JSONEachRow` format. +- [Usage of Nested Structures](../../interfaces/formats.md#jsoneachrow-nested) with the `JSONEachRow` format. -## input_format_with_names_use_header {#settings-input_format_with_names_use_header} +## input\_format\_with\_names\_use\_header {#settings-input-format-with-names-use-header} Enables or disables checking the column order when inserting data. @@ -316,29 +312,29 @@ To improve insert performance, we recommend disabling this check if you are sure Supported formats: -- [CSVWithNames](../../interfaces/formats.md#csvwithnames) -- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 1. -## date_time_input_format {#settings-date_time_input_format} +## date\_time\_input\_format {#settings-date-time-input-format} -Allows to choose a parser of text representation of date and time. +Allows choosing a parser of the text representation of date and time. -The setting doesn't apply to [date and time functions](../../query_language/functions/date_time_functions.md). +The setting doesn’t apply to [date and time functions](../../query_language/functions/date_time_functions.md). Possible values: -- `'best_effort'` — Enables extended parsing. +- `'best_effort'` — Enables extended parsing. ClickHouse can parse the basic `YYYY-MM-DD HH:MM:SS` format and all [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) date and time formats. For example, `'2018-06-08T01:02:03.000Z'`. -- `'basic'` — Use basic parser. +- `'basic'` — Use basic parser. ClickHouse can parse only the basic `YYYY-MM-DD HH:MM:SS` format. For example, `'2019-08-20 10:18:56'`. @@ -346,143 +342,142 @@ Default value: `'basic'`. See also: -- [DateTime data type.](../../data_types/datetime.md) -- [Functions for working with dates and times.](../../query_language/functions/date_time_functions.md) +- [DateTime data type.](../../data_types/datetime.md) +- [Functions for working with dates and times.](../../query_language/functions/date_time_functions.md) -## join_default_strictness {#settings-join_default_strictness} +## join\_default\_strictness {#settings-join-default-strictness} Sets default strictness for [JOIN clauses](../../query_language/select.md#select-join). Possible values: -- `ALL` — If the right table has several matching rows, ClickHouse creates a [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) from matching rows. This is the normal `JOIN` behavior from standard SQL. -- `ANY` — If the right table has several matching rows, only the first one found is joined. If the right table has only one matching row, the results of `ANY` and `ALL` are the same. -- `ASOF` — For joining sequences with an uncertain match. -- `Empty string` — If `ALL` or `ANY` is not specified in the query, ClickHouse throws an exception. +- `ALL` — If the right table has several matching rows, ClickHouse creates a [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) from matching rows. This is the normal `JOIN` behaviour from standard SQL. +- `ANY` — If the right table has several matching rows, only the first one found is joined. If the right table has only one matching row, the results of `ANY` and `ALL` are the same. +- `ASOF` — For joining sequences with an uncertain match. +- `Empty string` — If `ALL` or `ANY` is not specified in the query, ClickHouse throws an exception. Default value: `ALL`. -## join_any_take_last_row {#settings-join_any_take_last_row} +## join\_any\_take\_last\_row {#settings-join-any-take-last-row} -Changes behavior of join operations with `ANY` strictness. +Changes behaviour of join operations with `ANY` strictness. !!! warning "Attention" This setting applies only for `JOIN` operations with [Join](../table_engines/join.md) engine tables. Possible values: -- 0 — If the right table has more than one matching row, only the first one found is joined. -- 1 — If the right table has more than one matching row, only the last one found is joined. +- 0 — If the right table has more than one matching row, only the first one found is joined. +- 1 — If the right table has more than one matching row, only the last one found is joined. Default value: 0. See also: -- [JOIN clause](../../query_language/select.md#select-join) -- [Join table engine](../table_engines/join.md) -- [join_default_strictness](#settings-join_default_strictness) +- [JOIN clause](../../query_language/select.md#select-join) +- [Join table engine](../table_engines/join.md) +- [join\_default\_strictness](#settings-join_default_strictness) -## join_use_nulls {#join_use_nulls} +## join\_use\_nulls {#join-use-nulls} Sets the type of [JOIN](../../query_language/select.md) behavior. When merging tables, empty cells may appear. ClickHouse fills them differently based on this setting. Possible values: -- 0 — The empty cells are filled with the default value of the corresponding field type. -- 1 — `JOIN` behaves the same way as in standard SQL. The type of the corresponding field is converted to [Nullable](../../data_types/nullable.md#data_type-nullable), and empty cells are filled with [NULL](../../query_language/syntax.md). +- 0 — The empty cells are filled with the default value of the corresponding field type. +- 1 — `JOIN` behaves the same way as in standard SQL. The type of the corresponding field is converted to [Nullable](../../data_types/nullable.md#data_type-nullable), and empty cells are filled with [NULL](../../query_language/syntax.md). Default value: 0. -## max_block_size {#setting-max_block_size} +## max\_block\_size {#setting-max-block-size} -In ClickHouse, data is processed by blocks (sets of column parts). The internal processing cycles for a single block are efficient enough, but there are noticeable expenditures on each block. The `max_block_size` setting is a recommendation for what size of block (in number of rows) to load from tables. The block size shouldn't be too small, so that the expenditures on each block are still noticeable, but not too large, so that the query with LIMIT that is completed after the first block is processed quickly. The goal is to avoid consuming too much memory when extracting a large number of columns in multiple threads, and to preserve at least some cache locality. +In ClickHouse, data is processed by blocks (sets of column parts). The internal processing cycles for a single block are efficient enough, but there are noticeable expenditures on each block. The `max_block_size` setting is a recommendation for what size of the block (in a count of rows) to load from tables. The block size shouldn’t be too small, so that the expenditures on each block are still noticeable, but not too large so that the query with LIMIT that is completed after the first block is processed quickly. The goal is to avoid consuming too much memory when extracting a large number of columns in multiple threads and to preserve at least some cache locality. Default value: 65,536. Blocks the size of `max_block_size` are not always loaded from the table. If it is obvious that less data needs to be retrieved, a smaller block is processed. -## preferred_block_size_bytes +## preferred\_block\_size\_bytes {#preferred-block-size-bytes} Used for the same purpose as `max_block_size`, but it sets the recommended block size in bytes by adapting it to the number of rows in the block. However, the block size cannot be more than `max_block_size` rows. By default: 1,000,000. It only works when reading from MergeTree engines. -## merge_tree_min_rows_for_concurrent_read {#setting-merge_tree_min_rows_for_concurrent_read} +## merge\_tree\_min\_rows\_for\_concurrent\_read {#setting-merge-tree-min-rows-for-concurrent-read} If the number of rows to be read from a file of a [MergeTree](../table_engines/mergetree.md) table exceeds `merge_tree_min_rows_for_concurrent_read` then ClickHouse tries to perform a concurrent reading from this file on several threads. Possible values: -- Any positive integer. +- Any positive integer. Default value: 163840. -## merge_tree_min_bytes_for_concurrent_read {#setting-merge_tree_min_bytes_for_concurrent_read} +## merge\_tree\_min\_bytes\_for\_concurrent\_read {#setting-merge-tree-min-bytes-for-concurrent-read} If the number of bytes to read from one file of a [MergeTree](../table_engines/mergetree.md)-engine table exceeds `merge_tree_min_bytes_for_concurrent_read`, then ClickHouse tries to concurrently read from this file in several threads. Possible value: -- Any positive integer. +- Any positive integer. Default value: 251658240. -## merge_tree_min_rows_for_seek {#setting-merge_tree_min_rows_for_seek} +## merge\_tree\_min\_rows\_for\_seek {#setting-merge-tree-min-rows-for-seek} -If the distance between two data blocks to be read in one file is less than `merge_tree_min_rows_for_seek` rows, then ClickHouse does not seek through the file, but reads the data sequentially. +If the distance between two data blocks to be read in one file is less than `merge_tree_min_rows_for_seek` rows, then ClickHouse does not seek through the file but reads the data sequentially. Possible values: -- Any positive integer. +- Any positive integer. Default value: 0. -## merge_tree_min_bytes_for_seek {#setting-merge_tree_min_bytes_for_seek} +## merge\_tree\_min\_bytes\_for\_seek {#setting-merge-tree-min-bytes-for-seek} -If the distance between two data blocks to be read in one file is less than `merge_tree_min_bytes_for_seek` bytes, then ClickHouse sequentially reads range of file that contains both blocks, thus avoiding extra seek. +If the distance between two data blocks to be read in one file is less than `merge_tree_min_bytes_for_seek` bytes, then ClickHouse sequentially reads a range of file that contains both blocks, thus avoiding extra seek. Possible values: -- Any positive integer. +- Any positive integer. Default value: 0. +## merge\_tree\_coarse\_index\_granularity {#setting-merge-tree-coarse-index-granularity} -## merge_tree_coarse_index_granularity {#setting-merge_tree_coarse_index_granularity} - -When searching data, ClickHouse checks the data marks in the index file. If ClickHouse finds that required keys are in some range, it divides this range into `merge_tree_coarse_index_granularity` subranges and searches the required keys there recursively. +When searching for data, ClickHouse checks the data marks in the index file. If ClickHouse finds that required keys are in some range, it divides this range into `merge_tree_coarse_index_granularity` subranges and searches the required keys there recursively. Possible values: -- Any positive even integer. +- Any positive even integer. Default value: 8. -## merge_tree_max_rows_to_use_cache {#setting-merge_tree_max_rows_to_use_cache} +## merge\_tree\_max\_rows\_to\_use\_cache {#setting-merge-tree-max-rows-to-use-cache} -If ClickHouse should read more than `merge_tree_max_rows_to_use_cache` rows in one query, it doesn't use the cache of uncompressed blocks. +If ClickHouse should read more than `merge_tree_max_rows_to_use_cache` rows in one query, it doesn’t use the cache of uncompressed blocks. -The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed\_cache\_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. Possible values: -- Any positive integer. +- Any positive integer. Default value: 128 ✕ 8192. -## merge_tree_max_bytes_to_use_cache {#setting-merge_tree_max_bytes_to_use_cache} +## merge\_tree\_max\_bytes\_to\_use\_cache {#setting-merge-tree-max-bytes-to-use-cache} -If ClickHouse should read more than `merge_tree_max_bytes_to_use_cache` bytes in one query, it doesn't use the cache of uncompressed blocks. +If ClickHouse should read more than `merge_tree_max_bytes_to_use_cache` bytes in one query, it doesn’t use the cache of uncompressed blocks. -The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed_cache_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries that read a large amount of data. The [uncompressed\_cache\_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. Possible value: -- Any positive integer. +- Any positive integer. Default value: 2013265920. -## min_bytes_to_use_direct_io {#settings-min_bytes_to_use_direct_io} +## min\_bytes\_to\_use\_direct\_io {#settings-min-bytes-to-use-direct-io} The minimum data volume required for using direct I/O access to the storage disk. @@ -490,48 +485,48 @@ ClickHouse uses this setting when reading data from tables. If the total storage Possible values: -- 0 — Direct I/O is disabled. -- Positive integer. +- 0 — Direct I/O is disabled. +- Positive integer. Default value: 0. -## log_queries {#settings-log-queries} +## log\_queries {#settings-log-queries} Setting up query logging. -Queries sent to ClickHouse with this setup are logged according to the rules in the [query_log](../server_settings/settings.md#server_settings-query-log) server configuration parameter. +Queries sent to ClickHouse with this setup are logged according to the rules in the [query\_log](../server_settings/settings.md#server_settings-query-log) server configuration parameter. Example: -```text +``` text log_queries=1 ``` -## log_query_threads {#settings-log-query-threads} +## log\_query\_threads {#settings-log-query-threads} Setting up query threads logging. -Queries' threads runned by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../server_settings/settings.md#server_settings-query-thread-log) server configuration parameter. +Queries’ threads runned by ClickHouse with this setup are logged according to the rules in the [query\_thread\_log](../server_settings/settings.md#server_settings-query-thread-log) server configuration parameter. Example: -```text +``` text log_query_threads=1 ``` -## max_insert_block_size {#settings-max_insert_block_size} +## max\_insert\_block\_size {#settings-max-insert-block-size} The size of blocks to form for insertion into a table. This setting only applies in cases when the server forms the blocks. For example, for an INSERT via the HTTP interface, the server parses the data format and forms blocks of the specified size. -But when using clickhouse-client, the client parses the data itself, and the 'max_insert_block_size' setting on the server doesn't affect the size of the inserted blocks. -The setting also doesn't have a purpose when using INSERT SELECT, since data is inserted using the same blocks that are formed after SELECT. +But when using clickhouse-client, the client parses the data itself, and the ‘max\_insert\_block\_size’ setting on the server doesn’t affect the size of the inserted blocks. +The setting also doesn’t have a purpose when using INSERT SELECT, since data is inserted using the same blocks that are formed after SELECT. Default value: 1,048,576. -The default is slightly more than `max_block_size`. The reason for this is because certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allows sorting more data in RAM. +The default is slightly more than `max_block_size`. The reason for this is because certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion and a large enough block size allow sorting more data in RAM. -## max_replica_delay_for_distributed_queries {#settings-max_replica_delay_for_distributed_queries} +## max\_replica\_delay\_for\_distributed\_queries {#settings-max-replica-delay-for-distributed-queries} Disables lagging replicas for distributed queries. See [Replication](../../operations/table_engines/replication.md). @@ -541,88 +536,87 @@ Default value: 300. Used when performing `SELECT` from a distributed table that points to replicated tables. -## max_threads {#settings-max_threads} +## max\_threads {#settings-max-threads} -The maximum number of query processing threads, excluding threads for retrieving data from remote servers (see the 'max_distributed_connections' parameter). +The maximum number of query processing threads, excluding threads for retrieving data from remote servers (see the ‘max\_distributed\_connections’ parameter). This parameter applies to threads that perform the same stages of the query processing pipeline in parallel. -For example, when reading from a table, if it is possible to evaluate expressions with functions, filter with WHERE and pre-aggregate for GROUP BY in parallel using at least 'max_threads' number of threads, then 'max_threads' are used. +For example, when reading from a table, if it is possible to evaluate expressions with functions, filter with WHERE and pre-aggregate for GROUP BY in parallel using at least ‘max\_threads’ number of threads, then ‘max\_threads’ are used. Default value: the number of physical CPU cores. If less than one SELECT query is normally run on a server at a time, set this parameter to a value slightly less than the actual number of processor cores. -For queries that are completed quickly because of a LIMIT, you can set a lower 'max_threads'. For example, if the necessary number of entries are located in every block and max_threads = 8, then 8 blocks are retrieved, although it would have been enough to read just one. +For queries that are completed quickly because of a LIMIT, you can set a lower ‘max\_threads’. For example, if the necessary number of entries are located in every block and max\_threads = 8, then 8 blocks are retrieved, although it would have been enough to read just one. The smaller the `max_threads` value, the less memory is consumed. -## max_insert_threads {#settings-max_insert_threads} +## max\_insert\_threads {#settings-max-insert-threads} The maximum number of threads to execute the `INSERT SELECT` query. Possible values: -- 0 (or 1) — `INSERT SELECT` no parallel execution. -- Positive integer. Bigger than 1. +- 0 (or 1) — `INSERT SELECT` no parallel execution. +- Positive integer. Bigger than 1. Default value: 0. -Parallel `INSERT SELECT` has effect only if the `SELECT` part is executed in parallel, see [max_threads](#settings-max_threads) setting. +Parallel `INSERT SELECT` has effect only if the `SELECT` part is executed in parallel, see [max\_threads](#settings-max_threads) setting. Higher values will lead to higher memory usage. -## max_compress_block_size +## max\_compress\_block\_size {#max-compress-block-size} -The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). If the size is reduced, the compression rate is significantly reduced, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced. There usually isn't any reason to change this setting. +The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). If the size is reduced, the compression rate is significantly reduced, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced. There usually isn’t any reason to change this setting. -Don't confuse blocks for compression (a chunk of memory consisting of bytes) with blocks for query processing (a set of rows from a table). +Don’t confuse blocks for compression (a chunk of memory consisting of bytes) with blocks for query processing (a set of rows from a table). -## min_compress_block_size +## min\_compress\_block\_size {#min-compress-block-size} -For [MergeTree](../table_engines/mergetree.md)" tables. In order to reduce latency when processing queries, a block is compressed when writing the next mark if its size is at least 'min_compress_block_size'. By default, 65,536. +For [MergeTree](../table_engines/mergetree.md)" tables. In order to reduce latency when processing queries, a block is compressed when writing the next mark if its size is at least ‘min\_compress\_block\_size’. By default, 65,536. -The actual size of the block, if the uncompressed data is less than 'max_compress_block_size', is no less than this value and no less than the volume of data for one mark. +The actual size of the block, if the uncompressed data is less than ‘max\_compress\_block\_size’, is no less than this value and no less than the volume of data for one mark. -Let's look at an example. Assume that 'index_granularity' was set to 8192 during table creation. +Let’s look at an example. Assume that ‘index\_granularity’ was set to 8192 during table creation. -We are writing a UInt32-type column (4 bytes per value). When writing 8192 rows, the total will be 32 KB of data. Since min_compress_block_size = 65,536, a compressed block will be formed for every two marks. +We are writing a UInt32-type column (4 bytes per value). When writing 8192 rows, the total will be 32 KB of data. Since min\_compress\_block\_size = 65,536, a compressed block will be formed for every two marks. -We are writing a URL column with the String type (average size of 60 bytes per value). When writing 8192 rows, the average will be slightly less than 500 KB of data. Since this is more than 65,536, a compressed block will be formed for each mark. In this case, when reading data from the disk in the range of a single mark, extra data won't be decompressed. +We are writing a URL column with the String type (average size of 60 bytes per value). When writing 8192 rows, the average will be slightly less than 500 KB of data. Since this is more than 65,536, a compressed block will be formed for each mark. In this case, when reading data from the disk in the range of a single mark, extra data won’t be decompressed. -There usually isn't any reason to change this setting. +There usually isn’t any reason to change this setting. -## max_query_size {#settings-max_query_size} +## max\_query\_size {#settings-max-query-size} The maximum part of a query that can be taken to RAM for parsing with the SQL parser. The INSERT query also contains data for INSERT that is processed by a separate stream parser (that consumes O(1) RAM), which is not included in this restriction. Default value: 256 KiB. -## interactive_delay +## interactive\_delay {#interactive-delay} -The interval in microseconds for checking whether request execution has been canceled and sending the progress. +The interval in microseconds for checking whether request execution has been cancelled and sending the progress. -Default value: 100,000 (checks for canceling and sends the progress ten times per second). +Default value: 100,000 (checks for cancelling and sends the progress ten times per second). -## connect_timeout, receive_timeout, send_timeout +## connect\_timeout, receive\_timeout, send\_timeout {#connect-timeout-receive-timeout-send-timeout} Timeouts in seconds on the socket used for communicating with the client. Default value: 10, 300, 300. -## cancel_http_readonly_queries_on_client_close +## cancel\_http\_readonly\_queries\_on\_client\_close {#cancel-http-readonly-queries-on-client-close} -Cancels HTTP readonly queries (e.g. SELECT) when a client closes the connection without waiting for response. +Cancels HTTP read-only queries (e.g. SELECT) when a client closes the connection without waiting for the response. Default value: 0 - -## poll_interval +## poll\_interval {#poll-interval} Lock in a wait loop for the specified number of seconds. Default value: 10. -## max_distributed_connections +## max\_distributed\_connections {#max-distributed-connections} The maximum number of simultaneous connections with remote servers for distributed processing of a single query to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. @@ -630,184 +624,181 @@ Default value: 1024. The following parameters are only used when creating Distributed tables (and when launching a server), so there is no reason to change them at runtime. -## distributed_connections_pool_size +## distributed\_connections\_pool\_size {#distributed-connections-pool-size} The maximum number of simultaneous connections with remote servers for distributed processing of all queries to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. Default value: 1024. -## connect_timeout_with_failover_ms +## connect\_timeout\_with\_failover\_ms {#connect-timeout-with-failover-ms} -The timeout in milliseconds for connecting to a remote server for a Distributed table engine, if the 'shard' and 'replica' sections are used in the cluster definition. +The timeout in milliseconds for connecting to a remote server for a Distributed table engine, if the ‘shard’ and ‘replica’ sections are used in the cluster definition. If unsuccessful, several attempts are made to connect to various replicas. Default value: 50. -## connections_with_failover_max_tries +## connections\_with\_failover\_max\_tries {#connections-with-failover-max-tries} The maximum number of connection attempts with each replica for the Distributed table engine. Default value: 3. -## extremes +## extremes {#extremes} Whether to count extreme values (the minimums and maximums in columns of a query result). Accepts 0 or 1. By default, 0 (disabled). -For more information, see the section "Extreme values". +For more information, see the section “Extreme values”. -## use_uncompressed_cache {#setting-use_uncompressed_cache} +## use\_uncompressed\_cache {#setting-use-uncompressed-cache} Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled). -Using the uncompressed cache (only for tables in the MergeTree family) can significantly reduce latency and increase throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the [uncompressed_cache_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) configuration parameter (only set in the config file) – the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed and the least-used data is automatically deleted. +Using the uncompressed cache (only for tables in the MergeTree family) can significantly reduce latency and increase throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the [uncompressed\_cache\_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) configuration parameter (only set in the config file) – the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed and the least-used data is automatically deleted. -For queries that read at least a somewhat large volume of data (one million rows or more), the uncompressed cache is disabled automatically in order to save space for truly small queries. This means that you can keep the 'use_uncompressed_cache' setting always set to 1. +For queries that read at least a somewhat large volume of data (one million rows or more), the uncompressed cache is disabled automatically to save space for truly small queries. This means that you can keep the ‘use\_uncompressed\_cache’ setting always set to 1. -## replace_running_query +## replace\_running\_query {#replace-running-query} -When using the HTTP interface, the 'query_id' parameter can be passed. This is any string that serves as the query identifier. -If a query from the same user with the same 'query_id' already exists at this time, the behavior depends on the 'replace_running_query' parameter. +When using the HTTP interface, the ‘query\_id’ parameter can be passed. This is any string that serves as the query identifier. +If a query from the same user with the same ‘query\_id’ already exists at this time, the behaviour depends on the ‘replace\_running\_query’ parameter. -`0` (default) – Throw an exception (don't allow the query to run if a query with the same 'query_id' is already running). +`0` (default) – Throw an exception (don’t allow the query to run if a query with the same ‘query\_id’ is already running). `1` – Cancel the old query and start running the new one. -Yandex.Metrica uses this parameter set to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn't finished yet, it should be canceled. +Yandex.Metrica uses this parameter set to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn’t finished yet, it should be cancelled. +## stream\_flush\_interval\_ms {#stream-flush-interval-ms} -## stream_flush_interval_ms - -Works for tables with streaming in the case of a timeout, or when a thread generates [max_insert_block_size](#settings-max_insert_block_size) rows. +Works for tables with streaming in the case of a timeout, or when a thread generates [max\_insert\_block\_size](#settings-max_insert_block_size) rows. The default value is 7500. The smaller the value, the more often data is flushed into the table. Setting the value too low leads to poor performance. - -## load_balancing {#settings-load_balancing} +## load\_balancing {#settings-load-balancing} Specifies the algorithm of replicas selection that is used for distributed query processing. ClickHouse supports the following algorithms of choosing replicas: -- [Random](#load_balancing-random) (by default) -- [Nearest hostname](#load_balancing-nearest_hostname) -- [In order](#load_balancing-in_order) -- [First or random](#load_balancing-first_or_random) +- [Random](#load_balancing-random) (by default) +- [Nearest hostname](#load_balancing-nearest_hostname) +- [In order](#load_balancing-in_order) +- [First or random](#load_balancing-first_or_random) -### Random (by default) {#load_balancing-random} +### Random (by default) {#load-balancing-random} -```sql +``` sql load_balancing = random ``` -The number of errors is counted for each replica. The query is sent to the replica with the fewest errors, and if there are several of these, to any one of them. +The number of errors is counted for each replica. The query is sent to the replica with the fewest errors, and if there are several of these, to anyone of them. Disadvantages: Server proximity is not accounted for; if the replicas have different data, you will also get different data. -### Nearest Hostname {#load_balancing-nearest_hostname} +### Nearest Hostname {#load-balancing-nearest-hostname} -```sql +``` sql load_balancing = nearest_hostname ``` -The number of errors is counted for each replica. Every 5 minutes, the number of errors is integrally divided by 2. Thus, the number of errors is calculated for a recent time with exponential smoothing. If there is one replica with a minimal number of errors (i.e. errors occurred recently on the other replicas), the query is sent to it. If there are multiple replicas with the same minimal number of errors, the query is sent to the replica with a host name that is most similar to the server's host name in the config file (for the number of different characters in identical positions, up to the minimum length of both host names). +The number of errors is counted for each replica. Every 5 minutes, the number of errors is integrally divided by 2. Thus, the number of errors is calculated for a recent time with exponential smoothing. If there is one replica with a minimal number of errors (i.e. errors occurred recently on the other replicas), the query is sent to it. If there are multiple replicas with the same minimal number of errors, the query is sent to the replica with a hostname that is most similar to the server’s hostname in the config file (for the number of different characters in identical positions, up to the minimum length of both hostnames). For instance, example01-01-1 and example01-01-2.yandex.ru are different in one position, while example01-01-1 and example01-02-2 differ in two places. -This method might seem primitive, but it doesn't require external data about network topology, and it doesn't compare IP addresses, which would be complicated for our IPv6 addresses. +This method might seem primitive, but it doesn’t require external data about network topology, and it doesn’t compare IP addresses, which would be complicated for our IPv6 addresses. Thus, if there are equivalent replicas, the closest one by name is preferred. We can also assume that when sending a query to the same server, in the absence of failures, a distributed query will also go to the same servers. So even if different data is placed on the replicas, the query will return mostly the same results. -### In Order {#load_balancing-in_order} +### In Order {#load-balancing-in-order} -```sql +``` sql load_balancing = in_order ``` -Replicas with the same number of errors are accessed in the same order as they are specified in configuration. +Replicas with the same number of errors are accessed in the same order as they are specified in the configuration. This method is appropriate when you know exactly which replica is preferable. +### First or Random {#load-balancing-first-or-random} -### First or Random {#load_balancing-first_or_random} - -```sql +``` sql load_balancing = first_or_random ``` -This algorithm chooses the first replica in the set or a random replica if the first is unavailable. It's effective in cross-replication topology setups, but useless in other configurations. +This algorithm chooses the first replica in the set or a random replica if the first is unavailable. It’s effective in cross-replication topology setups, but useless in other configurations. -The `first_or_random` algorithm solves the problem of the `in_order` algorithm. With `in_order`, if one replica goes down, the next one gets a double load while the remaining replicas handle the usual amount of traffic. When using the `first_or_random` algorithm, load is evenly distributed among replicas that are still available. +The `first_or_random` algorithm solves the problem of the `in_order` algorithm. With `in_order`, if one replica goes down, the next one gets a double load while the remaining replicas handle the usual amount of traffic. When using the `first_or_random` algorithm, the load is evenly distributed among replicas that are still available. -## prefer_localhost_replica {#settings-prefer_localhost_replica} +## prefer\_localhost\_replica {#settings-prefer-localhost-replica} Enables/disables preferable using the localhost replica when processing distributed queries. Possible values: -- 1 — ClickHouse always sends a query to the localhost replica if it exists. -- 0 — ClickHouse uses the balancing strategy specified by the [load_balancing](#settings-load_balancing) setting. +- 1 — ClickHouse always sends a query to the localhost replica if it exists. +- 0 — ClickHouse uses the balancing strategy specified by the [load\_balancing](#settings-load_balancing) setting. Default value: 1. !!! warning "Warning" - Disable this setting if you use [max_parallel_replicas](#settings-max_parallel_replicas). + Disable this setting if you use [max\_parallel\_replicas](#settings-max_parallel_replicas). -## totals_mode +## totals\_mode {#totals-mode} -How to calculate TOTALS when HAVING is present, as well as when max_rows_to_group_by and group_by_overflow_mode = 'any' are present. -See the section "WITH TOTALS modifier". +How to calculate TOTALS when HAVING is present, as well as when max\_rows\_to\_group\_by and group\_by\_overflow\_mode = ‘any’ are present. +See the section “WITH TOTALS modifier”. -## totals_auto_threshold +## totals\_auto\_threshold {#totals-auto-threshold} The threshold for `totals_mode = 'auto'`. -See the section "WITH TOTALS modifier". +See the section “WITH TOTALS modifier”. -## max_parallel_replicas {#settings-max_parallel_replicas} +## max\_parallel\_replicas {#settings-max-parallel-replicas} The maximum number of replicas for each shard when executing a query. For consistency (to get different parts of the same data split), this option only works when the sampling key is set. Replica lag is not controlled. -## compile +## compile {#compile} Enable compilation of queries. By default, 0 (disabled). -Compilation is only used for part of the query-processing pipeline: for the first stage of aggregation (GROUP BY). +The compilation is only used for part of the query-processing pipeline: for the first stage of aggregation (GROUP BY). If this portion of the pipeline was compiled, the query may run faster due to deployment of short cycles and inlining aggregate function calls. The maximum performance improvement (up to four times faster in rare cases) is seen for queries with multiple simple aggregate functions. Typically, the performance gain is insignificant. In very rare cases, it may slow down query execution. -## min_count_to_compile +## min\_count\_to\_compile {#min-count-to-compile} How many times to potentially use a compiled chunk of code before running compilation. By default, 3. For testing, the value can be set to 0: compilation runs synchronously and the query waits for the end of the compilation process before continuing execution. For all other cases, use values ​​starting with 1. Compilation normally takes about 5-10 seconds. If the value is 1 or more, compilation occurs asynchronously in a separate thread. The result will be used as soon as it is ready, including queries that are currently running. Compiled code is required for each different combination of aggregate functions used in the query and the type of keys in the GROUP BY clause. -The results of compilation are saved in the build directory in the form of .so files. There is no restriction on the number of compilation results, since they don't use very much space. Old results will be used after server restarts, except in the case of a server upgrade – in this case, the old results are deleted. +The results of the compilation are saved in the build directory in the form of .so files. There is no restriction on the number of compilation results since they don’t use very much space. Old results will be used after server restarts, except in the case of a server upgrade – in this case, the old results are deleted. -## output_format_json_quote_64bit_integers {#session_settings-output_format_json_quote_64bit_integers} +## output\_format\_json\_quote\_64bit\_integers {#session-settings-output-format-json-quote-64bit-integers} If the value is true, integers appear in quotes when using JSON\* Int64 and UInt64 formats (for compatibility with most JavaScript implementations); otherwise, integers are output without the quotes. -## format_csv_delimiter {#settings-format_csv_delimiter} +## format\_csv\_delimiter {#settings-format-csv-delimiter} The character interpreted as a delimiter in the CSV data. By default, the delimiter is `,`. -## input_format_csv_unquoted_null_literal_as_null {#settings-input_format_csv_unquoted_null_literal_as_null} +## input\_format\_csv\_unquoted\_null\_literal\_as\_null {#settings-input-format-csv-unquoted-null-literal-as-null} For CSV input format enables or disables parsing of unquoted `NULL` as literal (synonym for `\N`). -## output_format_csv_crlf_end_of_line {#settings-output_format_csv_crlf_end_of_line} +## output\_format\_csv\_crlf\_end\_of\_line {#settings-output-format-csv-crlf-end-of-line} -Use DOS/Windows style line separator (CRLF) in CSV instead of Unix style (LF). +Use DOS/Windows-style line separator (CRLF) in CSV instead of Unix style (LF). -## output_format_tsv_crlf_end_of_line {#settings-output_format_tsv_crlf_end_of_line} +## output\_format\_tsv\_crlf\_end\_of\_line {#settings-output-format-tsv-crlf-end-of-line} -Use DOC/Windows style line separator (CRLF) in TSV instead of Unix style (LF). +Use DOC/Windows-style line separator (CRLF) in TSV instead of Unix style (LF). -## insert_quorum {#settings-insert_quorum} +## insert\_quorum {#settings-insert-quorum} -Enables quorum writes. +Enables the quorum writes. -- If `insert_quorum < 2`, the quorum writes are disabled. -- If `insert_quorum >= 2`, the quorum writes are enabled. +- If `insert_quorum < 2`, the quorum writes are disabled. +- If `insert_quorum >= 2`, the quorum writes are enabled. Default value: 0. @@ -817,38 +808,37 @@ Quorum writes All the replicas in the quorum are consistent, i.e., they contain data from all previous `INSERT` queries. The `INSERT` sequence is linearized. -When reading the data written from the `insert_quorum`, you can use the [select_sequential_consistency](#settings-select_sequential_consistency) option. +When reading the data written from the `insert_quorum`, you can use the [select\_sequential\_consistency](#settings-select_sequential_consistency) option. ClickHouse generates an exception -- If the number of available replicas at the time of the query is less than the `insert_quorum`. -- At an attempt to write data when the previous block has not yet been inserted in the `insert_quorum` of replicas. This situation may occur if the user tries to perform an `INSERT` before the previous one with the `insert_quorum` is completed. +- If the number of available replicas at the time of the query is less than the `insert_quorum`. +- At an attempt to write data when the previous block has not yet been inserted in the `insert_quorum` of replicas. This situation may occur if the user tries to perform an `INSERT` before the previous one with the `insert_quorum` is completed. See also: -- [insert_quorum_timeout](#settings-insert_quorum_timeout) -- [select_sequential_consistency](#settings-select_sequential_consistency) +- [insert\_quorum\_timeout](#settings-insert_quorum_timeout) +- [select\_sequential\_consistency](#settings-select_sequential_consistency) -## insert_quorum_timeout {#settings-insert_quorum_timeout} +## insert\_quorum\_timeout {#settings-insert-quorum-timeout} -Quorum write timeout in seconds. If the timeout has passed and no write has taken place yet, ClickHouse will generate an exception and the client must repeat the query to write the same block to the same or any other replica. +Write to quorum timeout in seconds. If the timeout has passed and no write has taken place yet, ClickHouse will generate an exception and the client must repeat the query to write the same block to the same or any other replica. Default value: 60 seconds. See also: -- [insert_quorum](#settings-insert_quorum) -- [select_sequential_consistency](#settings-select_sequential_consistency) +- [insert\_quorum](#settings-insert_quorum) +- [select\_sequential\_consistency](#settings-select_sequential_consistency) - -## select_sequential_consistency {#settings-select_sequential_consistency} +## select\_sequential\_consistency {#settings-select-sequential-consistency} Enables or disables sequential consistency for `SELECT` queries: Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 0. @@ -858,245 +848,245 @@ When sequential consistency is enabled, ClickHouse allows the client to execute See also: -- [insert_quorum](#settings-insert_quorum) -- [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert\_quorum](#settings-insert_quorum) +- [insert\_quorum\_timeout](#settings-insert_quorum_timeout) -## insert_deduplicate {#settings-insert_deduplicate} +## insert\_deduplicate {#settings-insert-deduplicate} -Enables or disables block deduplication of `INSERT` (for Replicated* tables). +Enables or disables block deduplication of `INSERT` (for Replicated\* tables). Possible values: -- 0 — Disabled. -- 1 — Enabled. +- 0 — Disabled. +- 1 — Enabled. Default value: 1. -By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication] (../ table_engines/replication.md). +By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see \[Data Replication\] (../ table\_engines/replication.md). -## deduplicate_blocks_in_dependent_materialized_views {#settings-deduplicate_blocks_in_dependent_materialized_views} +## deduplicate\_blocks\_in\_dependent\_materialized\_views {#settings-deduplicate-blocks-in-dependent-materialized-views} -Enables or disables the deduplication check for materialized views that receive data from Replicated* tables. +Enables or disables the deduplication check for materialized views that receive data from Replicated\* tables. Possible values: - 0 — Disabled. - 1 — Enabled. + 0 — Disabled. + 1 — Enabled. Default value: 0. Usage -By default, deduplication is not performed for materialized views, but is done upstream, in the source table. -If an INSERTed block is skipped due to deduplication in the source table, there will be no insertion into attached materialized views. This behavior exists to enable insertion of highly aggregated data into materialized views, for cases where inserted blocks are the same after materialized view aggregation but derived from different INSERTs into the source table. -At the same time, this behavior "breaks" `INSERT` idempotency. If an `INSERT` into the main table was successful and `INSERT` into a materialized view failed (e.g. because of communication failure with Zookeeper) a client will get an error and can retry the operation. However, the materialized view won't receive the second insert because it will be discarded by deduplication in the main (source) table. The setting `deduplicate_blocks_in_dependent_materialized_views` allows to change this behavior. On retry a materialized view will receive the repeat insert and will perform deduplication check by itself, -ignoring check result for the source table, and will insert rows lost because of first failure. +By default, deduplication is not performed for materialized views but is done upstream, in the source table. +If an INSERTed block is skipped due to deduplication in the source table, there will be no insertion into attached materialized views. This behaviour exists to enable insertion of highly aggregated data into materialized views, for cases where inserted blocks are the same after materialized view aggregation but derived from different INSERTs into the source table. +At the same time, this behaviour “breaks” `INSERT` idempotency. If an `INSERT` into the main table was successful and `INSERT` into a materialized view failed (e.g. because of communication failure with Zookeeper) a client will get an error and can retry the operation. However, the materialized view won’t receive the second insert because it will be discarded by deduplication in the main (source) table. The setting `deduplicate_blocks_in_dependent_materialized_views` allows for changing this behaviour. On retry, a materialized view will receive the repeat insert and will perform deduplication check by itself, +ignoring check result for the source table, and will insert rows lost because of the first failure. + +## max\_network\_bytes {#settings-max-network-bytes} -## max_network_bytes {#settings-max_network_bytes} Limits the data volume (in bytes) that is received or transmitted over the network when executing a query. This setting applies to every individual query. Possible values: -- Positive integer. -- 0 — Data volume control is disabled. +- Positive integer. +- 0 — Data volume control is disabled. Default value: 0. -## max_network_bandwidth {#settings-max_network_bandwidth} +## max\_network\_bandwidth {#settings-max-network-bandwidth} Limits the speed of the data exchange over the network in bytes per second. This setting applies to every query. Possible values: -- Positive integer. -- 0 — Bandwidth control is disabled. +- Positive integer. +- 0 — Bandwidth control is disabled. Default value: 0. -## max_network_bandwidth_for_user {#settings-max_network_bandwidth_for_user} +## max\_network\_bandwidth\_for\_user {#settings-max-network-bandwidth-for-user} Limits the speed of the data exchange over the network in bytes per second. This setting applies to all concurrently running queries performed by a single user. Possible values: -- Positive integer. -- 0 — Control of the data speed is disabled. +- Positive integer. +- 0 — Control of the data speed is disabled. Default value: 0. -## max_network_bandwidth_for_all_users {#settings-max_network_bandwidth_for_all_users} +## max\_network\_bandwidth\_for\_all\_users {#settings-max-network-bandwidth-for-all-users} Limits the speed that data is exchanged at over the network in bytes per second. This setting applies to all concurrently running queries on the server. Possible values: -- Positive integer. -- 0 — Control of the data speed is disabled. +- Positive integer. +- 0 — Control of the data speed is disabled. Default value: 0. -## count_distinct_implementation {#settings-count_distinct_implementation} +## count\_distinct\_implementation {#settings-count-distinct-implementation} -Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT ...)](../../query_language/agg_functions/reference.md#agg_function-count) construction. +Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT …)](../../query_language/agg_functions/reference.md#agg_function-count) construction. Possible values: -- [uniq](../../query_language/agg_functions/reference.md#agg_function-uniq) -- [uniqCombined](../../query_language/agg_functions/reference.md#agg_function-uniqcombined) -- [uniqCombined64](../../query_language/agg_functions/reference.md#agg_function-uniqcombined64) -- [uniqHLL12](../../query_language/agg_functions/reference.md#agg_function-uniqhll12) -- [uniqExact](../../query_language/agg_functions/reference.md#agg_function-uniqexact) +- [uniq](../../query_language/agg_functions/reference.md#agg_function-uniq) +- [uniqCombined](../../query_language/agg_functions/reference.md#agg_function-uniqcombined) +- [uniqCombined64](../../query_language/agg_functions/reference.md#agg_function-uniqcombined64) +- [uniqHLL12](../../query_language/agg_functions/reference.md#agg_function-uniqhll12) +- [uniqExact](../../query_language/agg_functions/reference.md#agg_function-uniqexact) Default value: `uniqExact`. -## skip_unavailable_shards {#settings-skip_unavailable_shards} +## skip\_unavailable\_shards {#settings-skip-unavailable-shards} Enables or disables silently skipping of unavailable shards. Shard is considered unavailable if all its replicas are unavailable. A replica is unavailable in the following cases: -- ClickHouse can't connect to replica for any reason. +- ClickHouse can’t connect to replica for any reason. When connecting to a replica, ClickHouse performs several attempts. If all these attempts fail, the replica is considered unavailable. -- Replica can't be resolved through DNS. +- Replica can’t be resolved through DNS. - If replica's hostname can't be resolved through DNS, it can indicate the following situations: + If replica’s hostname can’t be resolved through DNS, it can indicate the following situations: - - Replica's host has no DNS record. It can occur in systems with dynamic DNS, for example, [Kubernetes](https://kubernetes.io), where nodes can be unresolvable during downtime, and this is not an error. + - Replica’s host has no DNS record. It can occur in systems with dynamic DNS, for example, [Kubernetes](https://kubernetes.io), where nodes can be unresolvable during downtime, and this is not an error. - - Configuration error. ClickHouse configuration file contains a wrong hostname. + - Configuration error. ClickHouse configuration file contains a wrong hostname. Possible values: -- 1 — skipping enabled. +- 1 — skipping enabled. - If a shard is unavailable, ClickHouse returns a result based on partial data and doesn't report node availability issues. + If a shard is unavailable, ClickHouse returns a result based on partial data and doesn’t report node availability issues. -- 0 — skipping disabled. +- 0 — skipping disabled. If a shard is unavailable, ClickHouse throws an exception. Default value: 0. -## optimize_skip_unused_shards {#settings-optimize_skip_unused_shards} +## optimize\_skip\_unused\_shards {#settings-optimize-skip-unused-shards} -Enables or disables skipping of unused shards for SELECT queries that has sharding key condition in PREWHERE/WHERE (assumes that the data is distributed by sharding key, otherwise do nothing). +Enables or disables skipping of unused shards for SELECT queries that have sharding key condition in PREWHERE/WHERE (assumes that the data is distributed by sharding key, otherwise do nothing). Default value: 0 -## force_optimize_skip_unused_shards {#settings-force_optimize_skip_unused_shards} +## force\_optimize\_skip\_unused\_shards {#settings-force-optimize-skip-unused-shards} Enables or disables query execution if [`optimize_skip_unused_shards`](#settings-optimize_skip_unused_shards) enabled and skipping of unused shards is not possible. If the skipping is not possible and the setting is enabled exception will be thrown. Possible values: -- 0 - Disabled (do not throws) -- 1 - Disable query execution only if the table has sharding key -- 2 - Disable query execution regardless sharding key is defined for the table +- 0 - Disabled (do not throws) +- 1 - Disable query execution only if the table has sharding key +- 2 - Disable query execution regardless sharding key is defined for the table Default value: 0 -## optimize_throw_if_noop {#setting-optimize_throw_if_noop} +## optimize\_throw\_if\_noop {#setting-optimize-throw-if-noop} -Enables or disables throwing an exception if an [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) query didn't perform a merge. +Enables or disables throwing an exception if an [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) query didn’t perform a merge. -By default, `OPTIMIZE` returns successfully even if it didn't do anything. This setting lets you differentiate these situations and get the reason in an exception message. +By default, `OPTIMIZE` returns successfully even if it didn’t do anything. This setting lets you differentiate these situations and get the reason in an exception message. Possible values: -- 1 — Throwing an exception is enabled. -- 0 — Throwing an exception is disabled. +- 1 — Throwing an exception is enabled. +- 0 — Throwing an exception is disabled. Default value: 0. +## distributed\_replica\_error\_half\_life {#settings-distributed-replica-error-half-life} -## distributed_replica_error_half_life {#settings-distributed_replica_error_half_life} +- Type: seconds +- Default value: 60 seconds -- Type: seconds -- Default value: 60 seconds - -Controls how fast errors in distributed tables are zeroed. If a replica is unavailabe for some time, accumulates 5 errors, and distributed_replica_error_half_life is set to 1 second, then the replica is considered normal 3 seconds after last error. +Controls how fast errors in distributed tables are zeroed. If a replica is unavailable for some time, accumulates 5 errors, and distributed\_replica\_error\_half\_life is set to 1 second, then the replica is considered normal 3 seconds after last error. See also: -- [Table engine Distributed](../../operations/table_engines/distributed.md) -- [distributed_replica_error_cap](#settings-distributed_replica_error_cap) +- [Table engine Distributed](../../operations/table_engines/distributed.md) +- [distributed\_replica\_error\_cap](#settings-distributed_replica_error_cap) -## distributed_replica_error_cap {#settings-distributed_replica_error_cap} +## distributed\_replica\_error\_cap {#settings-distributed-replica-error-cap} -- Type: unsigned int -- Default value: 1000 +- Type: unsigned int +- Default value: 1000 Error count of each replica is capped at this value, preventing a single replica from accumulating too many errors. See also: -- [Table engine Distributed](../../operations/table_engines/distributed.md) -- [distributed_replica_error_half_life](#settings-distributed_replica_error_half_life) +- [Table engine Distributed](../../operations/table_engines/distributed.md) +- [distributed\_replica\_error\_half\_life](#settings-distributed_replica_error_half_life) -## distributed_directory_monitor_sleep_time_ms {#distributed_directory_monitor_sleep_time_ms} +## distributed\_directory\_monitor\_sleep\_time\_ms {#distributed-directory-monitor-sleep-time-ms} Base interval for the [Distributed](../table_engines/distributed.md) table engine to send data. The actual interval grows exponentially in the event of errors. Possible values: -- Positive integer number of milliseconds. +- A positive integer number of milliseconds. Default value: 100 milliseconds. -## distributed_directory_monitor_max_sleep_time_ms {#distributed_directory_monitor_max_sleep_time_ms} +## distributed\_directory\_monitor\_max\_sleep\_time\_ms {#distributed-directory-monitor-max-sleep-time-ms} -Maximum interval for the [Distributed](../table_engines/distributed.md) table engine to send data. Limits exponential growth of the interval set in the [distributed_directory_monitor_sleep_time_ms](#distributed_directory_monitor_sleep_time_ms) setting. +Maximum interval for the [Distributed](../table_engines/distributed.md) table engine to send data. Limits exponential growth of the interval set in the [distributed\_directory\_monitor\_sleep\_time\_ms](#distributed_directory_monitor_sleep_time_ms) setting. Possible values: -- Positive integer number of milliseconds. +- A positive integer number of milliseconds. Default value: 30000 milliseconds (30 seconds). -## distributed_directory_monitor_batch_inserts {#distributed_directory_monitor_batch_inserts} +## distributed\_directory\_monitor\_batch\_inserts {#distributed-directory-monitor-batch-inserts} Enables/disables sending of inserted data in batches. -When batch sending is enabled, the [Distributed](../table_engines/distributed.md) table engine tries to send multiple files of inserted data in one operation instead of sending them separately. Batch sending improves cluster performance by better utilizing server and network resources. +When batch sending is enabled, the [Distributed](../table_engines/distributed.md) table engine tries to send multiple files of inserted data in one operation instead of sending them separately. Batch sending improves cluster performance by better-utilizing server and network resources. Possible values: -- 1 — Enabled. -- 0 — Disabled. +- 1 — Enabled. +- 0 — Disabled. Default value: 0. -## os_thread_priority {#setting-os_thread_priority} +## os\_thread\_priority {#setting-os-thread-priority} Sets the priority ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) for threads that execute queries. The OS scheduler considers this priority when choosing the next thread to run on each available CPU core. !!! warning "Warning" - To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments don't allow you to set the `CAP_SYS_NICE` capability. In this case, `clickhouse-server` shows a message about it at the start. + To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments don’t allow you to set the `CAP_SYS_NICE` capability. In this case, `clickhouse-server` shows a message about it at the start. Possible values: -- You can set values in the range `[-20, 19]`. +- You can set values in the range `[-20, 19]`. -Lower values mean higher priority. Threads with low `nice` priority values are executed more frequently than threads with high values. High values are preferable for long running non-interactive queries because it allows them to quickly give up resources in favor of short interactive queries when they arrive. +Lower values mean higher priority. Threads with low `nice` priority values are executed more frequently than threads with high values. High values are preferable for long-running non-interactive queries because it allows them to quickly give up resources in favour of short interactive queries when they arrive. Default value: 0. -## query_profiler_real_time_period_ns {#query_profiler_real_time_period_ns} +## query\_profiler\_real\_time\_period\_ns {#query-profiler-real-time-period-ns} Sets the period for a real clock timer of the [query profiler](../../operations/performance/sampling_query_profiler.md). Real clock timer counts wall-clock time. Possible values: -- Positive integer number, in nanoseconds. +- Positive integer number, in nanoseconds. Recommended values: - - 10000000 (100 times a second) nanoseconds and less for single queries. - - 1000000000 (once a second) for cluster-wide profiling. + - 10000000 (100 times a second) nanoseconds and less for single queries. + - 1000000000 (once a second) for cluster-wide profiling. -- 0 for turning off the timer. +- 0 for turning off the timer. Type: [UInt64](../../data_types/int_uint.md). @@ -1104,22 +1094,22 @@ Default value: 1000000000 nanoseconds (once a second). See also: -- System table [trace_log](../system_tables.md#system_tables-trace_log) +- System table [trace\_log](../system_tables.md#system_tables-trace_log) -## query_profiler_cpu_time_period_ns {#query_profiler_cpu_time_period_ns} +## query\_profiler\_cpu\_time\_period\_ns {#query-profiler-cpu-time-period-ns} Sets the period for a CPU clock timer of the [query profiler](../../operations/performance/sampling_query_profiler.md). This timer counts only CPU time. Possible values: -- Positive integer number of nanoseconds. +- A positive integer number of nanoseconds. Recommended values: - - 10000000 (100 times a second) nanosecods and more for for single queries. - - 1000000000 (once a second) for cluster-wide profiling. + - 10000000 (100 times a second) nanoseconds and more for single queries. + - 1000000000 (once a second) for cluster-wide profiling. -- 0 for turning off the timer. +- 0 for turning off the timer. Type: [UInt64](../../data_types/int_uint.md). @@ -1127,39 +1117,39 @@ Default value: 1000000000 nanoseconds. See also: -- System table [trace_log](../system_tables.md#system_tables-trace_log) +- System table [trace\_log](../system_tables.md#system_tables-trace_log) -## allow_introspection_functions {#settings-allow_introspection_functions} +## allow\_introspection\_functions {#settings-allow-introspection-functions} Enables of disables [introspections functions](../../query_language/functions/introspection.md) for query profiling. Possible values: -- 1 — Introspection functions enabled. -- 0 — Introspection functions disabled. +- 1 — Introspection functions enabled. +- 0 — Introspection functions disabled. Default value: 0. **See Also** -- [Sampling Query Profiler](../performance/sampling_query_profiler.md) -- System table [trace_log](../system_tables.md#system_tables-trace_log) +- [Sampling Query Profiler](../performance/sampling_query_profiler.md) +- System table [trace\_log](../system_tables.md#system_tables-trace_log) -## input_format_parallel_parsing +## input\_format\_parallel\_parsing {#input-format-parallel-parsing} -- Type: bool -- Default value: True +- Type: bool +- Default value: True Enable order-preserving parallel parsing of data formats. Supported only for TSV, TKSV, CSV and JSONEachRow formats. -## min_chunk_bytes_for_parallel_parsing +## min\_chunk\_bytes\_for\_parallel\_parsing {#min-chunk-bytes-for-parallel-parsing} -- Type: unsigned int -- Default value: 1 MiB +- Type: unsigned int +- Default value: 1 MiB The minimum chunk size in bytes, which each thread will parse in parallel. -## output_format_avro_codec {#settings-output_format_avro_codec} +## output\_format\_avro\_codec {#settings-output-format-avro-codec} Sets the compression codec used for output Avro file. @@ -1167,13 +1157,13 @@ Type: string Possible values: -- `null` — No compression -- `deflate` — Compress with Deflate (zlib) -- `snappy` — Compress with [Snappy](https://google.github.io/snappy/) +- `null` — No compression +- `deflate` — Compress with Deflate (zlib) +- `snappy` — Compress with [Snappy](https://google.github.io/snappy/) Default value: `snappy` (if available) or `deflate`. -## output_format_avro_sync_interval {#settings-output_format_avro_sync_interval} +## output\_format\_avro\_sync\_interval {#settings-output-format-avro-sync-interval} Sets minimum data size (in bytes) between synchronization markers for output Avro file. @@ -1183,7 +1173,7 @@ Possible values: 32 (32 bytes) - 1073741824 (1 GiB) Default value: 32768 (32 KiB) -## format_avro_schema_registry_url {#settings-format_avro_schema_registry_url} +## format\_avro\_schema\_registry\_url {#settings-format-avro-schema-registry-url} Sets Confluent Schema Registry URL to use with [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent) format @@ -1191,5 +1181,4 @@ Type: URL Default value: Empty - [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) diff --git a/docs/en/operations/settings/settings_profiles.md b/docs/en/operations/settings/settings_profiles.md index 604206bf9da..09976eda14e 100644 --- a/docs/en/operations/settings/settings_profiles.md +++ b/docs/en/operations/settings/settings_profiles.md @@ -1,5 +1,4 @@ - -# Settings profiles +# Settings Profiles {#settings-profiles} A settings profile is a collection of settings grouped under the same name. Each ClickHouse user has a profile. To apply all the settings in a profile, set the `profile` setting. @@ -16,7 +15,7 @@ Settings profiles are declared in the user config file. This is usually `users.x Example: -```xml +``` xml @@ -62,5 +61,4 @@ The example specifies two profiles: `default` and `web`. The `default` profile h Settings profiles can inherit from each other. To use inheritance, indicate one or multiple `profile` settings before the other settings that are listed in the profile. In case when one setting is defined in different profiles, the latest defined is used. - [Original article](https://clickhouse.tech/docs/en/operations/settings/settings_profiles/) diff --git a/docs/en/operations/settings/settings_users.md b/docs/en/operations/settings/settings_users.md index c30786759e6..08bd5a4c4e7 100644 --- a/docs/en/operations/settings/settings_users.md +++ b/docs/en/operations/settings/settings_users.md @@ -1,10 +1,10 @@ -# User settings +# User Settings {#user-settings} The `users` section of the `user.xml` configuration file contains user settings. Structure of the `users` section: -```xml +``` xml @@ -31,59 +31,55 @@ Structure of the `users` section: ``` -### user_name/password +### user\_name/password {#user-namepassword} Password can be specified in plaintext or in SHA256 (hex format). -- To assign a password in plaintext (**not recommended**), place it in a `password` element. +- To assign a password in plaintext (**not recommended**), place it in a `password` element. For example, `qwerty`. The password can be left blank. -- To assign a password using its SHA256 hash, place it in a `password_sha256_hex` element. +- To assign a password using its SHA256 hash, place it in a `password_sha256_hex` element. For example, `65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5`. Example of how to generate a password from shell: - ``` - PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' - ``` + PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' The first line of the result is the password. The second line is the corresponding SHA256 hash. -- For compatibility with MySQL clients, password can be specified in double SHA1 hash. Place it in `password_double_sha1_hex` element. +- For compatibility with MySQL clients, password can be specified in double SHA1 hash. Place it in `password_double_sha1_hex` element. For example, `08b4a0f1de6ad37da17359e592c8d74788a83eb0`. Example of how to generate a password from shell: - ``` - PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-' - ``` + PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-' The first line of the result is the password. The second line is the corresponding double SHA1 hash. -### user_name/networks +### user\_name/networks {#user-namenetworks} List of networks from which the user can connect to the ClickHouse server. Each element of the list can have one of the following forms: -- `` — IP address or network mask. +- `` — IP address or network mask. Examples: `213.180.204.3`, `10.0.0.1/8`, `10.0.0.1/255.255.255.0`, `2a02:6b8::3`, `2a02:6b8::3/64`, `2a02:6b8::3/ffff:ffff:ffff:ffff::`. -- `` — Hostname. +- `` — Hostname. Example: `example01.host.ru`. To check access, a DNS query is performed, and all returned IP addresses are compared to the peer address. -- `` — Regular expression for hostnames. +- `` — Regular expression for hostnames. Example, `^example\d\d-\d\d-\d\.host\.ru$` @@ -95,33 +91,32 @@ All results of DNS requests are cached until the server restarts. To open access for user from any network, specify: -```xml +``` xml ::/0 ``` !!! warning "Warning" - It's insecure to open access from any network unless you have a firewall properly configured or the server is not directly connected to Internet. - + It’s insecure to open access from any network unless you have a firewall properly configured or the server is not directly connected to Internet. To open access only from localhost, specify: -```xml +``` xml ::1 127.0.0.1 ``` -### user_name/profile +### user\_name/profile {#user-nameprofile} You can assign a settings profile for the user. Settings profiles are configured in a separate section of the `users.xml` file. For more information, see [Profiles of Settings](settings_profiles.md). -### user_name/quota +### user\_name/quota {#user-namequota} Quotas allow you to track or limit resource usage over a period of time. Quotas are configured in the `quotas` section of the `users.xml` configuration file. You can assign a quotas set for the user. For a detailed description of quotas configuration, see [Quotas](../quotas.md#quotas). -### user_name/databases +### user\_name/databases {#user-namedatabases} In this section, you can you can limit rows that are returned by ClickHouse for `SELECT` queries made by the current user, thus implementing basic row-level security. @@ -129,7 +124,7 @@ In this section, you can you can limit rows that are returned by ClickHouse for The following configuration forces that user `user1` can only see the rows of `table1` as the result of `SELECT` queries, where the value of the `id` field is 1000. -```xml +``` xml diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index 0eb1b8d67f6..6ff95dcfa03 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -1,27 +1,27 @@ -# System tables +# System tables {#system-tables} -System tables are used for implementing part of the system's functionality, and for providing access to information about how the system is working. -You can't delete a system table (but you can perform DETACH). -System tables don't have files with data on the disk or files with metadata. The server creates all the system tables when it starts. +System tables are used for implementing part of the system’s functionality, and for providing access to information about how the system is working. +You can’t delete a system table (but you can perform DETACH). +System tables don’t have files with data on the disk or files with metadata. The server creates all the system tables when it starts. System tables are read-only. -They are located in the 'system' database. +They are located in the ‘system’ database. -## system.asynchronous_metrics {#system_tables-asynchronous_metrics} +## system.asynchronous\_metrics {#system-tables-asynchronous-metrics} Contains metrics that are calculated periodically in the background. For example, the amount of RAM in use. Columns: -- `metric` ([String](../data_types/string.md)) — Metric name. -- `value` ([Float64](../data_types/float.md)) — Metric value. +- `metric` ([String](../data_types/string.md)) — Metric name. +- `value` ([Float64](../data_types/float.md)) — Metric value. **Example** -```sql +``` sql SELECT * FROM system.asynchronous_metrics LIMIT 10 ``` -```text +``` text ┌─metric──────────────────────────────────┬──────value─┐ │ jemalloc.background_thread.run_interval │ 0 │ │ jemalloc.background_thread.num_runs │ 0 │ @@ -38,38 +38,37 @@ SELECT * FROM system.asynchronous_metrics LIMIT 10 **See Also** -- [Monitoring](monitoring.md) — Base concepts of ClickHouse monitoring. -- [system.metrics](#system_tables-metrics) — Contains instantly calculated metrics. -- [system.events](#system_tables-events) — Contains a number of events that have occurred. -- [system.metric_log](#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. +- [Monitoring](monitoring.md) — Base concepts of ClickHouse monitoring. +- [system.metrics](#system_tables-metrics) — Contains instantly calculated metrics. +- [system.events](#system_tables-events) — Contains a number of events that have occurred. +- [system.metric\_log](#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. -## system.clusters +## system.clusters {#system-clusters} Contains information about clusters available in the config file and the servers in them. Columns: -- `cluster` (String) — The cluster name. -- `shard_num` (UInt32) — The shard number in the cluster, starting from 1. -- `shard_weight` (UInt32) — The relative weight of the shard when writing data. -- `replica_num` (UInt32) — The replica number in the shard, starting from 1. -- `host_name` (String) — The host name, as specified in the config. -- `host_address` (String) — The host IP address obtained from DNS. -- `port` (UInt16) — The port to use for connecting to the server. -- `user` (String) — The name of the user for connecting to the server. -- `errors_count` (UInt32) - number of times this host failed to reach replica. -- `estimated_recovery_time` (UInt32) - seconds left until replica error count is zeroed and it is considered to be back to normal. - +- `cluster` (String) — The cluster name. +- `shard_num` (UInt32) — The shard number in the cluster, starting from 1. +- `shard_weight` (UInt32) — The relative weight of the shard when writing data. +- `replica_num` (UInt32) — The replica number in the shard, starting from 1. +- `host_name` (String) — The host name, as specified in the config. +- `host_address` (String) — The host IP address obtained from DNS. +- `port` (UInt16) — The port to use for connecting to the server. +- `user` (String) — The name of the user for connecting to the server. +- `errors_count` (UInt32) - number of times this host failed to reach replica. +- `estimated_recovery_time` (UInt32) - seconds left until replica error count is zeroed and it is considered to be back to normal. Please note that `errors_count` is updated once per query to the cluster, but `estimated_recovery_time` is recalculated on-demand. So there could be a case of non-zero `errors_count` and zero `estimated_recovery_time`, that next query will zero `errors_count` and try to use replica as if it has no errors. **See also** -- [Table engine Distributed](table_engines/distributed.md) -- [distributed_replica_error_cap setting](settings/settings.md#settings-distributed_replica_error_cap) -- [distributed_replica_error_half_life setting](settings/settings.md#settings-distributed_replica_error_half_life) +- [Table engine Distributed](table_engines/distributed.md) +- [distributed\_replica\_error\_cap setting](settings/settings.md#settings-distributed_replica_error_cap) +- [distributed\_replica\_error\_half\_life setting](settings/settings.md#settings-distributed_replica_error_half_life) -## system.columns +## system.columns {#system-columns} Contains information about columns in all the tables. @@ -77,36 +76,36 @@ You can use this table to get information similar to the [DESCRIBE TABLE](../que The `system.columns` table contains the following columns (the column type is shown in brackets): -- `database` (String) — Database name. -- `table` (String) — Table name. -- `name` (String) — Column name. -- `type` (String) — Column type. -- `default_kind` (String) — Expression type (`DEFAULT`, `MATERIALIZED`, `ALIAS`) for the default value, or an empty string if it is not defined. -- `default_expression` (String) — Expression for the default value, or an empty string if it is not defined. -- `data_compressed_bytes` (UInt64) — The size of compressed data, in bytes. -- `data_uncompressed_bytes` (UInt64) — The size of decompressed data, in bytes. -- `marks_bytes` (UInt64) — The size of marks, in bytes. -- `comment` (String) — Comment on the column, or an empty string if it is not defined. -- `is_in_partition_key` (UInt8) — Flag that indicates whether the column is in the partition expression. -- `is_in_sorting_key` (UInt8) — Flag that indicates whether the column is in the sorting key expression. -- `is_in_primary_key` (UInt8) — Flag that indicates whether the column is in the primary key expression. -- `is_in_sampling_key` (UInt8) — Flag that indicates whether the column is in the sampling key expression. +- `database` (String) — Database name. +- `table` (String) — Table name. +- `name` (String) — Column name. +- `type` (String) — Column type. +- `default_kind` (String) — Expression type (`DEFAULT`, `MATERIALIZED`, `ALIAS`) for the default value, or an empty string if it is not defined. +- `default_expression` (String) — Expression for the default value, or an empty string if it is not defined. +- `data_compressed_bytes` (UInt64) — The size of compressed data, in bytes. +- `data_uncompressed_bytes` (UInt64) — The size of decompressed data, in bytes. +- `marks_bytes` (UInt64) — The size of marks, in bytes. +- `comment` (String) — Comment on the column, or an empty string if it is not defined. +- `is_in_partition_key` (UInt8) — Flag that indicates whether the column is in the partition expression. +- `is_in_sorting_key` (UInt8) — Flag that indicates whether the column is in the sorting key expression. +- `is_in_primary_key` (UInt8) — Flag that indicates whether the column is in the primary key expression. +- `is_in_sampling_key` (UInt8) — Flag that indicates whether the column is in the sampling key expression. -## system.contributors {#system_contributors} +## system.contributors {#system-contributors} Contains information about contributors. All constributors in random order. The order is random at query execution time. Columns: -- `name` (String) — Contributor (author) name from git log. +- `name` (String) — Contributor (author) name from git log. **Example** -```sql +``` sql SELECT * FROM system.contributors LIMIT 10 ``` -```text +``` text ┌─name─────────────┐ │ Olga Khvostikova │ │ Max Vetrov │ @@ -123,64 +122,65 @@ SELECT * FROM system.contributors LIMIT 10 To find out yourself in the table, use a query: -```sql +``` sql SELECT * FROM system.contributors WHERE name='Olga Khvostikova' ``` -```text + +``` text ┌─name─────────────┐ │ Olga Khvostikova │ └──────────────────┘ ``` -## system.databases +## system.databases {#system-databases} -This table contains a single String column called 'name' – the name of a database. +This table contains a single String column called ‘name’ – the name of a database. Each database that the server knows about has a corresponding entry in the table. This system table is used for implementing the `SHOW DATABASES` query. -## system.detached_parts {#system_tables-detached_parts} +## system.detached\_parts {#system-tables-detached-parts} -Contains information about detached parts of [MergeTree](table_engines/mergetree.md) tables. The `reason` column specifies why the part was detached. For user-detached parts, the reason is empty. Such parts can be attached with [ALTER TABLE ATTACH PARTITION|PART](../query_language/query_language/alter/#alter_attach-partition) command. For the description of other columns, see [system.parts](#system_tables-parts). If part name is invalid, values of some columns may be `NULL`. Such parts can be deleted with [ALTER TABLE DROP DETACHED PART](../query_language/query_language/alter/#alter_drop-detached). +Contains information about detached parts of [MergeTree](table_engines/mergetree.md) tables. The `reason` column specifies why the part was detached. For user-detached parts, the reason is empty. Such parts can be attached with [ALTER TABLE ATTACH PARTITION\|PART](../query_language/query_language/alter/#alter_attach-partition) command. For the description of other columns, see [system.parts](#system_tables-parts). If part name is invalid, values of some columns may be `NULL`. Such parts can be deleted with [ALTER TABLE DROP DETACHED PART](../query_language/query_language/alter/#alter_drop-detached). -## system.dictionaries +## system.dictionaries {#system-dictionaries} Contains information about external dictionaries. Columns: -- `name` (String) — Dictionary name. -- `type` (String) — Dictionary type: Flat, Hashed, Cache. -- `origin` (String) — Path to the configuration file that describes the dictionary. -- `attribute.names` (Array(String)) — Array of attribute names provided by the dictionary. -- `attribute.types` (Array(String)) — Corresponding array of attribute types that are provided by the dictionary. -- `has_hierarchy` (UInt8) — Whether the dictionary is hierarchical. -- `bytes_allocated` (UInt64) — The amount of RAM the dictionary uses. -- `hit_rate` (Float64) — For cache dictionaries, the percentage of uses for which the value was in the cache. -- `element_count` (UInt64) — The number of items stored in the dictionary. -- `load_factor` (Float64) — The percentage filled in the dictionary (for a hashed dictionary, the percentage filled in the hash table). -- `creation_time` (DateTime) — The time when the dictionary was created or last successfully reloaded. -- `last_exception` (String) — Text of the error that occurs when creating or reloading the dictionary if the dictionary couldn't be created. -- `source` (String) — Text describing the data source for the dictionary. +- `name` (String) — Dictionary name. +- `type` (String) — Dictionary type: Flat, Hashed, Cache. +- `origin` (String) — Path to the configuration file that describes the dictionary. +- `attribute.names` (Array(String)) — Array of attribute names provided by the dictionary. +- `attribute.types` (Array(String)) — Corresponding array of attribute types that are provided by the dictionary. +- `has_hierarchy` (UInt8) — Whether the dictionary is hierarchical. +- `bytes_allocated` (UInt64) — The amount of RAM the dictionary uses. +- `hit_rate` (Float64) — For cache dictionaries, the percentage of uses for which the value was in the cache. +- `element_count` (UInt64) — The number of items stored in the dictionary. +- `load_factor` (Float64) — The percentage filled in the dictionary (for a hashed dictionary, the percentage filled in the hash table). +- `creation_time` (DateTime) — The time when the dictionary was created or last successfully reloaded. +- `last_exception` (String) — Text of the error that occurs when creating or reloading the dictionary if the dictionary couldn’t be created. +- `source` (String) — Text describing the data source for the dictionary. Note that the amount of memory used by the dictionary is not proportional to the number of items stored in it. So for flat and cached dictionaries, all the memory cells are pre-assigned, regardless of how full the dictionary actually is. -## system.events {#system_tables-events} +## system.events {#system-tables-events} Contains information about the number of events that have occurred in the system. For example, in the table, you can find how many `SELECT` queries were processed since the ClickHouse server started. Columns: -- `event` ([String](../data_types/string.md)) — Event name. -- `value` ([UInt64](../data_types/int_uint.md)) — Number of events occurred. -- `description` ([String](../data_types/string.md)) — Event description. +- `event` ([String](../data_types/string.md)) — Event name. +- `value` ([UInt64](../data_types/int_uint.md)) — Number of events occurred. +- `description` ([String](../data_types/string.md)) — Event description. **Example** -```sql +``` sql SELECT * FROM system.events LIMIT 5 ``` -```text +``` text ┌─event─────────────────────────────────┬─value─┬─description────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Query │ 12 │ Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries. │ │ SelectQuery │ 8 │ Same as Query, but only for SELECT queries. │ @@ -192,76 +192,75 @@ SELECT * FROM system.events LIMIT 5 **See Also** -- [system.asynchronous_metrics](#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. -- [system.metrics](#system_tables-metrics) — Contains instantly calculated metrics. -- [system.metric_log](#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. -- [Monitoring](monitoring.md) — Base concepts of ClickHouse monitoring. +- [system.asynchronous\_metrics](#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. +- [system.metrics](#system_tables-metrics) — Contains instantly calculated metrics. +- [system.metric\_log](#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. +- [Monitoring](monitoring.md) — Base concepts of ClickHouse monitoring. -## system.functions +## system.functions {#system-functions} Contains information about normal and aggregate functions. Columns: -- `name`(`String`) – The name of the function. -- `is_aggregate`(`UInt8`) — Whether the function is aggregate. +- `name`(`String`) – The name of the function. +- `is_aggregate`(`UInt8`) — Whether the function is aggregate. -## system.graphite_retentions +## system.graphite\_retentions {#system-graphite-retentions} -Contains information about parameters [graphite_rollup](server_settings/settings.md#server_settings-graphite_rollup) which are used in tables with [\*GraphiteMergeTree](table_engines/graphitemergetree.md) engines. +Contains information about parameters [graphite\_rollup](server_settings/settings.md#server_settings-graphite_rollup) which are used in tables with [\*GraphiteMergeTree](table_engines/graphitemergetree.md) engines. Columns: -- `config_name` (String) - `graphite_rollup` parameter name. -- `regexp` (String) - A pattern for the metric name. -- `function` (String) - The name of the aggregating function. -- `age` (UInt64) - The minimum age of the data in seconds. -- `precision` (UInt64) - How precisely to define the age of the data in seconds. -- `priority` (UInt16) - Pattern priority. -- `is_default` (UInt8) - Whether the pattern is the default. -- `Tables.database` (Array(String)) - Array of names of database tables that use the `config_name` parameter. -- `Tables.table` (Array(String)) - Array of table names that use the `config_name` parameter. +- `config_name` (String) - `graphite_rollup` parameter name. +- `regexp` (String) - A pattern for the metric name. +- `function` (String) - The name of the aggregating function. +- `age` (UInt64) - The minimum age of the data in seconds. +- `precision` (UInt64) - How precisely to define the age of the data in seconds. +- `priority` (UInt16) - Pattern priority. +- `is_default` (UInt8) - Whether the pattern is the default. +- `Tables.database` (Array(String)) - Array of names of database tables that use the `config_name` parameter. +- `Tables.table` (Array(String)) - Array of table names that use the `config_name` parameter. - -## system.merges +## system.merges {#system-merges} Contains information about merges and part mutations currently in process for tables in the MergeTree family. Columns: -- `database` (String) — The name of the database the table is in. -- `table` (String) — Table name. -- `elapsed` (Float64) — The time elapsed (in seconds) since the merge started. -- `progress` (Float64) — The percentage of completed work from 0 to 1. -- `num_parts` (UInt64) — The number of pieces to be merged. -- `result_part_name` (String) — The name of the part that will be formed as the result of merging. -- `is_mutation` (UInt8) - 1 if this process is a part mutation. -- `total_size_bytes_compressed` (UInt64) — The total size of the compressed data in the merged chunks. -- `total_size_marks` (UInt64) — The total number of marks in the merged parts. -- `bytes_read_uncompressed` (UInt64) — Number of bytes read, uncompressed. -- `rows_read` (UInt64) — Number of rows read. -- `bytes_written_uncompressed` (UInt64) — Number of bytes written, uncompressed. -- `rows_written` (UInt64) — Number of rows written. +- `database` (String) — The name of the database the table is in. +- `table` (String) — Table name. +- `elapsed` (Float64) — The time elapsed (in seconds) since the merge started. +- `progress` (Float64) — The percentage of completed work from 0 to 1. +- `num_parts` (UInt64) — The number of pieces to be merged. +- `result_part_name` (String) — The name of the part that will be formed as the result of merging. +- `is_mutation` (UInt8) - 1 if this process is a part mutation. +- `total_size_bytes_compressed` (UInt64) — The total size of the compressed data in the merged chunks. +- `total_size_marks` (UInt64) — The total number of marks in the merged parts. +- `bytes_read_uncompressed` (UInt64) — Number of bytes read, uncompressed. +- `rows_read` (UInt64) — Number of rows read. +- `bytes_written_uncompressed` (UInt64) — Number of bytes written, uncompressed. +- `rows_written` (UInt64) — Number of rows written. -## system.metrics {#system_tables-metrics} +## system.metrics {#system-tables-metrics} Contains metrics which can be calculated instantly, or have a current value. For example, the number of simultaneously processed queries or the current replica delay. This table is always up to date. Columns: -- `metric` ([String](../data_types/string.md)) — Metric name. -- `value` ([Int64](../data_types/int_uint.md)) — Metric value. -- `description` ([String](../data_types/string.md)) — Metric description. +- `metric` ([String](../data_types/string.md)) — Metric name. +- `value` ([Int64](../data_types/int_uint.md)) — Metric value. +- `description` ([String](../data_types/string.md)) — Metric description. The list of supported metrics you can find in the [dbms/src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Common/CurrentMetrics.cpp) source file of ClickHouse. **Example** -```sql +``` sql SELECT * FROM system.metrics LIMIT 10 ``` -```text +``` text ┌─metric─────────────────────┬─value─┬─description──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Query │ 1 │ Number of executing queries │ │ Merge │ 0 │ Number of executing background merges │ @@ -278,16 +277,17 @@ SELECT * FROM system.metrics LIMIT 10 **See Also** -- [system.asynchronous_metrics](#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. -- [system.events](#system_tables-events) — Contains a number of events that occurred. -- [system.metric_log](#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. -- [Monitoring](monitoring.md) — Base concepts of ClickHouse monitoring. +- [system.asynchronous\_metrics](#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. +- [system.events](#system_tables-events) — Contains a number of events that occurred. +- [system.metric\_log](#system_tables-metric_log) — Contains a history of metrics values from tables `system.metrics` и `system.events`. +- [Monitoring](monitoring.md) — Base concepts of ClickHouse monitoring. -## system.metric_log {#system_tables-metric_log} +## system.metric\_log {#system-tables-metric-log} Contains history of metrics values from tables `system.metrics` and `system.events`, periodically flushed to disk. To turn on metrics history collection on `system.metric_log`, create `/etc/clickhouse-server/config.d/metric_log.xml` with following content: -```xml + +``` xml system @@ -300,12 +300,11 @@ To turn on metrics history collection on `system.metric_log`, create `/etc/click **Example** -```sql +``` sql SELECT * FROM system.metric_log LIMIT 1 FORMAT Vertical; ``` - -```text +``` text Row 1: ────── event_date: 2020-02-18 @@ -329,34 +328,34 @@ CurrentMetric_PartMutation: 0 CurrentMetric_ReplicatedFetch: 0 CurrentMetric_ReplicatedSend: 0 CurrentMetric_ReplicatedChecks: 0 -... +... ``` **See also** -- [system.asynchronous_metrics](#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. -- [system.events](#system_tables-events) — Contains a number of events that occurred. -- [system.metrics](#system_tables-metrics) — Contains instantly calculated metrics. -- [Monitoring](monitoring.md) — Base concepts of ClickHouse monitoring. +- [system.asynchronous\_metrics](#system_tables-asynchronous_metrics) — Contains periodically calculated metrics. +- [system.events](#system_tables-events) — Contains a number of events that occurred. +- [system.metrics](#system_tables-metrics) — Contains instantly calculated metrics. +- [Monitoring](monitoring.md) — Base concepts of ClickHouse monitoring. -## system.numbers +## system.numbers {#system-numbers} -This table contains a single UInt64 column named 'number' that contains almost all the natural numbers starting from zero. +This table contains a single UInt64 column named ‘number’ that contains almost all the natural numbers starting from zero. You can use this table for tests, or if you need to do a brute force search. Reads from this table are not parallelized. -## system.numbers_mt +## system.numbers\_mt {#system-numbers-mt} -The same as 'system.numbers' but reads are parallelized. The numbers can be returned in any order. +The same as ‘system.numbers’ but reads are parallelized. The numbers can be returned in any order. Used for tests. -## system.one +## system.one {#system-one} -This table contains a single row with a single 'dummy' UInt8 column containing the value 0. -This table is used if a SELECT query doesn't specify the FROM clause. +This table contains a single row with a single ‘dummy’ UInt8 column containing the value 0. +This table is used if a SELECT query doesn’t specify the FROM clause. This is similar to the DUAL table found in other DBMSs. -## system.parts {#system_tables-parts} +## system.parts {#system-tables-parts} Contains information about parts of [MergeTree](table_engines/mergetree.md) tables. @@ -364,308 +363,342 @@ Each row describes one data part. Columns: -- `partition` (String) – The partition name. To learn what a partition is, see the description of the [ALTER](../query_language/alter.md#query_language_queries_alter) query. +- `partition` (String) – The partition name. To learn what a partition is, see the description of the [ALTER](../query_language/alter.md#query_language_queries_alter) query. Formats: - - `YYYYMM` for automatic partitioning by month. - - `any_string` when partitioning manually. + - `YYYYMM` for automatic partitioning by month. + - `any_string` when partitioning manually. -- `name` (`String`) – Name of the data part. -- `active` (`UInt8`) – Flag that indicates whether the data part is active. If a data part is active, it's used in a table. Otherwise, it's deleted. Inactive data parts remain after merging. -- `marks` (`UInt64`) – The number of marks. To get the approximate number of rows in a data part, multiply `marks` by the index granularity (usually 8192) (this hint doesn't work for adaptive granularity). -- `rows` (`UInt64`) – The number of rows. -- `bytes_on_disk` (`UInt64`) – Total size of all the data part files in bytes. -- `data_compressed_bytes` (`UInt64`) – Total size of compressed data in the data part. All the auxiliary files (for example, files with marks) are not included. -- `data_uncompressed_bytes` (`UInt64`) – Total size of uncompressed data in the data part. All the auxiliary files (for example, files with marks) are not included. -- `marks_bytes` (`UInt64`) – The size of the file with marks. -- `modification_time` (`DateTime`) – The time the directory with the data part was modified. This usually corresponds to the time of data part creation.| -- `remove_time` (`DateTime`) – The time when the data part became inactive. -- `refcount` (`UInt32`) – The number of places where the data part is used. A value greater than 2 indicates that the data part is used in queries or merges. -- `min_date` (`Date`) – The minimum value of the date key in the data part. -- `max_date` (`Date`) – The maximum value of the date key in the data part. -- `min_time` (`DateTime`) – The minimum value of the date and time key in the data part. -- `max_time`(`DateTime`) – The maximum value of the date and time key in the data part. -- `partition_id` (`String`) – ID of the partition. -- `min_block_number` (`UInt64`) – The minimum number of data parts that make up the current part after merging. -- `max_block_number` (`UInt64`) – The maximum number of data parts that make up the current part after merging. -- `level` (`UInt32`) – Depth of the merge tree. Zero means that the current part was created by insert rather than by merging other parts. -- `data_version` (`UInt64`) – Number that is used to determine which mutations should be applied to the data part (mutations with a version higher than `data_version`). -- `primary_key_bytes_in_memory` (`UInt64`) – The amount of memory (in bytes) used by primary key values. -- `primary_key_bytes_in_memory_allocated` (`UInt64`) – The amount of memory (in bytes) reserved for primary key values. -- `is_frozen` (`UInt8`) – Flag that shows that a partition data backup exists. 1, the backup exists. 0, the backup doesn't exist. For more details, see [FREEZE PARTITION](../query_language/alter.md#alter_freeze-partition) -- `database` (`String`) – Name of the database. -- `table` (`String`) – Name of the table. -- `engine` (`String`) – Name of the table engine without parameters. -- `path` (`String`) – Absolute path to the folder with data part files. -- `disk` (`String`) – Name of a disk that stores the data part. -- `hash_of_all_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of compressed files. -- `hash_of_uncompressed_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of uncompressed files (files with marks, index file etc.). -- `uncompressed_hash_of_compressed_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of data in the compressed files as if they were uncompressed. -- `bytes` (`UInt64`) – Alias for `bytes_on_disk`. -- `marks_size` (`UInt64`) – Alias for `marks_bytes`. +- `name` (`String`) – Name of the data part. +- `active` (`UInt8`) – Flag that indicates whether the data part is active. If a data part is active, it’s used in a table. Otherwise, it’s deleted. Inactive data parts remain after merging. -## system.part_log {#system_tables-part-log} +- `marks` (`UInt64`) – The number of marks. To get the approximate number of rows in a data part, multiply `marks` by the index granularity (usually 8192) (this hint doesn’t work for adaptive granularity). -The `system.part_log` table is created only if the [part_log](server_settings/settings.md#server_settings-part-log) server setting is specified. +- `rows` (`UInt64`) – The number of rows. + +- `bytes_on_disk` (`UInt64`) – Total size of all the data part files in bytes. + +- `data_compressed_bytes` (`UInt64`) – Total size of compressed data in the data part. All the auxiliary files (for example, files with marks) are not included. + +- `data_uncompressed_bytes` (`UInt64`) – Total size of uncompressed data in the data part. All the auxiliary files (for example, files with marks) are not included. + +- `marks_bytes` (`UInt64`) – The size of the file with marks. + +- `modification_time` (`DateTime`) – The time the directory with the data part was modified. This usually corresponds to the time of data part creation.\| + +- `remove_time` (`DateTime`) – The time when the data part became inactive. + +- `refcount` (`UInt32`) – The number of places where the data part is used. A value greater than 2 indicates that the data part is used in queries or merges. + +- `min_date` (`Date`) – The minimum value of the date key in the data part. + +- `max_date` (`Date`) – The maximum value of the date key in the data part. + +- `min_time` (`DateTime`) – The minimum value of the date and time key in the data part. + +- `max_time`(`DateTime`) – The maximum value of the date and time key in the data part. + +- `partition_id` (`String`) – ID of the partition. + +- `min_block_number` (`UInt64`) – The minimum number of data parts that make up the current part after merging. + +- `max_block_number` (`UInt64`) – The maximum number of data parts that make up the current part after merging. + +- `level` (`UInt32`) – Depth of the merge tree. Zero means that the current part was created by insert rather than by merging other parts. + +- `data_version` (`UInt64`) – Number that is used to determine which mutations should be applied to the data part (mutations with a version higher than `data_version`). + +- `primary_key_bytes_in_memory` (`UInt64`) – The amount of memory (in bytes) used by primary key values. + +- `primary_key_bytes_in_memory_allocated` (`UInt64`) – The amount of memory (in bytes) reserved for primary key values. + +- `is_frozen` (`UInt8`) – Flag that shows that a partition data backup exists. 1, the backup exists. 0, the backup doesn’t exist. For more details, see [FREEZE PARTITION](../query_language/alter.md#alter_freeze-partition) + +- `database` (`String`) – Name of the database. + +- `table` (`String`) – Name of the table. + +- `engine` (`String`) – Name of the table engine without parameters. + +- `path` (`String`) – Absolute path to the folder with data part files. + +- `disk` (`String`) – Name of a disk that stores the data part. + +- `hash_of_all_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of compressed files. + +- `hash_of_uncompressed_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of uncompressed files (files with marks, index file etc.). + +- `uncompressed_hash_of_compressed_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of data in the compressed files as if they were uncompressed. + +- `bytes` (`UInt64`) – Alias for `bytes_on_disk`. + +- `marks_size` (`UInt64`) – Alias for `marks_bytes`. + +## system.part\_log {#system-tables-part-log} + +The `system.part_log` table is created only if the [part\_log](server_settings/settings.md#server_settings-part-log) server setting is specified. This table contains information about events that occurred with [data parts](table_engines/custom_partitioning_key.md) in the [MergeTree](table_engines/mergetree.md) family tables, such as adding or merging data. The `system.part_log` table contains the following columns: -- `event_type` (Enum) — Type of the event that occurred with the data part. Can have one of the following values: - - `NEW_PART` — Inserting of a new data part. - - `MERGE_PARTS` — Merging of data parts. - - `DOWNLOAD_PART` — Downloading a data part. - - `REMOVE_PART` — Removing or detaching a data part using [DETACH PARTITION](../query_language/alter.md#alter_detach-partition). - - `MUTATE_PART` — Mutating of a data part. - - `MOVE_PART` — Moving the data part from the one disk to another one. -- `event_date` (Date) — Event date. -- `event_time` (DateTime) — Event time. -- `duration_ms` (UInt64) — Duration. -- `database` (String) — Name of the database the data part is in. -- `table` (String) — Name of the table the data part is in. -- `part_name` (String) — Name of the data part. -- `partition_id` (String) — ID of the partition that the data part was inserted to. The column takes the 'all' value if the partitioning is by `tuple()`. -- `rows` (UInt64) — The number of rows in the data part. -- `size_in_bytes` (UInt64) — Size of the data part in bytes. -- `merged_from` (Array(String)) — An array of names of the parts which the current part was made up from (after the merge). -- `bytes_uncompressed` (UInt64) — Size of uncompressed bytes. -- `read_rows` (UInt64) — The number of rows was read during the merge. -- `read_bytes` (UInt64) — The number of bytes was read during the merge. -- `error` (UInt16) — The code number of the occurred error. -- `exception` (String) — Text message of the occurred error. +- `event_type` (Enum) — Type of the event that occurred with the data part. Can have one of the following values: + - `NEW_PART` — Inserting of a new data part. + - `MERGE_PARTS` — Merging of data parts. + - `DOWNLOAD_PART` — Downloading a data part. + - `REMOVE_PART` — Removing or detaching a data part using [DETACH PARTITION](../query_language/alter.md#alter_detach-partition). + - `MUTATE_PART` — Mutating of a data part. + - `MOVE_PART` — Moving the data part from the one disk to another one. +- `event_date` (Date) — Event date. +- `event_time` (DateTime) — Event time. +- `duration_ms` (UInt64) — Duration. +- `database` (String) — Name of the database the data part is in. +- `table` (String) — Name of the table the data part is in. +- `part_name` (String) — Name of the data part. +- `partition_id` (String) — ID of the partition that the data part was inserted to. The column takes the ‘all’ value if the partitioning is by `tuple()`. +- `rows` (UInt64) — The number of rows in the data part. +- `size_in_bytes` (UInt64) — Size of the data part in bytes. +- `merged_from` (Array(String)) — An array of names of the parts which the current part was made up from (after the merge). +- `bytes_uncompressed` (UInt64) — Size of uncompressed bytes. +- `read_rows` (UInt64) — The number of rows was read during the merge. +- `read_bytes` (UInt64) — The number of bytes was read during the merge. +- `error` (UInt16) — The code number of the occurred error. +- `exception` (String) — Text message of the occurred error. The `system.part_log` table is created after the first inserting data to the `MergeTree` table. -## system.processes {#system_tables-processes} +## system.processes {#system-tables-processes} This system table is used for implementing the `SHOW PROCESSLIST` query. Columns: -- `user` (String) – The user who made the query. Keep in mind that for distributed processing, queries are sent to remote servers under the `default` user. The field contains the username for a specific query, not for a query that this query initiated. -- `address` (String) – The IP address the request was made from. The same for distributed processing. To track where a distributed query was originally made from, look at `system.processes` on the query requestor server. -- `elapsed` (Float64) – The time in seconds since request execution started. -- `rows_read` (UInt64) – The number of rows read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. -- `bytes_read` (UInt64) – The number of uncompressed bytes read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. -- `total_rows_approx` (UInt64) – The approximation of the total number of rows that should be read. For distributed processing, on the requestor server, this is the total for all remote servers. It can be updated during request processing, when new sources to process become known. -- `memory_usage` (UInt64) – Amount of RAM the request uses. It might not include some types of dedicated memory. See the [max_memory_usage](../operations/settings/query_complexity.md#settings_max_memory_usage) setting. -- `query` (String) – The query text. For `INSERT`, it doesn't include the data to insert. -- `query_id` (String) – Query ID, if defined. +- `user` (String) – The user who made the query. Keep in mind that for distributed processing, queries are sent to remote servers under the `default` user. The field contains the username for a specific query, not for a query that this query initiated. +- `address` (String) – The IP address the request was made from. The same for distributed processing. To track where a distributed query was originally made from, look at `system.processes` on the query requestor server. +- `elapsed` (Float64) – The time in seconds since request execution started. +- `rows_read` (UInt64) – The number of rows read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. +- `bytes_read` (UInt64) – The number of uncompressed bytes read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. +- `total_rows_approx` (UInt64) – The approximation of the total number of rows that should be read. For distributed processing, on the requestor server, this is the total for all remote servers. It can be updated during request processing, when new sources to process become known. +- `memory_usage` (UInt64) – Amount of RAM the request uses. It might not include some types of dedicated memory. See the [max\_memory\_usage](../operations/settings/query_complexity.md#settings_max_memory_usage) setting. +- `query` (String) – The query text. For `INSERT`, it doesn’t include the data to insert. +- `query_id` (String) – Query ID, if defined. -## system.text_log {#system_tables-text_log} +## system.text\_log {#system-tables-text-log} Contains logging entries. Logging level which goes to this table can be limited with `text_log.level` server setting. Columns: -- `event_date` (`Date`) - Date of the entry. -- `event_time` (`DateTime`) - Time of the entry. -- `microseconds` (`UInt32`) - Microseconds of the entry. -- `thread_name` (String) — Name of the thread from which the logging was done. -- `thread_id` (UInt64) — OS thread ID. -- `level` (`Enum8`) - Entry level. - - `'Fatal' = 1` - - `'Critical' = 2` - - `'Error' = 3` - - `'Warning' = 4` - - `'Notice' = 5` - - `'Information' = 6` - - `'Debug' = 7` - - `'Trace' = 8` -- `query_id` (`String`) - ID of the query. -- `logger_name` (`LowCardinality(String)`) - Name of the logger (i.e. `DDLWorker`) -- `message` (`String`) - The message itself. -- `revision` (`UInt32`) - ClickHouse revision. -- `source_file` (`LowCardinality(String)`) - Source file from which the logging was done. -- `source_line` (`UInt64`) - Source line from which the logging was done. +- `event_date` (`Date`) - Date of the entry. +- `event_time` (`DateTime`) - Time of the entry. +- `microseconds` (`UInt32`) - Microseconds of the entry. +- `thread_name` (String) — Name of the thread from which the logging was done. +- `thread_id` (UInt64) — OS thread ID. +- `level` (`Enum8`) - Entry level. + - `'Fatal' = 1` + - `'Critical' = 2` + - `'Error' = 3` + - `'Warning' = 4` + - `'Notice' = 5` + - `'Information' = 6` + - `'Debug' = 7` + - `'Trace' = 8` +- `query_id` (`String`) - ID of the query. +- `logger_name` (`LowCardinality(String)`) - Name of the logger (i.e. `DDLWorker`) +- `message` (`String`) - The message itself. +- `revision` (`UInt32`) - ClickHouse revision. +- `source_file` (`LowCardinality(String)`) - Source file from which the logging was done. +- `source_line` (`UInt64`) - Source line from which the logging was done. - -## system.query_log {#system_tables-query_log} +## system.query\_log {#system-tables-query-log} Contains information about execution of queries. For each query, you can see processing start time, duration of processing, error messages and other information. !!! note "Note" - The table doesn't contain input data for `INSERT` queries. + The table doesn’t contain input data for `INSERT` queries. -ClickHouse creates this table only if the [query_log](server_settings/settings.md#server_settings-query-log) server parameter is specified. This parameter sets the logging rules, such as the logging interval or the name of the table the queries will be logged in. +ClickHouse creates this table only if the [query\_log](server_settings/settings.md#server_settings-query-log) server parameter is specified. This parameter sets the logging rules, such as the logging interval or the name of the table the queries will be logged in. -To enable query logging, set the [log_queries](settings/settings.md#settings-log-queries) parameter to 1. For details, see the [Settings](settings/settings.md) section. +To enable query logging, set the [log\_queries](settings/settings.md#settings-log-queries) parameter to 1. For details, see the [Settings](settings/settings.md) section. The `system.query_log` table registers two kinds of queries: -1. Initial queries that were run directly by the client. -2. Child queries that were initiated by other queries (for distributed query execution). For these types of queries, information about the parent queries is shown in the `initial_*` columns. +1. Initial queries that were run directly by the client. +2. Child queries that were initiated by other queries (for distributed query execution). For these types of queries, information about the parent queries is shown in the `initial_*` columns. Columns: -- `type` (`Enum8`) — Type of event that occurred when executing the query. Values: - - `'QueryStart' = 1` — Successful start of query execution. - - `'QueryFinish' = 2` — Successful end of query execution. - - `'ExceptionBeforeStart' = 3` — Exception before the start of query execution. - - `'ExceptionWhileProcessing' = 4` — Exception during the query execution. -- `event_date` (Date) — Query starting date. -- `event_time` (DateTime) — Query starting time. -- `query_start_time` (DateTime) — Start time of query execution. -- `query_duration_ms` (UInt64) — Duration of query execution. -- `read_rows` (UInt64) — Number of read rows. -- `read_bytes` (UInt64) — Number of read bytes. -- `written_rows` (UInt64) — For `INSERT` queries, the number of written rows. For other queries, the column value is 0. -- `written_bytes` (UInt64) — For `INSERT` queries, the number of written bytes. For other queries, the column value is 0. -- `result_rows` (UInt64) — Number of rows in the result. -- `result_bytes` (UInt64) — Number of bytes in the result. -- `memory_usage` (UInt64) — Memory consumption by the query. -- `query` (String) — Query string. -- `exception` (String) — Exception message. -- `stack_trace` (String) — Stack trace (a list of methods called before the error occurred). An empty string, if the query is completed successfully. -- `is_initial_query` (UInt8) — Query type. Possible values: - - 1 — Query was initiated by the client. - - 0 — Query was initiated by another query for distributed query execution. -- `user` (String) — Name of the user who initiated the current query. -- `query_id` (String) — ID of the query. -- `address` (IPv6) — IP address that was used to make the query. -- `port` (UInt16) — The client port that was used to make the query. -- `initial_user` (String) — Name of the user who ran the initial query (for distributed query execution). -- `initial_query_id` (String) — ID of the initial query (for distributed query execution). -- `initial_address` (IPv6) — IP address that the parent query was launched from. -- `initial_port` (UInt16) — The client port that was used to make the parent query. -- `interface` (UInt8) — Interface that the query was initiated from. Possible values: - - 1 — TCP. - - 2 — HTTP. -- `os_user` (String) — OS's username who runs [clickhouse-client](../interfaces/cli.md). -- `client_hostname` (String) — Hostname of the client machine where the [clickhouse-client](../interfaces/cli.md) or another TCP client is run. -- `client_name` (String) — The [clickhouse-client](../interfaces/cli.md) or another TCP client name. -- `client_revision` (UInt32) — Revision of the [clickhouse-client](../interfaces/cli.md) or another TCP client. -- `client_version_major` (UInt32) — Major version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. -- `client_version_minor` (UInt32) — Minor version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. -- `client_version_patch` (UInt32) — Patch component of the [clickhouse-client](../interfaces/cli.md) or another TCP client version. -- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: - - 0 — The query was launched from the TCP interface. - - 1 — `GET` method was used. - - 2 — `POST` method was used. -- `http_user_agent` (String) — The `UserAgent` header passed in the HTTP request. -- `quota_key` (String) — The "quota key" specified in the [quotas](quotas.md) setting (see `keyed`). -- `revision` (UInt32) — ClickHouse revision. -- `thread_numbers` (Array(UInt32)) — Number of threads that are participating in query execution. -- `ProfileEvents.Names` (Array(String)) — Counters that measure different metrics. The description of them could be found in the table [system.events](#system_tables-events) -- `ProfileEvents.Values` (Array(UInt64)) — Values of metrics that are listed in the `ProfileEvents.Names` column. -- `Settings.Names` (Array(String)) — Names of settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` parameter to 1. -- `Settings.Values` (Array(String)) — Values of settings that are listed in the `Settings.Names` column. +- `type` (`Enum8`) — Type of event that occurred when executing the query. Values: + - `'QueryStart' = 1` — Successful start of query execution. + - `'QueryFinish' = 2` — Successful end of query execution. + - `'ExceptionBeforeStart' = 3` — Exception before the start of query execution. + - `'ExceptionWhileProcessing' = 4` — Exception during the query execution. +- `event_date` (Date) — Query starting date. +- `event_time` (DateTime) — Query starting time. +- `query_start_time` (DateTime) — Start time of query execution. +- `query_duration_ms` (UInt64) — Duration of query execution. +- `read_rows` (UInt64) — Number of read rows. +- `read_bytes` (UInt64) — Number of read bytes. +- `written_rows` (UInt64) — For `INSERT` queries, the number of written rows. For other queries, the column value is 0. +- `written_bytes` (UInt64) — For `INSERT` queries, the number of written bytes. For other queries, the column value is 0. +- `result_rows` (UInt64) — Number of rows in the result. +- `result_bytes` (UInt64) — Number of bytes in the result. +- `memory_usage` (UInt64) — Memory consumption by the query. +- `query` (String) — Query string. +- `exception` (String) — Exception message. +- `stack_trace` (String) — Stack trace (a list of methods called before the error occurred). An empty string, if the query is completed successfully. +- `is_initial_query` (UInt8) — Query type. Possible values: + - 1 — Query was initiated by the client. + - 0 — Query was initiated by another query for distributed query execution. +- `user` (String) — Name of the user who initiated the current query. +- `query_id` (String) — ID of the query. +- `address` (IPv6) — IP address that was used to make the query. +- `port` (UInt16) — The client port that was used to make the query. +- `initial_user` (String) — Name of the user who ran the initial query (for distributed query execution). +- `initial_query_id` (String) — ID of the initial query (for distributed query execution). +- `initial_address` (IPv6) — IP address that the parent query was launched from. +- `initial_port` (UInt16) — The client port that was used to make the parent query. +- `interface` (UInt8) — Interface that the query was initiated from. Possible values: + - 1 — TCP. + - 2 — HTTP. +- `os_user` (String) — OS’s username who runs [clickhouse-client](../interfaces/cli.md). +- `client_hostname` (String) — Hostname of the client machine where the [clickhouse-client](../interfaces/cli.md) or another TCP client is run. +- `client_name` (String) — The [clickhouse-client](../interfaces/cli.md) or another TCP client name. +- `client_revision` (UInt32) — Revision of the [clickhouse-client](../interfaces/cli.md) or another TCP client. +- `client_version_major` (UInt32) — Major version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. +- `client_version_minor` (UInt32) — Minor version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. +- `client_version_patch` (UInt32) — Patch component of the [clickhouse-client](../interfaces/cli.md) or another TCP client version. +- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: + - 0 — The query was launched from the TCP interface. + - 1 — `GET` method was used. + - 2 — `POST` method was used. +- `http_user_agent` (String) — The `UserAgent` header passed in the HTTP request. +- `quota_key` (String) — The “quota key” specified in the [quotas](quotas.md) setting (see `keyed`). +- `revision` (UInt32) — ClickHouse revision. +- `thread_numbers` (Array(UInt32)) — Number of threads that are participating in query execution. +- `ProfileEvents.Names` (Array(String)) — Counters that measure different metrics. The description of them could be found in the table [system.events](#system_tables-events) +- `ProfileEvents.Values` (Array(UInt64)) — Values of metrics that are listed in the `ProfileEvents.Names` column. +- `Settings.Names` (Array(String)) — Names of settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` parameter to 1. +- `Settings.Values` (Array(String)) — Values of settings that are listed in the `Settings.Names` column. Each query creates one or two rows in the `query_log` table, depending on the status of the query: -1. If the query execution is successful, two events with types 1 and 2 are created (see the `type` column). -2. If an error occurred during query processing, two events with types 1 and 4 are created. -3. If an error occurred before launching the query, a single event with type 3 is created. +1. If the query execution is successful, two events with types 1 and 2 are created (see the `type` column). +2. If an error occurred during query processing, two events with types 1 and 4 are created. +3. If an error occurred before launching the query, a single event with type 3 is created. -By default, logs are added to the table at intervals of 7.5 seconds. You can set this interval in the [query_log](server_settings/settings.md#server_settings-query-log) server setting (see the `flush_interval_milliseconds` parameter). To flush the logs forcibly from the memory buffer into the table, use the `SYSTEM FLUSH LOGS` query. +By default, logs are added to the table at intervals of 7.5 seconds. You can set this interval in the [query\_log](server_settings/settings.md#server_settings-query-log) server setting (see the `flush_interval_milliseconds` parameter). To flush the logs forcibly from the memory buffer into the table, use the `SYSTEM FLUSH LOGS` query. When the table is deleted manually, it will be automatically created on the fly. Note that all the previous logs will be deleted. -!!! note - The storage period for logs is unlimited. Logs aren't automatically deleted from the table. You need to organize the removal of outdated logs yourself. +!!! note "Note" + The storage period for logs is unlimited. Logs aren’t automatically deleted from the table. You need to organize the removal of outdated logs yourself. -You can specify an arbitrary partitioning key for the `system.query_log` table in the [query_log](server_settings/settings.md#server_settings-query-log) server setting (see the `partition_by` parameter). +You can specify an arbitrary partitioning key for the `system.query_log` table in the [query\_log](server_settings/settings.md#server_settings-query-log) server setting (see the `partition_by` parameter). -## system.query_thread_log {#system_tables-query-thread-log} +## system.query\_thread\_log {#system-tables-query-thread-log} The table contains information about each query execution thread. -ClickHouse creates this table only if the [query_thread_log](server_settings/settings.md#server_settings-query-thread-log) server parameter is specified. This parameter sets the logging rules, such as the logging interval or the name of the table the queries will be logged in. +ClickHouse creates this table only if the [query\_thread\_log](server_settings/settings.md#server_settings-query-thread-log) server parameter is specified. This parameter sets the logging rules, such as the logging interval or the name of the table the queries will be logged in. -To enable query logging, set the [log_query_threads](settings/settings.md#settings-log-query-threads) parameter to 1. For details, see the [Settings](settings/settings.md) section. +To enable query logging, set the [log\_query\_threads](settings/settings.md#settings-log-query-threads) parameter to 1. For details, see the [Settings](settings/settings.md) section. Columns: -- `event_date` (Date) — the date when the thread has finished execution of the query. -- `event_time` (DateTime) — the date and time when the thread has finished execution of the query. -- `query_start_time` (DateTime) — Start time of query execution. -- `query_duration_ms` (UInt64) — Duration of query execution. -- `read_rows` (UInt64) — Number of read rows. -- `read_bytes` (UInt64) — Number of read bytes. -- `written_rows` (UInt64) — For `INSERT` queries, the number of written rows. For other queries, the column value is 0. -- `written_bytes` (UInt64) — For `INSERT` queries, the number of written bytes. For other queries, the column value is 0. -- `memory_usage` (Int64) — The difference between the amount of allocated and freed memory in context of this thread. -- `peak_memory_usage` (Int64) — The maximum difference between the amount of allocated and freed memory in context of this thread. -- `thread_name` (String) — Name of the thread. -- `thread_number` (UInt32) — Internal thread ID. -- `os_thread_id` (Int32) — OS thread ID. -- `master_thread_id` (UInt64) — OS initial ID of initial thread. -- `query` (String) — Query string. -- `is_initial_query` (UInt8) — Query type. Possible values: - - 1 — Query was initiated by the client. - - 0 — Query was initiated by another query for distributed query execution. -- `user` (String) — Name of the user who initiated the current query. -- `query_id` (String) — ID of the query. -- `address` (IPv6) — IP address that was used to make the query. -- `port` (UInt16) — The client port that was used to make the query. -- `initial_user` (String) — Name of the user who ran the initial query (for distributed query execution). -- `initial_query_id` (String) — ID of the initial query (for distributed query execution). -- `initial_address` (IPv6) — IP address that the parent query was launched from. -- `initial_port` (UInt16) — The client port that was used to make the parent query. -- `interface` (UInt8) — Interface that the query was initiated from. Possible values: - - 1 — TCP. - - 2 — HTTP. -- `os_user` (String) — OS's username who runs [clickhouse-client](../interfaces/cli.md). -- `client_hostname` (String) — Hostname of the client machine where the [clickhouse-client](../interfaces/cli.md) or another TCP client is run. -- `client_name` (String) — The [clickhouse-client](../interfaces/cli.md) or another TCP client name. -- `client_revision` (UInt32) — Revision of the [clickhouse-client](../interfaces/cli.md) or another TCP client. -- `client_version_major` (UInt32) — Major version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. -- `client_version_minor` (UInt32) — Minor version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. -- `client_version_patch` (UInt32) — Patch component of the [clickhouse-client](../interfaces/cli.md) or another TCP client version. -- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: - - 0 — The query was launched from the TCP interface. - - 1 — `GET` method was used. - - 2 — `POST` method was used. -- `http_user_agent` (String) — The `UserAgent` header passed in the HTTP request. -- `quota_key` (String) — The "quota key" specified in the [quotas](quotas.md) setting (see `keyed`). -- `revision` (UInt32) — ClickHouse revision. -- `ProfileEvents.Names` (Array(String)) — Counters that measure different metrics for this thread. The description of them could be found in the table [system.events](#system_tables-events) -- `ProfileEvents.Values` (Array(UInt64)) — Values of metrics for this thread that are listed in the `ProfileEvents.Names` column. +- `event_date` (Date) — the date when the thread has finished execution of the query. +- `event_time` (DateTime) — the date and time when the thread has finished execution of the query. +- `query_start_time` (DateTime) — Start time of query execution. +- `query_duration_ms` (UInt64) — Duration of query execution. +- `read_rows` (UInt64) — Number of read rows. +- `read_bytes` (UInt64) — Number of read bytes. +- `written_rows` (UInt64) — For `INSERT` queries, the number of written rows. For other queries, the column value is 0. +- `written_bytes` (UInt64) — For `INSERT` queries, the number of written bytes. For other queries, the column value is 0. +- `memory_usage` (Int64) — The difference between the amount of allocated and freed memory in context of this thread. +- `peak_memory_usage` (Int64) — The maximum difference between the amount of allocated and freed memory in context of this thread. +- `thread_name` (String) — Name of the thread. +- `thread_number` (UInt32) — Internal thread ID. +- `os_thread_id` (Int32) — OS thread ID. +- `master_thread_id` (UInt64) — OS initial ID of initial thread. +- `query` (String) — Query string. +- `is_initial_query` (UInt8) — Query type. Possible values: + - 1 — Query was initiated by the client. + - 0 — Query was initiated by another query for distributed query execution. +- `user` (String) — Name of the user who initiated the current query. +- `query_id` (String) — ID of the query. +- `address` (IPv6) — IP address that was used to make the query. +- `port` (UInt16) — The client port that was used to make the query. +- `initial_user` (String) — Name of the user who ran the initial query (for distributed query execution). +- `initial_query_id` (String) — ID of the initial query (for distributed query execution). +- `initial_address` (IPv6) — IP address that the parent query was launched from. +- `initial_port` (UInt16) — The client port that was used to make the parent query. +- `interface` (UInt8) — Interface that the query was initiated from. Possible values: + - 1 — TCP. + - 2 — HTTP. +- `os_user` (String) — OS’s username who runs [clickhouse-client](../interfaces/cli.md). +- `client_hostname` (String) — Hostname of the client machine where the [clickhouse-client](../interfaces/cli.md) or another TCP client is run. +- `client_name` (String) — The [clickhouse-client](../interfaces/cli.md) or another TCP client name. +- `client_revision` (UInt32) — Revision of the [clickhouse-client](../interfaces/cli.md) or another TCP client. +- `client_version_major` (UInt32) — Major version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. +- `client_version_minor` (UInt32) — Minor version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. +- `client_version_patch` (UInt32) — Patch component of the [clickhouse-client](../interfaces/cli.md) or another TCP client version. +- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: + - 0 — The query was launched from the TCP interface. + - 1 — `GET` method was used. + - 2 — `POST` method was used. +- `http_user_agent` (String) — The `UserAgent` header passed in the HTTP request. +- `quota_key` (String) — The “quota key” specified in the [quotas](quotas.md) setting (see `keyed`). +- `revision` (UInt32) — ClickHouse revision. +- `ProfileEvents.Names` (Array(String)) — Counters that measure different metrics for this thread. The description of them could be found in the table [system.events](#system_tables-events) +- `ProfileEvents.Values` (Array(UInt64)) — Values of metrics for this thread that are listed in the `ProfileEvents.Names` column. -By default, logs are added to the table at intervals of 7.5 seconds. You can set this interval in the [query_thread_log](server_settings/settings.md#server_settings-query-thread-log) server setting (see the `flush_interval_milliseconds` parameter). To flush the logs forcibly from the memory buffer into the table, use the `SYSTEM FLUSH LOGS` query. +By default, logs are added to the table at intervals of 7.5 seconds. You can set this interval in the [query\_thread\_log](server_settings/settings.md#server_settings-query-thread-log) server setting (see the `flush_interval_milliseconds` parameter). To flush the logs forcibly from the memory buffer into the table, use the `SYSTEM FLUSH LOGS` query. When the table is deleted manually, it will be automatically created on the fly. Note that all the previous logs will be deleted. -!!! note - The storage period for logs is unlimited. Logs aren't automatically deleted from the table. You need to organize the removal of outdated logs yourself. +!!! note "Note" + The storage period for logs is unlimited. Logs aren’t automatically deleted from the table. You need to organize the removal of outdated logs yourself. -You can specify an arbitrary partitioning key for the `system.query_thread_log` table in the [query_thread_log](server_settings/settings.md#server_settings-query-thread-log) server setting (see the `partition_by` parameter). +You can specify an arbitrary partitioning key for the `system.query_thread_log` table in the [query\_thread\_log](server_settings/settings.md#server_settings-query-thread-log) server setting (see the `partition_by` parameter). -## system.trace_log {#system_tables-trace_log} +## system.trace\_log {#system-tables-trace-log} Contains stack traces collected by the sampling query profiler. -ClickHouse creates this table when the [trace_log](server_settings/settings.md#server_settings-trace_log) server configuration section is set. Also the [query_profiler_real_time_period_ns](settings/settings.md#query_profiler_real_time_period_ns) and [query_profiler_cpu_time_period_ns](settings/settings.md#query_profiler_cpu_time_period_ns) settings should be set. +ClickHouse creates this table when the [trace\_log](server_settings/settings.md#server_settings-trace_log) server configuration section is set. Also the [query\_profiler\_real\_time\_period\_ns](settings/settings.md#query_profiler_real_time_period_ns) and [query\_profiler\_cpu\_time\_period\_ns](settings/settings.md#query_profiler_cpu_time_period_ns) settings should be set. To analyze logs, use the `addressToLine`, `addressToSymbol` and `demangle` introspection functions. Columns: -- `event_date`([Date](../data_types/date.md)) — Date of sampling moment. -- `event_time`([DateTime](../data_types/datetime.md)) — Timestamp of sampling moment. -- `revision`([UInt32](../data_types/int_uint.md)) — ClickHouse server build revision. +- `event_date`([Date](../data_types/date.md)) — Date of sampling moment. + +- `event_time`([DateTime](../data_types/datetime.md)) — Timestamp of sampling moment. + +- `revision`([UInt32](../data_types/int_uint.md)) — ClickHouse server build revision. When connecting to server by `clickhouse-client`, you see the string similar to `Connected to ClickHouse server version 19.18.1 revision 54429.`. This field contains the `revision`, but not the `version` of a server. -- `timer_type`([Enum8](../data_types/enum.md)) — Timer type: +- `timer_type`([Enum8](../data_types/enum.md)) — Timer type: - - `Real` represents wall-clock time. - - `CPU` represents CPU time. + - `Real` represents wall-clock time. + - `CPU` represents CPU time. -- `thread_number`([UInt32](../data_types/int_uint.md)) — Thread identifier. -- `query_id`([String](../data_types/string.md)) — Query identifier that can be used to get details about a query that was running from the [query_log](#system_tables-query_log) system table. -- `trace`([Array(UInt64)](../data_types/array.md)) — Stack trace at the moment of sampling. Each element is a virtual memory address inside ClickHouse server process. +- `thread_number`([UInt32](../data_types/int_uint.md)) — Thread identifier. + +- `query_id`([String](../data_types/string.md)) — Query identifier that can be used to get details about a query that was running from the [query\_log](#system_tables-query_log) system table. + +- `trace`([Array(UInt64)](../data_types/array.md)) — Stack trace at the moment of sampling. Each element is a virtual memory address inside ClickHouse server process. **Example** -```sql +``` sql SELECT * FROM system.trace_log LIMIT 1 \G ``` -```text +``` text Row 1: ────── event_date: 2019-11-15 @@ -677,7 +710,7 @@ query_id: acc4d61f-5bd1-4a3e-bc91-2180be37c915 trace: [94222141367858,94222152240175,94222152325351,94222152329944,94222152330796,94222151449980,94222144088167,94222151682763,94222144088167,94222151682763,94222144088167,94222144058283,94222144059248,94222091840750,94222091842302,94222091831228,94222189631488,140509950166747,140509942945935] ``` -## system.replicas {#system_tables-replicas} +## system.replicas {#system-tables-replicas} Contains information and status for replicated tables residing on the local server. This table can be used for monitoring. The table contains a row for every Replicated\* table. @@ -691,7 +724,7 @@ WHERE table = 'visits' FORMAT Vertical ``` -```text +``` text Row 1: ────── database: merge @@ -728,46 +761,46 @@ active_replicas: 2 Columns: -- `database` (`String`) - Database name -- `table` (`String`) - Table name -- `engine` (`String`) - Table engine name -- `is_leader` (`UInt8`) - Whether the replica is the leader. -Only one replica at a time can be the leader. The leader is responsible for selecting background merges to perform. -Note that writes can be performed to any replica that is available and has a session in ZK, regardless of whether it is a leader. -- `can_become_leader` (`UInt8`) - Whether the replica can be elected as a leader. -- `is_readonly` (`UInt8`) - Whether the replica is in read-only mode. -This mode is turned on if the config doesn't have sections with ZooKeeper, if an unknown error occurred when reinitializing sessions in ZooKeeper, and during session reinitialization in ZooKeeper. -- `is_session_expired` (`UInt8`) - the session with ZooKeeper has expired. Basically the same as `is_readonly`. -- `future_parts` (`UInt32`) - The number of data parts that will appear as the result of INSERTs or merges that haven't been done yet. -- `parts_to_check` (`UInt32`) - The number of data parts in the queue for verification. A part is put in the verification queue if there is suspicion that it might be damaged. -- `zookeeper_path` (`String`) - Path to table data in ZooKeeper. -- `replica_name` (`String`) - Replica name in ZooKeeper. Different replicas of the same table have different names. -- `replica_path` (`String`) - Path to replica data in ZooKeeper. The same as concatenating 'zookeeper_path/replicas/replica_path'. -- `columns_version` (`Int32`) - Version number of the table structure. Indicates how many times ALTER was performed. If replicas have different versions, it means some replicas haven't made all of the ALTERs yet. -- `queue_size` (`UInt32`) - Size of the queue for operations waiting to be performed. Operations include inserting blocks of data, merges, and certain other actions. It usually coincides with `future_parts`. -- `inserts_in_queue` (`UInt32`) - Number of inserts of blocks of data that need to be made. Insertions are usually replicated fairly quickly. If this number is large, it means something is wrong. -- `merges_in_queue` (`UInt32`) - The number of merges waiting to be made. Sometimes merges are lengthy, so this value may be greater than zero for a long time. -- `part_mutations_in_queue` (`UInt32`) - The number of mutations waiting to be made. -- `queue_oldest_time` (`DateTime`) - If `queue_size` greater than 0, shows when the oldest operation was added to the queue. -- `inserts_oldest_time` (`DateTime`) - See `queue_oldest_time` -- `merges_oldest_time` (`DateTime`) - See `queue_oldest_time` -- `part_mutations_oldest_time` (`DateTime`) - See `queue_oldest_time` +- `database` (`String`) - Database name +- `table` (`String`) - Table name +- `engine` (`String`) - Table engine name +- `is_leader` (`UInt8`) - Whether the replica is the leader. + Only one replica at a time can be the leader. The leader is responsible for selecting background merges to perform. + Note that writes can be performed to any replica that is available and has a session in ZK, regardless of whether it is a leader. +- `can_become_leader` (`UInt8`) - Whether the replica can be elected as a leader. +- `is_readonly` (`UInt8`) - Whether the replica is in read-only mode. + This mode is turned on if the config doesn’t have sections with ZooKeeper, if an unknown error occurred when reinitializing sessions in ZooKeeper, and during session reinitialization in ZooKeeper. +- `is_session_expired` (`UInt8`) - the session with ZooKeeper has expired. Basically the same as `is_readonly`. +- `future_parts` (`UInt32`) - The number of data parts that will appear as the result of INSERTs or merges that haven’t been done yet. +- `parts_to_check` (`UInt32`) - The number of data parts in the queue for verification. A part is put in the verification queue if there is suspicion that it might be damaged. +- `zookeeper_path` (`String`) - Path to table data in ZooKeeper. +- `replica_name` (`String`) - Replica name in ZooKeeper. Different replicas of the same table have different names. +- `replica_path` (`String`) - Path to replica data in ZooKeeper. The same as concatenating ‘zookeeper\_path/replicas/replica\_path’. +- `columns_version` (`Int32`) - Version number of the table structure. Indicates how many times ALTER was performed. If replicas have different versions, it means some replicas haven’t made all of the ALTERs yet. +- `queue_size` (`UInt32`) - Size of the queue for operations waiting to be performed. Operations include inserting blocks of data, merges, and certain other actions. It usually coincides with `future_parts`. +- `inserts_in_queue` (`UInt32`) - Number of inserts of blocks of data that need to be made. Insertions are usually replicated fairly quickly. If this number is large, it means something is wrong. +- `merges_in_queue` (`UInt32`) - The number of merges waiting to be made. Sometimes merges are lengthy, so this value may be greater than zero for a long time. +- `part_mutations_in_queue` (`UInt32`) - The number of mutations waiting to be made. +- `queue_oldest_time` (`DateTime`) - If `queue_size` greater than 0, shows when the oldest operation was added to the queue. +- `inserts_oldest_time` (`DateTime`) - See `queue_oldest_time` +- `merges_oldest_time` (`DateTime`) - See `queue_oldest_time` +- `part_mutations_oldest_time` (`DateTime`) - See `queue_oldest_time` The next 4 columns have a non-zero value only where there is an active session with ZK. -- `log_max_index` (`UInt64`) - Maximum entry number in the log of general activity. -- `log_pointer` (`UInt64`) - Maximum entry number in the log of general activity that the replica copied to its execution queue, plus one. If `log_pointer` is much smaller than `log_max_index`, something is wrong. -- `last_queue_update` (`DateTime`) - When the queue was updated last time. -- `absolute_delay` (`UInt64`) - How big lag in seconds the current replica has. -- `total_replicas` (`UInt8`) - The total number of known replicas of this table. -- `active_replicas` (`UInt8`) - The number of replicas of this table that have a session in ZooKeeper (i.e., the number of functioning replicas). +- `log_max_index` (`UInt64`) - Maximum entry number in the log of general activity. +- `log_pointer` (`UInt64`) - Maximum entry number in the log of general activity that the replica copied to its execution queue, plus one. If `log_pointer` is much smaller than `log_max_index`, something is wrong. +- `last_queue_update` (`DateTime`) - When the queue was updated last time. +- `absolute_delay` (`UInt64`) - How big lag in seconds the current replica has. +- `total_replicas` (`UInt8`) - The total number of known replicas of this table. +- `active_replicas` (`UInt8`) - The number of replicas of this table that have a session in ZooKeeper (i.e., the number of functioning replicas). If you request all the columns, the table may work a bit slowly, since several reads from ZooKeeper are made for each row. -If you don't request the last 4 columns (log_max_index, log_pointer, total_replicas, active_replicas), the table works quickly. +If you don’t request the last 4 columns (log\_max\_index, log\_pointer, total\_replicas, active\_replicas), the table works quickly. For example, you can check that everything is working correctly like this: -```sql +``` sql SELECT database, table, @@ -797,29 +830,28 @@ WHERE OR active_replicas < total_replicas ``` -If this query doesn't return anything, it means that everything is fine. +If this query doesn’t return anything, it means that everything is fine. -## system.settings +## system.settings {#system-settings} Contains information about settings that are currently in use. I.e. used for executing the query you are using to read from the system.settings table. Columns: -- `name` (String) — Setting name. -- `value` (String) — Setting value. -- `changed` (UInt8) — Whether the setting was explicitly defined in the config or explicitly changed. - +- `name` (String) — Setting name. +- `value` (String) — Setting value. +- `changed` (UInt8) — Whether the setting was explicitly defined in the config or explicitly changed. Example: -```sql +``` sql SELECT * FROM system.settings WHERE changed ``` -```text +``` text ┌─name───────────────────┬─value───────┬─changed─┐ │ max_threads │ 8 │ 1 │ │ use_uncompressed_cache │ 0 │ 1 │ @@ -828,29 +860,29 @@ WHERE changed └────────────────────────┴─────────────┴─────────┘ ``` -## system.table_engines +## system.table\_engines {#system-table-engines} Contains description of table engines supported by server and their feature support information. This table contains the following columns (the column type is shown in brackets): -- `name` (String) — The name of table engine. -- `supports_settings` (UInt8) — Flag that indicates if table engine supports `SETTINGS` clause. -- `supports_skipping_indices` (UInt8) — Flag that indicates if table engine supports [skipping indices](table_engines/mergetree/#table_engine-mergetree-data_skipping-indexes). -- `supports_ttl` (UInt8) — Flag that indicates if table engine supports [TTL](table_engines/mergetree/#table_engine-mergetree-ttl). -- `supports_sort_order` (UInt8) — Flag that indicates if table engine supports clauses `PARTITION_BY`, `PRIMARY_KEY`, `ORDER_BY` and `SAMPLE_BY`. -- `supports_replication` (UInt8) — Flag that indicates if table engine supports [data replication](table_engines/replication/). -- `supports_duduplication` (UInt8) — Flag that indicates if table engine supports data deduplication. +- `name` (String) — The name of table engine. +- `supports_settings` (UInt8) — Flag that indicates if table engine supports `SETTINGS` clause. +- `supports_skipping_indices` (UInt8) — Flag that indicates if table engine supports [skipping indices](table_engines/mergetree/#table_engine-mergetree-data_skipping-indexes). +- `supports_ttl` (UInt8) — Flag that indicates if table engine supports [TTL](table_engines/mergetree/#table_engine-mergetree-ttl). +- `supports_sort_order` (UInt8) — Flag that indicates if table engine supports clauses `PARTITION_BY`, `PRIMARY_KEY`, `ORDER_BY` and `SAMPLE_BY`. +- `supports_replication` (UInt8) — Flag that indicates if table engine supports [data replication](table_engines/replication/). +- `supports_duduplication` (UInt8) — Flag that indicates if table engine supports data deduplication. Example: -```sql +``` sql SELECT * FROM system.table_engines WHERE name in ('Kafka', 'MergeTree', 'ReplicatedCollapsingMergeTree') ``` -```text +``` text ┌─name──────────────────────────┬─supports_settings─┬─supports_skipping_indices─┬─supports_sort_order─┬─supports_ttl─┬─supports_replication─┬─supports_deduplication─┐ │ Kafka │ 1 │ 0 │ 0 │ 0 │ 0 │ 0 │ │ MergeTree │ 1 │ 1 │ 1 │ 1 │ 0 │ 0 │ @@ -860,71 +892,70 @@ WHERE name in ('Kafka', 'MergeTree', 'ReplicatedCollapsingMergeTree') **See also** -- MergeTree family [query clauses](table_engines/mergetree.md#mergetree-query-clauses) -- Kafka [settings](table_engines/kafka.md#table_engine-kafka-creating-a-table) -- Join [settings](table_engines/join.md#join-limitations-and-settings) +- MergeTree family [query clauses](table_engines/mergetree.md#mergetree-query-clauses) +- Kafka [settings](table_engines/kafka.md#table_engine-kafka-creating-a-table) +- Join [settings](table_engines/join.md#join-limitations-and-settings) - -## system.tables +## system.tables {#system-tables} Contains metadata of each table that the server knows about. Detached tables are not shown in `system.tables`. This table contains the following columns (the column type is shown in brackets): -- `database` (String) — The name of the database the table is in. -- `name` (String) — Table name. -- `engine` (String) — Table engine name (without parameters). -- `is_temporary` (UInt8) - Flag that indicates whether the table is temporary. -- `data_path` (String) - Path to the table data in the file system. -- `metadata_path` (String) - Path to the table metadata in the file system. -- `metadata_modification_time` (DateTime) - Time of latest modification of the table metadata. -- `dependencies_database` (Array(String)) - Database dependencies. -- `dependencies_table` (Array(String)) - Table dependencies ([MaterializedView](table_engines/materializedview.md) tables based on the current table). -- `create_table_query` (String) - The query that was used to create the table. -- `engine_full` (String) - Parameters of the table engine. -- `partition_key` (String) - The partition key expression specified in the table. -- `sorting_key` (String) - The sorting key expression specified in the table. -- `primary_key` (String) - The primary key expression specified in the table. -- `sampling_key` (String) - The sampling key expression specified in the table. +- `database` (String) — The name of the database the table is in. +- `name` (String) — Table name. +- `engine` (String) — Table engine name (without parameters). +- `is_temporary` (UInt8) - Flag that indicates whether the table is temporary. +- `data_path` (String) - Path to the table data in the file system. +- `metadata_path` (String) - Path to the table metadata in the file system. +- `metadata_modification_time` (DateTime) - Time of latest modification of the table metadata. +- `dependencies_database` (Array(String)) - Database dependencies. +- `dependencies_table` (Array(String)) - Table dependencies ([MaterializedView](table_engines/materializedview.md) tables based on the current table). +- `create_table_query` (String) - The query that was used to create the table. +- `engine_full` (String) - Parameters of the table engine. +- `partition_key` (String) - The partition key expression specified in the table. +- `sorting_key` (String) - The sorting key expression specified in the table. +- `primary_key` (String) - The primary key expression specified in the table. +- `sampling_key` (String) - The sampling key expression specified in the table. The `system.tables` table is used in `SHOW TABLES` query implementation. -## system.zookeeper +## system.zookeeper {#system-zookeeper} The table does not exist if ZooKeeper is not configured. Allows reading data from the ZooKeeper cluster defined in the config. -The query must have a 'path' equality condition in the WHERE clause. This is the path in ZooKeeper for the children that you want to get data for. +The query must have a ‘path’ equality condition in the WHERE clause. This is the path in ZooKeeper for the children that you want to get data for. The query `SELECT * FROM system.zookeeper WHERE path = '/clickhouse'` outputs data for all children on the `/clickhouse` node. -To output data for all root nodes, write path = '/'. -If the path specified in 'path' doesn't exist, an exception will be thrown. +To output data for all root nodes, write path = ‘/’. +If the path specified in ‘path’ doesn’t exist, an exception will be thrown. Columns: -- `name` (String) — The name of the node. -- `path` (String) — The path to the node. -- `value` (String) — Node value. -- `dataLength` (Int32) — Size of the value. -- `numChildren` (Int32) — Number of descendants. -- `czxid` (Int64) — ID of the transaction that created the node. -- `mzxid` (Int64) — ID of the transaction that last changed the node. -- `pzxid` (Int64) — ID of the transaction that last deleted or added descendants. -- `ctime` (DateTime) — Time of node creation. -- `mtime` (DateTime) — Time of the last modification of the node. -- `version` (Int32) — Node version: the number of times the node was changed. -- `cversion` (Int32) — Number of added or removed descendants. -- `aversion` (Int32) — Number of changes to the ACL. -- `ephemeralOwner` (Int64) — For ephemeral nodes, the ID of the session that owns this node. +- `name` (String) — The name of the node. +- `path` (String) — The path to the node. +- `value` (String) — Node value. +- `dataLength` (Int32) — Size of the value. +- `numChildren` (Int32) — Number of descendants. +- `czxid` (Int64) — ID of the transaction that created the node. +- `mzxid` (Int64) — ID of the transaction that last changed the node. +- `pzxid` (Int64) — ID of the transaction that last deleted or added descendants. +- `ctime` (DateTime) — Time of node creation. +- `mtime` (DateTime) — Time of the last modification of the node. +- `version` (Int32) — Node version: the number of times the node was changed. +- `cversion` (Int32) — Number of added or removed descendants. +- `aversion` (Int32) — Number of changes to the ACL. +- `ephemeralOwner` (Int64) — For ephemeral nodes, the ID of the session that owns this node. Example: -```sql +``` sql SELECT * FROM system.zookeeper WHERE path = '/clickhouse/tables/01-08/visits/replicas' FORMAT Vertical ``` -```text +``` text Row 1: ────── name: example01-08-1.yandex.ru @@ -960,58 +991,56 @@ pzxid: 987021252247 path: /clickhouse/tables/01-08/visits/replicas ``` -## system.mutations {#system_tables-mutations} +## system.mutations {#system-tables-mutations} The table contains information about [mutations](../query_language/alter.md#alter-mutations) of MergeTree tables and their progress. Each mutation command is represented by a single row. The table has the following columns: **database**, **table** - The name of the database and table to which the mutation was applied. -**mutation_id** - The ID of the mutation. For replicated tables these IDs correspond to znode names in the `/mutations/` directory in ZooKeeper. For unreplicated tables the IDs correspond to file names in the data directory of the table. +**mutation\_id** - The ID of the mutation. For replicated tables these IDs correspond to znode names in the `/mutations/` directory in ZooKeeper. For unreplicated tables the IDs correspond to file names in the data directory of the table. **command** - The mutation command string (the part of the query after `ALTER TABLE [db.]table`). -**create_time** - When this mutation command was submitted for execution. +**create\_time** - When this mutation command was submitted for execution. -**block_numbers.partition_id**, **block_numbers.number** - A nested column. For mutations of replicated tables, it contains one record for each partition: the partition ID and the block number that was acquired by the mutation (in each partition, only parts that contain blocks with numbers less than the block number acquired by the mutation in that partition will be mutated). In non-replicated tables, block numbers in all partitions form a single sequence. This means that for mutations of non-replicated tables, the column will contain one record with a single block number acquired by the mutation. +**block\_numbers.partition\_id**, **block\_numbers.number** - A nested column. For mutations of replicated tables, it contains one record for each partition: the partition ID and the block number that was acquired by the mutation (in each partition, only parts that contain blocks with numbers less than the block number acquired by the mutation in that partition will be mutated). In non-replicated tables, block numbers in all partitions form a single sequence. This means that for mutations of non-replicated tables, the column will contain one record with a single block number acquired by the mutation. -**parts_to_do** - The number of data parts that need to be mutated for the mutation to finish. +**parts\_to\_do** - The number of data parts that need to be mutated for the mutation to finish. -**is_done** - Is the mutation done? Note that even if `parts_to_do = 0` it is possible that a mutation of a replicated table is not done yet because of a long-running INSERT that will create a new data part that will need to be mutated. +**is\_done** - Is the mutation done? Note that even if `parts_to_do = 0` it is possible that a mutation of a replicated table is not done yet because of a long-running INSERT that will create a new data part that will need to be mutated. If there were problems with mutating some parts, the following columns contain additional information: -**latest_failed_part** - The name of the most recent part that could not be mutated. +**latest\_failed\_part** - The name of the most recent part that could not be mutated. -**latest_fail_time** - The time of the most recent part mutation failure. +**latest\_fail\_time** - The time of the most recent part mutation failure. -**latest_fail_reason** - The exception message that caused the most recent part mutation failure. +**latest\_fail\_reason** - The exception message that caused the most recent part mutation failure. - -## system.disks {#system_tables-disks} +## system.disks {#system-tables-disks} Contains information about disks defined in the [server configuration](table_engines/mergetree.md#table_engine-mergetree-multiple-volumes_configure). Columns: -- `name` ([String](../data_types/string.md)) — Name of a disk in the server configuration. -- `path` ([String](../data_types/string.md)) — Path to the mount point in the file system. -- `free_space` ([UInt64](../data_types/int_uint.md)) — Free space on disk in bytes. -- `total_space` ([UInt64](../data_types/int_uint.md)) — Disk volume in bytes. -- `keep_free_space` ([UInt64](../data_types/int_uint.md)) — Amount of disk space that should stay free on disk in bytes. Defined in the `keep_free_space_bytes` parameter of disk configuration. +- `name` ([String](../data_types/string.md)) — Name of a disk in the server configuration. +- `path` ([String](../data_types/string.md)) — Path to the mount point in the file system. +- `free_space` ([UInt64](../data_types/int_uint.md)) — Free space on disk in bytes. +- `total_space` ([UInt64](../data_types/int_uint.md)) — Disk volume in bytes. +- `keep_free_space` ([UInt64](../data_types/int_uint.md)) — Amount of disk space that should stay free on disk in bytes. Defined in the `keep_free_space_bytes` parameter of disk configuration. - -## system.storage_policies {#system_tables-storage_policies} +## system.storage\_policies {#system-tables-storage-policies} Contains information about storage policies and volumes defined in the [server configuration](table_engines/mergetree.md#table_engine-mergetree-multiple-volumes_configure). Columns: -- `policy_name` ([String](../data_types/string.md)) — Name of the storage policy. -- `volume_name` ([String](../data_types/string.md)) — Volume name defined in the storage policy. -- `volume_priority` ([UInt64](../data_types/int_uint.md)) — Volume order number in the configuration. -- `disks` ([Array(String)](../data_types/array.md)) — Disk names, defined in the storage policy. -- `max_data_part_size` ([UInt64](../data_types/int_uint.md)) — Maximum size of a data part that can be stored on volume disks (0 — no limit). -- `move_factor` ([Float64](../data_types/float.md)) — Ratio of free disk space. When the ratio exceeds the value of configuration parameter, ClickHouse start to move data to the next volume in order. +- `policy_name` ([String](../data_types/string.md)) — Name of the storage policy. +- `volume_name` ([String](../data_types/string.md)) — Volume name defined in the storage policy. +- `volume_priority` ([UInt64](../data_types/int_uint.md)) — Volume order number in the configuration. +- `disks` ([Array(String)](../data_types/array.md)) — Disk names, defined in the storage policy. +- `max_data_part_size` ([UInt64](../data_types/int_uint.md)) — Maximum size of a data part that can be stored on volume disks (0 — no limit). +- `move_factor` ([Float64](../data_types/float.md)) — Ratio of free disk space. When the ratio exceeds the value of configuration parameter, ClickHouse start to move data to the next volume in order. If the storage policy contains more then one volume, then information for each volume is stored in the individual row of the table. diff --git a/docs/en/operations/table_engines/aggregatingmergetree.md b/docs/en/operations/table_engines/aggregatingmergetree.md index f45ded4067c..63cda964a16 100644 --- a/docs/en/operations/table_engines/aggregatingmergetree.md +++ b/docs/en/operations/table_engines/aggregatingmergetree.md @@ -1,5 +1,4 @@ - -# AggregatingMergeTree +# AggregatingMergeTree {#aggregatingmergetree} The engine inherits from [MergeTree](mergetree.md#table_engines-mergetree), altering the logic for data parts merging. ClickHouse replaces all rows with the same primary key (or more accurately, with the same [sorting key](mergetree.md)) with a single row (within a one data part) that stores a combination of states of aggregate functions. @@ -9,9 +8,9 @@ The engine processes all columns with [AggregateFunction](../../data_types/neste It is appropriate to use `AggregatingMergeTree` if it reduces the number of rows by orders. -## Creating a Table +## Creating a Table {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -31,12 +30,14 @@ For a description of request parameters, see [request description](../../query_l When creating a `AggregatingMergeTree` table the same [clauses](mergetree.md) are required, as when creating a `MergeTree` table. -
Deprecated Method for Creating a Table +
-!!! attention +Deprecated Method for Creating a Table + +!!! attention "Attention" Do not use this method in new projects and, if possible, switch the old projects to the method described above. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -48,18 +49,18 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] All of the parameters have the same meaning as in `MergeTree`.
-## SELECT and INSERT +## SELECT and INSERT {#select-and-insert} To insert data, use [INSERT SELECT](../../query_language/insert_into.md) query with aggregate -State- functions. When selecting data from `AggregatingMergeTree` table, use `GROUP BY` clause and the same aggregate functions as when inserting data, but using `-Merge` suffix. -In the results of `SELECT` query the values of `AggregateFunction` type have implementation-specific binary representation for all of the ClickHouse output formats. If dump data into, for example, `TabSeparated` format with `SELECT` query then this dump can be loaded back using `INSERT` query. +In the results of `SELECT` query, the values of `AggregateFunction` type have implementation-specific binary representation for all of the ClickHouse output formats. If dump data into, for example, `TabSeparated` format with `SELECT` query then this dump can be loaded back using `INSERT` query. -## Example of an Aggregated Materialized View +## Example of an Aggregated Materialized View {#example-of-an-aggregated-materialized-view} `AggregatingMergeTree` materialized view that watches the `test.visits` table: -```sql +``` sql CREATE MATERIALIZED VIEW test.basic ENGINE = AggregatingMergeTree() PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate) AS SELECT @@ -71,9 +72,9 @@ FROM test.visits GROUP BY CounterID, StartDate; ``` -Inserting of data into the `test.visits` table. +Inserting data into the `test.visits` table. -```sql +``` sql INSERT INTO test.visits ... ``` @@ -81,7 +82,7 @@ The data are inserted in both the table and view `test.basic` that will perform To get the aggregated data, we need to execute a query such as `SELECT ... GROUP BY ...` from the view `test.basic`: -```sql +``` sql SELECT StartDate, sumMerge(Visits) AS Visits, diff --git a/docs/en/operations/table_engines/buffer.md b/docs/en/operations/table_engines/buffer.md index f574fcb9f74..2f0bd5d1bb3 100644 --- a/docs/en/operations/table_engines/buffer.md +++ b/docs/en/operations/table_engines/buffer.md @@ -1,23 +1,23 @@ -# Buffer +# Buffer {#buffer} Buffers the data to write in RAM, periodically flushing it to another table. During the read operation, data is read from the buffer and the other table simultaneously. -```sql +``` sql Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes) ``` Engine parameters: -- `database` – Database name. Instead of the database name, you can use a constant expression that returns a string. -- `table` – Table to flush data to. -- `num_layers` – Parallelism layer. Physically, the table will be represented as `num_layers` of independent buffers. Recommended value: 16. -- `min_time`, `max_time`, `min_rows`, `max_rows`, `min_bytes`, and `max_bytes` – Conditions for flushing data from the buffer. +- `database` – Database name. Instead of the database name, you can use a constant expression that returns a string. +- `table` – Table to flush data to. +- `num_layers` – Parallelism layer. Physically, the table will be represented as `num_layers` of independent buffers. Recommended value: 16. +- `min_time`, `max_time`, `min_rows`, `max_rows`, `min_bytes`, and `max_bytes` – Conditions for flushing data from the buffer. Data is flushed from the buffer and written to the destination table if all the `min*` conditions or at least one `max*` condition are met. -- `min_time`, `max_time` – Condition for the time in seconds from the moment of the first write to the buffer. -- `min_rows`, `max_rows` – Condition for the number of rows in the buffer. -- `min_bytes`, `max_bytes` – Condition for the number of bytes in the buffer. +- `min_time`, `max_time` – Condition for the time in seconds from the moment of the first write to the buffer. +- `min_rows`, `max_rows` – Condition for the number of rows in the buffer. +- `min_bytes`, `max_bytes` – Condition for the number of bytes in the buffer. During the write operation, data is inserted to a `num_layers` number of random buffers. Or, if the data part to insert is large enough (greater than `max_rows` or `max_bytes`), it is written directly to the destination table, omitting the buffer. @@ -25,11 +25,11 @@ The conditions for flushing the data are calculated separately for each of the ` Example: -```sql +``` sql CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000) ``` -Creating a 'merge.hits_buffer' table with the same structure as 'merge.hits' and using the Buffer engine. When writing to this table, data is buffered in RAM and later written to the 'merge.hits' table. 16 buffers are created. The data in each of them is flushed if either 100 seconds have passed, or one million rows have been written, or 100 MB of data have been written; or if simultaneously 10 seconds have passed and 10,000 rows and 10 MB of data have been written. For example, if just one row has been written, after 100 seconds it will be flushed, no matter what. But if many rows have been written, the data will be flushed sooner. +Creating a ‘merge.hits\_buffer’ table with the same structure as ‘merge.hits’ and using the Buffer engine. When writing to this table, data is buffered in RAM and later written to the ‘merge.hits’ table. 16 buffers are created. The data in each of them is flushed if either 100 seconds have passed, or one million rows have been written, or 100 MB of data have been written; or if simultaneously 10 seconds have passed and 10,000 rows and 10 MB of data have been written. For example, if just one row has been written, after 100 seconds it will be flushed, no matter what. But if many rows have been written, the data will be flushed sooner. When the server is stopped, with DROP TABLE or DETACH TABLE, buffer data is also flushed to the destination table. @@ -38,10 +38,10 @@ You can set empty strings in single quotation marks for the database and table n When reading from a Buffer table, data is processed both from the buffer and from the destination table (if there is one). Note that the Buffer tables does not support an index. In other words, data in the buffer is fully scanned, which might be slow for large buffers. (For data in a subordinate table, the index that it supports will be used.) -If the set of columns in the Buffer table doesn't match the set of columns in a subordinate table, a subset of columns that exist in both tables is inserted. +If the set of columns in the Buffer table doesn’t match the set of columns in a subordinate table, a subset of columns that exist in both tables is inserted. -If the types don't match for one of the columns in the Buffer table and a subordinate table, an error message is entered in the server log and the buffer is cleared. -The same thing happens if the subordinate table doesn't exist when the buffer is flushed. +If the types don’t match for one of the columns in the Buffer table and a subordinate table, an error message is entered in the server log and the buffer is cleared. +The same thing happens if the subordinate table doesn’t exist when the buffer is flushed. If you need to run ALTER for a subordinate table and the Buffer table, we recommend first deleting the Buffer table, running ALTER for the subordinate table, then creating the Buffer table again. @@ -51,15 +51,14 @@ FINAL and SAMPLE do not work correctly for Buffer tables. These conditions are p When adding data to a Buffer, one of the buffers is locked. This causes delays if a read operation is simultaneously being performed from the table. -Data that is inserted to a Buffer table may end up in the subordinate table in a different order and in different blocks. Because of this, a Buffer table is difficult to use for writing to a CollapsingMergeTree correctly. To avoid problems, you can set 'num_layers' to 1. +Data that is inserted to a Buffer table may end up in the subordinate table in a different order and in different blocks. Because of this, a Buffer table is difficult to use for writing to a CollapsingMergeTree correctly. To avoid problems, you can set ‘num\_layers’ to 1. -If the destination table is replicated, some expected characteristics of replicated tables are lost when writing to a Buffer table. The random changes to the order of rows and sizes of data parts cause data deduplication to quit working, which means it is not possible to have a reliable 'exactly once' write to replicated tables. +If the destination table is replicated, some expected characteristics of replicated tables are lost when writing to a Buffer table. The random changes to the order of rows and sizes of data parts cause data deduplication to quit working, which means it is not possible to have a reliable ‘exactly once’ write to replicated tables. Due to these disadvantages, we can only recommend using a Buffer table in rare cases. -A Buffer table is used when too many INSERTs are received from a large number of servers over a unit of time and data can't be buffered before insertion, which means the INSERTs can't run fast enough. - -Note that it doesn't make sense to insert data one row at a time, even for Buffer tables. This will only produce a speed of a few thousand rows per second, while inserting larger blocks of data can produce over a million rows per second (see the section "Performance"). +A Buffer table is used when too many INSERTs are received from a large number of servers over a unit of time and data can’t be buffered before insertion, which means the INSERTs can’t run fast enough. +Note that it doesn’t make sense to insert data one row at a time, even for Buffer tables. This will only produce a speed of a few thousand rows per second, while inserting larger blocks of data can produce over a million rows per second (see the section “Performance”). [Original article](https://clickhouse.tech/docs/en/operations/table_engines/buffer/) diff --git a/docs/en/operations/table_engines/collapsingmergetree.md b/docs/en/operations/table_engines/collapsingmergetree.md index 9c50dd959ed..ad396ac62bd 100644 --- a/docs/en/operations/table_engines/collapsingmergetree.md +++ b/docs/en/operations/table_engines/collapsingmergetree.md @@ -1,14 +1,14 @@ -# CollapsingMergeTree {#table_engine-collapsingmergetree} +# CollapsingMergeTree {#table-engine-collapsingmergetree} The engine inherits from [MergeTree](mergetree.md) and adds the logic of rows collapsing to data parts merge algorithm. `CollapsingMergeTree` asynchronously deletes (collapses) pairs of rows if all of the fields in a sorting key (`ORDER BY`) are equivalent excepting the particular field `Sign` which can have `1` and `-1` values. Rows without a pair are kept. For more details see the [Collapsing](#table_engine-collapsingmergetree-collapsing) section of the document. -The engine may significantly reduce the volume of storage and increase efficiency of `SELECT` query as a consequence. +The engine may significantly reduce the volume of storage and increase the efficiency of `SELECT` query as a consequence. -## Creating a Table +## Creating a Table {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -25,7 +25,7 @@ For a description of query parameters, see [query description](../../query_langu **CollapsingMergeTree Parameters** -- `sign` — Name of the column with the type of row: `1` is a "state" row, `-1` is a "cancel" row. +- `sign` — Name of the column with the type of row: `1` is a “state” row, `-1` is a “cancel” row. Column data type — `Int8`. @@ -33,12 +33,14 @@ For a description of query parameters, see [query description](../../query_langu When creating a `CollapsingMergeTree` table, the same [query clauses](mergetree.md#table_engine-mergetree-creating-a-table) are required, as when creating a `MergeTree` table. -
Deprecated Method for Creating a Table +
-!!! attention +Deprecated Method for Creating a Table + +!!! attention "Attention" Do not use this method in new projects and, if possible, switch the old projects to the method described above. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -49,23 +51,23 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] All of the parameters excepting `sign` have the same meaning as in `MergeTree`. -- `sign` — Name of the column with the type of row: `1` — "state" row, `-1` — "cancel" row. +- `sign` — Name of the column with the type of row: `1` — “state” row, `-1` — “cancel” row. Column Data Type — `Int8`. +
+## Collapsing {#table-engine-collapsingmergetree-collapsing} -## Collapsing {#table_engine-collapsingmergetree-collapsing} - -### Data +### Data {#data} Consider the situation where you need to save continually changing data for some object. It sounds logical to have one row for an object and update it at any change, but update operation is expensive and slow for DBMS because it requires rewriting of the data in the storage. If you need to write data quickly, update not acceptable, but you can write the changes of an object sequentially as follows. -Use the particular column `Sign`. If `Sign = 1` it means that the row is a state of an object, let's call it "state" row. If `Sign = -1` it means the cancellation of the state of an object with the same attributes, let's call it "cancel" row. +Use the particular column `Sign`. If `Sign = 1` it means that the row is a state of an object, let’s call it “state” row. If `Sign = -1` it means the cancellation of the state of an object with the same attributes, let’s call it “cancel” row. -For example, we want to calculate how much pages users checked at some site and how long they were there. At some moment of time we write the following row with the state of user activity: +For example, we want to calculate how much pages users checked at some site and how long they were there. At some moment we write the following row with the state of user activity: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ @@ -73,20 +75,20 @@ For example, we want to calculate how much pages users checked at some site and At some moment later we register the change of user activity and write it with the following two rows. -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ ``` -The first row cancels the previous state of the object (user). It should copy the sorting key fields of the canceled state excepting `Sign`. +The first row cancels the previous state of the object (user). It should copy the sorting key fields of the cancelled state excepting `Sign`. The second row contains the current state. As we need only the last state of user activity, the rows -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ 5 │ 146 │ -1 │ @@ -99,39 +101,42 @@ Why we need 2 rows for each change read in the [Algorithm](#table_engine-collaps **Peculiar properties of such approach** -1. The program that writes the data should remember the state of an object to be able to cancel it. "Cancel" string should contain copies of the sorting key fields of the "state" string and the opposite `Sign`. It increases the initial size of storage but allows to write the data quickly. -2. Long growing arrays in columns reduce the efficiency of the engine due to load for writing. The more straightforward data, the higher efficiency. -3. The `SELECT` results depend strongly on the consistency of object changes history. Be accurate when preparing data for inserting. You can get unpredictable results in inconsistent data, for example, negative values for non-negative metrics such as session depth. +1. The program that writes the data should remember the state of an object to be able to cancel it. “Cancel” string should contain copies of the sorting key fields of the “state” string and the opposite `Sign`. It increases the initial size of storage but allows to write the data quickly. +2. Long growing arrays in columns reduce the efficiency of the engine due to load for writing. The more straightforward data, the higher the efficiency. +3. The `SELECT` results depend strongly on the consistency of object changes history. Be accurate when preparing data for inserting. You can get unpredictable results in inconsistent data, for example, negative values for non-negative metrics such as session depth. -### Algorithm {#table_engine-collapsingmergetree-collapsing-algorithm} +### Algorithm {#table-engine-collapsingmergetree-collapsing-algorithm} -When ClickHouse merges data parts, each group of consecutive rows with the same sorting key (`ORDER BY`) is reduced to not more than two rows, one with `Sign = 1` ("state" row) and another with `Sign = -1` ("cancel" row). In other words, entries collapse. +When ClickHouse merges data parts, each group of consecutive rows with the same sorting key (`ORDER BY`) is reduced to not more than two rows, one with `Sign = 1` (“state” row) and another with `Sign = -1` (“cancel” row). In other words, entries collapse. For each resulting data part ClickHouse saves: - 1. The first "cancel" and the last "state" rows, if the number of "state" and "cancel" rows matches and the last row is a "state" row. - 2. The last "state" row, if there is more "state" rows than "cancel" rows. - 3. The first "cancel" row, if there is more "cancel" rows than "state" rows. - 4. None of the rows, in all other cases. +1. The first “cancel” and the last “state” rows, if the number of “state” and “cancel” rows matches and the last row is a “state” row. - In addition when there is at least 2 more "state" rows than "cancel" rows, or at least 2 more "cancel" rows then "state" rows, the merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once. +2. The last “state” row, if there are more “state” rows than “cancel” rows. + +3. The first “cancel” row, if there are more “cancel” rows than “state” rows. + +4. None of the rows, in all other cases. + +Also when there are at least 2 more “state” rows than “cancel” rows, or at least 2 more “cancel” rows then “state” rows, the merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once. Thus, collapsing should not change the results of calculating statistics. Changes gradually collapsed so that in the end only the last state of almost every object left. -The `Sign` is required because the merging algorithm doesn't guarantee that all of the rows with the same sorting key will be in the same resulting data part and even on the same physical server. ClickHouse process `SELECT` queries with multiple threads, and it can not predict the order of rows in the result. The aggregation is required if there is a need to get completely "collapsed" data from `CollapsingMergeTree` table. +The `Sign` is required because the merging algorithm doesn’t guarantee that all of the rows with the same sorting key will be in the same resulting data part and even on the same physical server. ClickHouse process `SELECT` queries with multiple threads, and it can not predict the order of rows in the result. The aggregation is required if there is a need to get completely “collapsed” data from `CollapsingMergeTree` table. To finalize collapsing, write a query with `GROUP BY` clause and aggregate functions that account for the sign. For example, to calculate quantity, use `sum(Sign)` instead of `count()`. To calculate the sum of something, use `sum(Sign * x)` instead of `sum(x)`, and so on, and also add `HAVING sum(Sign) > 0`. -The aggregates `count`, `sum` and `avg` could be calculated this way. The aggregate `uniq` could be calculated if an object has at least one state not collapsed. The aggregates `min` and `max` could not be calculated because `CollapsingMergeTree` does not save values history of the collapsed states. +The aggregates `count`, `sum` and `avg` could be calculated this way. The aggregate `uniq` could be calculated if an object has at least one state not collapsed. The aggregates `min` and `max` could not be calculated because `CollapsingMergeTree` does not save the values history of the collapsed states. If you need to extract data without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the `FINAL` modifier for the `FROM` clause. This approach is significantly less efficient. -## Example of use +## Example of use {#example-of-use} Example data: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ 5 │ 146 │ -1 │ @@ -141,7 +146,7 @@ Example data: Creation of the table: -```sql +``` sql CREATE TABLE UAct ( UserID UInt64, @@ -155,10 +160,11 @@ ORDER BY UserID Insertion of the data: -```sql +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1) ``` -```sql + +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 6, 185, 1) ``` @@ -166,11 +172,11 @@ We use two `INSERT` queries to create two different data parts. If we insert the Getting the data: -```sql +``` sql SELECT * FROM UAct ``` -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -182,11 +188,11 @@ SELECT * FROM UAct What do we see and where is collapsing? -With two `INSERT` queries, we created 2 data parts. The `SELECT` query was performed in 2 threads, and we got a random order of rows. Collapsing not occurred because there was no merge of the data parts yet. ClickHouse merges data part in an unknown moment of time which we can not predict. +With two `INSERT` queries, we created 2 data parts. The `SELECT` query was performed in 2 threads, and we got a random order of rows. Collapsing not occurred because there was no merge of the data parts yet. ClickHouse merges data part in an unknown moment which we can not predict. Thus we need aggregation: -```sql +``` sql SELECT UserID, sum(PageViews * Sign) AS PageViews, @@ -195,7 +201,8 @@ FROM UAct GROUP BY UserID HAVING sum(Sign) > 0 ``` -```text + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┐ │ 4324182021466249494 │ 6 │ 185 │ └─────────────────────┴───────────┴──────────┘ @@ -203,22 +210,23 @@ HAVING sum(Sign) > 0 If we do not need aggregation and want to force collapsing, we can use `FINAL` modifier for `FROM` clause. -```sql +``` sql SELECT * FROM UAct FINAL ``` -```text + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ ``` -This way of selecting the data is very inefficient. Don't use it for big tables. +This way of selecting the data is very inefficient. Don’t use it for big tables. -## Example of another approach +## Example of another approach {#example-of-another-approach} Example data: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ -5 │ -146 │ -1 │ @@ -226,9 +234,9 @@ Example data: └─────────────────────┴───────────┴──────────┴──────┘ ``` -The idea is that merges take into account only key fields. And in the "Cancel" line we can specify negative values that equalize the previous version of the row when summing without using the Sign column. For this approach, it is necessary to change the data type `PageViews`,` Duration` to store negative values of UInt8 -> Int16. +The idea is that merges take into account only key fields. And in the “Cancel” line we can specify negative values that equalize the previous version of the row when summing without using the Sign column. For this approach, it is necessary to change the data type `PageViews`,`Duration` to store negative values of UInt8 -\> Int16. -```sql +``` sql CREATE TABLE UAct ( UserID UInt64, @@ -240,20 +248,23 @@ ENGINE = CollapsingMergeTree(Sign) ORDER BY UserID ``` -Let's test the approach: -```sql +Let’s test the approach: + +``` sql insert into UAct values(4324182021466249494, 5, 146, 1); insert into UAct values(4324182021466249494, -5, -146, -1); insert into UAct values(4324182021466249494, 6, 185, 1); select * from UAct final; // avoid using final in production (just for a test or small tables) ``` -```text + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ ``` -```sql + +``` sql SELECT UserID, sum(PageViews) AS PageViews, @@ -265,25 +276,27 @@ GROUP BY UserID │ 4324182021466249494 │ 6 │ 185 │ └─────────────────────┴───────────┴──────────┘ ``` -```sqk + +``` sqk select count() FROM UAct ``` -```text + +``` text ┌─count()─┐ │ 3 │ └─────────┘ ``` -```sql + +``` sql optimize table UAct final; select * FROM UAct ``` -```text + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ ``` - - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/collapsingmergetree/) diff --git a/docs/en/operations/table_engines/custom_partitioning_key.md b/docs/en/operations/table_engines/custom_partitioning_key.md index d5e80e30f7e..a22149b6b9b 100644 --- a/docs/en/operations/table_engines/custom_partitioning_key.md +++ b/docs/en/operations/table_engines/custom_partitioning_key.md @@ -1,12 +1,12 @@ -# Custom Partitioning Key +# Custom Partitioning Key {#custom-partitioning-key} Partitioning is available for the [MergeTree](mergetree.md) family tables (including [replicated](replication.md) tables). [Materialized views](materializedview.md) based on MergeTree tables support partitioning, as well. -A partition is a logical combination of records in a table by a specified criterion. You can set a partition by an arbitrary criterion, such as by month, by day, or by event type. Each partition is stored separately in order to simplify manipulations of this data. When accessing the data, ClickHouse uses the smallest subset of partitions possible. +A partition is a logical combination of records in a table by a specified criterion. You can set a partition by an arbitrary criterion, such as by month, by day, or by event type. Each partition is stored separately to simplify manipulations of this data. When accessing the data, ClickHouse uses the smallest subset of partitions possible. The partition is specified in the `PARTITION BY expr` clause when [creating a table](mergetree.md#table_engine-mergetree-creating-a-table). The partition key can be any expression from the table columns. For example, to specify partitioning by month, use the expression `toYYYYMM(date_column)`: -```sql +``` sql CREATE TABLE visits ( VisitDate Date, @@ -20,7 +20,7 @@ ORDER BY Hour; The partition key can also be a tuple of expressions (similar to the [primary key](mergetree.md#primary-keys-and-indexes-in-queries)). For example: -```sql +``` sql ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/name', 'replica1', Sign) PARTITION BY (toMonday(StartDate), EventType) ORDER BY (CounterID, StartDate, intHash32(UserID)); @@ -30,12 +30,12 @@ In this example, we set partitioning by the event types that occurred during the When inserting new data to a table, this data is stored as a separate part (chunk) sorted by the primary key. In 10-15 minutes after inserting, the parts of the same partition are merged into the entire part. -!!! info - A merge only works for data parts that have the same value for the partitioning expression. This means **you shouldn't make overly granular partitions** (more than about a thousand partitions). Otherwise, the `SELECT` query performs poorly because of an unreasonably large number of files in the file system and open file descriptors. +!!! info "Info" + A merge only works for data parts that have the same value for the partitioning expression. This means **you shouldn’t make overly granular partitions** (more than about a thousand partitions). Otherwise, the `SELECT` query performs poorly because of an unreasonably large number of files in the file system and open file descriptors. -Use the [system.parts](../system_tables.md#system_tables-parts) table to view the table parts and partitions. For example, let's assume that we have a `visits` table with partitioning by month. Let's perform the `SELECT` query for the `system.parts` table: +Use the [system.parts](../system_tables.md#system_tables-parts) table to view the table parts and partitions. For example, let’s assume that we have a `visits` table with partitioning by month. Let’s perform the `SELECT` query for the `system.parts` table: -```sql +``` sql SELECT partition, name, @@ -44,7 +44,7 @@ FROM system.parts WHERE table = 'visits' ``` -```text +``` text ┌─partition─┬─name───────────┬─active─┐ │ 201901 │ 201901_1_3_1 │ 0 │ │ 201901 │ 201901_1_9_2 │ 1 │ @@ -56,29 +56,29 @@ WHERE table = 'visits' └───────────┴────────────────┴────────┘ ``` -The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER ... PARTITION](#alter_manipulations-with-partitions) queries. +The `partition` column contains the names of the partitions. There are two partitions in this example: `201901` and `201902`. You can use this column value to specify the partition name in [ALTER … PARTITION](#alter_manipulations-with-partitions) queries. The `name` column contains the names of the partition data parts. You can use this column to specify the name of the part in the [ALTER ATTACH PART](#alter_attach-partition) query. -Let's break down the name of the first part: `201901_1_3_1`: +Let’s break down the name of the first part: `201901_1_3_1`: -- `201901` is the partition name. -- `1` is the minimum number of the data block. -- `3` is the maximum number of the data block. -- `1` is the chunk level (the depth of the merge tree it is formed from). +- `201901` is the partition name. +- `1` is the minimum number of the data block. +- `3` is the maximum number of the data block. +- `1` is the chunk level (the depth of the merge tree it is formed from). -!!! info +!!! info "Info" The parts of old-type tables have the name: `20190117_20190123_2_2_0` (minimum date - maximum date - minimum block number - maximum block number - level). The `active` column shows the status of the part. `1` is active; `0` is inactive. The inactive parts are, for example, source parts remaining after merging to a larger part. The corrupted data parts are also indicated as inactive. As you can see in the example, there are several separated parts of the same partition (for example, `201901_1_3_1` and `201901_1_9_2`). This means that these parts are not merged yet. ClickHouse merges the inserted parts of data periodically, approximately 15 minutes after inserting. In addition, you can perform a non-scheduled merge using the [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) query. Example: -```sql +``` sql OPTIMIZE TABLE visits PARTITION 201902; ``` -```text +``` text ┌─partition─┬─name───────────┬─active─┐ │ 201901 │ 201901_1_3_1 │ 0 │ │ 201901 │ 201901_1_9_2 │ 1 │ @@ -95,7 +95,7 @@ Inactive parts will be deleted approximately 10 minutes after merging. Another way to view a set of parts and partitions is to go into the directory of the table: `/var/lib/clickhouse/data///`. For example: -```bash +``` bash /var/lib/clickhouse/data/default/visits$ ls -l total 40 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1 @@ -109,11 +109,11 @@ drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 12:09 201902_4_6_1 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached ``` -The folders '201901_1_1_0', '201901_1_7_1' and so on are the directories of the parts. Each part relates to a corresponding partition and contains data just for a certain month (the table in this example has partitioning by month). +The folders ‘201901\_1\_1\_0’, ‘201901\_1\_7\_1’ and so on are the directories of the parts. Each part relates to a corresponding partition and contains data just for a certain month (the table in this example has partitioning by month). The `detached` directory contains parts that were detached from the table using the [DETACH](#alter_detach-partition) query. The corrupted parts are also moved to this directory, instead of being deleted. The server does not use the parts from the `detached` directory. You can add, delete, or modify the data in this directory at any time – the server will not know about this until you run the [ATTACH](../../query_language/alter.md#alter_attach-partition) query. -Note that on the operating server, you cannot manually change the set of parts or their data on the file system, since the server will not know about it. For non-replicated tables, you can do this when the server is stopped, but it isn't recommended. For replicated tables, the set of parts cannot be changed in any case. +Note that on the operating server, you cannot manually change the set of parts or their data on the file system, since the server will not know about it. For non-replicated tables, you can do this when the server is stopped, but it isn’t recommended. For replicated tables, the set of parts cannot be changed in any case. ClickHouse allows you to perform operations with the partitions: delete them, copy from one table to another, or create a backup. See the list of all operations in the section [Manipulations With Partitions and Parts](../../query_language/alter.md#alter_manipulations-with-partitions). diff --git a/docs/en/operations/table_engines/dictionary.md b/docs/en/operations/table_engines/dictionary.md index c3de79f69cf..18b5dfabd78 100644 --- a/docs/en/operations/table_engines/dictionary.md +++ b/docs/en/operations/table_engines/dictionary.md @@ -1,11 +1,10 @@ - -# Dictionary +# Dictionary {#dictionary} The `Dictionary` engine displays the [dictionary](../../query_language/dicts/external_dicts.md) data as a ClickHouse table. As an example, consider a dictionary of `products` with the following configuration: -```xml +``` xml products @@ -38,7 +37,7 @@ As an example, consider a dictionary of `products` with the following configurat Query the dictionary data: -```sql +``` sql SELECT name, type, @@ -52,42 +51,40 @@ FROM system.dictionaries WHERE name = 'products' ``` -```text +``` text ┌─name─────┬─type─┬─key────┬─attribute.names─┬─attribute.types─┬─bytes_allocated─┬─element_count─┬─source──────────┐ │ products │ Flat │ UInt64 │ ['title'] │ ['String'] │ 23065376 │ 175032 │ ODBC: .products │ └──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘ ``` -You can use the [dictGet*](../../query_language/functions/ext_dict_functions.md#ext_dict_functions) function to get the dictionary data in this format. +You can use the [dictGet\*](../../query_language/functions/ext_dict_functions.md#ext_dict_functions) function to get the dictionary data in this format. -This view isn't helpful when you need to get raw data, or when performing a `JOIN` operation. For these cases, you can use the `Dictionary` engine, which displays the dictionary data in a table. +This view isn’t helpful when you need to get raw data, or when performing a `JOIN` operation. For these cases, you can use the `Dictionary` engine, which displays the dictionary data in a table. Syntax: -```sql +``` sql CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)` ``` Usage example: -```sql +``` sql create table products (product_id UInt64, title String) Engine = Dictionary(products); ``` -``` -Ok -``` -Take a look at what's in the table. + Ok -```sql +Take a look at what’s in the table. + +``` sql select * from products limit 1; ``` -```text +``` text ┌────product_id─┬─title───────────┐ │ 152689 │ Some item │ └───────────────┴─────────────────┘ ``` - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/dictionary/) diff --git a/docs/en/operations/table_engines/distributed.md b/docs/en/operations/table_engines/distributed.md index 39b02310575..4caa7d6abb3 100644 --- a/docs/en/operations/table_engines/distributed.md +++ b/docs/en/operations/table_engines/distributed.md @@ -1,38 +1,42 @@ +# Distributed {#distributed} -# Distributed - -**The Distributed engine does not store data itself**, but allows distributed query processing on multiple servers. +**Tables with Distributed engine do not store any data by themself**, but allow distributed query processing on multiple servers. Reading is automatically parallelized. During a read, the table indexes on remote servers are used, if there are any. The Distributed engine accepts parameters: -- the cluster name in the server's config file -- the name of a remote database -- the name of a remote table -- (optionally) sharding key -- (optionally) policy name, it will be used to store temporary files for async send +- the cluster name in the server’s config file - See also: - - `insert_distributed_sync` setting - - [MergeTree](mergetree.md#table_engine-mergetree-multiple-volumes) for the examples +- the name of a remote database + +- the name of a remote table + +- (optionally) sharding key + +- (optionally) policy name, it will be used to store temporary files for async send + + See also: + + - `insert_distributed_sync` setting + - [MergeTree](mergetree.md#table_engine-mergetree-multiple-volumes) for the examples Example: -```sql +``` sql Distributed(logs, default, hits[, sharding_key[, policy_name]]) ``` -Data will be read from all servers in the 'logs' cluster, from the default.hits table located on every server in the cluster. -Data is not only read, but is partially processed on the remote servers (to the extent that this is possible). +Data will be read from all servers in the ‘logs’ cluster, from the default.hits table located on every server in the cluster. +Data is not only read but is partially processed on the remote servers (to the extent that this is possible). For example, for a query with GROUP BY, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated. Instead of the database name, you can use a constant expression that returns a string. For example: currentDatabase(). -logs – The cluster name in the server's config file. +logs – The cluster name in the server’s config file. Clusters are set like this: -```xml +``` xml @@ -66,80 +70,78 @@ Clusters are set like this: ``` -Here a cluster is defined with the name 'logs' that consists of two shards, each of which contains two replicas. +Here a cluster is defined with the name ‘logs’ that consists of two shards, each of which contains two replicas. Shards refer to the servers that contain different parts of the data (in order to read all the data, you must access all the shards). Replicas are duplicating servers (in order to read all the data, you can access the data on any one of the replicas). Cluster names must not contain dots. The parameters `host`, `port`, and optionally `user`, `password`, `secure`, `compression` are specified for each server: +- `host` – The address of the remote server. You can use either the domain or the IPv4 or IPv6 address. If you specify the domain, the server makes a DNS request when it starts, and the result is stored as long as the server is running. If the DNS request fails, the server doesn’t start. If you change the DNS record, restart the server. +- `port` – The TCP port for messenger activity (‘tcp\_port’ in the config, usually set to 9000). Do not confuse it with http\_port. +- `user` – Name of the user for connecting to a remote server. Default value: default. This user must have access to connect to the specified server. Access is configured in the users.xml file. For more information, see the section [Access rights](../../operations/access_rights.md). +- `password` – The password for connecting to a remote server (not masked). Default value: empty string. +- `secure` - Use ssl for connection, usually you also should define `port` = 9440. Server should listen on 9440 and have correct certificates. +- `compression` - Use data compression. Default value: true. -: - `host` – The address of the remote server. You can use either the domain or the IPv4 or IPv6 address. If you specify the domain, the server makes a DNS request when it starts, and the result is stored as long as the server is running. If the DNS request fails, the server doesn't start. If you change the DNS record, restart the server. -- `port` – The TCP port for messenger activity ('tcp_port' in the config, usually set to 9000). Do not confuse it with http_port. -- `user` – Name of the user for connecting to a remote server. Default value: default. This user must have access to connect to the specified server. Access is configured in the users.xml file. For more information, see the section [Access rights](../../operations/access_rights.md). -- `password` – The password for connecting to a remote server (not masked). Default value: empty string. -- `secure` - Use ssl for connection, usually you also should define `port` = 9440. Server should listen on 9440 and have correct certificates. -- `compression` - Use data compression. Default value: true. - -When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) – see the [load_balancing](../settings/settings.md#settings-load_balancing) setting. +When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) – see the [load\_balancing](../settings/settings.md#settings-load_balancing) setting. If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times. -This works in favor of resiliency, but does not provide complete fault tolerance: a remote server might accept the connection, but might not work, or work poorly. +This works in favour of resiliency, but does not provide complete fault tolerance: a remote server might accept the connection, but might not work, or work poorly. You can specify just one of the shards (in this case, query processing should be called remote, rather than distributed) or up to any number of shards. In each shard, you can specify from one to any number of replicas. You can specify a different number of replicas for each shard. You can specify as many clusters as you wish in the configuration. -To view your clusters, use the 'system.clusters' table. +To view your clusters, use the ‘system.clusters’ table. -The Distributed engine allows working with a cluster like a local server. However, the cluster is inextensible: you must write its configuration in the server config file (even better, for all the cluster's servers). +The Distributed engine allows working with a cluster like a local server. However, the cluster is inextensible: you must write its configuration in the server config file (even better, for all the cluster’s servers). -There is no support for Distributed tables that look at other Distributed tables (except in cases when a Distributed table only has one shard). As an alternative, make the Distributed table look at the "final" tables. +There is no support for Distributed tables that look at other Distributed tables (except in cases when a Distributed table only has one shard). As an alternative, make the Distributed table look at the “final” tables. -The Distributed engine requires writing clusters to the config file. Clusters from the config file are updated on the fly, without restarting the server. If you need to send a query to an unknown set of shards and replicas each time, you don't need to create a Distributed table – use the 'remote' table function instead. See the section [Table functions](../../query_language/table_functions/index.md). +The Distributed engine requires writing clusters to the config file. Clusters from the config file are updated on the fly, without restarting the server. If you need to send a query to an unknown set of shards and replicas each time, you don’t need to create a Distributed table – use the ‘remote’ table function instead. See the section [Table functions](../../query_language/table_functions/index.md). There are two methods for writing data to a cluster: -First, you can define which servers to write which data to and perform the write directly on each shard. In other words, perform INSERT in the tables that the distributed table "looks at". This is the most flexible solution as you can use any sharding scheme, which could be non-trivial due to the requirements of the subject area. This is also the most optimal solution, since data can be written to different shards completely independently. +First, you can define which servers to write which data to and perform the write directly on each shard. In other words, perform INSERT in the tables that the distributed table “looks at”. This is the most flexible solution as you can use any sharding scheme, which could be non-trivial due to the requirements of the subject area. This is also the most optimal solution since data can be written to different shards completely independently. -Second, you can perform INSERT in a Distributed table. In this case, the table will distribute the inserted data across servers itself. In order to write to a Distributed table, it must have a sharding key set (the last parameter). In addition, if there is only one shard, the write operation works without specifying the sharding key, since it doesn't mean anything in this case. +Second, you can perform INSERT in a Distributed table. In this case, the table will distribute the inserted data across the servers itself. In order to write to a Distributed table, it must have a sharding key set (the last parameter). In addition, if there is only one shard, the write operation works without specifying the sharding key, since it doesn’t mean anything in this case. Each shard can have a weight defined in the config file. By default, the weight is equal to one. Data is distributed across shards in the amount proportional to the shard weight. For example, if there are two shards and the first has a weight of 9 while the second has a weight of 10, the first will be sent 9 / 19 parts of the rows, and the second will be sent 10 / 19. -Each shard can have the 'internal_replication' parameter defined in the config file. +Each shard can have the ‘internal\_replication’ parameter defined in the config file. -If this parameter is set to 'true', the write operation selects the first healthy replica and writes data to it. Use this alternative if the Distributed table "looks at" replicated tables. In other words, if the table where data will be written is going to replicate them itself. +If this parameter is set to ‘true’, the write operation selects the first healthy replica and writes data to it. Use this alternative if the Distributed table “looks at” replicated tables. In other words, if the table where data will be written is going to replicate them itself. -If it is set to 'false' (the default), data is written to all replicas. In essence, this means that the Distributed table replicates data itself. This is worse than using replicated tables, because the consistency of replicas is not checked, and over time they will contain slightly different data. +If it is set to ‘false’ (the default), data is written to all replicas. In essence, this means that the Distributed table replicates data itself. This is worse than using replicated tables, because the consistency of replicas is not checked, and over time they will contain slightly different data. -To select the shard that a row of data is sent to, the sharding expression is analyzed, and its remainder is taken from dividing it by the total weight of the shards. The row is sent to the shard that corresponds to the half-interval of the remainders from 'prev_weight' to 'prev_weights + weight', where 'prev_weights' is the total weight of the shards with the smallest number, and 'weight' is the weight of this shard. For example, if there are two shards, and the first has a weight of 9 while the second has a weight of 10, the row will be sent to the first shard for the remainders from the range \[0, 9), and to the second for the remainders from the range \[9, 19). +To select the shard that a row of data is sent to, the sharding expression is analyzed, and its remainder is taken from dividing it by the total weight of the shards. The row is sent to the shard that corresponds to the half-interval of the remainders from ‘prev\_weight’ to ‘prev\_weights + weight’, where ‘prev\_weights’ is the total weight of the shards with the smallest number, and ‘weight’ is the weight of this shard. For example, if there are two shards, and the first has a weight of 9 while the second has a weight of 10, the row will be sent to the first shard for the remainders from the range \[0, 9), and to the second for the remainders from the range \[9, 19). -The sharding expression can be any expression from constants and table columns that returns an integer. For example, you can use the expression 'rand()' for random distribution of data, or 'UserID' for distribution by the remainder from dividing the user's ID (then the data of a single user will reside on a single shard, which simplifies running IN and JOIN by users). If one of the columns is not distributed evenly enough, you can wrap it in a hash function: intHash64(UserID). +The sharding expression can be any expression from constants and table columns that returns an integer. For example, you can use the expression ‘rand()’ for random distribution of data, or ‘UserID’ for distribution by the remainder from dividing the user’s ID (then the data of a single user will reside on a single shard, which simplifies running IN and JOIN by users). If one of the columns is not distributed evenly enough, you can wrap it in a hash function: intHash64(UserID). -A simple remainder from division is a limited solution for sharding and isn't always appropriate. It works for medium and large volumes of data (dozens of servers), but not for very large volumes of data (hundreds of servers or more). In the latter case, use the sharding scheme required by the subject area, rather than using entries in Distributed tables. +A simple reminder from the division is a limited solution for sharding and isn’t always appropriate. It works for medium and large volumes of data (dozens of servers), but not for very large volumes of data (hundreds of servers or more). In the latter case, use the sharding scheme required by the subject area, rather than using entries in Distributed tables. -SELECT queries are sent to all the shards, and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you don't have to transfer the old data to it. You can write new data with a heavier weight – the data will be distributed slightly unevenly, but queries will work correctly and efficiently. +SELECT queries are sent to all the shards and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you don’t have to transfer the old data to it. You can write new data with a heavier weight – the data will be distributed slightly unevenly, but queries will work correctly and efficiently. You should be concerned about the sharding scheme in the following cases: -- Queries are used that require joining data (IN or JOIN) by a specific key. If data is sharded by this key, you can use local IN or JOIN instead of GLOBAL IN or GLOBAL JOIN, which is much more efficient. -- A large number of servers is used (hundreds or more) with a large number of small queries (queries of individual clients - websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as we've done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into "layers", where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. Distributed tables are created for each layer, and a single shared distributed table is created for global queries. +- Queries are used that require joining data (IN or JOIN) by a specific key. If data is sharded by this key, you can use local IN or JOIN instead of GLOBAL IN or GLOBAL JOIN, which is much more efficient. +- A large number of servers is used (hundreds or more) with a large number of small queries (queries of individual clients - websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as we’ve done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into “layers”, where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. Distributed tables are created for each layer, and a single shared distributed table is created for global queries. -Data is written asynchronously. When inserted in the table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. The period for sending data is managed by the [distributed_directory_monitor_sleep_time_ms](../settings/settings.md#distributed_directory_monitor_sleep_time_ms) and [distributed_directory_monitor_max_sleep_time_ms](../settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) settings. The `Distributed` engine sends each file with inserted data separately, but you can enable batch sending of files with the [distributed_directory_monitor_batch_inserts](../settings/settings.md#distributed_directory_monitor_batch_inserts) setting. This setting improves cluster performance by better utilizing local server and network resources. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: `/var/lib/clickhouse/data/database/table/`. +Data is written asynchronously. When inserted in the table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. The period for sending data is managed by the [distributed\_directory\_monitor\_sleep\_time\_ms](../settings/settings.md#distributed_directory_monitor_sleep_time_ms) and [distributed\_directory\_monitor\_max\_sleep\_time\_ms](../settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) settings. The `Distributed` engine sends each file with inserted data separately, but you can enable batch sending of files with the [distributed\_directory\_monitor\_batch\_inserts](../settings/settings.md#distributed_directory_monitor_batch_inserts) setting. This setting improves cluster performance by better utilizing local server and network resources. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: `/var/lib/clickhouse/data/database/table/`. -If the server ceased to exist or had a rough restart (for example, after a device failure) after an INSERT to a Distributed table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the 'broken' subdirectory and no longer used. +If the server ceased to exist or had a rough restart (for example, after a device failure) after an INSERT to a Distributed table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the ‘broken’ subdirectory and no longer used. -When the max_parallel_replicas option is enabled, query processing is parallelized across all replicas within a single shard. For more information, see the section [max_parallel_replicas](../settings/settings.md#settings-max_parallel_replicas). +When the max\_parallel\_replicas option is enabled, query processing is parallelized across all replicas within a single shard. For more information, see the section [max\_parallel\_replicas](../settings/settings.md#settings-max_parallel_replicas). -## Virtual Columns +## Virtual Columns {#virtual-columns} -- `_shard_num` — Contains the `shard_num` (from `system.clusters`). Type: [UInt32](../../data_types/int_uint.md). +- `_shard_num` — Contains the `shard_num` (from `system.clusters`). Type: [UInt32](../../data_types/int_uint.md). !!! note "Note" Since [`remote`](../../query_language/table_functions/remote.md)/`cluster` table functions internally create temporary instance of the same Distributed engine, `_shard_num` is available there too. **See Also** -- [Virtual columns](index.md#table_engines-virtual_columns) - +- [Virtual columns](index.md#table_engines-virtual_columns) [Original article](https://clickhouse.tech/docs/en/operations/table_engines/distributed/) diff --git a/docs/en/operations/table_engines/external_data.md b/docs/en/operations/table_engines/external_data.md index 12fc34f516e..e520bb6fe0a 100644 --- a/docs/en/operations/table_engines/external_data.md +++ b/docs/en/operations/table_engines/external_data.md @@ -1,37 +1,36 @@ +# External Data for Query Processing {#external-data-for-query-processing} -# External Data for Query Processing - -ClickHouse allows sending a server the data that is needed for processing a query, together with a SELECT query. This data is put in a temporary table (see the section "Temporary tables") and can be used in the query (for example, in IN operators). +ClickHouse allows sending a server the data that is needed for processing a query, together with a SELECT query. This data is put in a temporary table (see the section “Temporary tables”) and can be used in the query (for example, in IN operators). For example, if you have a text file with important user identifiers, you can upload it to the server along with a query that uses filtration by this list. -If you need to run more than one query with a large volume of external data, don't use this feature. It is better to upload the data to the DB ahead of time. +If you need to run more than one query with a large volume of external data, don’t use this feature. It is better to upload the data to the DB ahead of time. External data can be uploaded using the command-line client (in non-interactive mode), or using the HTTP interface. In the command-line client, you can specify a parameters section in the format -```bash +``` bash --external --file=... [--name=...] [--format=...] [--types=...|--structure=...] ``` You may have multiple sections like this, for the number of tables being transmitted. -**--external** – Marks the beginning of a clause. -**--file** – Path to the file with the table dump, or -, which refers to stdin. +**–external** – Marks the beginning of a clause. +**–file** – Path to the file with the table dump, or -, which refers to stdin. Only a single table can be retrieved from stdin. -The following parameters are optional: **--name**– Name of the table. If omitted, _data is used. -**--format** – Data format in the file. If omitted, TabSeparated is used. +The following parameters are optional: **–name**– Name of the table. If omitted, \_data is used. +**–format** – Data format in the file. If omitted, TabSeparated is used. -One of the following parameters is required:**--types** – A list of comma-separated column types. For example: `UInt64,String`. The columns will be named _1, _2, ... -**--structure**– The table structure in the format`UserID UInt64`, `URL String`. Defines the column names and types. +One of the following parameters is required:**–types** – A list of comma-separated column types. For example: `UInt64,String`. The columns will be named \_1, \_2, … +**–structure**– The table structure in the format`UserID UInt64`, `URL String`. Defines the column names and types. -The files specified in 'file' will be parsed by the format specified in 'format', using the data types specified in 'types' or 'structure'. The table will be uploaded to the server and accessible there as a temporary table with the name in 'name'. +The files specified in ‘file’ will be parsed by the format specified in ‘format’, using the data types specified in ‘types’ or ‘structure’. The table will be uploaded to the server and accessible there as a temporary table with the name in ‘name’. Examples: -```bash +``` bash $ echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 849897 $ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' @@ -42,11 +41,11 @@ $ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, co /bin/sync 1 ``` -When using the HTTP interface, external data is passed in the multipart/form-data format. Each table is transmitted as a separate file. The table name is taken from the file name. The 'query_string' is passed the parameters 'name_format', 'name_types', and 'name_structure', where 'name' is the name of the table that these parameters correspond to. The meaning of the parameters is the same as when using the command-line client. +When using the HTTP interface, external data is passed in the multipart/form-data format. Each table is transmitted as a separate file. The table name is taken from the file name. The ‘query\_string’ is passed the parameters ‘name\_format’, ‘name\_types’, and ‘name\_structure’, where ‘name’ is the name of the table that these parameters correspond to. The meaning of the parameters is the same as when using the command-line client. Example: -```bash +``` bash $ cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv $ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' @@ -59,5 +58,4 @@ $ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+coun For distributed query processing, the temporary tables are sent to all the remote servers. - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/external_data/) diff --git a/docs/en/operations/table_engines/file.md b/docs/en/operations/table_engines/file.md index d45eb596ea1..35b74da6eba 100644 --- a/docs/en/operations/table_engines/file.md +++ b/docs/en/operations/table_engines/file.md @@ -1,39 +1,39 @@ -# File {#table_engines-file} +# File {#table-engines-file} The File table engine keeps the data in a file in one of the supported [file formats](../../interfaces/formats.md#formats) (TabSeparated, Native, etc.). Usage examples: -- Data export from ClickHouse to file. -- Convert data from one format to another. -- Updating data in ClickHouse via editing a file on a disk. +- Data export from ClickHouse to file. +- Convert data from one format to another. +- Updating data in ClickHouse via editing a file on a disk. -## Usage in ClickHouse Server +## Usage in ClickHouse Server {#usage-in-clickhouse-server} -```sql +``` sql File(Format) ``` The `Format` parameter specifies one of the available file formats. To perform `SELECT` queries, the format must be supported for input, and to perform -`INSERT` queries -- for output. The available formats are listed in the +`INSERT` queries – for output. The available formats are listed in the [Formats](../../interfaces/formats.md#formats) section. ClickHouse does not allow to specify filesystem path for`File`. It will use folder defined by [path](../server_settings/settings.md) setting in server configuration. -When creating table using `File(Format)` it creates empty subdirectory in that folder. When data is written to that table, it's put into `data.Format` file in that subdirectory. +When creating table using `File(Format)` it creates empty subdirectory in that folder. When data is written to that table, it’s put into `data.Format` file in that subdirectory. You may manually create this subfolder and file in server filesystem and then [ATTACH](../../query_language/misc.md) it to table information with matching name, so you can query data from that file. -!!! warning +!!! warning "Warning" Be careful with this functionality, because ClickHouse does not keep track of external changes to such files. The result of simultaneous writes via ClickHouse and outside of ClickHouse is undefined. **Example:** **1.** Set up the `file_engine_table` table: -```sql +``` sql CREATE TABLE file_engine_table (name String, value UInt32) ENGINE=File(TabSeparated) ``` @@ -41,43 +41,43 @@ By default ClickHouse will create folder `/var/lib/clickhouse/data/default/file_ **2.** Manually create `/var/lib/clickhouse/data/default/file_engine_table/data.TabSeparated` containing: -```bash +``` bash $ cat data.TabSeparated -one 1 -two 2 +one 1 +two 2 ``` **3.** Query the data: -```sql +``` sql SELECT * FROM file_engine_table ``` -```text +``` text ┌─name─┬─value─┐ │ one │ 1 │ │ two │ 2 │ └──────┴───────┘ ``` -## Usage in Clickhouse-local +## Usage in Clickhouse-local {#usage-in-clickhouse-local} In [clickhouse-local](../utils/clickhouse-local.md) File engine accepts file path in addition to `Format`. Default input/output streams can be specified using numeric or human-readable names like `0` or `stdin`, `1` or `stdout`. **Example:** -```bash +``` bash $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" ``` -## Details of Implementation +## Details of Implementation {#details-of-implementation} -- Multiple `SELECT` queries can be performed concurrently, but `INSERT` queries will wait each other. -- Supported creating new file by `INSERT` query. -- If file exists, `INSERT` would append new values in it. -- Not supported: - - `ALTER` - - `SELECT ... SAMPLE` - - Indices - - Replication +- Multiple `SELECT` queries can be performed concurrently, but `INSERT` queries will wait each other. +- Supported creating new file by `INSERT` query. +- If file exists, `INSERT` would append new values in it. +- Not supported: + - `ALTER` + - `SELECT ... SAMPLE` + - Indices + - Replication [Original article](https://clickhouse.tech/docs/en/operations/table_engines/file/) diff --git a/docs/en/operations/table_engines/generate.md b/docs/en/operations/table_engines/generate.md index fd98b3c9d18..245231b9583 100644 --- a/docs/en/operations/table_engines/generate.md +++ b/docs/en/operations/table_engines/generate.md @@ -1,15 +1,15 @@ -# GenerateRandom {#table_engines-generate} +# GenerateRandom {#table-engines-generate} The GenerateRandom table engine produces random data for given table schema. Usage examples: -- Use in test to populate reproducible large table. -- Generate random input for fuzzing tests. +- Use in test to populate reproducible large table. +- Generate random input for fuzzing tests. -## Usage in ClickHouse Server +## Usage in ClickHouse Server {#usage-in-clickhouse-server} -```sql +``` sql ENGINE = GenerateRandom(random_seed, max_string_length, max_array_length) ``` @@ -24,17 +24,17 @@ It supports all [DataTypes](../../data_types/index.md) that can be stored in a t **1.** Set up the `generate_engine_table` table: -```sql +``` sql CREATE TABLE generate_engine_table (name String, value UInt32) ENGINE = GenerateRandom(1, 5, 3) ``` **2.** Query the data: -```sql +``` sql SELECT * FROM generate_engine_table LIMIT 3 ``` -```text +``` text ┌─name─┬──────value─┐ │ c4xJ │ 1412771199 │ │ r │ 1791099446 │ @@ -42,12 +42,13 @@ SELECT * FROM generate_engine_table LIMIT 3 └──────┴────────────┘ ``` -## Details of Implementation -- Not supported: - - `ALTER` - - `SELECT ... SAMPLE` - - `INSERT` - - Indices - - Replication +## Details of Implementation {#details-of-implementation} + +- Not supported: + - `ALTER` + - `SELECT ... SAMPLE` + - `INSERT` + - Indices + - Replication [Original article](https://clickhouse.tech/docs/en/operations/table_engines/generate/) diff --git a/docs/en/operations/table_engines/graphitemergetree.md b/docs/en/operations/table_engines/graphitemergetree.md index c260acca315..13180b9e0bd 100644 --- a/docs/en/operations/table_engines/graphitemergetree.md +++ b/docs/en/operations/table_engines/graphitemergetree.md @@ -2,13 +2,13 @@ This engine is designed for thinning and aggregating/averaging (rollup) [Graphite](http://graphite.readthedocs.io/en/latest/index.html) data. It may be helpful to developers who want to use ClickHouse as a data store for Graphite. -You can use any ClickHouse table engine to store the Graphite data if you don't need rollup, but if you need a rollup use `GraphiteMergeTree`. The engine reduces the volume of storage and increases the efficiency of queries from Graphite. +You can use any ClickHouse table engine to store the Graphite data if you don’t need rollup, but if you need a rollup use `GraphiteMergeTree`. The engine reduces the volume of storage and increases the efficiency of queries from Graphite. The engine inherits properties from [MergeTree](mergetree.md). ## Creating a Table {#creating-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( Path String, @@ -27,10 +27,13 @@ See a detailed description of the [CREATE TABLE](../../query_language/create.md# A table for the Graphite data should have the following columns for the following data: -- Metric name (Graphite sensor). Data type: `String`. -- Time of measuring the metric. Data type: `DateTime`. -- Value of the metric. Data type: any numeric. -- Version of the metric. Data type: any numeric. +- Metric name (Graphite sensor). Data type: `String`. + +- Time of measuring the metric. Data type: `DateTime`. + +- Value of the metric. Data type: any numeric. + +- Version of the metric. Data type: any numeric. ClickHouse saves the rows with the highest version or the last written if versions are the same. Other rows are deleted during the merge of data parts. @@ -38,18 +41,20 @@ The names of these columns should be set in the rollup configuration. **GraphiteMergeTree parameters** -- `config_section` — Name of the section in the configuration file, where are the rules of rollup set. +- `config_section` — Name of the section in the configuration file, where are the rules of rollup set. **Query clauses** When creating a `GraphiteMergeTree` table, the same [clauses](mergetree.md#table_engine-mergetree-creating-a-table) are required, as when creating a `MergeTree` table. -
Deprecated Method for Creating a Table +
-!!! attention +Deprecated Method for Creating a Table + +!!! attention "Attention" Do not use this method in new projects and, if possible, switch the old projects to the method described above. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( EventDate Date, @@ -63,32 +68,31 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] All of the parameters excepting `config_section` have the same meaning as in `MergeTree`. -- `config_section` — Name of the section in the configuration file, where are the rules of rollup set. +- `config_section` — Name of the section in the configuration file, where are the rules of rollup set. +
## Rollup configuration {#rollup-configuration} -The settings for rollup are defined by the [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) parameter in the server configuration. The name of the parameter could be any. You can create several configurations and use them for different tables. +The settings for rollup are defined by the [graphite\_rollup](../server_settings/settings.md#server_settings-graphite_rollup) parameter in the server configuration. The name of the parameter could be any. You can create several configurations and use them for different tables. Rollup configuration structure: -``` -required-columns -patterns -``` + required-columns + patterns ### Required Columns {#required-columns} -- `path_column_name` — The name of the column storing the metric name (Graphite sensor). Default value: `Path`. -- `time_column_name` — The name of the column storing the time of measuring the metric. Default value: `Time`. -- `value_column_name` — The name of the column storing the value of the metric at the time set in `time_column_name`. Default value: `Value`. -- `version_column_name` — The name of the column storing the version of the metric. Default value: `Timestamp`. +- `path_column_name` — The name of the column storing the metric name (Graphite sensor). Default value: `Path`. +- `time_column_name` — The name of the column storing the time of measuring the metric. Default value: `Time`. +- `value_column_name` — The name of the column storing the value of the metric at the time set in `time_column_name`. Default value: `Value`. +- `version_column_name` — The name of the column storing the version of the metric. Default value: `Timestamp`. ### Patterns {#patterns} Structure of the `patterns` section: -```text +``` text pattern regexp function @@ -112,23 +116,22 @@ default !!! warning "Attention" Patterns must be strictly ordered: - 1. Patterns without `function` or `retention`. - 1. Patterns with both `function` and `retention`. - 1. Pattern `default`. + 1. Patterns without `function` or `retention`. + 1. Patterns with both `function` and `retention`. + 1. Pattern `default`. When processing a row, ClickHouse checks the rules in the `pattern` sections. Each of `pattern` (including `default`) sections can contain `function` parameter for aggregation, `retention` parameters or both. If the metric name matches the `regexp`, the rules from the `pattern` section (or sections) are applied; otherwise, the rules from the `default` section are used. Fields for `pattern` and `default` sections: -- `regexp`– A pattern for the metric name. -- `age` – The minimum age of the data in seconds. -- `precision`– How precisely to define the age of the data in seconds. Should be a divisor for 86400 (seconds in a day). -- `function` – The name of the aggregating function to apply to data whose age falls within the range `[age, age + precision]`. - +- `regexp`– A pattern for the metric name. +- `age` – The minimum age of the data in seconds. +- `precision`– How precisely to define the age of the data in seconds. Should be a divisor for 86400 (seconds in a day). +- `function` – The name of the aggregating function to apply to data whose age falls within the range `[age, age + precision]`. ### Configuration Example {#configuration-example} -```xml +``` xml Version diff --git a/docs/en/operations/table_engines/hdfs.md b/docs/en/operations/table_engines/hdfs.md index fb254f11112..b7223731eb0 100644 --- a/docs/en/operations/table_engines/hdfs.md +++ b/docs/en/operations/table_engines/hdfs.md @@ -1,17 +1,18 @@ -# HDFS {#table_engines-hdfs} +# HDFS {#table-engines-hdfs} This engine provides integration with [Apache Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop) ecosystem by allowing to manage data on [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)via ClickHouse. This engine is similar to the [File](file.md) and [URL](url.md) engines, but provides Hadoop-specific features. -## Usage +## Usage {#usage} -```sql +``` sql ENGINE = HDFS(URI, format) ``` + The `URI` parameter is the whole file URI in HDFS. The `format` parameter specifies one of the available file formats. To perform `SELECT` queries, the format must be supported for input, and to perform -`INSERT` queries -- for output. The available formats are listed in the +`INSERT` queries – for output. The available formats are listed in the [Formats](../../interfaces/formats.md#formats) section. The path part of `URI` may contain globs. In this case the table would be readonly. @@ -19,94 +20,97 @@ The path part of `URI` may contain globs. In this case the table would be readon **1.** Set up the `hdfs_engine_table` table: -```sql +``` sql CREATE TABLE hdfs_engine_table (name String, value UInt32) ENGINE=HDFS('hdfs://hdfs1:9000/other_storage', 'TSV') ``` **2.** Fill file: -```sql + +``` sql INSERT INTO hdfs_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) ``` **3.** Query the data: -```sql +``` sql SELECT * FROM hdfs_engine_table LIMIT 2 ``` -```text +``` text ┌─name─┬─value─┐ │ one │ 1 │ │ two │ 2 │ └──────┴───────┘ ``` -## Implementation Details +## Implementation Details {#implementation-details} -- Reads and writes can be parallel -- Not supported: - - `ALTER` and `SELECT...SAMPLE` operations. - - Indexes. - - Replication. +- Reads and writes can be parallel +- Not supported: + - `ALTER` and `SELECT...SAMPLE` operations. + - Indexes. + - Replication. **Globs in path** Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. Listing of files determines during `SELECT` (not at `CREATE` moment). -- `*` — Substitutes any number of any characters except `/` including empty string. -- `?` — Substitutes any single character. -- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Substitutes any number in range from N to M including both borders. +- `*` — Substitutes any number of any characters except `/` including empty string. +- `?` — Substitutes any single character. +- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Substitutes any number in range from N to M including both borders. - Constructions with `{}` are similar to the [remote](../../query_language/table_functions/remote.md) table function. +Constructions with `{}` are similar to the [remote](../../query_language/table_functions/remote.md) table function. **Example** -1. Suppose we have several files in TSV format with the following URIs on HDFS: +1. Suppose we have several files in TSV format with the following URIs on HDFS: -- 'hdfs://hdfs1:9000/some_dir/some_file_1' -- 'hdfs://hdfs1:9000/some_dir/some_file_2' -- 'hdfs://hdfs1:9000/some_dir/some_file_3' -- 'hdfs://hdfs1:9000/another_dir/some_file_1' -- 'hdfs://hdfs1:9000/another_dir/some_file_2' -- 'hdfs://hdfs1:9000/another_dir/some_file_3' +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_3’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_3’ -2. There are several ways to make a table consisting of all six files: +1. There are several ways to make a table consisting of all six files: -```sql + + +``` sql CREATE TABLE table_with_range (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV') ``` Another way: -```sql +``` sql CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_?', 'TSV') ``` Table consists of all the files in both directories (all files should satisfy format and schema described in query): -```sql +``` sql CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV') ``` -!!! warning +!!! warning "Warning" If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **Example** -Create table with files named `file000`, `file001`, ... , `file999`: +Create table with files named `file000`, `file001`, … , `file999`: -```sql +``` sql CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') ``` -## Virtual Columns +## Virtual Columns {#virtual-columns} -- `_path` — Path to the file. -- `_file` — Name of the file. +- `_path` — Path to the file. +- `_file` — Name of the file. **See Also** -- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) +- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) [Original article](https://clickhouse.tech/docs/en/operations/table_engines/hdfs/) diff --git a/docs/en/operations/table_engines/index.md b/docs/en/operations/table_engines/index.md index 4ee5eb249e4..919ef387ea5 100644 --- a/docs/en/operations/table_engines/index.md +++ b/docs/en/operations/table_engines/index.md @@ -1,77 +1,77 @@ -# Table engines {#table_engines} +# Table engines {#table-engines} The table engine (type of table) determines: -- How and where data is stored, where to write it to, and where to read it from. -- Which queries are supported, and how. -- Concurrent data access. -- Use of indexes, if present. -- Whether multithreaded request execution is possible. -- Data replication parameters. +- How and where data is stored, where to write it to, and where to read it from. +- Which queries are supported, and how. +- Concurrent data access. +- Use of indexes, if present. +- Whether multithreaded request execution is possible. +- Data replication parameters. -## Engine Families +## Engine Families {#engine-families} -### MergeTree +### MergeTree {#mergetree} -The most universal and functional table engines for high-load tasks. The property shared by these engines is quick data insertion with subsequent background data processing. `MergeTree` family engines support data replication (with [Replicated*](replication.md) versions of engines), partitioning, and other features not supported in other engines. +The most universal and functional table engines for high-load tasks. The property shared by these engines is quick data insertion with subsequent background data processing. `MergeTree` family engines support data replication (with [Replicated\*](replication.md) versions of engines), partitioning, and other features not supported in other engines. Engines in the family: -- [MergeTree](mergetree.md) -- [ReplacingMergeTree](replacingmergetree.md) -- [SummingMergeTree](summingmergetree.md) -- [AggregatingMergeTree](aggregatingmergetree.md) -- [CollapsingMergeTree](collapsingmergetree.md) -- [VersionedCollapsingMergeTree](versionedcollapsingmergetree.md) -- [GraphiteMergeTree](graphitemergetree.md) +- [MergeTree](mergetree.md) +- [ReplacingMergeTree](replacingmergetree.md) +- [SummingMergeTree](summingmergetree.md) +- [AggregatingMergeTree](aggregatingmergetree.md) +- [CollapsingMergeTree](collapsingmergetree.md) +- [VersionedCollapsingMergeTree](versionedcollapsingmergetree.md) +- [GraphiteMergeTree](graphitemergetree.md) -### Log +### Log {#log} -Lightweight [engines](log_family.md) with minimum functionality. They're the most effective when you need to quickly write many small tables (up to approximately 1 million rows) and read them later as a whole. +Lightweight [engines](log_family.md) with minimum functionality. They’re the most effective when you need to quickly write many small tables (up to approximately 1 million rows) and read them later as a whole. Engines in the family: -- [TinyLog](tinylog.md) -- [StripeLog](stripelog.md) -- [Log](log.md) +- [TinyLog](tinylog.md) +- [StripeLog](stripelog.md) +- [Log](log.md) -### Intergation engines +### Intergation engines {#intergation-engines} Engines for communicating with other data storage and processing systems. Engines in the family: -- [Kafka](kafka.md) -- [MySQL](mysql.md) -- [ODBC](odbc.md) -- [JDBC](jdbc.md) -- [HDFS](hdfs.md) +- [Kafka](kafka.md) +- [MySQL](mysql.md) +- [ODBC](odbc.md) +- [JDBC](jdbc.md) +- [HDFS](hdfs.md) -### Special engines +### Special engines {#special-engines} Engines in the family: -- [Distributed](distributed.md) -- [MaterializedView](materializedview.md) -- [Dictionary](dictionary.md) -- [Merge](merge.md) -- [File](file.md) -- [Null](null.md) -- [Set](set.md) -- [Join](join.md) -- [URL](url.md) -- [View](view.md) -- [Memory](memory.md) -- [Buffer](buffer.md) +- [Distributed](distributed.md) +- [MaterializedView](materializedview.md) +- [Dictionary](dictionary.md) +- [Merge](merge.md) +- [File](file.md) +- [Null](null.md) +- [Set](set.md) +- [Join](join.md) +- [URL](url.md) +- [View](view.md) +- [Memory](memory.md) +- [Buffer](buffer.md) -## Virtual columns {#table_engines-virtual_columns} +## Virtual columns {#table-engines-virtual-columns} Virtual column is an integral table engine attribute that is defined in the engine source code. -You shouldn't specify virtual columns in the `CREATE TABLE` query and you can't see them in `SHOW CREATE TABLE` and `DESCRIBE TABLE` query results. Virtual columns are also read-only, so you can't insert data into virtual columns. +You shouldn’t specify virtual columns in the `CREATE TABLE` query and you can’t see them in `SHOW CREATE TABLE` and `DESCRIBE TABLE` query results. Virtual columns are also read-only, so you can’t insert data into virtual columns. -To select data from a virtual column, you must specify its name in the `SELECT` query. `SELECT *` doesn't return values from virtual columns. +To select data from a virtual column, you must specify its name in the `SELECT` query. `SELECT *` doesn’t return values from virtual columns. -If you create a table with a column that has the same name as one of the table virtual columns, the virtual column becomes inaccessible. We don't recommend doing this. To help avoid conflicts, virtual column names are usually prefixed with an underscore. +If you create a table with a column that has the same name as one of the table virtual columns, the virtual column becomes inaccessible. We don’t recommend doing this. To help avoid conflicts, virtual column names are usually prefixed with an underscore. [Original article](https://clickhouse.tech/docs/en/operations/table_engines/) diff --git a/docs/en/operations/table_engines/jdbc.md b/docs/en/operations/table_engines/jdbc.md index 2f87cd7794e..3deed862850 100644 --- a/docs/en/operations/table_engines/jdbc.md +++ b/docs/en/operations/table_engines/jdbc.md @@ -1,4 +1,4 @@ -# JDBC {#table_engine-jdbc} +# JDBC {#table-engine-jdbc} Allows ClickHouse to connect to external databases via [JDBC](https://en.wikipedia.org/wiki/Java_Database_Connectivity). @@ -6,9 +6,9 @@ To implement the JDBC connection, ClickHouse uses the separate program [clickhou This engine supports the [Nullable](../../data_types/nullable.md) data type. -## Creating a Table +## Creating a Table {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name ( columns list... @@ -18,19 +18,20 @@ ENGINE = JDBC(dbms_uri, external_database, external_table) **Engine Parameters** -- `dbms_uri` — URI of an external DBMS. +- `dbms_uri` — URI of an external DBMS. Format: `jdbc:://:/?user=&password=`. Example for MySQL: `jdbc:mysql://localhost:3306/?user=root&password=root`. -- `external_database` — Database in an external DBMS. -- `external_table` — Name of the table in `external_database`. +- `external_database` — Database in an external DBMS. -## Usage Example +- `external_table` — Name of the table in `external_database`. -Creating a table in MySQL server by connecting directly with it's console client: +## Usage Example {#usage-example} -```text +Creating a table in MySQL server by connecting directly with it’s console client: + +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -53,7 +54,7 @@ mysql> select * from test; Creating a table in ClickHouse server and selecting data from it: -```sql +``` sql CREATE TABLE jdbc_table ( `int_id` Int32, @@ -63,19 +64,20 @@ CREATE TABLE jdbc_table ) ENGINE JDBC('jdbc:mysql://localhost:3306/?user=root&password=root', 'test', 'test') ``` -```sql + +``` sql SELECT * FROM jdbc_table ``` -```text + +``` text ┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ │ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ └────────┴──────────────┴───────┴────────────────┘ - ``` -## See Also +## See Also {#see-also} -- [JDBC table function](../../query_language/table_functions/jdbc.md). +- [JDBC table function](../../query_language/table_functions/jdbc.md). [Original article](https://clickhouse.tech/docs/en/operations/table_engines/jdbc/) diff --git a/docs/en/operations/table_engines/join.md b/docs/en/operations/table_engines/join.md index 4b74ac5b2af..5d902519fde 100644 --- a/docs/en/operations/table_engines/join.md +++ b/docs/en/operations/table_engines/join.md @@ -1,10 +1,10 @@ -# Join +# Join {#join} Prepared data structure for using in [JOIN](../../query_language/select.md#select-join) operations. ## Creating a Table {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], @@ -16,40 +16,43 @@ See the detailed description of the [CREATE TABLE](../../query_language/create.m **Engine Parameters** -- `join_strictness` – [JOIN strictness](../../query_language/select.md#select-join-strictness). -- `join_type` – [JOIN type](../../query_language/select.md#select-join-types). -- `k1[, k2, ...]` – Key columns from the `USING` clause that the `JOIN` operation is made with. +- `join_strictness` – [JOIN strictness](../../query_language/select.md#select-join-strictness). +- `join_type` – [JOIN type](../../query_language/select.md#select-join-types). +- `k1[, k2, ...]` – Key columns from the `USING` clause that the `JOIN` operation is made with. -Enter `join_strictness` and `join_type` parameters without quotes, for example, `Join(ANY, LEFT, col1)`. They must match the `JOIN` operation that the table will be used for. If the parameters don't match, ClickHouse doesn't throw an exception and may return incorrect data. +Enter `join_strictness` and `join_type` parameters without quotes, for example, `Join(ANY, LEFT, col1)`. They must match the `JOIN` operation that the table will be used for. If the parameters don’t match, ClickHouse doesn’t throw an exception and may return incorrect data. -## Table Usage +## Table Usage {#table-usage} -### Example +### Example {#example} Creating the left-side table: -```sql +``` sql CREATE TABLE id_val(`id` UInt32, `val` UInt32) ENGINE = TinyLog ``` -```sql + +``` sql INSERT INTO id_val VALUES (1,11)(2,12)(3,13) ``` Creating the right-side `Join` table: -```sql +``` sql CREATE TABLE id_val_join(`id` UInt32, `val` UInt8) ENGINE = Join(ANY, LEFT, id) ``` -```sql + +``` sql INSERT INTO id_val_join VALUES (1,21)(1,22)(3,23) ``` Joining the tables: -```sql +``` sql SELECT * FROM id_val ANY LEFT JOIN id_val_join USING (id) SETTINGS join_use_nulls = 1 ``` -```text + +``` text ┌─id─┬─val─┬─id_val_join.val─┐ │ 1 │ 11 │ 21 │ │ 2 │ 12 │ ᴺᵁᴸᴸ │ @@ -59,39 +62,40 @@ SELECT * FROM id_val ANY LEFT JOIN id_val_join USING (id) SETTINGS join_use_null As an alternative, you can retrieve data from the `Join` table, specifying the join key value: -```sql +``` sql SELECT joinGet('id_val_join', 'val', toUInt32(1)) ``` -```text + +``` text ┌─joinGet('id_val_join', 'val', toUInt32(1))─┐ │ 21 │ └────────────────────────────────────────────┘ ``` -### Selecting and Inserting Data +### Selecting and Inserting Data {#selecting-and-inserting-data} You can use `INSERT` queries to add data to the `Join`-engine tables. If the table was created with the `ANY` strictness, data for duplicate keys are ignored. With the `ALL` strictness, all rows are added. You cannot perform a `SELECT` query directly from the table. Instead, use one of the following methods: -- Place the table to the right side in a `JOIN` clause. -- Call the [joinGet](../../query_language/functions/other_functions.md#joinget) function, which lets you extract data from the table the same way as from a dictionary. +- Place the table to the right side in a `JOIN` clause. +- Call the [joinGet](../../query_language/functions/other_functions.md#joinget) function, which lets you extract data from the table the same way as from a dictionary. ### Limitations and Settings {#join-limitations-and-settings} When creating a table, the following settings are applied: -- [join_use_nulls](../settings/settings.md#join_use_nulls) -- [max_rows_in_join](../settings/query_complexity.md#settings-max_rows_in_join) -- [max_bytes_in_join](../settings/query_complexity.md#settings-max_bytes_in_join) -- [join_overflow_mode](../settings/query_complexity.md#settings-join_overflow_mode) -- [join_any_take_last_row](../settings/settings.md#settings-join_any_take_last_row) +- [join\_use\_nulls](../settings/settings.md#join_use_nulls) +- [max\_rows\_in\_join](../settings/query_complexity.md#settings-max_rows_in_join) +- [max\_bytes\_in\_join](../settings/query_complexity.md#settings-max_bytes_in_join) +- [join\_overflow\_mode](../settings/query_complexity.md#settings-join_overflow_mode) +- [join\_any\_take\_last\_row](../settings/settings.md#settings-join_any_take_last_row) -The `Join`-engine tables can't be used in `GLOBAL JOIN` operations. +The `Join`-engine tables can’t be used in `GLOBAL JOIN` operations. -The `Join`-engine allows use [join_use_nulls](../settings/settings.md#join_use_nulls) setting in the `CREATE TABLE` statement. And [SELECT](../../query_language/select.md) query allows use `join_use_nulls` too. If you have different `join_use_nulls` settings, you can get an error joining table. It depends on kind of JOIN. When you use [joinGet](../../query_language/functions/other_functions.md#joinget) function, you have to use the same `join_use_nulls` setting in `CRATE TABLE` and `SELECT` statements. +The `Join`-engine allows use [join\_use\_nulls](../settings/settings.md#join_use_nulls) setting in the `CREATE TABLE` statement. And [SELECT](../../query_language/select.md) query allows use `join_use_nulls` too. If you have different `join_use_nulls` settings, you can get an error joining table. It depends on kind of JOIN. When you use [joinGet](../../query_language/functions/other_functions.md#joinget) function, you have to use the same `join_use_nulls` setting in `CRATE TABLE` and `SELECT` statements. -## Data Storage +## Data Storage {#data-storage} `Join` table data is always located in the RAM. When inserting rows into a table, ClickHouse writes data blocks to the directory on the disk so that they can be restored when the server restarts. diff --git a/docs/en/operations/table_engines/kafka.md b/docs/en/operations/table_engines/kafka.md index 5731d68a61d..cf7b095524b 100644 --- a/docs/en/operations/table_engines/kafka.md +++ b/docs/en/operations/table_engines/kafka.md @@ -1,17 +1,16 @@ -# Kafka +# Kafka {#kafka} This engine works with [Apache Kafka](http://kafka.apache.org/). Kafka lets you: -- Publish or subscribe to data flows. -- Organize fault-tolerant storage. -- Process streams as they become available. +- Publish or subscribe to data flows. +- Organize fault-tolerant storage. +- Process streams as they become available. +## Creating a Table {#table-engine-kafka-creating-a-table} -## Creating a Table {#table_engine-kafka-creating-a-table} - -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -28,23 +27,24 @@ SETTINGS [kafka_num_consumers = N,] [kafka_skip_broken_messages = N] ``` + Required parameters: -- `kafka_broker_list` – A comma-separated list of brokers (for example, `localhost:9092`). -- `kafka_topic_list` – A list of Kafka topics. -- `kafka_group_name` – A group of Kafka consumers. Reading margins are tracked for each group separately. If you don't want messages to be duplicated in the cluster, use the same group name everywhere. -- `kafka_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as ` JSONEachRow`. For more information, see the [Formats](../../interfaces/formats.md) section. +- `kafka_broker_list` – A comma-separated list of brokers (for example, `localhost:9092`). +- `kafka_topic_list` – A list of Kafka topics. +- `kafka_group_name` – A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere. +- `kafka_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../interfaces/formats.md) section. Optional parameters: -- `kafka_row_delimiter` – Delimiter character, which ends the message. -- `kafka_schema` – Parameter that must be used if the format requires a schema definition. For example, [Cap'n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object. -- `kafka_num_consumers` – The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition. -- `kafka_skip_broken_messages` – Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data). +- `kafka_row_delimiter` – Delimiter character, which ends the message. +- `kafka_schema` – Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object. +- `kafka_num_consumers` – The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition. +- `kafka_skip_broken_messages` – Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data). Examples: -```sql +``` sql CREATE TABLE queue ( timestamp UInt64, level String, @@ -72,39 +72,38 @@ Examples: kafka_num_consumers = 4; ``` +
-
Deprecated Method for Creating a Table +Deprecated Method for Creating a Table -!!! attention +!!! attention "Attention" Do not use this method in new projects. If possible, switch old projects to the method described above. - -```sql +``` sql Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format [, kafka_row_delimiter, kafka_schema, kafka_num_consumers, kafka_skip_broken_messages]) ```
-## Description - +## Description {#description} The delivered messages are tracked automatically, so each message in a group is only counted once. If you want to get the data twice, then create a copy of the table with another group name. -Groups are flexible and synced on the cluster. For instance, if you have 10 topics and 5 copies of a table in a cluster, then each copy gets 2 topics. If the number of copies changes, the topics are redistributed across the copies automatically. Read more about this at [http://kafka.apache.org/intro](http://kafka.apache.org/intro). +Groups are flexible and synced on the cluster. For instance, if you have 10 topics and 5 copies of a table in a cluster, then each copy gets 2 topics. If the number of copies changes, the topics are redistributed across the copies automatically. Read more about this at http://kafka.apache.org/intro. `SELECT` is not particularly useful for reading messages (except for debugging), because each message can be read only once. It is more practical to create real-time threads using materialized views. To do this: -1. Use the engine to create a Kafka consumer and consider it a data stream. -2. Create a table with the desired structure. -3. Create a materialized view that converts data from the engine and puts it into a previously created table. +1. Use the engine to create a Kafka consumer and consider it a data stream. +2. Create a table with the desired structure. +3. Create a materialized view that converts data from the engine and puts it into a previously created table. When the `MATERIALIZED VIEW` joins the engine, it starts collecting data in the background. This allows you to continually receive messages from Kafka and convert them to the required format using `SELECT`. One kafka table can have as many materialized views as you like, they do not read data from the kafka table directly, but receive new records (in blocks), this way you can write to several tables with different detail level (with grouping - aggregation and without). Example: -```sql +``` sql CREATE TABLE queue ( timestamp UInt64, level String, @@ -124,22 +123,22 @@ Example: SELECT level, sum(total) FROM daily GROUP BY level; ``` -To improve performance, received messages are grouped into blocks the size of [max_insert_block_size](../settings/settings.md#settings-max_insert_block_size). If the block wasn't formed within [stream_flush_interval_ms](../settings/settings.md) milliseconds, the data will be flushed to the table regardless of the completeness of the block. +To improve performance, received messages are grouped into blocks the size of [max\_insert\_block\_size](../settings/settings.md#settings-max_insert_block_size). If the block wasn’t formed within [stream\_flush\_interval\_ms](../settings/settings.md) milliseconds, the data will be flushed to the table regardless of the completeness of the block. To stop receiving topic data or to change the conversion logic, detach the materialized view: -```sql +``` sql DETACH TABLE consumer; ATTACH MATERIALIZED VIEW consumer; ``` If you want to change the target table by using `ALTER`, we recommend disabling the material view to avoid discrepancies between the target table and the data from the view. -## Configuration +## Configuration {#configuration} Similar to GraphiteMergeTree, the Kafka engine supports extended configuration using the ClickHouse config file. There are two configuration keys that you can use: global (`kafka`) and topic-level (`kafka_*`). The global configuration is applied first, and then the topic-level configuration is applied (if it exists). -```xml +``` xml cgrp @@ -155,17 +154,16 @@ Similar to GraphiteMergeTree, the Kafka engine supports extended configuration u For a list of possible configuration options, see the [librdkafka configuration reference](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md). Use the underscore (`_`) instead of a dot in the ClickHouse configuration. For example, `check.crcs=true` will be `true`. -## Virtual Columns +## Virtual Columns {#virtual-columns} -- `_topic` — Kafka topic. -- `_key` — Key of the message. -- `_offset` — Offset of the message. -- `_timestamp` — Timestamp of the message. -- `_partition` — Partition of Kafka topic. +- `_topic` — Kafka topic. +- `_key` — Key of the message. +- `_offset` — Offset of the message. +- `_timestamp` — Timestamp of the message. +- `_partition` — Partition of Kafka topic. **See Also** -- [Virtual columns](index.md#table_engines-virtual_columns) - +- [Virtual columns](index.md#table_engines-virtual_columns) [Original article](https://clickhouse.tech/docs/en/operations/table_engines/kafka/) diff --git a/docs/en/operations/table_engines/log.md b/docs/en/operations/table_engines/log.md index 4e7b4703f42..5e10b68f63e 100644 --- a/docs/en/operations/table_engines/log.md +++ b/docs/en/operations/table_engines/log.md @@ -1,11 +1,9 @@ -# Log +# Log {#log} Engine belongs to the family of log engines. See the common properties of log engines and their differences in the [Log Engine Family](log_family.md) article. - -Log differs from [TinyLog](tinylog.md) in that a small file of "marks" resides with the column files. These marks are written on every data block and contain offsets that indicate where to start reading the file in order to skip the specified number of rows. This makes it possible to read table data in multiple threads. +Log differs from [TinyLog](tinylog.md) in that a small file of “marks” resides with the column files. These marks are written on every data block and contain offsets that indicate where to start reading the file in order to skip the specified number of rows. This makes it possible to read table data in multiple threads. For concurrent data access, the read operations can be performed simultaneously, while write operations block reads and each other. The Log engine does not support indexes. Similarly, if writing to a table failed, the table is broken, and reading from it returns an error. The Log engine is appropriate for temporary data, write-once tables, and for testing or demonstration purposes. - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/log/) diff --git a/docs/en/operations/table_engines/log_family.md b/docs/en/operations/table_engines/log_family.md index 68ea24542bb..8065c68f705 100644 --- a/docs/en/operations/table_engines/log_family.md +++ b/docs/en/operations/table_engines/log_family.md @@ -1,38 +1,39 @@ -# Log Engine Family +# Log Engine Family {#log-engine-family} These engines were developed for scenarios when you need to quickly write many small tables (up to about 1 million rows) and read them later as a whole. Engines of the family: -- [StripeLog](stripelog.md) -- [Log](log.md) -- [TinyLog](tinylog.md) +- [StripeLog](stripelog.md) +- [Log](log.md) +- [TinyLog](tinylog.md) -## Common properties +## Common properties {#common-properties} Engines: -- Store data on a disk. -- Append data to the end of file when writing. -- Support locks for concurrent data access. - +- Store data on a disk. + +- Append data to the end of file when writing. + +- Support locks for concurrent data access. + During `INSERT` queries, the table is locked, and other queries for reading and writing data both wait for the table to unlock. If there are no data writing queries, any number of data reading queries can be performed concurrently. -- Do not support [mutation](../../query_language/alter.md#alter-mutations) operations. -- Do not support indexes. +- Do not support [mutation](../../query_language/alter.md#alter-mutations) operations. + +- Do not support indexes. This means that `SELECT` queries for ranges of data are not efficient. -- Do not write data atomically. +- Do not write data atomically. You can get a table with corrupted data if something breaks the write operation, for example, abnormal server shutdown. +## Differences {#differences} -## Differences - -The `TinyLog` engine is the simplest in the family and provides the poorest functionality and lowest efficiency. The `TinyLog` engine doesn't support parallel data reading by several threads. It reads data slower than other engines in the family that support parallel reading and it uses almost as many descriptors as the `Log` engine because it stores each column in a separate file. Use it in simple low-load scenarios. +The `TinyLog` engine is the simplest in the family and provides the poorest functionality and lowest efficiency. The `TinyLog` engine doesn’t support parallel data reading by several threads. It reads data slower than other engines in the family that support parallel reading and it uses almost as many descriptors as the `Log` engine because it stores each column in a separate file. Use it in simple low-load scenarios. The `Log` and `StripeLog` engines support parallel data reading. When reading data, ClickHouse uses multiple threads. Each thread processes a separate data block. The `Log` engine uses a separate file for each column of the table. `StripeLog` stores all the data in one file. As a result, the `StripeLog` engine uses fewer descriptors in the operating system, but the `Log` engine provides higher efficiency when reading data. - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/log_family/) diff --git a/docs/en/operations/table_engines/materializedview.md b/docs/en/operations/table_engines/materializedview.md index bc4d2288542..c12b9512c79 100644 --- a/docs/en/operations/table_engines/materializedview.md +++ b/docs/en/operations/table_engines/materializedview.md @@ -1,6 +1,5 @@ -# MaterializedView +# MaterializedView {#materializedview} Used for implementing materialized views (for more information, see [CREATE TABLE](../../query_language/create.md)). For storing data, it uses a different engine that was specified when creating the view. When reading from a table, it just uses this engine. - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/materializedview/) diff --git a/docs/en/operations/table_engines/memory.md b/docs/en/operations/table_engines/memory.md index dded3ca8d41..b490eb1e08f 100644 --- a/docs/en/operations/table_engines/memory.md +++ b/docs/en/operations/table_engines/memory.md @@ -1,13 +1,12 @@ -# Memory +# Memory {#memory} The Memory engine stores data in RAM, in uncompressed form. Data is stored in exactly the same form as it is received when read. In other words, reading from this table is completely free. -Concurrent data access is synchronized. Locks are short: read and write operations don't block each other. +Concurrent data access is synchronized. Locks are short: read and write operations don’t block each other. Indexes are not supported. Reading is parallelized. Maximal productivity (over 10 GB/sec) is reached on simple queries, because there is no reading from the disk, decompressing, or deserializing data. (We should note that in many cases, the productivity of the MergeTree engine is almost as high.) When restarting a server, data disappears from the table and the table becomes empty. Normally, using this table engine is not justified. However, it can be used for tests, and for tasks where maximum speed is required on a relatively small number of rows (up to approximately 100,000,000). -The Memory engine is used by the system for temporary tables with external query data (see the section "External data for processing a query"), and for implementing GLOBAL IN (see the section "IN operators"). - +The Memory engine is used by the system for temporary tables with external query data (see the section “External data for processing a query”), and for implementing GLOBAL IN (see the section “IN operators”). [Original article](https://clickhouse.tech/docs/en/operations/table_engines/memory/) diff --git a/docs/en/operations/table_engines/merge.md b/docs/en/operations/table_engines/merge.md index e081963cc87..04554057fff 100644 --- a/docs/en/operations/table_engines/merge.md +++ b/docs/en/operations/table_engines/merge.md @@ -1,4 +1,4 @@ -# Merge +# Merge {#merge} The `Merge` engine (not to be confused with `MergeTree`) does not store data itself, but allows reading from any number of other tables simultaneously. Reading is automatically parallelized. Writing to a table is not supported. When reading, the indexes of tables that are actually being read are used, if they exist. @@ -6,27 +6,27 @@ The `Merge` engine accepts parameters: the database name and a regular expressio Example: -```sql +``` sql Merge(hits, '^WatchLog') ``` -Data will be read from the tables in the `hits` database that have names that match the regular expression '`^WatchLog`'. +Data will be read from the tables in the `hits` database that have names that match the regular expression ‘`^WatchLog`’. Instead of the database name, you can use a constant expression that returns a string. For example, `currentDatabase()`. Regular expressions — [re2](https://github.com/google/re2) (supports a subset of PCRE), case-sensitive. -See the notes about escaping symbols in regular expressions in the "match" section. +See the notes about escaping symbols in regular expressions in the “match” section. When selecting tables to read, the `Merge` table itself will not be selected, even if it matches the regex. This is to avoid loops. -It is possible to create two `Merge` tables that will endlessly try to read each others' data, but this is not a good idea. +It is possible to create two `Merge` tables that will endlessly try to read each others’ data, but this is not a good idea. The typical way to use the `Merge` engine is for working with a large number of `TinyLog` tables as if with a single table. Example 2: -Let's say you have a old table (WatchLog_old) and decided to change partitioning without moving data to a new table (WatchLog_new) and you need to see data from both tables. +Let’s say you have a old table (WatchLog\_old) and decided to change partitioning without moving data to a new table (WatchLog\_new) and you need to see data from both tables. -```sql +``` sql CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64) ENGINE=MergeTree(date, (UserId, EventType), 8192); INSERT INTO WatchLog_old VALUES ('2018-01-01', 1, 'hit', 3); @@ -40,7 +40,8 @@ CREATE TABLE WatchLog as WatchLog_old ENGINE=Merge(currentDatabase(), '^WatchLog SELECT * FROM WatchLog ``` -```text + +``` text ┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ │ 2018-01-01 │ 1 │ hit │ 3 │ └────────────┴────────┴───────────┴─────┘ @@ -49,15 +50,14 @@ FROM WatchLog └────────────┴────────┴───────────┴─────┘ ``` -## Virtual Columns +## Virtual Columns {#virtual-columns} -- `_table` — Contains the name of the table from which data was read. Type: [String](../../data_types/string.md). +- `_table` — Contains the name of the table from which data was read. Type: [String](../../data_types/string.md). You can set the constant conditions on `_table` in the `WHERE/PREWHERE` clause (for example, `WHERE _table='xyz'`). In this case the read operation is performed only for that tables where the condition on `_table` is satisfied, so the `_table` column acts as an index. **See Also** -- [Virtual columns](index.md#table_engines-virtual_columns) - +- [Virtual columns](index.md#table_engines-virtual_columns) [Original article](https://clickhouse.tech/docs/en/operations/table_engines/merge/) diff --git a/docs/en/operations/table_engines/mergetree.md b/docs/en/operations/table_engines/mergetree.md index e80de9def1c..3f9f0ef2492 100644 --- a/docs/en/operations/table_engines/mergetree.md +++ b/docs/en/operations/table_engines/mergetree.md @@ -1,4 +1,4 @@ -# MergeTree {#table_engines-mergetree} +# MergeTree {#table-engines-mergetree} The `MergeTree` engine and other engines of this family (`*MergeTree`) are the most robust ClickHouse table engines. @@ -6,29 +6,28 @@ Engines in the `MergeTree` family are designed for inserting a very large amount Main features: -- Stores data sorted by primary key. +- Stores data sorted by primary key. This allows you to create a small sparse index that helps find data faster. -- Partitions can be used if the [partitioning key](custom_partitioning_key.md) is specified. +- Partitions can be used if the [partitioning key](custom_partitioning_key.md) is specified. ClickHouse supports certain operations with partitions that are more effective than general operations on the same data with the same result. ClickHouse also automatically cuts off the partition data where the partitioning key is specified in the query. This also improves query performance. -- Data replication support. +- Data replication support. The family of `ReplicatedMergeTree` tables provides data replication. For more information, see [Data replication](replication.md). -- Data sampling support. +- Data sampling support. If necessary, you can set the data sampling method in the table. -!!! info +!!! info "Info" The [Merge](merge.md) engine does not belong to the `*MergeTree` family. +## Creating a Table {#table-engine-mergetree-creating-a-table} -## Creating a Table {#table_engine-mergetree-creating-a-table} - -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], @@ -47,30 +46,30 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] For a description of parameters, see the [CREATE query description](../../query_language/create.md). -!!!note "Note" +!!! note "Note" `INDEX` is an experimental feature, see [Data Skipping Indexes](#table_engine-mergetree-data_skipping-indexes). ### Query Clauses {#mergetree-query-clauses} -- `ENGINE` — Name and parameters of the engine. `ENGINE = MergeTree()`. The `MergeTree` engine does not have parameters. +- `ENGINE` — Name and parameters of the engine. `ENGINE = MergeTree()`. The `MergeTree` engine does not have parameters. -- `PARTITION BY` — The [partitioning key](custom_partitioning_key.md). +- `PARTITION BY` — The [partitioning key](custom_partitioning_key.md). For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](../../data_types/date.md). The partition names here have the `"YYYYMM"` format. -- `ORDER BY` — The sorting key. +- `ORDER BY` — The sorting key. A tuple of columns or arbitrary expressions. Example: `ORDER BY (CounterID, EventDate)`. -- `PRIMARY KEY` — The primary key if it [differs from the sorting key](mergetree.md). +- `PRIMARY KEY` — The primary key if it [differs from the sorting key](mergetree.md). By default the primary key is the same as the sorting key (which is specified by the `ORDER BY` clause). Thus in most cases it is unnecessary to specify a separate `PRIMARY KEY` clause. -- `SAMPLE BY` — An expression for sampling. +- `SAMPLE BY` — An expression for sampling. If a sampling expression is used, the primary key must contain it. Example: `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))`. -- `TTL` — A list of rules specifying storage duration of rows and defining logic of automatic parts movement [between disks and volumes](#table_engine-mergetree-multiple-volumes). +- `TTL` — A list of rules specifying storage duration of rows and defining logic of automatic parts movement [between disks and volumes](#table_engine-mergetree-multiple-volumes). Expression must have one `Date` or `DateTime` column as a result. Example: `TTL date + INTERVAL 1 DAY` @@ -79,22 +78,22 @@ For a description of parameters, see the [CREATE query description](../../query_ For more details, see [TTL for columns and tables](#table_engine-mergetree-ttl) -- `SETTINGS` — Additional parameters that control the behavior of the `MergeTree`: - - `index_granularity` — Maximum number of data rows between the marks of an index. Default value: 8192. See [Data Storage](#mergetree-data-storage). - - `index_granularity_bytes` — Maximum size of data granules in bytes. Default value: 10Mb. To restrict the granule size only by number of rows, set to 0 (not recommended). See [Data Storage](#mergetree-data-storage). - - `enable_mixed_granularity_parts` — Enables or disables transitioning to control the granule size with the `index_granularity_bytes` setting. Before version 19.11, there was only the `index_granularity` setting for restricting granule size. The `index_granularity_bytes` setting improves ClickHouse performance when selecting data from tables with big rows (tens and hundreds of megabytes). If you have tables with big rows, you can enable this setting for the tables to improve the efficiency of `SELECT` queries. - - `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in "Server configuration parameters". - - `min_merge_bytes_to_use_direct_io` — The minimum data volume for merge operation that is required for using direct I/O access to the storage disk. When merging data parts, ClickHouse calculates the total storage volume of all the data to be merged. If the volume exceeds `min_merge_bytes_to_use_direct_io` bytes, ClickHouse reads and writes the data to the storage disk using the direct I/O interface (`O_DIRECT` option). If `min_merge_bytes_to_use_direct_io = 0`, then direct I/O is disabled. Default value: `10 * 1024 * 1024 * 1024` bytes. - - - `merge_with_ttl_timeout` — Minimum delay in seconds before repeating a merge with TTL. Default value: 86400 (1 day). - - `write_final_mark` — Enables or disables writing the final index mark at the end of data part (after the last byte). Default value: 1. Don't turn it off. - - `merge_max_block_size` — Maximum number of rows in block for merge operations. Default value: 8192. - - `storage_policy` — Storage policy. See [Using Multiple Block Devices for Data Storage](#table_engine-mergetree-multiple-volumes). +- `SETTINGS` — Additional parameters that control the behavior of the `MergeTree`: + - `index_granularity` — Maximum number of data rows between the marks of an index. Default value: 8192. See [Data Storage](#mergetree-data-storage). + - `index_granularity_bytes` — Maximum size of data granules in bytes. Default value: 10Mb. To restrict the granule size only by number of rows, set to 0 (not recommended). See [Data Storage](#mergetree-data-storage). + - `enable_mixed_granularity_parts` — Enables or disables transitioning to control the granule size with the `index_granularity_bytes` setting. Before version 19.11, there was only the `index_granularity` setting for restricting granule size. The `index_granularity_bytes` setting improves ClickHouse performance when selecting data from tables with big rows (tens and hundreds of megabytes). If you have tables with big rows, you can enable this setting for the tables to improve the efficiency of `SELECT` queries. + - `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in “Server configuration parameters”. + - `min_merge_bytes_to_use_direct_io` — The minimum data volume for merge operation that is required for using direct I/O access to the storage disk. When merging data parts, ClickHouse calculates the total storage volume of all the data to be merged. If the volume exceeds `min_merge_bytes_to_use_direct_io` bytes, ClickHouse reads and writes the data to the storage disk using the direct I/O interface (`O_DIRECT` option). If `min_merge_bytes_to_use_direct_io = 0`, then direct I/O is disabled. Default value: `10 * 1024 * 1024 * 1024` bytes. + + - `merge_with_ttl_timeout` — Minimum delay in seconds before repeating a merge with TTL. Default value: 86400 (1 day). + - `write_final_mark` — Enables or disables writing the final index mark at the end of data part (after the last byte). Default value: 1. Don’t turn it off. + - `merge_max_block_size` — Maximum number of rows in block for merge operations. Default value: 8192. + - `storage_policy` — Storage policy. See [Using Multiple Block Devices for Data Storage](#table_engine-mergetree-multiple-volumes). **Example of Sections Setting** -```sql +``` sql ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity=8192 ``` @@ -104,12 +103,14 @@ We also set an expression for sampling as a hash by the user ID. This allows you The `index_granularity` setting can be omitted because 8192 is the default value. -
Deprecated Method for Creating a Table +
-!!! attention +Deprecated Method for Creating a Table + +!!! attention "Attention" Do not use this method in new projects. If possible, switch old projects to the method described above. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -120,14 +121,14 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **MergeTree() Parameters** -- `date-column` — The name of a column of the [Date](../../data_types/date.md) type. ClickHouse automatically creates partitions by month based on this column. The partition names are in the `"YYYYMM"` format. -- `sampling_expression` — An expression for sampling. -- `(primary, key)` — Primary key. Type: [Tuple()](../../data_types/tuple.md) -- `index_granularity` — The granularity of an index. The number of data rows between the "marks" of an index. The value 8192 is appropriate for most tasks. +- `date-column` — The name of a column of the [Date](../../data_types/date.md) type. ClickHouse automatically creates partitions by month based on this column. The partition names are in the `"YYYYMM"` format. +- `sampling_expression` — An expression for sampling. +- `(primary, key)` — Primary key. Type: [Tuple()](../../data_types/tuple.md) +- `index_granularity` — The granularity of an index. The number of data rows between the “marks” of an index. The value 8192 is appropriate for most tasks. **Example** -```sql +``` sql MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) ``` @@ -142,7 +143,7 @@ When data is inserted in a table, separate data parts are created and each of th Data belonging to different partitions are separated into different parts. In the background, ClickHouse merges data parts for more efficient storage. Parts belonging to different partitions are not merged. The merge mechanism does not guarantee that all rows with the same primary key will be in the same data part. -Each data part is logically divided into granules. A granule is the smallest indivisible data set that ClickHouse reads when selecting data. ClickHouse doesn't split rows or values, so each granule always contains an integer number of rows. The first row of a granule is marked with the value of the primary key for the row. For each data part, ClickHouse creates an index file that stores the marks. For each column, whether it's in the primary key or not, ClickHouse also stores the same marks. These marks let you find data directly in column files. +Each data part is logically divided into granules. A granule is the smallest indivisible data set that ClickHouse reads when selecting data. ClickHouse doesn’t split rows or values, so each granule always contains an integer number of rows. The first row of a granule is marked with the value of the primary key for the row. For each data part, ClickHouse creates an index file that stores the marks. For each column, whether it’s in the primary key or not, ClickHouse also stores the same marks. These marks let you find data directly in column files. The granule size is restricted by the `index_granularity` and `index_granularity_bytes` settings of the table engine. The number of rows in a granule lays in the `[1, index_granularity]` range, depending on the size of the rows. The size of a granule can exceed `index_granularity_bytes` if the size of a single row is greater than the value of the setting. In this case, the size of the granule equals the size of the row. @@ -150,52 +151,49 @@ The granule size is restricted by the `index_granularity` and `index_granularity Take the `(CounterID, Date)` primary key as an example. In this case, the sorting and index can be illustrated as follows: -``` -Whole data: [-------------------------------------------------------------------------] -CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll] -Date: [1111111222222233331233211111222222333211111112122222223111112223311122333] -Marks: | | | | | | | | | | | - a,1 a,2 a,3 b,3 e,2 e,3 g,1 h,2 i,1 i,3 l,3 -Marks numbers: 0 1 2 3 4 5 6 7 8 9 10 -``` + Whole data: [-------------------------------------------------------------------------] + CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll] + Date: [1111111222222233331233211111222222333211111112122222223111112223311122333] + Marks: | | | | | | | | | | | + a,1 a,2 a,3 b,3 e,2 e,3 g,1 h,2 i,1 i,3 l,3 + Marks numbers: 0 1 2 3 4 5 6 7 8 9 10 If the data query specifies: -- `CounterID in ('a', 'h')`, the server reads the data in the ranges of marks `[0, 3)` and `[6, 8)`. -- `CounterID IN ('a', 'h') AND Date = 3`, the server reads the data in the ranges of marks `[1, 3)` and `[7, 8)`. -- `Date = 3`, the server reads the data in the range of marks `[1, 10]`. +- `CounterID in ('a', 'h')`, the server reads the data in the ranges of marks `[0, 3)` and `[6, 8)`. +- `CounterID IN ('a', 'h') AND Date = 3`, the server reads the data in the ranges of marks `[1, 3)` and `[7, 8)`. +- `Date = 3`, the server reads the data in the range of marks `[1, 10]`. The examples above show that it is always more effective to use an index than a full scan. A sparse index allows extra data to be read. When reading a single range of the primary key, up to `index_granularity * 2` extra rows in each data block can be read. -Sparse indexes allow you to work with a very large number of table rows, because in most cases, such indexes fit in the computer's RAM. +Sparse indexes allow you to work with a very large number of table rows, because in most cases, such indexes fit in the computer’s RAM. ClickHouse does not require a unique primary key. You can insert multiple rows with the same primary key. -### Selecting the Primary Key +### Selecting the Primary Key {#selecting-the-primary-key} The number of columns in the primary key is not explicitly limited. Depending on the data structure, you can include more or fewer columns in the primary key. This may: -- Improve the performance of an index. +- Improve the performance of an index. If the primary key is `(a, b)`, then adding another column `c` will improve the performance if the following conditions are met: - - There are queries with a condition on column `c`. - - Long data ranges (several times longer than the `index_granularity`) with identical values for `(a, b)` are common. In other words, when adding another column allows you to skip quite long data ranges. + - There are queries with a condition on column `c`. + - Long data ranges (several times longer than the `index_granularity`) with identical values for `(a, b)` are common. In other words, when adding another column allows you to skip quite long data ranges. -- Improve data compression. +- Improve data compression. ClickHouse sorts data by primary key, so the higher the consistency, the better the compression. -- Provide additional logic when merging data parts in the [CollapsingMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) and [SummingMergeTree](summingmergetree.md) engines. +- Provide additional logic when merging data parts in the [CollapsingMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) and [SummingMergeTree](summingmergetree.md) engines. In this case it makes sense to specify the *sorting key* that is different from the primary key. A long primary key will negatively affect the insert performance and memory consumption, but extra columns in the primary key do not affect ClickHouse performance during `SELECT` queries. - -### Choosing a Primary Key that Differs from the Sorting Key +### Choosing a Primary Key that Differs from the Sorting Key {#choosing-a-primary-key-that-differs-from-the-sorting-key} It is possible to specify a primary key (an expression with values that are written in the index file for each mark) that is different from the sorting key (an expression for sorting the rows in data parts). In this case the primary key expression tuple must be a prefix of the sorting key expression tuple. @@ -204,19 +202,17 @@ This feature is helpful when using the [SummingMergeTree](summingmergetree.md) a In this case it makes sense to leave only a few columns in the primary key that will provide efficient range scans and add the remaining dimension columns to the sorting key tuple. -[ALTER](../../query_language/alter.md) of the sorting key is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts don't need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column, the data is sorted by both the old and new sorting keys at the moment of table modification. +[ALTER](../../query_language/alter.md) of the sorting key is a lightweight operation because when a new column is simultaneously added to the table and to the sorting key, existing data parts don’t need to be changed. Since the old sorting key is a prefix of the new sorting key and there is no data in the newly added column, the data is sorted by both the old and new sorting keys at the moment of table modification. -### Use of Indexes and Partitions in Queries +### Use of Indexes and Partitions in Queries {#use-of-indexes-and-partitions-in-queries} For `SELECT` queries, ClickHouse analyzes whether an index can be used. An index can be used if the `WHERE/PREWHERE` clause has an expression (as one of the conjunction elements, or entirely) that represents an equality or inequality comparison operation, or if it has `IN` or `LIKE` with a fixed prefix on columns or expressions that are in the primary key or partitioning key, or on certain partially repetitive functions of these columns, or logical relationships of these expressions. Thus, it is possible to quickly run queries on one or many ranges of the primary key. In this example, queries will be fast when run for a specific tracking tag, for a specific tag and date range, for a specific tag and date, for multiple tags with a date range, and so on. -Let's look at the engine configured as follows: +Let’s look at the engine configured as follows: -``` -ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192 -``` + ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192 In this case, in queries: @@ -228,30 +224,31 @@ SELECT count() FROM table WHERE ((EventDate >= toDate('2014-01-01') AND EventDat ClickHouse will use the primary key index to trim improper data and the monthly partitioning key to trim partitions that are in improper date ranges. -The queries above show that the index is used even for complex expressions. Reading from the table is organized so that using the index can't be slower than a full scan. +The queries above show that the index is used even for complex expressions. Reading from the table is organized so that using the index can’t be slower than a full scan. -In the example below, the index can't be used. +In the example below, the index can’t be used. ``` sql SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' ``` -To check whether ClickHouse can use the index when running a query, use the settings [force_index_by_date](../settings/settings.md#settings-force_index_by_date) and [force_primary_key](../settings/settings.md). +To check whether ClickHouse can use the index when running a query, use the settings [force\_index\_by\_date](../settings/settings.md#settings-force_index_by_date) and [force\_primary\_key](../settings/settings.md). The key for partitioning by month allows reading only those data blocks which contain dates from the proper range. In this case, the data block may contain data for many dates (up to an entire month). Within a block, data is sorted by primary key, which might not contain the date as the first column. Because of this, using a query with only a date condition that does not specify the primary key prefix will cause more data to be read than for a single date. -### Use of Index for Partially-Monotonic Primary Keys +### Use of Index for Partially-Monotonic Primary Keys {#use-of-index-for-partially-monotonic-primary-keys} Consider, for example, the days of the month. They form a [monotonic sequence](https://en.wikipedia.org/wiki/Monotonic_function) for one month, but not monotonic for more extended periods. This is a partially-monotonic sequence. If a user creates the table with partially-monotonic primary key, ClickHouse creates a sparse index as usual. When a user selects data from this kind of table, ClickHouse analyzes the query conditions. If the user wants to get data between two marks of the index and both these marks fall within one month, ClickHouse can use the index in this particular case because it can calculate the distance between the parameters of a query and index marks. -ClickHouse cannot use an index if the values of the primary key in the query parameter range don't represent a monotonic sequence. In this case, ClickHouse uses the full scan method. +ClickHouse cannot use an index if the values of the primary key in the query parameter range don’t represent a monotonic sequence. In this case, ClickHouse uses the full scan method. ClickHouse uses this logic not only for days of the month sequences, but for any primary key that represents a partially-monotonic sequence. -### Data Skipping Indexes (Experimental) {#table_engine-mergetree-data_skipping-indexes} +### Data Skipping Indexes (Experimental) {#table-engine-mergetree-data-skipping-indexes} The index declaration is in the columns section of the `CREATE` query. -```sql + +``` sql INDEX index_name expr TYPE type(...) GRANULARITY granularity_value ``` @@ -259,10 +256,9 @@ For tables from the `*MergeTree` family, data skipping indices can be specified. These indices aggregate some information about the specified expression on blocks, which consist of `granularity_value` granules (the size of the granule is specified using the `index_granularity` setting in the table engine). Then these aggregates are used in `SELECT` queries for reducing the amount of data to read from the disk by skipping big blocks of data where the `where` query cannot be satisfied. - **Example** -```sql +``` sql CREATE TABLE table_name ( u64 UInt64, @@ -277,35 +273,35 @@ CREATE TABLE table_name Indices from the example can be used by ClickHouse to reduce the amount of data to read from disk in the following queries: -```sql +``` sql SELECT count() FROM table WHERE s < 'z' SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 ``` -#### Available Types of Indices +#### Available Types of Indices {#available-types-of-indices} -- `minmax` +- `minmax` Stores extremes of the specified expression (if the expression is `tuple`, then it stores extremes for each element of `tuple`), uses stored info for skipping blocks of data like the primary key. -- `set(max_rows)` +- `set(max_rows)` - Stores unique values of the specified expression (no more than `max_rows` rows, `max_rows=0` means "no limits"). Uses the values to check if the `WHERE` expression is not satisfiable on a block of data. + Stores unique values of the specified expression (no more than `max_rows` rows, `max_rows=0` means “no limits”). Uses the values to check if the `WHERE` expression is not satisfiable on a block of data. -- `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` +- `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) that contains all ngrams from a block of data. Works only with strings. Can be used for optimization of `equals`, `like` and `in` expressions. - - `n` — ngram size, - - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, for example, 256 or 512, because it can be compressed well). - - `number_of_hash_functions` — The number of hash functions used in the Bloom filter. - - `random_seed` — The seed for Bloom filter hash functions. + - `n` — ngram size, + - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, for example, 256 or 512, because it can be compressed well). + - `number_of_hash_functions` — The number of hash functions used in the Bloom filter. + - `random_seed` — The seed for Bloom filter hash functions. -- `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` +- `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` The same as `ngrambf_v1`, but stores tokens instead of ngrams. Tokens are sequences separated by non-alphanumeric characters. -- `bloom_filter([false_positive])` — Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) for the specified columns. +- `bloom_filter([false_positive])` — Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) for the specified columns. The optional `false_positive` parameter is the probability of receiving a false positive response from the filter. Possible values: (0, 1). Default value: 0.025. @@ -313,62 +309,63 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 The following functions can use it: [equals](../../query_language/functions/comparison_functions.md), [notEquals](../../query_language/functions/comparison_functions.md), [in](../../query_language/functions/in_functions.md), [notIn](../../query_language/functions/in_functions.md), [has](../../query_language/functions/array_functions.md). -```sql + + +``` sql INDEX sample_index (u64 * length(s)) TYPE minmax GRANULARITY 4 INDEX sample_index2 (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4 INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 4 ``` -#### Functions Support +#### Functions Support {#functions-support} Conditions in the `WHERE` clause contains calls of the functions that operate with columns. If the column is a part of an index, ClickHouse tries to use this index when performing the functions. ClickHouse supports different subsets of functions for using indexes. The `set` index can be used with all functions. Function subsets for other indexes are shown in the table below. -Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter -----------------------------|-------------|--------|------------|------------|--------------- -[equals (=, ==)](../../query_language/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ -[notEquals(!=, <>)](../../query_language/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ -[like](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ -[notLike](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✗ | ✗ -[startsWith](../../query_language/functions/string_functions.md#function-startswith) | ✔ | ✔ | ✔ | ✔ | ✗ -[endsWith](../../query_language/functions/string_functions.md#function-endswith) | ✗ | ✗ | ✔ | ✔ | ✗ -[multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ -[in](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ -[notIn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ -[less (<)](../../query_language/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ -[greater (>)](../../query_language/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ -[lessOrEquals (<=)](../../query_language/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ -[greaterOrEquals (>=)](../../query_language/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ -[empty](../../query_language/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ -[notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ -hasToken | ✗ | ✗ | ✗ | ✔ | ✗ +| Function (operator) / Index | primary key | minmax | ngrambf\_v1 | tokenbf\_v1 | bloom\_filter | +|----------------------------------------------------------------------------------------------------------|-------------|--------|-------------|-------------|---------------| +| [equals (=, ==)](../../query_language/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notEquals(!=, \<\>)](../../query_language/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [like](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ | +| [notLike](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✗ | ✗ | +| [startsWith](../../query_language/functions/string_functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [endsWith](../../query_language/functions/string_functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ | +| [multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ | +| [in](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notIn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [less (\<)](../../query_language/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greater (\>)](../../query_language/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [lessOrEquals (\<=)](../../query_language/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greaterOrEquals (\>=)](../../query_language/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [empty](../../query_language/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| hasToken | ✗ | ✗ | ✗ | ✔ | ✗ | -Functions with a constant argument that is less than ngram size can't be used by `ngrambf_v1` for query optimization. +Functions with a constant argument that is less than ngram size can’t be used by `ngrambf_v1` for query optimization. -Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, and `bloom_filter` indexes can't be used for optimizing queries where the result of a function is expected to be false, for example: +Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, and `bloom_filter` indexes can’t be used for optimizing queries where the result of a function is expected to be false, for example: -- Can be optimized: - - `s LIKE '%test%'` - - `NOT s NOT LIKE '%test%'` - - `s = 1` - - `NOT s != 1` - - `startsWith(s, 'test')` -- Can't be optimized: - - `NOT s LIKE '%test%'` - - `s NOT LIKE '%test%'` - - `NOT s = 1` - - `s != 1` - - `NOT startsWith(s, 'test')` +- Can be optimized: + - `s LIKE '%test%'` + - `NOT s NOT LIKE '%test%'` + - `s = 1` + - `NOT s != 1` + - `startsWith(s, 'test')` +- Can’t be optimized: + - `NOT s LIKE '%test%'` + - `s NOT LIKE '%test%'` + - `NOT s = 1` + - `s != 1` + - `NOT startsWith(s, 'test')` -## Concurrent Data Access +## Concurrent Data Access {#concurrent-data-access} For concurrent table access, we use multi-versioning. In other words, when a table is simultaneously read and updated, data is read from a set of parts that is current at the time of the query. There are no lengthy locks. Inserts do not get in the way of read operations. Reading from a table is automatically parallelized. - -## TTL for Columns and Tables {#table_engine-mergetree-ttl} +## TTL for Columns and Tables {#table-engine-mergetree-ttl} Determines the lifetime of values. @@ -378,14 +375,14 @@ Expressions must evaluate to [Date](../../data_types/date.md) or [DateTime](../. Example: -```sql +``` sql TTL time_column TTL time_column + interval ``` To define `interval`, use [time interval](../../query_language/operators.md#operators-datetime) operators. -```sql +``` sql TTL date_time + INTERVAL 1 MONTH TTL date_time + INTERVAL 15 HOUR ``` @@ -394,14 +391,14 @@ TTL date_time + INTERVAL 15 HOUR When the values in the column expire, ClickHouse replaces them with the default values for the column data type. If all the column values in the data part expire, ClickHouse deletes this column from the data part in a filesystem. -The `TTL` clause can't be used for key columns. +The `TTL` clause can’t be used for key columns. Examples: Creating a table with TTL -```sql -CREATE TABLE example_table +``` sql +CREATE TABLE example_table ( d DateTime, a Int TTL d + INTERVAL 1 MONTH, @@ -415,7 +412,7 @@ ORDER BY d; Adding TTL to a column of an existing table -```sql +``` sql ALTER TABLE example_table MODIFY COLUMN c String TTL d + INTERVAL 1 DAY; @@ -423,7 +420,7 @@ ALTER TABLE example_table Altering TTL of the column -```sql +``` sql ALTER TABLE example_table MODIFY COLUMN c String TTL d + INTERVAL 1 MONTH; @@ -433,22 +430,22 @@ ALTER TABLE example_table Table can have an expression for removal of expired rows, and multiple expressions for automatic move of parts between [disks or volumes](#table_engine-mergetree-multiple-volumes). When rows in the table expire, ClickHouse deletes all corresponding rows. For parts moving feature, all rows of a part must satisfy the movement expression criteria. -```sql +``` sql TTL expr [DELETE|TO DISK 'aaa'|TO VOLUME 'bbb'], ... ``` Type of TTL rule may follow each TTL expression. It affects an action which is to be done once the expression is satisfied (reaches current time): -- `DELETE` - delete expired rows (default action); -- `TO DISK 'aaa'` - move part to the disk `aaa`; -- `TO VOLUME 'bbb'` - move part to the disk `bbb`. +- `DELETE` - delete expired rows (default action); +- `TO DISK 'aaa'` - move part to the disk `aaa`; +- `TO VOLUME 'bbb'` - move part to the disk `bbb`. Examples: Creating a table with TTL -```sql -CREATE TABLE example_table +``` sql +CREATE TABLE example_table ( d DateTime, a Int @@ -463,7 +460,7 @@ TTL d + INTERVAL 1 MONTH [DELETE], Altering TTL of the table -```sql +``` sql ALTER TABLE example_table MODIFY TTL d + INTERVAL 1 DAY; ``` @@ -472,37 +469,36 @@ ALTER TABLE example_table Data with an expired TTL is removed when ClickHouse merges data parts. -When ClickHouse see that data is expired, it performs an off-schedule merge. To control the frequency of such merges, you can set [merge_with_ttl_timeout](#mergetree_setting-merge_with_ttl_timeout). If the value is too low, it will perform many off-schedule merges that may consume a lot of resources. +When ClickHouse see that data is expired, it performs an off-schedule merge. To control the frequency of such merges, you can set [merge\_with\_ttl\_timeout](#mergetree_setting-merge_with_ttl_timeout). If the value is too low, it will perform many off-schedule merges that may consume a lot of resources. If you perform the `SELECT` query between merges, you may get expired data. To avoid it, use the [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) query before `SELECT`. [Original article](https://clickhouse.tech/docs/en/operations/table_engines/mergetree/) +## Using Multiple Block Devices for Data Storage {#table-engine-mergetree-multiple-volumes} -## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes} +### Introduction {#introduction} -### Introduction +`MergeTree` family table engines can store data on multiple block devices. For example, it can be useful when the data of a certain table are implicitly split into “hot” and “cold”. The most recent data is regularly requested but requires only a small amount of space. On the contrary, the fat-tailed historical data is requested rarely. If several disks are available, the “hot” data may be located on fast disks (for example, NVMe SSDs or in memory), while the “cold” data - on relatively slow ones (for example, HDD). -`MergeTree` family table engines can store data on multiple block devices. For example, it can be useful when the data of a certain table are implicitly split into "hot" and "cold". The most recent data is regularly requested but requires only a small amount of space. On the contrary, the fat-tailed historical data is requested rarely. If several disks are available, the "hot" data may be located on fast disks (for example, NVMe SSDs or in memory), while the "cold" data - on relatively slow ones (for example, HDD). +Data part is the minimum movable unit for `MergeTree`-engine tables. The data belonging to one part are stored on one disk. Data parts can be moved between disks in the background (according to user settings) as well as by means of the [ALTER](../../query_language/alter.md#alter_move-partition) queries. -Data part is the minimum movable unit for `MergeTree`-engine tables. The data belonging to one part are stored on one disk. Data parts can be moved between disks in the background (according to user settings) as well as by means of the [ALTER](../../query_language/alter.md#alter_move-partition) queries. +### Terms {#terms} -### Terms +- Disk — Block device mounted to the filesystem. +- Default disk — Disk that stores the path specified in the [path](../server_settings/settings.md#server_settings-path) server setting. +- Volume — Ordered set of equal disks (similar to [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures)). +- Storage policy — Set of volumes and the rules for moving data between them. -- Disk — Block device mounted to the filesystem. -- Default disk — Disk that stores the path specified in the [path](../server_settings/settings.md#server_settings-path) server setting. -- Volume — Ordered set of equal disks (similar to [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures)). -- Storage policy — Set of volumes and the rules for moving data between them. +The names given to the described entities can be found in the system tables, [system.storage\_policies](../system_tables.md#system_tables-storage_policies) and [system.disks](../system_tables.md#system_tables-disks). To apply one of the configured storage policies for a table, use the `storage_policy` setting of `MergeTree`-engine family tables. -The names given to the described entities can be found in the system tables, [system.storage_policies](../system_tables.md#system_tables-storage_policies) and [system.disks](../system_tables.md#system_tables-disks). To apply one of the configured storage policies for a table, use the `storage_policy` setting of `MergeTree`-engine family tables. +### Configuration {#table-engine-mergetree-multiple-volumes-configure} -### Configuration {#table_engine-mergetree-multiple-volumes_configure} - -Disks, volumes and storage policies should be declared inside the `` tag either in the main file `config.xml` or in a distinct file in the `config.d` directory. +Disks, volumes and storage policies should be declared inside the `` tag either in the main file `config.xml` or in a distinct file in the `config.d` directory. Configuration structure: -```xml +``` xml @@ -519,22 +515,22 @@ Configuration structure: ... - + ... ``` Tags: -- `` — Disk name. Names must be different for all disks. -- `path` — path under which a server will store data (`data` and `shadow` folders), should be terminated with '/'. -- `keep_free_space_bytes` — the amount of free disk space to be reserved. +- `` — Disk name. Names must be different for all disks. +- `path` — path under which a server will store data (`data` and `shadow` folders), should be terminated with ‘/’. +- `keep_free_space_bytes` — the amount of free disk space to be reserved. The order of the disk definition is not important. Storage policies configuration markup: -```xml +``` xml ... @@ -563,15 +559,15 @@ Storage policies configuration markup: Tags: -- `policy_name_N` — Policy name. Policy names must be unique. -- `volume_name_N` — Volume name. Volume names must be unique. -- `disk` — a disk within a volume. -- `max_data_part_size_bytes` — the maximum size of a part that can be stored on any of the volume's disks. -- `move_factor` — when the amount of available space gets lower than this factor, data automatically start to move on the next volume if any (by default, 0.1). +- `policy_name_N` — Policy name. Policy names must be unique. +- `volume_name_N` — Volume name. Volume names must be unique. +- `disk` — a disk within a volume. +- `max_data_part_size_bytes` — the maximum size of a part that can be stored on any of the volume’s disks. +- `move_factor` — when the amount of available space gets lower than this factor, data automatically start to move on the next volume if any (by default, 0.1). Cofiguration examples: -```xml +``` xml ... @@ -592,7 +588,7 @@ Cofiguration examples: disk1 - + 0.2 @@ -603,14 +599,14 @@ Cofiguration examples: In given example, the `hdd_in_order` policy implements the [round-robin](https://en.wikipedia.org/wiki/Round-robin_scheduling) approach. Thus this policy defines only one volume (`single`), the data parts are stored on all its disks in circular order. Such policy can be quite useful if there are several similar disks are mounted to the system, but RAID is not configured. Keep in mind that each individual disk drive is not reliable and you might want to compensate it with replication factor of 3 or more. -If there are different kinds of disks available in the system, `moving_from_ssd_to_hdd` policy can be used instead. The volume `hot` consists of an SSD disk (`fast_ssd`), and the maximum size of a part that can be stored on this volume is 1GB. All the parts with the size larger than 1GB will be stored directly on the `cold` volume, which contains an HDD disk `disk1`. +If there are different kinds of disks available in the system, `moving_from_ssd_to_hdd` policy can be used instead. The volume `hot` consists of an SSD disk (`fast_ssd`), and the maximum size of a part that can be stored on this volume is 1GB. All the parts with the size larger than 1GB will be stored directly on the `cold` volume, which contains an HDD disk `disk1`. Also, once the disk `fast_ssd` gets filled by more than 80%, data will be transferred to the `disk1` by a background process. -The order of volume enumeration within a storage policy is important. Once a volume is overfilled, data are moved to the next one. The order of disk enumeration is important as well because data are stored on them in turns. +The order of volume enumeration within a storage policy is important. Once a volume is overfilled, data are moved to the next one. The order of disk enumeration is important as well because data are stored on them in turns. -When creating a table, one can apply one of the configured storage policies to it: +When creating a table, one can apply one of the configured storage policies to it: -```sql +``` sql CREATE TABLE table_with_non_default_policy ( EventDate Date, OrderID UInt64, @@ -622,28 +618,28 @@ PARTITION BY toYYYYMM(EventDate) SETTINGS storage_policy = 'moving_from_ssd_to_hdd' ``` -The `default` storage policy implies using only one volume, which consists of only one disk given in ``. Once a table is created, its storage policy cannot be changed. +The `default` storage policy implies using only one volume, which consists of only one disk given in ``. Once a table is created, its storage policy cannot be changed. -### Details +### Details {#details} In the case of `MergeTree` tables, data is getting to disk in different ways: -- As a result of an insert (`INSERT` query). -- During background merges and [mutations](../../query_language/alter.md#alter-mutations). -- When downloading from another replica. -- As a result of partition freezing [ALTER TABLE ... FREEZE PARTITION](../../query_language/alter.md#alter_freeze-partition). +- As a result of an insert (`INSERT` query). +- During background merges and [mutations](../../query_language/alter.md#alter-mutations). +- When downloading from another replica. +- As a result of partition freezing [ALTER TABLE … FREEZE PARTITION](../../query_language/alter.md#alter_freeze-partition). In all these cases except for mutations and partition freezing, a part is stored on a volume and a disk according to the given storage policy: -1. The first volume (in the order of definition) that has enough disk space for storing a part (`unreserved_space > current_part_size`) and allows for storing parts of a given size (`max_data_part_size_bytes > current_part_size`) is chosen. -2. Within this volume, that disk is chosen that follows the one, which was used for storing the previous chunk of data, and that has free space more than the part size (`unreserved_space - keep_free_space_bytes > current_part_size`). +1. The first volume (in the order of definition) that has enough disk space for storing a part (`unreserved_space > current_part_size`) and allows for storing parts of a given size (`max_data_part_size_bytes > current_part_size`) is chosen. +2. Within this volume, that disk is chosen that follows the one, which was used for storing the previous chunk of data, and that has free space more than the part size (`unreserved_space - keep_free_space_bytes > current_part_size`). -Under the hood, mutations and partition freezing make use of [hard links](https://en.wikipedia.org/wiki/Hard_link). Hard links between different disks are not supported, therefore in such cases the resulting parts are stored on the same disks as the initial ones. +Under the hood, mutations and partition freezing make use of [hard links](https://en.wikipedia.org/wiki/Hard_link). Hard links between different disks are not supported, therefore in such cases the resulting parts are stored on the same disks as the initial ones. In the background, parts are moved between volumes on the basis of the amount of free space (`move_factor` parameter) according to the order the volumes are declared in the configuration file. -Data is never transferred from the last one and into the first one. One may use system tables [system.part_log](../system_tables.md#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](../system_tables.md#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs. +Data is never transferred from the last one and into the first one. One may use system tables [system.part\_log](../system_tables.md#system_tables-part-log) (field `type = MOVE_PART`) and [system.parts](../system_tables.md#system_tables-parts) (fields `path` and `disk`) to monitor background moves. Also, the detailed information can be found in server logs. -User can force moving a part or a partition from one volume to another using the query [ALTER TABLE ... MOVE PART|PARTITION ... TO VOLUME|DISK ...](../../query_language/alter.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. +User can force moving a part or a partition from one volume to another using the query [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../query_language/alter.md#alter_move-partition), all the restrictions for background operations are taken into account. The query initiates a move on its own and does not wait for background operations to be completed. User will get an error message if not enough free space is available or if any of the required conditions are not met. Moving data does not interfere with data replication. Therefore, different storage policies can be specified for the same table on different replicas. diff --git a/docs/en/operations/table_engines/mysql.md b/docs/en/operations/table_engines/mysql.md index d6d1dbff595..741930d48d8 100644 --- a/docs/en/operations/table_engines/mysql.md +++ b/docs/en/operations/table_engines/mysql.md @@ -1,11 +1,10 @@ - -# MySQL +# MySQL {#mysql} The MySQL engine allows you to perform `SELECT` queries on data that is stored on a remote MySQL server. -## Creating a Table +## Creating a Table {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], @@ -18,32 +17,38 @@ See a detailed description of the [CREATE TABLE](../../query_language/create.md# The table structure can differ from the original MySQL table structure: -- Column names should be the same as in the original MySQL table, but you can use just some of these columns and in any order. -- Column types may differ from those in the original MySQL table. ClickHouse tries to [cast](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) values to the ClickHouse data types. +- Column names should be the same as in the original MySQL table, but you can use just some of these columns and in any order. +- Column types may differ from those in the original MySQL table. ClickHouse tries to [cast](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) values to the ClickHouse data types. **Engine Parameters** -- `host:port` — MySQL server address. -- `database` — Remote database name. -- `table` — Remote table name. -- `user` — MySQL user. -- `password` — User password. -- `replace_query` — Flag that converts `INSERT INTO` queries to `REPLACE INTO`. If `replace_query=1`, the query is substituted. -- `on_duplicate_clause` — The `ON DUPLICATE KEY on_duplicate_clause` expression that is added to the `INSERT` query. +- `host:port` — MySQL server address. + +- `database` — Remote database name. + +- `table` — Remote table name. + +- `user` — MySQL user. + +- `password` — User password. + +- `replace_query` — Flag that converts `INSERT INTO` queries to `REPLACE INTO`. If `replace_query=1`, the query is substituted. + +- `on_duplicate_clause` — The `ON DUPLICATE KEY on_duplicate_clause` expression that is added to the `INSERT` query. Example: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, where `on_duplicate_clause` is `UPDATE c2 = c2 + 1`. See the [MySQL documentation](https://dev.mysql.com/doc/refman/8.0/en/insert-on-duplicate.html) to find which `on_duplicate_clause` you can use with the `ON DUPLICATE KEY` clause. To specify `on_duplicate_clause` you need to pass `0` to the `replace_query` parameter. If you simultaneously pass `replace_query = 1` and `on_duplicate_clause`, ClickHouse generates an exception. -Simple `WHERE` clauses such as ` =, !=, >, >=, <, <=` are executed on the MySQL server. +Simple `WHERE` clauses such as `=, !=, >, >=, <, <=` are executed on the MySQL server. The rest of the conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to MySQL finishes. -## Usage Example +## Usage Example {#usage-example} Table in MySQL: -```text +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -66,7 +71,7 @@ mysql> select * from test; Table in ClickHouse, retrieving data from the MySQL table created above: -```sql +``` sql CREATE TABLE mysql_table ( `float_nullable` Nullable(Float32), @@ -74,18 +79,20 @@ CREATE TABLE mysql_table ) ENGINE = MySQL('localhost:3306', 'test', 'test', 'bayonet', '123') ``` -```sql + +``` sql SELECT * FROM mysql_table ``` -```text + +``` text ┌─float_nullable─┬─int_id─┐ │ ᴺᵁᴸᴸ │ 1 │ └────────────────┴────────┘ ``` -## See Also +## See Also {#see-also} -- [The 'mysql' table function](../../query_language/table_functions/mysql.md) -- [Using MySQL as a source of external dictionary](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) +- [The ‘mysql’ table function](../../query_language/table_functions/mysql.md) +- [Using MySQL as a source of external dictionary](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) [Original article](https://clickhouse.tech/docs/en/operations/table_engines/mysql/) diff --git a/docs/en/operations/table_engines/null.md b/docs/en/operations/table_engines/null.md index a0bc65a287f..030f470e656 100644 --- a/docs/en/operations/table_engines/null.md +++ b/docs/en/operations/table_engines/null.md @@ -1,9 +1,7 @@ - -# Null +# Null {#null} When writing to a Null table, data is ignored. When reading from a Null table, the response is empty. However, you can create a materialized view on a Null table. So the data written to the table will end up in the view. - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/null/) diff --git a/docs/en/operations/table_engines/odbc.md b/docs/en/operations/table_engines/odbc.md index 8bce818d5ed..f9ff4028f5a 100644 --- a/docs/en/operations/table_engines/odbc.md +++ b/docs/en/operations/table_engines/odbc.md @@ -1,4 +1,4 @@ -# ODBC {#table_engine-odbc} +# ODBC {#table-engine-odbc} Allows ClickHouse to connect to external databases via [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). @@ -6,9 +6,9 @@ To safely implement ODBC connections, ClickHouse uses a separate program `clickh This engine supports the [Nullable](../../data_types/nullable.md) data type. -## Creating a Table +## Creating a Table {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1], @@ -22,16 +22,16 @@ See a detailed description of the [CREATE TABLE](../../query_language/create.md# The table structure can differ from the source table structure: -- Column names should be the same as in the source table, but you can use just some of these columns and in any order. -- Column types may differ from those in the source table. ClickHouse tries to [cast](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) values to the ClickHouse data types. +- Column names should be the same as in the source table, but you can use just some of these columns and in any order. +- Column types may differ from those in the source table. ClickHouse tries to [cast](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) values to the ClickHouse data types. **Engine Parameters** -- `connection_settings` — Name of the section with connection settings in the `odbc.ini` file. -- `external_database` — Name of a database in an external DBMS. -- `external_table` — Name of a table in the `external_database`. +- `connection_settings` — Name of the section with connection settings in the `odbc.ini` file. +- `external_database` — Name of a database in an external DBMS. +- `external_table` — Name of a table in the `external_database`. -## Usage Example +## Usage Example {#usage-example} **Retrieving data from the local MySQL installation via ODBC** @@ -41,17 +41,18 @@ Ensure that unixODBC and MySQL Connector are installed. By default (if installed from packages), ClickHouse starts as user `clickhouse`. Thus, you need to create and configure this user in the MySQL server. -```bash +``` bash $ sudo mysql ``` -```sql + +``` sql mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; ``` Then configure the connection in `/etc/odbc.ini`. -```bash +``` bash $ cat /etc/odbc.ini [mysqlconn] DRIVER = /usr/local/lib/libmyodbc5w.so @@ -64,7 +65,7 @@ PASSWORD = clickhouse You can check the connection using the `isql` utility from the unixODBC installation. -```bash +``` bash $ isql -v mysqlconn +---------------------------------------+ | Connected! | @@ -74,7 +75,7 @@ $ isql -v mysqlconn Table in MySQL: -```text +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -97,7 +98,7 @@ mysql> select * from test; Table in ClickHouse, retrieving data from the MySQL table: -```sql +``` sql CREATE TABLE odbc_t ( `int_id` Int32, @@ -105,18 +106,20 @@ CREATE TABLE odbc_t ) ENGINE = ODBC('DSN=mysqlconn', 'test', 'test') ``` -```sql + +``` sql SELECT * FROM odbc_t ``` -```text + +``` text ┌─int_id─┬─float_nullable─┐ │ 1 │ ᴺᵁᴸᴸ │ └────────┴────────────────┘ ``` -## See Also +## See Also {#see-also} -- [ODBC external dictionaries](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) -- [ODBC table function](../../query_language/table_functions/odbc.md) +- [ODBC external dictionaries](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) +- [ODBC table function](../../query_language/table_functions/odbc.md) [Original article](https://clickhouse.tech/docs/en/operations/table_engines/odbc/) diff --git a/docs/en/operations/table_engines/replacingmergetree.md b/docs/en/operations/table_engines/replacingmergetree.md index ee552544f69..b80c372b814 100644 --- a/docs/en/operations/table_engines/replacingmergetree.md +++ b/docs/en/operations/table_engines/replacingmergetree.md @@ -1,14 +1,14 @@ -# ReplacingMergeTree +# ReplacingMergeTree {#replacingmergetree} The engine differs from [MergeTree](mergetree.md#table_engines-mergetree) in that it removes duplicate entries with the same primary key value (or more accurately, with the same [sorting key](mergetree.md) value). -Data deduplication occurs only during a merge. Merging occurs in the background at an unknown time, so you can't plan for it. Some of the data may remain unprocessed. Although you can run an unscheduled merge using the `OPTIMIZE` query, don't count on using it, because the `OPTIMIZE` query will read and write a large amount of data. +Data deduplication occurs only during a merge. Merging occurs in the background at an unknown time, so you can’t plan for it. Some of the data may remain unprocessed. Although you can run an unscheduled merge using the `OPTIMIZE` query, don’t count on using it, because the `OPTIMIZE` query will read and write a large amount of data. -Thus, `ReplacingMergeTree` is suitable for clearing out duplicate data in the background in order to save space, but it doesn't guarantee the absence of duplicates. +Thus, `ReplacingMergeTree` is suitable for clearing out duplicate data in the background in order to save space, but it doesn’t guarantee the absence of duplicates. -## Creating a Table +## Creating a Table {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -26,22 +26,25 @@ For a description of request parameters, see [request description](../../query_l **ReplacingMergeTree Parameters** -- `ver` — column with version. Type `UInt*`, `Date` or `DateTime`. Optional parameter. +- `ver` — column with version. Type `UInt*`, `Date` or `DateTime`. Optional parameter. When merging, `ReplacingMergeTree` from all the rows with the same primary key leaves only one: - - Last in the selection, if `ver` not set. - - With the maximum version, if `ver` specified. + + - Last in the selection, if `ver` not set. + - With the maximum version, if `ver` specified. **Query clauses** When creating a `ReplacingMergeTree` table the same [clauses](mergetree.md) are required, as when creating a `MergeTree` table. -
Deprecated Method for Creating a Table +
-!!! attention +Deprecated Method for Creating a Table + +!!! attention "Attention" Do not use this method in new projects and, if possible, switch the old projects to the method described above. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -52,8 +55,8 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] All of the parameters excepting `ver` have the same meaning as in `MergeTree`. +- `ver` - column with the version. Optional parameter. For a description, see the text above. -- `ver` - column with the version. Optional parameter. For a description, see the text above.
[Original article](https://clickhouse.tech/docs/en/operations/table_engines/replacingmergetree/) diff --git a/docs/en/operations/table_engines/replication.md b/docs/en/operations/table_engines/replication.md index dbf1f491015..b739861003e 100644 --- a/docs/en/operations/table_engines/replication.md +++ b/docs/en/operations/table_engines/replication.md @@ -1,14 +1,14 @@ -# Data Replication {#table_engines-replication} +# Data Replication {#table-engines-replication} Replication is only supported for tables in the MergeTree family: -- ReplicatedMergeTree -- ReplicatedSummingMergeTree -- ReplicatedReplacingMergeTree -- ReplicatedAggregatingMergeTree -- ReplicatedCollapsingMergeTree -- ReplicatedVersionedCollapsingMergeTree -- ReplicatedGraphiteMergeTree +- ReplicatedMergeTree +- ReplicatedSummingMergeTree +- ReplicatedReplacingMergeTree +- ReplicatedAggregatingMergeTree +- ReplicatedCollapsingMergeTree +- ReplicatedVersionedCollapsingMergeTree +- ReplicatedGraphiteMergeTree Replication works at the level of an individual table, not the entire server. A server can store both replicated and non-replicated tables at the same time. @@ -18,20 +18,20 @@ Compressed data for `INSERT` and `ALTER` queries is replicated (for more informa `CREATE`, `DROP`, `ATTACH`, `DETACH` and `RENAME` queries are executed on a single server and are not replicated: -- The `CREATE TABLE` query creates a new replicatable table on the server where the query is run. If this table already exists on other servers, it adds a new replica. -- The `DROP TABLE` query deletes the replica located on the server where the query is run. -- The `RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas. +- The `CREATE TABLE` query creates a new replicatable table on the server where the query is run. If this table already exists on other servers, it adds a new replica. +- The `DROP TABLE` query deletes the replica located on the server where the query is run. +- The `RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas. ClickHouse uses [Apache ZooKeeper](https://zookeeper.apache.org) for storing replicas meta information. Use ZooKeeper version 3.4.5 or newer. To use replication, set parameters in the [zookeeper](../server_settings/settings.md#server-settings_zookeeper) server configuration section. !!! attention "Attention" - Don't neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. + Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. Example of setting the addresses of the ZooKeeper cluster: -```xml +``` xml example1 @@ -47,16 +47,16 @@ Example of setting the addresses of the ZooKeeper cluster: ``` - + You can specify any existing ZooKeeper cluster and the system will use a directory on it for its own data (the directory is specified when creating a replicatable table). -If ZooKeeper isn't set in the config file, you can't create replicated tables, and any existing replicated tables will be read-only. +If ZooKeeper isn’t set in the config file, you can’t create replicated tables, and any existing replicated tables will be read-only. -ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max_replica_delay_for_distributed_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback_to_stale_replicas_for_distributed_queries](../settings/settings.md#settings-fallback_to_stale_replicas_for_distributed_queries). +ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max\_replica\_delay\_for\_distributed\_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback\_to\_stale\_replicas\_for\_distributed\_queries](../settings/settings.md#settings-fallback_to_stale_replicas_for_distributed_queries). -For each `INSERT` query, approximately ten entries are added to ZooKeeper through several transactions. (To be more precise, this is for each inserted block of data; an INSERT query contains one block or one block per `max_insert_block_size = 1048576` rows.) This leads to slightly longer latencies for `INSERT` compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one `INSERT` per second, it doesn't create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred `INSERTs` per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data. +For each `INSERT` query, approximately ten entries are added to ZooKeeper through several transactions. (To be more precise, this is for each inserted block of data; an INSERT query contains one block or one block per `max_insert_block_size = 1048576` rows.) This leads to slightly longer latencies for `INSERT` compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one `INSERT` per second, it doesn’t create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred `INSERTs` per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data. -For very large clusters, you can use different ZooKeeper clusters for different shards. However, this hasn't proven necessary on the Yandex.Metrica cluster (approximately 300 servers). +For very large clusters, you can use different ZooKeeper clusters for different shards. However, this hasn’t proven necessary on the Yandex.Metrica cluster (approximately 300 servers). Replication is asynchronous and multi-master. `INSERT` queries (as well as `ALTER`) can be sent to any available server. Data is inserted on the server where the query is run, and then it is copied to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If part of the replicas are not available, the data is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network. @@ -64,7 +64,7 @@ By default, an INSERT query waits for confirmation of writing the data from only Each block of data is written atomically. The INSERT query is divided into blocks up to `max_insert_block_size = 1048576` rows. In other words, if the `INSERT` query has less than 1048576 rows, it is made atomically. -Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application doesn't know if the data was written to the DB, so the `INSERT` query can simply be repeated. It doesn't matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge_tree](../server_settings/settings.md#server_settings-merge_tree) server settings. +Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application doesn’t know if the data was written to the DB, so the `INSERT` query can simply be repeated. It doesn’t matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge\_tree](../server_settings/settings.md#server_settings-merge_tree) server settings. During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.) @@ -72,19 +72,18 @@ You can have any number of replicas of the same data. Yandex.Metrica uses double The system monitors data synchronicity on replicas and is able to recover after a failure. Failover is automatic (for small differences in data) or semi-automatic (when data differs too much, which may indicate a configuration error). - ## Creating Replicated Tables {#creating-replicated-tables} The `Replicated` prefix is added to the table engine name. For example:`ReplicatedMergeTree`. **Replicated\*MergeTree parameters** -- `zoo_path` — The path to the table in ZooKeeper. -- `replica_name` — The replica name in ZooKeeper. +- `zoo_path` — The path to the table in ZooKeeper. +- `replica_name` — The replica name in ZooKeeper. Example: -```sql +``` sql CREATE TABLE table_name ( EventDate DateTime, @@ -96,9 +95,11 @@ ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) ``` -
Example in deprecated syntax +
-```sql +Example in deprecated syntax + +``` sql CREATE TABLE table_name ( EventDate DateTime, @@ -109,9 +110,9 @@ CREATE TABLE table_name
-As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the 'macros' section of the configuration file. Example: +As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the ‘macros’ section of the configuration file. Example: -```xml +``` xml 05 02 @@ -126,12 +127,12 @@ In this case, the path consists of the following parts: `{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the Yandex.Metrica cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier. -`table_name` is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it doesn't change after a RENAME query. +`table_name` is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it doesn’t change after a RENAME query. *HINT*: you could add a database name in front of `table_name` as well. E.g. `db_name.table_name` The replica name identifies different replicas of the same table. You can use the server name for this, as in the example. The name only needs to be unique within each shard. -You can define the parameters explicitly instead of using substitutions. This might be convenient for testing and for configuring small clusters. However, you can't use distributed DDL queries (`ON CLUSTER`) in this case. +You can define the parameters explicitly instead of using substitutions. This might be convenient for testing and for configuring small clusters. However, you can’t use distributed DDL queries (`ON CLUSTER`) in this case. When working with large clusters, we recommend using substitutions because they reduce the probability of error. @@ -141,7 +142,7 @@ If you add a new replica after the table already contains some data on other rep To delete a replica, run `DROP TABLE`. However, only one replica is deleted – the one that resides on the server where you run the query. -## Recovery After Failures +## Recovery After Failures {#recovery-after-failures} If ZooKeeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ZooKeeper. @@ -155,35 +156,34 @@ Note that ClickHouse does not perform any destructive actions such as automatica When the server starts (or establishes a new session with ZooKeeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a `SELECT` query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary. -If the local set of data differs too much from the expected one, a safety mechanism is triggered. The server enters this in the log and refuses to launch. The reason for this is that this case may indicate a configuration error, such as if a replica on a shard was accidentally configured like a replica on a different shard. However, the thresholds for this mechanism are set fairly low, and this situation might occur during normal failure recovery. In this case, data is restored semi-automatically - by "pushing a button". +If the local set of data differs too much from the expected one, a safety mechanism is triggered. The server enters this in the log and refuses to launch. The reason for this is that this case may indicate a configuration error, such as if a replica on a shard was accidentally configured like a replica on a different shard. However, the thresholds for this mechanism are set fairly low, and this situation might occur during normal failure recovery. In this case, data is restored semi-automatically - by “pushing a button”. To start recovery, create the node `/path_to_table/replica_name/flags/force_restore_data` in ZooKeeper with any content, or run the command to restore all replicated tables: -```bash +``` bash sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data ``` Then restart the server. On start, the server deletes these flags and starts recovery. -## Recovery After Complete Data Loss +## Recovery After Complete Data Loss {#recovery-after-complete-data-loss} If all data and metadata disappeared from one of the servers, follow these steps for recovery: -1. Install ClickHouse on the server. Define substitutions correctly in the config file that contains the shard identifier and replicas, if you use them. -2. If you had unreplicated tables that must be manually duplicated on the servers, copy their data from a replica (in the directory `/var/lib/clickhouse/data/db_name/table_name/`). -3. Copy table definitions located in `/var/lib/clickhouse/metadata/` from a replica. If a shard or replica identifier is defined explicitly in the table definitions, correct it so that it corresponds to this replica. (Alternatively, start the server and make all the `ATTACH TABLE` queries that should have been in the .sql files in `/var/lib/clickhouse/metadata/`.) -4. To start recovery, create the ZooKeeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` +1. Install ClickHouse on the server. Define substitutions correctly in the config file that contains the shard identifier and replicas, if you use them. +2. If you had unreplicated tables that must be manually duplicated on the servers, copy their data from a replica (in the directory `/var/lib/clickhouse/data/db_name/table_name/`). +3. Copy table definitions located in `/var/lib/clickhouse/metadata/` from a replica. If a shard or replica identifier is defined explicitly in the table definitions, correct it so that it corresponds to this replica. (Alternatively, start the server and make all the `ATTACH TABLE` queries that should have been in the .sql files in `/var/lib/clickhouse/metadata/`.) +4. To start recovery, create the ZooKeeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` Then start the server (restart, if it is already running). Data will be downloaded from replicas. -An alternative recovery option is to delete information about the lost replica from ZooKeeper (`/path_to_table/replica_name`), then create the replica again as described in "[Creating replicated tables](#creating-replicated-tables)". +An alternative recovery option is to delete information about the lost replica from ZooKeeper (`/path_to_table/replica_name`), then create the replica again as described in “[Creating replicated tables](#creating-replicated-tables)”. There is no restriction on network bandwidth during recovery. Keep this in mind if you are restoring many replicas at once. +## Converting from MergeTree to ReplicatedMergeTree {#converting-from-mergetree-to-replicatedmergetree} -## Converting from MergeTree to ReplicatedMergeTree - -We use the term `MergeTree` to refer to all table engines in the ` MergeTree family`, the same as for `ReplicatedMergeTree`. +We use the term `MergeTree` to refer to all table engines in the `MergeTree family`, the same as for `ReplicatedMergeTree`. If you had a `MergeTree` table that was manually replicated, you can convert it to a replicated table. You might need to do this if you have already collected a large amount of data in a `MergeTree` table and now you want to enable replication. @@ -193,18 +193,18 @@ Rename the existing MergeTree table, then create a `ReplicatedMergeTree` table w Move the data from the old table to the `detached` subdirectory inside the directory with the new table data (`/var/lib/clickhouse/data/db_name/table_name/`). Then run `ALTER TABLE ATTACH PARTITION` on one of the replicas to add these data parts to the working set. -## Converting from ReplicatedMergeTree to MergeTree +## Converting from ReplicatedMergeTree to MergeTree {#converting-from-replicatedmergetree-to-mergetree} -Create a MergeTree table with a different name. Move all the data from the directory with the `ReplicatedMergeTree` table data to the new table's data directory. Then delete the `ReplicatedMergeTree` table and restart the server. +Create a MergeTree table with a different name. Move all the data from the directory with the `ReplicatedMergeTree` table data to the new table’s data directory. Then delete the `ReplicatedMergeTree` table and restart the server. If you want to get rid of a `ReplicatedMergeTree` table without launching the server: -- Delete the corresponding `.sql` file in the metadata directory (`/var/lib/clickhouse/metadata/`). -- Delete the corresponding path in ZooKeeper (`/path_to_table/replica_name`). +- Delete the corresponding `.sql` file in the metadata directory (`/var/lib/clickhouse/metadata/`). +- Delete the corresponding path in ZooKeeper (`/path_to_table/replica_name`). After this, you can launch the server, create a `MergeTree` table, move the data to its directory, and then restart the server. -## Recovery When Metadata in The ZooKeeper Cluster is Lost or Damaged +## Recovery When Metadata in The ZooKeeper Cluster is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged} If the data in ZooKeeper was lost or damaged, you can save data by moving it to an unreplicated table as described above. diff --git a/docs/en/operations/table_engines/set.md b/docs/en/operations/table_engines/set.md index ee369aa257e..cc881821001 100644 --- a/docs/en/operations/table_engines/set.md +++ b/docs/en/operations/table_engines/set.md @@ -1,13 +1,12 @@ -# Set +# Set {#set} -A data set that is always in RAM. It is intended for use on the right side of the IN operator (see the section "IN operators"). +A data set that is always in RAM. It is intended for use on the right side of the IN operator (see the section “IN operators”). You can use INSERT to insert data in the table. New elements will be added to the data set, while duplicates will be ignored. -But you can't perform SELECT from the table. The only way to retrieve data is by using it in the right half of the IN operator. +But you can’t perform SELECT from the table. The only way to retrieve data is by using it in the right half of the IN operator. Data is always located in RAM. For INSERT, the blocks of inserted data are also written to the directory of tables on the disk. When starting the server, this data is loaded to RAM. In other words, after restarting, the data remains in place. For a rough server restart, the block of data on the disk might be lost or damaged. In the latter case, you may need to manually delete the file with damaged data. - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/set/) diff --git a/docs/en/operations/table_engines/stripelog.md b/docs/en/operations/table_engines/stripelog.md index 59c918defc1..c589dfe1e78 100644 --- a/docs/en/operations/table_engines/stripelog.md +++ b/docs/en/operations/table_engines/stripelog.md @@ -1,12 +1,12 @@ -# StripeLog +# StripeLog {#stripelog} This engine belongs to the family of log engines. See the common properties of log engines and their differences in the [Log Engine Family](log_family.md) article. Use this engine in scenarios when you need to write many tables with a small amount of data (less than 1 million rows). -## Creating a Table {#table_engines-stripelog-creating-a-table} +## Creating a Table {#table-engines-stripelog-creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( column1_name [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -17,26 +17,26 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] See the detailed description of the [CREATE TABLE](../../query_language/create.md#create-table-query) query. -## Writing the Data {#table_engines-stripelog-writing-the-data} +## Writing the Data {#table-engines-stripelog-writing-the-data} The `StripeLog` engine stores all the columns in one file. For each `INSERT` query, ClickHouse appends the data block to the end of a table file, writing columns one by one. For each table ClickHouse writes the files: -- `data.bin` — Data file. -- `index.mrk` — File with marks. Marks contain offsets for each column of each data block inserted. +- `data.bin` — Data file. +- `index.mrk` — File with marks. Marks contain offsets for each column of each data block inserted. The `StripeLog` engine does not support the `ALTER UPDATE` and `ALTER DELETE` operations. -## Reading the Data {#table_engines-stripelog-reading-the-data} +## Reading the Data {#table-engines-stripelog-reading-the-data} The file with marks allows ClickHouse to parallelize the reading of data. This means that a `SELECT` query returns rows in an unpredictable order. Use the `ORDER BY` clause to sort rows. -## Example of Use {#table_engines-stripelog-example-of-use} +## Example of Use {#table-engines-stripelog-example-of-use} Creating a table: -```sql +``` sql CREATE TABLE stripe_log_table ( timestamp DateTime, @@ -48,7 +48,7 @@ ENGINE = StripeLog Inserting data: -```sql +``` sql INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The first regular message') INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The second regular message'),(now(),'WARNING','The first warning message') ``` @@ -57,10 +57,11 @@ We used two `INSERT` queries to create two data blocks inside the `data.bin` fil ClickHouse uses multiple threads when selecting data. Each thread reads a separate data block and returns resulting rows independently as it finishes. As a result, the order of blocks of rows in the output does not match the order of the same blocks in the input in most cases. For example: -```sql +``` sql SELECT * FROM stripe_log_table ``` -```text + +``` text ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ │ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ @@ -72,10 +73,11 @@ SELECT * FROM stripe_log_table Sorting the results (ascending order by default): -```sql +``` sql SELECT * FROM stripe_log_table ORDER BY timestamp ``` -```text + +``` text ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ │ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ diff --git a/docs/en/operations/table_engines/summingmergetree.md b/docs/en/operations/table_engines/summingmergetree.md index bb66d1b7a22..92705718663 100644 --- a/docs/en/operations/table_engines/summingmergetree.md +++ b/docs/en/operations/table_engines/summingmergetree.md @@ -1,13 +1,12 @@ - -# SummingMergeTree +# SummingMergeTree {#summingmergetree} The engine inherits from [MergeTree](mergetree.md#table_engines-mergetree). The difference is that when merging data parts for `SummingMergeTree` tables ClickHouse replaces all the rows with the same primary key (or more accurately, with the same [sorting key](mergetree.md)) with one row which contains summarized values for the columns with the numeric data type. If the sorting key is composed in a way that a single key value corresponds to large number of rows, this significantly reduces storage volume and speeds up data selection. We recommend to use the engine together with `MergeTree`. Store complete data in `MergeTree` table, and use `SummingMergeTree` for aggregated data storing, for example, when preparing reports. Such an approach will prevent you from losing valuable data due to an incorrectly composed primary key. -## Creating a Table +## Creating a Table {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -24,8 +23,8 @@ For a description of request parameters, see [request description](../../query_l **Parameters of SummingMergeTree** -- `columns` - a tuple with the names of columns where values will be summarized. Optional parameter. -The columns must be of a numeric type and must not be in the primary key. +- `columns` - a tuple with the names of columns where values will be summarized. Optional parameter. + The columns must be of a numeric type and must not be in the primary key. If `columns` not specified, ClickHouse summarizes the values in all columns with a numeric data type that are not in the primary key. @@ -33,12 +32,14 @@ The columns must be of a numeric type and must not be in the primary key. When creating a `SummingMergeTree` table the same [clauses](mergetree.md) are required, as when creating a `MergeTree` table. -
Deprecated Method for Creating a Table +
-!!! attention +Deprecated Method for Creating a Table + +!!! attention "Attention" Do not use this method in new projects and, if possible, switch the old projects to the method described above. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -49,14 +50,15 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] All of the parameters excepting `columns` have the same meaning as in `MergeTree`. -- `columns` — tuple with names of columns values of which will be summarized. Optional parameter. For a description, see the text above. +- `columns` — tuple with names of columns values of which will be summarized. Optional parameter. For a description, see the text above. +
-## Usage Example +## Usage Example {#usage-example} Consider the following table: -```sql +``` sql CREATE TABLE summtt ( key UInt32, @@ -68,31 +70,30 @@ ORDER BY key Insert data to it: -```sql +``` sql INSERT INTO summtt Values(1,1),(1,2),(2,1) ``` ClickHouse may sum all the rows not completely ([see below](#data-processing)), so we use an aggregate function `sum` and `GROUP BY` clause in the query. -```sql +``` sql SELECT key, sum(value) FROM summtt GROUP BY key ``` -```text +``` text ┌─key─┬─sum(value)─┐ │ 2 │ 1 │ │ 1 │ 3 │ └─────┴────────────┘ ``` - ## Data Processing {#data-processing} When data are inserted into a table, they are saved as-is. Clickhouse merges the inserted parts of data periodically and this is when rows with the same primary key are summed and replaced with one for each resulting part of data. -ClickHouse can merge the data parts so that different resulting parts of data cat consist rows with the same primary key, i.e. the summation will be incomplete. Therefore (`SELECT`) an aggregate function [sum()](../../query_language/agg_functions/reference.md#agg_function-sum) and `GROUP BY` clause should be used in a query as described in the example above. +ClickHouse can merge the data parts so that different resulting parts of data cat consist rows with the same primary key, i.e. the summation will be incomplete. Therefore (`SELECT`) an aggregate function [sum()](../../query_language/agg_functions/reference.md#agg_function-sum) and `GROUP BY` clause should be used in a query as described in the example above. -### Common rules for summation +### Common rules for summation {#common-rules-for-summation} The values in the columns with the numeric data type are summarized. The set of columns is defined by the parameter `columns`. @@ -102,24 +103,24 @@ If column is not in the primary key and is not summarized, an arbitrary value is The values are not summarized for columns in the primary key. -### The Summation in the AggregateFunction Columns +### The Summation in the AggregateFunction Columns {#the-summation-in-the-aggregatefunction-columns} For columns of [AggregateFunction type](../../data_types/nested_data_structures/aggregatefunction.md) ClickHouse behaves as [AggregatingMergeTree](aggregatingmergetree.md) engine aggregating according to the function. -### Nested Structures +### Nested Structures {#nested-structures} Table can have nested data structures that are processed in a special way. If the name of a nested table ends with `Map` and it contains at least two columns that meet the following criteria: -- the first column is numeric `(*Int*, Date, DateTime)` or a string `(String, FixedString)`, let's call it `key`, -- the other columns are arithmetic `(*Int*, Float32/64)`, let's call it `(values...)`, +- the first column is numeric `(*Int*, Date, DateTime)` or a string `(String, FixedString)`, let’s call it `key`, +- the other columns are arithmetic `(*Int*, Float32/64)`, let’s call it `(values...)`, then this nested table is interpreted as a mapping of `key => (values...)`, and when merging its rows, the elements of two data sets are merged by `key` with a summation of the corresponding `(values...)`. Examples: -```text +``` text [(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] [(1, 100)] + [(1, 150)] -> [(1, 250)] [(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] diff --git a/docs/en/operations/table_engines/tinylog.md b/docs/en/operations/table_engines/tinylog.md index d2bcb5356d6..ea6bd14d537 100644 --- a/docs/en/operations/table_engines/tinylog.md +++ b/docs/en/operations/table_engines/tinylog.md @@ -1,9 +1,9 @@ -# TinyLog +# TinyLog {#tinylog} The engine belongs to the log engine family. See [Log Engine Family](log_family.md) for common properties of log engines and their differences. This table engine is typically used with the write-once method: write data one time, then read it as many times as necessary. For example, you can use `TinyLog`-type tables for intermediary data that is processed in small batches. Note that storing data in a large number of small tables is inefficient. -Queries are executed in a single stream. In other words, this engine is intended for relatively small tables (up to about 1,000,000 rows). It makes sense to use this table engine if you have many small tables, since it's simpler than the [Log](log.md) engine (fewer files need to be opened). +Queries are executed in a single stream. In other words, this engine is intended for relatively small tables (up to about 1,000,000 rows). It makes sense to use this table engine if you have many small tables, since it’s simpler than the [Log](log.md) engine (fewer files need to be opened). [Original article](https://clickhouse.tech/docs/en/operations/table_engines/tinylog/) diff --git a/docs/en/operations/table_engines/url.md b/docs/en/operations/table_engines/url.md index 60d8b5e1f2a..393f0d2d637 100644 --- a/docs/en/operations/table_engines/url.md +++ b/docs/en/operations/table_engines/url.md @@ -1,9 +1,9 @@ -# URL(URL, Format) {#table_engines-url} +# URL(URL, Format) {#table-engines-url} Manages data on a remote HTTP/HTTPS server. This engine is similar to the [File](file.md) engine. -## Using the engine in the ClickHouse server +## Using the engine in the ClickHouse server {#using-the-engine-in-the-clickhouse-server} The `format` must be one that ClickHouse can use in `SELECT` queries and, if necessary, in `INSERTs`. For the full list of supported formats, see @@ -17,13 +17,13 @@ additional headers for getting a response from the server. respectively. For processing `POST` requests, the remote server must support [Chunked transfer encoding](https://en.wikipedia.org/wiki/Chunked_transfer_encoding). -You can limit the maximum number of HTTP GET redirect hops using the [max_http_get_redirects](../settings/settings.md#setting-max_http_get_redirects) setting. +You can limit the maximum number of HTTP GET redirect hops using the [max\_http\_get\_redirects](../settings/settings.md#setting-max_http_get_redirects) setting. **Example:** **1.** Create a `url_engine_table` table on the server : -```sql +``` sql CREATE TABLE url_engine_table (word String, value UInt64) ENGINE=URL('http://127.0.0.1:12345/', CSV) ``` @@ -31,7 +31,7 @@ ENGINE=URL('http://127.0.0.1:12345/', CSV) **2.** Create a basic HTTP server using the standard Python 3 tools and start it: -```python3 +``` python3 from http.server import BaseHTTPRequestHandler, HTTPServer class CSVHTTPServer(BaseHTTPRequestHandler): @@ -47,29 +47,29 @@ if __name__ == "__main__": HTTPServer(server_address, CSVHTTPServer).serve_forever() ``` -```bash +``` bash $ python3 server.py ``` **3.** Request data: -```sql +``` sql SELECT * FROM url_engine_table ``` -```text +``` text ┌─word──┬─value─┐ │ Hello │ 1 │ │ World │ 2 │ └───────┴───────┘ ``` -## Details of Implementation +## Details of Implementation {#details-of-implementation} -- Reads and writes can be parallel -- Not supported: - - `ALTER` and `SELECT...SAMPLE` operations. - - Indexes. - - Replication. +- Reads and writes can be parallel +- Not supported: + - `ALTER` and `SELECT...SAMPLE` operations. + - Indexes. + - Replication. [Original article](https://clickhouse.tech/docs/en/operations/table_engines/url/) diff --git a/docs/en/operations/table_engines/versionedcollapsingmergetree.md b/docs/en/operations/table_engines/versionedcollapsingmergetree.md index 4e7747ffae1..f8343fec1b5 100644 --- a/docs/en/operations/table_engines/versionedcollapsingmergetree.md +++ b/docs/en/operations/table_engines/versionedcollapsingmergetree.md @@ -1,18 +1,17 @@ - -# VersionedCollapsingMergeTree +# VersionedCollapsingMergeTree {#versionedcollapsingmergetree} This engine: -- Allows quick writing of object states that are continually changing. -- Deletes old object states in the background. This significantly reduces the volume of storage. +- Allows quick writing of object states that are continually changing. +- Deletes old object states in the background. This significantly reduces the volume of storage. See the section [Collapsing](#table_engines_versionedcollapsingmergetree) for details. The engine inherits from [MergeTree](mergetree.md#table_engines-mergetree) and adds the logic for collapsing rows to the algorithm for merging data parts. `VersionedCollapsingMergeTree` serves the same purpose as [CollapsingMergeTree](collapsingmergetree.md) but uses a different collapsing algorithm that allows inserting the data in any order with multiple threads. In particular, the `Version` column helps to collapse the rows properly even if they are inserted in the wrong order. In contrast, `CollapsingMergeTree` allows only strictly consecutive insertion. -## Creating a Table +## Creating a Table {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -29,15 +28,15 @@ For a description of query parameters, see the [query description](../../query_l **Engine Parameters** -```sql +``` sql VersionedCollapsingMergeTree(sign, version) ``` -- `sign` — Name of the column with the type of row: `1` is a "state" row, `-1` is a "cancel" row. +- `sign` — Name of the column with the type of row: `1` is a “state” row, `-1` is a “cancel” row. The column data type should be `Int8`. -- `version` — Name of the column with the version of the object state. +- `version` — Name of the column with the version of the object state. The column data type should be `UInt*`. @@ -45,12 +44,14 @@ VersionedCollapsingMergeTree(sign, version) When creating a `VersionedCollapsingMergeTree` table, the same [clauses](mergetree.md) are required as when creating a `MergeTree` table. -
Deprecated Method for Creating a Table +
-!!! attention +Deprecated Method for Creating a Table + +!!! attention "Attention" Do not use this method in new projects. If possible, switch the old projects to the method described above. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -61,27 +62,27 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] All of the parameters except `sign` and `version` have the same meaning as in `MergeTree`. -- `sign` — Name of the column with the type of row: `1` is a "state" row, `-1` is a "cancel" row. +- `sign` — Name of the column with the type of row: `1` is a “state” row, `-1` is a “cancel” row. Column Data Type — `Int8`. -- `version` — Name of the column with the version of the object state. +- `version` — Name of the column with the version of the object state. The column data type should be `UInt*`. +
+## Collapsing {#table-engines-versionedcollapsingmergetree} -## Collapsing {#table_engines_versionedcollapsingmergetree} - -### Data +### Data {#data} Consider a situation where you need to save continually changing data for some object. It is reasonable to have one row for an object and update the row whenever there are changes. However, the update operation is expensive and slow for a DBMS because it requires rewriting the data in the storage. Update is not acceptable if you need to write data quickly, but you can write the changes to an object sequentially as follows. -Use the `Sign` column when writing the row. If `Sign = 1` it means that the row is a state of an object (let's call it the "state" row). If `Sign = -1` it indicates the cancellation of the state of an object with the same attributes (let's call it the "cancel" row). Also use the `Version` column, which should identify each state of an object with a separate number. +Use the `Sign` column when writing the row. If `Sign = 1` it means that the row is a state of an object (let’s call it the “state” row). If `Sign = -1` it indicates the cancellation of the state of an object with the same attributes (let’s call it the “cancel” row). Also use the `Version` column, which should identify each state of an object with a separate number. For example, we want to calculate how many pages users visited on some site and how long they were there. At some point in time we write the following row with the state of user activity: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | └─────────────────────┴───────────┴──────────┴──────┴─────────┘ @@ -89,7 +90,7 @@ For example, we want to calculate how many pages users visited on some site and At some point later we register the change of user activity and write it with the following two rows. -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | │ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | @@ -102,7 +103,7 @@ The second row contains the current state. Because we need only the last state of user activity, the rows -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | @@ -115,31 +116,31 @@ To find out why we need two rows for each change, see [Algorithm](#table_engines **Notes on Usage** -1. The program that writes the data should remember the state of an object in order to cancel it. The "cancel" string should be a copy of the "state" string with the opposite `Sign`. This increases the initial size of storage but allows to write the data quickly. -2. Long growing arrays in columns reduce the efficiency of the engine due to the load for writing. The more straightforward the data, the better the efficiency. -3. `SELECT` results depend strongly on the consistency of the history of object changes. Be accurate when preparing data for inserting. You can get unpredictable results with inconsistent data, such as negative values for non-negative metrics like session depth. +1. The program that writes the data should remember the state of an object in order to cancel it. The “cancel” string should be a copy of the “state” string with the opposite `Sign`. This increases the initial size of storage but allows to write the data quickly. +2. Long growing arrays in columns reduce the efficiency of the engine due to the load for writing. The more straightforward the data, the better the efficiency. +3. `SELECT` results depend strongly on the consistency of the history of object changes. Be accurate when preparing data for inserting. You can get unpredictable results with inconsistent data, such as negative values for non-negative metrics like session depth. -### Algorithm {#table_engines-versionedcollapsingmergetree-algorithm} +### Algorithm {#table-engines-versionedcollapsingmergetree-algorithm} When ClickHouse merges data parts, it deletes each pair of rows that have the same primary key and version and different `Sign`. The order of rows does not matter. When ClickHouse inserts data, it orders rows by the primary key. If the `Version` column is not in the primary key, ClickHouse adds it to the primary key implicitly as the last field and uses it for ordering. -## Selecting Data +## Selecting Data {#selecting-data} -ClickHouse doesn't guarantee that all of the rows with the same primary key will be in the same resulting data part or even on the same physical server. This is true both for writing the data and for subsequent merging of the data parts. In addition, ClickHouse processes `SELECT` queries with multiple threads, and it cannot predict the order of rows in the result. This means that aggregation is required if there is a need to get completely "collapsed" data from a `VersionedCollapsingMergeTree` table. +ClickHouse doesn’t guarantee that all of the rows with the same primary key will be in the same resulting data part or even on the same physical server. This is true both for writing the data and for subsequent merging of the data parts. In addition, ClickHouse processes `SELECT` queries with multiple threads, and it cannot predict the order of rows in the result. This means that aggregation is required if there is a need to get completely “collapsed” data from a `VersionedCollapsingMergeTree` table. To finalize collapsing, write a query with a `GROUP BY` clause and aggregate functions that account for the sign. For example, to calculate quantity, use `sum(Sign)` instead of `count()`. To calculate the sum of something, use `sum(Sign * x)` instead of `sum(x)`, and add `HAVING sum(Sign) > 0`. -The aggregates `count`, `sum` and `avg` can be calculated this way. The aggregate `uniq` can be calculated if an object has at least one non-collapsed state. The aggregates `min` and `max` can't be calculated because `VersionedCollapsingMergeTree` does not save the history of values of collapsed states. +The aggregates `count`, `sum` and `avg` can be calculated this way. The aggregate `uniq` can be calculated if an object has at least one non-collapsed state. The aggregates `min` and `max` can’t be calculated because `VersionedCollapsingMergeTree` does not save the history of values of collapsed states. -If you need to extract the data with "collapsing" but without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the `FINAL` modifier for the `FROM` clause. This approach is inefficient and should not be used with large tables. +If you need to extract the data with “collapsing” but without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the `FINAL` modifier for the `FROM` clause. This approach is inefficient and should not be used with large tables. -## Example of Use +## Example of Use {#example-of-use} Example data: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | @@ -149,7 +150,7 @@ Example data: Creating the table: -```sql +``` sql CREATE TABLE UAct ( UserID UInt64, @@ -164,10 +165,11 @@ ORDER BY UserID Inserting the data: -```sql +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1, 1) ``` -```sql + +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1, 1),(4324182021466249494, 6, 185, 1, 2) ``` @@ -175,11 +177,11 @@ We use two `INSERT` queries to create two different data parts. If we insert the Getting the data: -```sql +``` sql SELECT * FROM UAct ``` -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┴─────────┘ @@ -195,7 +197,7 @@ Collapsing did not occur because the data parts have not been merged yet. ClickH This is why we need aggregation: -```sql +``` sql SELECT UserID, sum(PageViews * Sign) AS PageViews, @@ -205,23 +207,25 @@ FROM UAct GROUP BY UserID, Version HAVING sum(Sign) > 0 ``` -```text + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Version─┐ │ 4324182021466249494 │ 6 │ 185 │ 2 │ └─────────────────────┴───────────┴──────────┴─────────┘ ``` -If we don't need aggregation and want to force collapsing, we can use the `FINAL` modifier for the `FROM` clause. +If we don’t need aggregation and want to force collapsing, we can use the `FINAL` modifier for the `FROM` clause. -```sql +``` sql SELECT * FROM UAct FINAL ``` -```text + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ └─────────────────────┴───────────┴──────────┴──────┴─────────┘ ``` -This is a very inefficient way to select data. Don't use it for large tables. +This is a very inefficient way to select data. Don’t use it for large tables. [Original article](https://clickhouse.tech/docs/en/operations/table_engines/versionedcollapsingmergetree/) diff --git a/docs/en/operations/table_engines/view.md b/docs/en/operations/table_engines/view.md index 8c2e1295156..f742b0f68fa 100644 --- a/docs/en/operations/table_engines/view.md +++ b/docs/en/operations/table_engines/view.md @@ -1,6 +1,5 @@ -# View {#table_engines-view} +# View {#table-engines-view} Used for implementing views (for more information, see the `CREATE VIEW query`). It does not store data, but only stores the specified `SELECT` query. When reading from a table, it runs this query (and deletes all unnecessary columns from the query). - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/view/) diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md index 3cb64627f2a..01d322c325d 100644 --- a/docs/en/operations/tips.md +++ b/docs/en/operations/tips.md @@ -1,41 +1,42 @@ -# Usage Recommendations +# Usage Recommendations {#usage-recommendations} -## CPU Scaling Governor +## CPU Scaling Governor {#cpu-scaling-governor} Always use the `performance` scaling governor. The `on-demand` scaling governor works much worse with constantly high demand. -```bash +``` bash $ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ``` -## CPU Limitations +## CPU Limitations {#cpu-limitations} -Processors can overheat. Use `dmesg` to see if the CPU's clock rate was limited due to overheating. +Processors can overheat. Use `dmesg` to see if the CPU’s clock rate was limited due to overheating. The restriction can also be set externally at the datacenter level. You can use `turbostat` to monitor it under a load. -## RAM +## RAM {#ram} For small amounts of data (up to ~200 GB compressed), it is best to use as much memory as the volume of data. For large amounts of data and when processing interactive (online) queries, you should use a reasonable amount of RAM (128 GB or more) so the hot data subset will fit in the cache of pages. Even for data volumes of ~50 TB per server, using 128 GB of RAM significantly improves query performance compared to 64 GB. Do not disable overcommit. The value `cat /proc/sys/vm/overcommit_memory` should be 0 or 1. Run -```bash + +``` bash $ echo 0 | sudo tee /proc/sys/vm/overcommit_memory ``` -## Huge Pages +## Huge Pages {#huge-pages} Always disable transparent huge pages. It interferes with memory allocators, which leads to significant performance degradation. -```bash +``` bash $ echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled ``` Use `perf top` to watch the time spent in the kernel for memory management. Permanent huge pages also do not need to be allocated. -## Storage Subsystem +## Storage Subsystem {#storage-subsystem} If your budget allows you to use SSD, use SSD. If not, use HDD. SATA HDDs 7200 RPM will do. @@ -43,17 +44,17 @@ If not, use HDD. SATA HDDs 7200 RPM will do. Give preference to a lot of servers with local hard drives over a smaller number of servers with attached disk shelves. But for storing archives with rare queries, shelves will work. -## RAID +## RAID {#raid} When using HDD, you can combine their RAID-10, RAID-5, RAID-6 or RAID-50. -For Linux, software RAID is better (with `mdadm`). We don't recommend using LVM. +For Linux, software RAID is better (with `mdadm`). We don’t recommend using LVM. When creating RAID-10, select the `far` layout. If your budget allows, choose RAID-10. If you have more than 4 disks, use RAID-6 (preferred) or RAID-50, instead of RAID-5. -When using RAID-5, RAID-6 or RAID-50, always increase stripe_cache_size, since the default value is usually not the best choice. +When using RAID-5, RAID-6 or RAID-50, always increase stripe\_cache\_size, since the default value is usually not the best choice. -```bash +``` bash $ echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size ``` @@ -65,33 +66,33 @@ Never set the block size too small or too large. You can use RAID-0 on SSD. Regardless of RAID use, always use replication for data security. -Enable NCQ with a long queue. For HDD, choose the CFQ scheduler, and for SSD, choose noop. Don't reduce the 'readahead' setting. +Enable NCQ with a long queue. For HDD, choose the CFQ scheduler, and for SSD, choose noop. Don’t reduce the ‘readahead’ setting. For HDD, enable the write cache. -## File System +## File System {#file-system} Ext4 is the most reliable option. Set the mount options `noatime, nobarrier`. -XFS is also suitable, but it hasn't been as thoroughly tested with ClickHouse. +XFS is also suitable, but it hasn’t been as thoroughly tested with ClickHouse. Most other file systems should also work fine. File systems with delayed allocation work better. -## Linux Kernel +## Linux Kernel {#linux-kernel} -Don't use an outdated Linux kernel. +Don’t use an outdated Linux kernel. -## Network +## Network {#network} If you are using IPv6, increase the size of the route cache. The Linux kernel prior to 3.2 had a multitude of problems with IPv6 implementation. Use at least a 10 GB network, if possible. 1 Gb will also work, but it will be much worse for patching replicas with tens of terabytes of data, or for processing distributed queries with a large amount of intermediate data. -## ZooKeeper +## ZooKeeper {#zookeeper} -You are probably already using ZooKeeper for other purposes. You can use the same installation of ZooKeeper, if it isn't already overloaded. +You are probably already using ZooKeeper for other purposes. You can use the same installation of ZooKeeper, if it isn’t already overloaded. -It's best to use a fresh version of ZooKeeper – 3.4.9 or later. The version in stable Linux distributions may be outdated. +It’s best to use a fresh version of ZooKeeper – 3.4.9 or later. The version in stable Linux distributions may be outdated. -You should never use manually written scripts to transfer data between different ZooKeeper clusters, because the result will be incorrect for sequential nodes. Never use the "zkcopy" utility for the same reason: https://github.com/ksprojects/zkcopy/issues/15 +You should never use manually written scripts to transfer data between different ZooKeeper clusters, because the result will be incorrect for sequential nodes. Never use the “zkcopy” utility for the same reason: https://github.com/ksprojects/zkcopy/issues/15 If you want to divide an existing ZooKeeper cluster into two, the correct way is to increase the number of its replicas and then reconfigure it as two independent clusters. @@ -99,7 +100,7 @@ Do not run ZooKeeper on the same servers as ClickHouse. Because ZooKeeper is ver With the default settings, ZooKeeper is a time bomb: -> The ZooKeeper server won't delete files from old snapshots and logs when using the default configuration (see autopurge), and this is the responsibility of the operator. +> The ZooKeeper server won’t delete files from old snapshots and logs when using the default configuration (see autopurge), and this is the responsibility of the operator. This bomb must be defused. @@ -107,7 +108,7 @@ The ZooKeeper (3.5.1) configuration below is used in the Yandex.Metrica producti zoo.cfg: -```bash +``` bash # http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html # The number of milliseconds of each tick @@ -123,9 +124,9 @@ maxClientCnxns=2000 maxSessionTimeout=60000000 # the directory where the snapshot is stored. -dataDir=/opt/zookeeper/{{ cluster['name'] }}/data +dataDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/data # Place the dataLogDir to a separate physical disc for better performance -dataLogDir=/opt/zookeeper/{{ cluster['name'] }}/logs +dataLogDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/logs autopurge.snapRetainCount=10 autopurge.purgeInterval=1 @@ -158,20 +159,20 @@ snapCount=3000000 leaderServes=yes standaloneEnabled=false -dynamicConfigFile=/etc/zookeeper-{{ cluster['name'] }}/conf/zoo.cfg.dynamic +dynamicConfigFile=/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/zoo.cfg.dynamic ``` Java version: -```text +``` text Java(TM) SE Runtime Environment (build 1.8.0_25-b17) Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) ``` JVM parameters: -```bash -NAME=zookeeper-{{ cluster['name'] }} +``` bash +NAME=zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} ZOOCFGDIR=/etc/$NAME/conf # TODO this is really ugly @@ -190,8 +191,8 @@ JAVA=/usr/bin/java ZOOMAIN="org.apache.zookeeper.server.quorum.QuorumPeerMain" ZOO_LOG4J_PROP="INFO,ROLLINGFILE" JMXLOCALONLY=false -JAVA_OPTS="-Xms{{ cluster.get('xms','128M') }} \ - -Xmx{{ cluster.get('xmx','1G') }} \ +JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '}}' }} \ + -Xmx{{ '{{' }} cluster.get('xmx','1G') {{ '}}' }} \ -Xloggc:/var/log/$NAME/zookeeper-gc.log \ -XX:+UseGCLogFileRotation \ -XX:NumberOfGCLogFiles=16 \ @@ -211,8 +212,8 @@ JAVA_OPTS="-Xms{{ cluster.get('xms','128M') }} \ Salt init: -```text -description "zookeeper-{{ cluster['name'] }} centralized coordination service" +``` text +description "zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} centralized coordination service" start on runlevel [2345] stop on runlevel [!2345] @@ -222,23 +223,22 @@ respawn limit nofile 8192 8192 pre-start script - [ -r "/etc/zookeeper-{{ cluster['name'] }}/conf/environment" ] || exit 0 - . /etc/zookeeper-{{ cluster['name'] }}/conf/environment + [ -r "/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment" ] || exit 0 + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment [ -d $ZOO_LOG_DIR ] || mkdir -p $ZOO_LOG_DIR chown $USER:$GROUP $ZOO_LOG_DIR end script script - . /etc/zookeeper-{{ cluster['name'] }}/conf/environment + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment [ -r /etc/default/zookeeper ] && . /etc/default/zookeeper if [ -z "$JMXDISABLE" ]; then JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.local.only=$JMXLOCALONLY" fi - exec start-stop-daemon --start -c $USER --exec $JAVA --name zookeeper-{{ cluster['name'] }} \ + exec start-stop-daemon --start -c $USER --exec $JAVA --name zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} \ -- -cp $CLASSPATH $JAVA_OPTS -Dzookeeper.log.dir=${ZOO_LOG_DIR} \ -Dzookeeper.root.logger=${ZOO_LOG4J_PROP} $ZOOMAIN $ZOOCFG end script ``` - [Original article](https://clickhouse.tech/docs/en/operations/tips/) diff --git a/docs/en/operations/troubleshooting.md b/docs/en/operations/troubleshooting.md index 656a12bad2a..8520a0e5ca4 100644 --- a/docs/en/operations/troubleshooting.md +++ b/docs/en/operations/troubleshooting.md @@ -1,38 +1,37 @@ -# Troubleshooting +# Troubleshooting {#troubleshooting} -- [Installation](#troubleshooting-installation-errors) -- [Connecting to the server](#troubleshooting-accepts-no-connections) -- [Query processing](#troubleshooting-does-not-process-queries) -- [Efficiency of query processing](#troubleshooting-too-slow) +- [Installation](#troubleshooting-installation-errors) +- [Connecting to the server](#troubleshooting-accepts-no-connections) +- [Query processing](#troubleshooting-does-not-process-queries) +- [Efficiency of query processing](#troubleshooting-too-slow) ## Installation {#troubleshooting-installation-errors} -### You Cannot Get Deb Packages from ClickHouse Repository With apt-get - -- Check firewall settings. -- If you cannot access the repository for any reason, download packages as described in the [Getting started](../getting_started/index.md) article and install them manually using the `sudo dpkg -i ` command. You will also need the `tzdata` package. +### You Cannot Get Deb Packages from ClickHouse Repository With apt-get {#you-cannot-get-deb-packages-from-clickhouse-repository-with-apt-get} +- Check firewall settings. +- If you cannot access the repository for any reason, download packages as described in the [Getting started](../getting_started/index.md) article and install them manually using the `sudo dpkg -i ` command. You will also need the `tzdata` package. ## Connecting to the Server {#troubleshooting-accepts-no-connections} Possible issues: -- The server is not running. -- Unexpected or wrong configuration parameters. +- The server is not running. +- Unexpected or wrong configuration parameters. -### Server Is Not Running +### Server Is Not Running {#server-is-not-running} **Check if server is runnnig** Command: -```bash +``` bash $ sudo service clickhouse-server status ``` If the server is not running, start it with the command: -```bash +``` bash $ sudo service clickhouse-server start ``` @@ -42,24 +41,24 @@ The main log of `clickhouse-server` is in `/var/log/clickhouse-server/clickhouse If the server started successfully, you should see the strings: -- ` Application: starting up.` — Server started. -- ` Application: Ready for connections.` — Server is running and ready for connections. +- ` Application: starting up.` — Server started. +- ` Application: Ready for connections.` — Server is running and ready for connections. If `clickhouse-server` start failed with a configuration error, you should see the `` string with an error description. For example: -```text +``` text 2019.01.11 15:23:25.549505 [ 45 ] {} ExternalDictionaries: Failed reloading 'event2id' external dictionary: Poco::Exception. Code: 1000, e.code() = 111, e.displayText() = Connection refused, e.what() = Connection refused ``` -If you don't see an error at the end of the file, look through the entire file starting from the string: +If you don’t see an error at the end of the file, look through the entire file starting from the string: -```text +``` text Application: starting up. ``` If you try to start a second instance of `clickhouse-server` on the server, you see the following log: -```text +``` text 2019.01.11 15:25:11.151730 [ 1 ] {} : Starting ClickHouse 19.1.0 with revision 54413 2019.01.11 15:25:11.154578 [ 1 ] {} Application: starting up 2019.01.11 15:25:11.156361 [ 1 ] {} StatusFile: Status file ./status already exists - unclean restart. Contents: @@ -75,48 +74,48 @@ Revision: 54413 **See system.d logs** -If you don't find any useful information in `clickhouse-server` logs or there aren't any logs, you can view `system.d` logs using the command: +If you don’t find any useful information in `clickhouse-server` logs or there aren’t any logs, you can view `system.d` logs using the command: -```bash +``` bash $ sudo journalctl -u clickhouse-server ``` **Start clickhouse-server in interactive mode** -```bash +``` bash $ sudo -u clickhouse /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml ``` This command starts the server as an interactive app with standard parameters of the autostart script. In this mode `clickhouse-server` prints all the event messages in the console. -### Configuration Parameters +### Configuration Parameters {#configuration-parameters} Check: -- Docker settings. +- Docker settings. If you run ClickHouse in Docker in an IPv6 network, make sure that `network=host` is set. -- Endpoint settings. +- Endpoint settings. - Check [listen_host](server_settings/settings.md#server_settings-listen_host) and [tcp_port](server_settings/settings.md#server_settings-tcp_port) settings. + Check [listen\_host](server_settings/settings.md#server_settings-listen_host) and [tcp\_port](server_settings/settings.md#server_settings-tcp_port) settings. ClickHouse server accepts localhost connections only by default. -- HTTP protocol settings. +- HTTP protocol settings. Check protocol settings for the HTTP API. -- Secure connection settings. +- Secure connection settings. Check: - - The [tcp_port_secure](server_settings/settings.md#server_settings-tcp_port_secure) setting. - - Settings for [SSL sertificates](server_settings/settings.md#server_settings-openssl). + - The [tcp\_port\_secure](server_settings/settings.md#server_settings-tcp_port_secure) setting. + - Settings for [SSL sertificates](server_settings/settings.md#server_settings-openssl). - Use proper parameters while connecting. For example, use the `port_secure` parameter with `clickhouse_client`. + Use proper parameters while connecting. For example, use the `port_secure` parameter with `clickhouse_client`. -- User settings. +- User settings. You might be using the wrong user name or password. @@ -124,7 +123,7 @@ Check: If ClickHouse is not able to process the query, it sends an error description to the client. In the `clickhouse-client` you get a description of the error in the console. If you are using the HTTP interface, ClickHouse sends the error description in the response body. For example: -```bash +``` bash $ curl 'http://localhost:8123/' --data-binary "SELECT a" Code: 47, e.displayText() = DB::Exception: Unknown identifier: a. Note that there are no tables (FROM clause) in your query, context: required_names: 'a' source_tables: table_aliases: private_aliases: column_aliases: public_columns: 'a' masked_columns: array_join_columns: source_columns: , e.what() = DB::Exception ``` diff --git a/docs/en/operations/update.md b/docs/en/operations/update.md index d008a0e3c61..a5594b7a34f 100644 --- a/docs/en/operations/update.md +++ b/docs/en/operations/update.md @@ -1,8 +1,8 @@ -# ClickHouse Update +# ClickHouse Update {#clickhouse-update} If ClickHouse was installed from deb packages, execute the following commands on the server: -```bash +``` bash $ sudo apt-get update $ sudo apt-get install clickhouse-client clickhouse-server $ sudo service clickhouse-server restart diff --git a/docs/en/operations/utils/clickhouse-benchmark.md b/docs/en/operations/utils/clickhouse-benchmark.md index 5707158e671..b85277e5000 100644 --- a/docs/en/operations/utils/clickhouse-benchmark.md +++ b/docs/en/operations/utils/clickhouse-benchmark.md @@ -1,49 +1,51 @@ -# clickhouse-benchmark +# clickhouse-benchmark {#clickhouse-benchmark} Connects to a ClickHouse server and repeatedly sends specified queries. Syntax: -```bash +``` bash $ echo "single query" | clickhouse-benchmark [keys] ``` + or -```bash + +``` bash $ clickhouse-benchmark [keys] <<< "single query" ``` If you want to send a set of queries, create a text file and place each query on the individual string in this file. For example: -```sql +``` sql SELECT * FROM system.numbers LIMIT 10000000 SELECT 1 ``` Then pass this file to a standard input of `clickhouse-benchmark`. -```bash +``` bash clickhouse-benchmark [keys] < queries_file ``` ## Keys {#clickhouse-benchmark-keys} -- `-c N`, `--concurrency=N` — Number of queries that `clickhouse-benchmark` sends simultaneously. Default value: 1. -- `-d N`, `--delay=N` — Interval in seconds between intermediate reports (set 0 to disable reports). Default value: 1. -- `-h WORD`, `--host=WORD` — Server host. Default value: `localhost`. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-h` keys. -- `-p N`, `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-p` keys. -- `-i N`, `--iterations=N` — Total number of queries. Default value: 0. -- `-r`, `--randomize` — Random order of queries execution if there is more then one input query. -- `-s`, `--secure` — Using TLS connection. -- `-t N`, `--timelimit=N` — Time limit in seconds. `clickhouse-benchmark` stops sending queries when the specified time limit is reached. Default value: 0 (time limit disabled). -- `--confidence=N` — Level of confidence for T-test. Possible values: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Default value: 5. In the [comparison mode](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` performs the [Independent two-sample Student's t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) test to determine whether the two distributions aren't different with the selected level of confidence. -- `--cumulative` — Printing cumulative data instead of data per interval. -- `--database=DATABASE_NAME` — ClickHouse database name. Default value: `default`. -- `--json=FILEPATH` — JSON output. When the key is set, `clickhouse-benchmark` outputs a report to the specified JSON-file. -- `--user=USERNAME` — ClickHouse user name. Default value: `default`. -- `--password=PSWD` — ClickHouse user password. Default value: empty string. -- `--stacktrace` — Stack traces output. When the key is set, `clickhouse-bencmark` outputs stack traces of exceptions. -- `--stage=WORD` — Query processing stage at server. ClickHouse stops query processing and returns answer to `clickhouse-benchmark` at the specified stage. Possible values: `complete`, `fetch_columns`, `with_mergeable_state`. Default value: `complete`. -- `--help` — Shows the help message. +- `-c N`, `--concurrency=N` — Number of queries that `clickhouse-benchmark` sends simultaneously. Default value: 1. +- `-d N`, `--delay=N` — Interval in seconds between intermediate reports (set 0 to disable reports). Default value: 1. +- `-h WORD`, `--host=WORD` — Server host. Default value: `localhost`. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-h` keys. +- `-p N`, `--port=N` — Server port. Default value: 9000. For the [comparison mode](#clickhouse-benchmark-comparison-mode) you can use multiple `-p` keys. +- `-i N`, `--iterations=N` — Total number of queries. Default value: 0. +- `-r`, `--randomize` — Random order of queries execution if there is more then one input query. +- `-s`, `--secure` — Using TLS connection. +- `-t N`, `--timelimit=N` — Time limit in seconds. `clickhouse-benchmark` stops sending queries when the specified time limit is reached. Default value: 0 (time limit disabled). +- `--confidence=N` — Level of confidence for T-test. Possible values: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Default value: 5. In the [comparison mode](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` performs the [Independent two-sample Student’s t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) test to determine whether the two distributions aren’t different with the selected level of confidence. +- `--cumulative` — Printing cumulative data instead of data per interval. +- `--database=DATABASE_NAME` — ClickHouse database name. Default value: `default`. +- `--json=FILEPATH` — JSON output. When the key is set, `clickhouse-benchmark` outputs a report to the specified JSON-file. +- `--user=USERNAME` — ClickHouse user name. Default value: `default`. +- `--password=PSWD` — ClickHouse user password. Default value: empty string. +- `--stacktrace` — Stack traces output. When the key is set, `clickhouse-bencmark` outputs stack traces of exceptions. +- `--stage=WORD` — Query processing stage at server. ClickHouse stops query processing and returns answer to `clickhouse-benchmark` at the specified stage. Possible values: `complete`, `fetch_columns`, `with_mergeable_state`. Default value: `complete`. +- `--help` — Shows the help message. If you want to apply some [settings](../../operations/settings/index.md) for queries, pass them as a key `--= SETTING_VALUE`. For example, `--max_memory_usage=1048576`. @@ -53,75 +55,76 @@ By default, `clickhouse-benchmark` reports for each `--delay` interval. Example of the report: -```text +``` text Queries executed: 10. localhost:9000, queries 10, QPS: 6.772, RPS: 67904487.440, MiB/s: 518.070, result RPS: 67721584.984, result MiB/s: 516.675. -0.000% 0.145 sec. -10.000% 0.146 sec. -20.000% 0.146 sec. -30.000% 0.146 sec. -40.000% 0.147 sec. -50.000% 0.148 sec. -60.000% 0.148 sec. -70.000% 0.148 sec. -80.000% 0.149 sec. -90.000% 0.150 sec. -95.000% 0.150 sec. -99.000% 0.150 sec. -99.900% 0.150 sec. -99.990% 0.150 sec. +0.000% 0.145 sec. +10.000% 0.146 sec. +20.000% 0.146 sec. +30.000% 0.146 sec. +40.000% 0.147 sec. +50.000% 0.148 sec. +60.000% 0.148 sec. +70.000% 0.148 sec. +80.000% 0.149 sec. +90.000% 0.150 sec. +95.000% 0.150 sec. +99.000% 0.150 sec. +99.900% 0.150 sec. +99.990% 0.150 sec. ``` In the report you can find: -- Number of queries in the `Queries executed: ` field. -- Status string containing (in order): +- Number of queries in the `Queries executed:` field. - - Endpoint of ClickHouse server. - - Number of processed queries. - - QPS: QPS: How many queries server performed per second during a period specified in the `--delay` argument. - - RPS: How many rows server read per second during a period specified in the `--delay` argument. - - MiB/s: How many mebibytes server read per second during a period specified in the `--delay` argument. - - result RPS: How many rows placed by server to the result of a query per second during a period specified in the `--delay` argument. - - result MiB/s. How many mebibytes placed by server to the result of a query per second during a period specified in the `--delay` argument. +- Status string containing (in order): -- Percentiles of queries execution time. + - Endpoint of ClickHouse server. + - Number of processed queries. + - QPS: QPS: How many queries server performed per second during a period specified in the `--delay` argument. + - RPS: How many rows server read per second during a period specified in the `--delay` argument. + - MiB/s: How many mebibytes server read per second during a period specified in the `--delay` argument. + - result RPS: How many rows placed by server to the result of a query per second during a period specified in the `--delay` argument. + - result MiB/s. How many mebibytes placed by server to the result of a query per second during a period specified in the `--delay` argument. +- Percentiles of queries execution time. ## Comparison mode {#clickhouse-benchmark-comparison-mode} -`clickhouse-benchmark` can compare performances for two running ClickHouse servers. +`clickhouse-benchmark` can compare performances for two running ClickHouse servers. To use the comparison mode, specify endpoints of both servers by two pairs of `--host`, `--port` keys. Keys matched together by position in arguments list, the first `--host` is matched with the first `--port` and so on. `clickhouse-benchmark` establishes connections to both servers, then sends queries. Each query addressed to a randomly selected server. The results are shown for each server separately. ## Example {#clickhouse-benchmark-example} -```bash +``` bash $ echo "SELECT * FROM system.numbers LIMIT 10000000 OFFSET 10000000" | clickhouse-benchmark -i 10 ``` -```text + +``` text Loaded 1 queries. Queries executed: 6. localhost:9000, queries 6, QPS: 6.153, RPS: 123398340.957, MiB/s: 941.455, result RPS: 61532982.200, result MiB/s: 469.459. -0.000% 0.159 sec. -10.000% 0.159 sec. -20.000% 0.159 sec. -30.000% 0.160 sec. -40.000% 0.160 sec. -50.000% 0.162 sec. -60.000% 0.164 sec. -70.000% 0.165 sec. -80.000% 0.166 sec. -90.000% 0.166 sec. -95.000% 0.167 sec. -99.000% 0.167 sec. -99.900% 0.167 sec. -99.990% 0.167 sec. +0.000% 0.159 sec. +10.000% 0.159 sec. +20.000% 0.159 sec. +30.000% 0.160 sec. +40.000% 0.160 sec. +50.000% 0.162 sec. +60.000% 0.164 sec. +70.000% 0.165 sec. +80.000% 0.166 sec. +90.000% 0.166 sec. +95.000% 0.167 sec. +99.000% 0.167 sec. +99.900% 0.167 sec. +99.990% 0.167 sec. @@ -129,18 +132,18 @@ Queries executed: 10. localhost:9000, queries 10, QPS: 6.082, RPS: 121959604.568, MiB/s: 930.478, result RPS: 60815551.642, result MiB/s: 463.986. -0.000% 0.159 sec. -10.000% 0.159 sec. -20.000% 0.160 sec. -30.000% 0.163 sec. -40.000% 0.164 sec. -50.000% 0.165 sec. -60.000% 0.166 sec. -70.000% 0.166 sec. -80.000% 0.167 sec. -90.000% 0.167 sec. -95.000% 0.170 sec. -99.000% 0.172 sec. -99.900% 0.172 sec. -99.990% 0.172 sec. +0.000% 0.159 sec. +10.000% 0.159 sec. +20.000% 0.160 sec. +30.000% 0.163 sec. +40.000% 0.164 sec. +50.000% 0.165 sec. +60.000% 0.166 sec. +70.000% 0.166 sec. +80.000% 0.167 sec. +90.000% 0.167 sec. +95.000% 0.170 sec. +99.000% 0.172 sec. +99.900% 0.172 sec. +99.990% 0.172 sec. ``` diff --git a/docs/en/operations/utils/clickhouse-copier.md b/docs/en/operations/utils/clickhouse-copier.md index 795edc9748d..a96e1709b14 100644 --- a/docs/en/operations/utils/clickhouse-copier.md +++ b/docs/en/operations/utils/clickhouse-copier.md @@ -1,5 +1,4 @@ - -# clickhouse-copier +# clickhouse-copier {#clickhouse-copier} Copies data from the tables in one cluster to tables in another (or the same) cluster. @@ -7,38 +6,39 @@ You can run multiple `clickhouse-copier` instances on different servers to perfo After starting, `clickhouse-copier`: -- Connects to ZooKeeper and receives: - - Copying jobs. - - The state of the copying jobs. +- Connects to ZooKeeper and receives: -- It performs the jobs. + - Copying jobs. + - The state of the copying jobs. - Each running process chooses the "closest" shard of the source cluster and copies the data into the destination cluster, resharding the data if necessary. +- It performs the jobs. + + Each running process chooses the “closest” shard of the source cluster and copies the data into the destination cluster, resharding the data if necessary. `clickhouse-copier` tracks the changes in ZooKeeper and applies them on the fly. To reduce network traffic, we recommend running `clickhouse-copier` on the same server where the source data is located. -## Running clickhouse-copier +## Running clickhouse-copier {#running-clickhouse-copier} The utility should be run manually: -```bash +``` bash $ clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir ``` Parameters: -- `daemon` — Starts `clickhouse-copier` in daemon mode. -- `config` — The path to the `zookeeper.xml` file with the parameters for the connection to ZooKeeper. -- `task-path` — The path to the ZooKeeper node. This node is used for syncing `clickhouse-copier` processes and storing tasks. Tasks are stored in `$task-path/description`. -- `task-file` — Optional path to file with task configuration for initial upload to ZooKeeper. -- `task-upload-force` — Force upload `task-file` even if node already exists. -- `base-dir` — The path to logs and auxiliary files. When it starts, `clickhouse-copier` creates `clickhouse-copier_YYYYMMHHSS_` subdirectories in `$base-dir`. If this parameter is omitted, the directories are created in the directory where `clickhouse-copier` was launched. +- `daemon` — Starts `clickhouse-copier` in daemon mode. +- `config` — The path to the `zookeeper.xml` file with the parameters for the connection to ZooKeeper. +- `task-path` — The path to the ZooKeeper node. This node is used for syncing `clickhouse-copier` processes and storing tasks. Tasks are stored in `$task-path/description`. +- `task-file` — Optional path to file with task configuration for initial upload to ZooKeeper. +- `task-upload-force` — Force upload `task-file` even if node already exists. +- `base-dir` — The path to logs and auxiliary files. When it starts, `clickhouse-copier` creates `clickhouse-copier_YYYYMMHHSS_` subdirectories in `$base-dir`. If this parameter is omitted, the directories are created in the directory where `clickhouse-copier` was launched. -## Format of zookeeper.xml +## Format of zookeeper.xml {#format-of-zookeeper-xml} -```xml +``` xml trace @@ -55,9 +55,9 @@ Parameters: ``` -## Configuration of copying tasks +## Configuration of copying tasks {#configuration-of-copying-tasks} -```xml +``` xml @@ -166,5 +166,4 @@ Parameters: `clickhouse-copier` tracks the changes in `/task/path/description` and applies them on the fly. For instance, if you change the value of `max_workers`, the number of processes running tasks will also change. - [Original article](https://clickhouse.tech/docs/en/operations/utils/clickhouse-copier/) diff --git a/docs/en/operations/utils/clickhouse-local.md b/docs/en/operations/utils/clickhouse-local.md index e6ec112b0c3..bc08eed23ad 100644 --- a/docs/en/operations/utils/clickhouse-local.md +++ b/docs/en/operations/utils/clickhouse-local.md @@ -1,5 +1,4 @@ - -# clickhouse-local +# clickhouse-local {#clickhouse-local} The `clickhouse-local` program enables you to perform fast processing on local files, without having to deploy and configure the ClickHouse server. @@ -9,59 +8,58 @@ Accepts data that represent tables and queries them using [ClickHouse SQL dialec By default `clickhouse-local` does not have access to data on the same host, but it supports loading server configuration using `--config-file` argument. -!!! warning +!!! warning "Warning" It is not recommended to load production server configuration into `clickhouse-local` because data can be damaged in case of human error. - -## Usage +## Usage {#usage} Basic usage: -```bash +``` bash $ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" ``` Arguments: -- `-S`, `--structure` — table structure for input data. -- `-if`, `--input-format` — input format, `TSV` by default. -- `-f`, `--file` — path to data, `stdin` by default. -- `-q` `--query` — queries to execute with `;` as delimeter. -- `-N`, `--table` — table name where to put output data, `table` by default. -- `-of`, `--format`, `--output-format` — output format, `TSV` by default. -- `--stacktrace` — whether to dump debug output in case of exception. -- `--verbose` — more details on query execution. -- `-s` — disables `stderr` logging. -- `--config-file` — path to configuration file in same format as for ClickHouse server, by default the configuration empty. -- `--help` — arguments references for `clickhouse-local`. +- `-S`, `--structure` — table structure for input data. +- `-if`, `--input-format` — input format, `TSV` by default. +- `-f`, `--file` — path to data, `stdin` by default. +- `-q` `--query` — queries to execute with `;` as delimeter. +- `-N`, `--table` — table name where to put output data, `table` by default. +- `-of`, `--format`, `--output-format` — output format, `TSV` by default. +- `--stacktrace` — whether to dump debug output in case of exception. +- `--verbose` — more details on query execution. +- `-s` — disables `stderr` logging. +- `--config-file` — path to configuration file in same format as for ClickHouse server, by default the configuration empty. +- `--help` — arguments references for `clickhouse-local`. Also there are arguments for each ClickHouse configuration variable which are more commonly used instead of `--config-file`. +## Examples {#examples} -## Examples - -```bash +``` bash $ echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. -1 2 -3 4 +1 2 +3 4 ``` Previous example is the same as: -```bash +``` bash $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. -1 2 -3 4 +1 2 +3 4 ``` -Now let's output memory user for each Unix user: +Now let’s output memory user for each Unix user: -```bash +``` bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" ``` -```text + +``` text Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ┏━━━━━━━━━━┳━━━━━━━━━━┓ ┃ user ┃ memTotal ┃ diff --git a/docs/en/operations/utils/index.md b/docs/en/operations/utils/index.md index 71d46f3f7c8..ebc1396d031 100644 --- a/docs/en/operations/utils/index.md +++ b/docs/en/operations/utils/index.md @@ -1,7 +1,7 @@ -# ClickHouse Utility +# ClickHouse Utility {#clickhouse-utility} -- [clickhouse-local](clickhouse-local.md) — Allows running SQL queries on data without stopping the ClickHouse server, similar to how `awk` does this. -- [clickhouse-copier](clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. -- [clickhouse-benchmark](clickhouse-benchmark.md) — Loads server with the custom queries and settings. +- [clickhouse-local](clickhouse-local.md) — Allows running SQL queries on data without stopping the ClickHouse server, similar to how `awk` does this. +- [clickhouse-copier](clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. +- [clickhouse-benchmark](clickhouse-benchmark.md) — Loads server with the custom queries and settings. [Original article](https://clickhouse.tech/docs/en/operations/utils/) diff --git a/docs/en/query_language/agg_functions/combinators.md b/docs/en/query_language/agg_functions/combinators.md index 9cb6c4be4fa..c9fa6dbb370 100644 --- a/docs/en/query_language/agg_functions/combinators.md +++ b/docs/en/query_language/agg_functions/combinators.md @@ -1,4 +1,4 @@ -# Aggregate function combinators {#aggregate_functions_combinators} +# Aggregate function combinators {#aggregate-functions-combinators} The name of an aggregate function can have a suffix appended to it. This changes the way the aggregate function works. @@ -12,33 +12,33 @@ With conditional aggregate functions, you can calculate aggregates for several c ## -Array {#agg-functions-combinator-array} -The -Array suffix can be appended to any aggregate function. In this case, the aggregate function takes arguments of the 'Array(T)' type (arrays) instead of 'T' type arguments. If the aggregate function accepts multiple arguments, this must be arrays of equal lengths. When processing arrays, the aggregate function works like the original aggregate function across all array elements. +The -Array suffix can be appended to any aggregate function. In this case, the aggregate function takes arguments of the ‘Array(T)’ type (arrays) instead of ‘T’ type arguments. If the aggregate function accepts multiple arguments, this must be arrays of equal lengths. When processing arrays, the aggregate function works like the original aggregate function across all array elements. -Example 1: `sumArray(arr)` - Totals all the elements of all 'arr' arrays. In this example, it could have been written more simply: `sum(arraySum(arr))`. +Example 1: `sumArray(arr)` - Totals all the elements of all ‘arr’ arrays. In this example, it could have been written more simply: `sum(arraySum(arr))`. -Example 2: `uniqArray(arr)` – Counts the number of unique elements in all 'arr' arrays. This could be done an easier way: `uniq(arrayJoin(arr))`, but it's not always possible to add 'arrayJoin' to a query. +Example 2: `uniqArray(arr)` – Counts the number of unique elements in all ‘arr’ arrays. This could be done an easier way: `uniq(arrayJoin(arr))`, but it’s not always possible to add ‘arrayJoin’ to a query. --If and -Array can be combined. However, 'Array' must come first, then 'If'. Examples: `uniqArrayIf(arr, cond)`, `quantilesTimingArrayIf(level1, level2)(arr, cond)`. Due to this order, the 'cond' argument won't be an array. +-If and -Array can be combined. However, ‘Array’ must come first, then ‘If’. Examples: `uniqArrayIf(arr, cond)`, `quantilesTimingArrayIf(level1, level2)(arr, cond)`. Due to this order, the ‘cond’ argument won’t be an array. ## -State {#agg-functions-combinator-state} -If you apply this combinator, the aggregate function doesn't return the resulting value (such as the number of unique values for the [uniq](reference.md#agg_function-uniq) function), but an intermediate state of the aggregation (for `uniq`, this is the hash table for calculating the number of unique values). This is an `AggregateFunction(...)` that can be used for further processing or stored in a table to finish aggregating later. +If you apply this combinator, the aggregate function doesn’t return the resulting value (such as the number of unique values for the [uniq](reference.md#agg_function-uniq) function), but an intermediate state of the aggregation (for `uniq`, this is the hash table for calculating the number of unique values). This is an `AggregateFunction(...)` that can be used for further processing or stored in a table to finish aggregating later. To work with these states, use: -- [AggregatingMergeTree](../../operations/table_engines/aggregatingmergetree.md) table engine. -- [finalizeAggregation](../functions/other_functions.md#function-finalizeaggregation) function. -- [runningAccumulate](../functions/other_functions.md#function-runningaccumulate) function. -- [-Merge](#aggregate_functions_combinators_merge) combinator. -- [-MergeState](#aggregate_functions_combinators_mergestate) combinator. +- [AggregatingMergeTree](../../operations/table_engines/aggregatingmergetree.md) table engine. +- [finalizeAggregation](../functions/other_functions.md#function-finalizeaggregation) function. +- [runningAccumulate](../functions/other_functions.md#function-runningaccumulate) function. +- [-Merge](#aggregate_functions_combinators_merge) combinator. +- [-MergeState](#aggregate_functions_combinators_mergestate) combinator. -## -Merge {#aggregate_functions_combinators_merge} +## -Merge {#aggregate-functions-combinators-merge} If you apply this combinator, the aggregate function takes the intermediate aggregation state as an argument, combines the states to finish aggregation, and returns the resulting value. -## -MergeState {#aggregate_functions_combinators_mergestate} +## -MergeState {#aggregate-functions-combinators-mergestate} -Merges the intermediate aggregation states in the same way as the -Merge combinator. However, it doesn't return the resulting value, but an intermediate aggregation state, similar to the -State combinator. +Merges the intermediate aggregation states in the same way as the -Merge combinator. However, it doesn’t return the resulting value, but an intermediate aggregation state, similar to the -State combinator. ## -ForEach {#agg-functions-combinator-foreach} @@ -46,12 +46,13 @@ Converts an aggregate function for tables into an aggregate function for arrays ## -OrDefault {#agg-functions-combinator-ordefault} -Fills the default value of the aggregate function's return type if there is nothing to aggregate. +Fills the default value of the aggregate function’s return type if there is nothing to aggregate. -```sql +``` sql SELECT avg(number), avgOrDefault(number) FROM numbers(0) ``` -```text + +``` text ┌─avg(number)─┬─avgOrDefault(number)─┐ │ nan │ 0 │ └─────────────┴──────────────────────┘ @@ -61,10 +62,11 @@ SELECT avg(number), avgOrDefault(number) FROM numbers(0) Fills `null` if there is nothing to aggregate. The return column will be nullable. -```sql +``` sql SELECT avg(number), avgOrNull(number) FROM numbers(0) ``` -```text + +``` text ┌─avg(number)─┬─avgOrNull(number)─┐ │ nan │ ᴺᵁᴸᴸ │ └─────────────┴───────────────────┘ @@ -72,45 +74,45 @@ SELECT avg(number), avgOrNull(number) FROM numbers(0) -OrDefault and -OrNull can be combined with other combinators. It is useful when the aggregate function does not accept the empty input. -```sql +``` sql SELECT avgOrNullIf(x, x > 10) FROM ( SELECT toDecimal32(1.23, 2) AS x ) ``` -```text + +``` text ┌─avgOrNullIf(x, greater(x, 10))─┐ │ ᴺᵁᴸᴸ │ └────────────────────────────────┘ ``` -## -Resample {#agg_functions-combinator-resample} +## -Resample {#agg-functions-combinator-resample} Lets you divide data into groups, and then separately aggregates the data in those groups. Groups are created by splitting the values from one column into intervals. -```sql +``` sql Resample(start, end, step)(, resampling_key) ``` **Parameters** -- `start` — Starting value of the whole required interval for `resampling_key` values. -- `stop` — Ending value of the whole required interval for `resampling_key` values. The whole interval doesn't include the `stop` value `[start, stop)`. -- `step` — Step for separating the whole interval into subintervals. The `aggFunction` is executed over each of those subintervals independently. -- `resampling_key` — Column whose values are used for separating data into intervals. -- `aggFunction_params` — `aggFunction` parameters. - +- `start` — Starting value of the whole required interval for `resampling_key` values. +- `stop` — Ending value of the whole required interval for `resampling_key` values. The whole interval doesn’t include the `stop` value `[start, stop)`. +- `step` — Step for separating the whole interval into subintervals. The `aggFunction` is executed over each of those subintervals independently. +- `resampling_key` — Column whose values are used for separating data into intervals. +- `aggFunction_params` — `aggFunction` parameters. **Returned values** -- Array of `aggFunction` results for each subinterval. +- Array of `aggFunction` results for each subinterval. **Example** Consider the `people` table with the following data: -```text +``` text ┌─name───┬─age─┬─wage─┐ │ John │ 16 │ 10 │ │ Alice │ 30 │ 15 │ @@ -121,14 +123,15 @@ Consider the `people` table with the following data: └────────┴─────┴──────┘ ``` -Let's get the names of the people whose age lies in the intervals of `[30,60)` and `[60,75)`. Since we use integer representation for age, we get ages in the `[30, 59]` and `[60,74]` intervals. +Let’s get the names of the people whose age lies in the intervals of `[30,60)` and `[60,75)`. Since we use integer representation for age, we get ages in the `[30, 59]` and `[60,74]` intervals. -To aggregate names in an array, we use the [groupArray](reference.md#agg_function-grouparray) aggregate function. It takes one argument. In our case, it's the `name` column. The `groupArrayResample` function should use the `age` column to aggregate names by age. To define the required intervals, we pass the `30, 75, 30` arguments into the `groupArrayResample` function. +To aggregate names in an array, we use the [groupArray](reference.md#agg_function-grouparray) aggregate function. It takes one argument. In our case, it’s the `name` column. The `groupArrayResample` function should use the `age` column to aggregate names by age. To define the required intervals, we pass the `30, 75, 30` arguments into the `groupArrayResample` function. -```sql +``` sql SELECT groupArrayResample(30, 75, 30)(name, age) FROM people ``` -```text + +``` text ┌─groupArrayResample(30, 75, 30)(name, age)─────┐ │ [['Alice','Mary','Evelyn'],['David','Brian']] │ └───────────────────────────────────────────────┘ @@ -136,17 +139,18 @@ SELECT groupArrayResample(30, 75, 30)(name, age) FROM people Consider the results. -`Jonh` is out of the sample because he's too young. Other people are distributed according to the specified age intervals. +`Jonh` is out of the sample because he’s too young. Other people are distributed according to the specified age intervals. -Now let's count the total number of people and their average wage in the specified age intervals. +Now let’s count the total number of people and their average wage in the specified age intervals. -```sql +``` sql SELECT countResample(30, 75, 30)(name, age) AS amount, avgResample(30, 75, 30)(wage, age) AS avg_wage FROM people ``` -```text + +``` text ┌─amount─┬─avg_wage──────────────────┐ │ [3,2] │ [11.5,12.949999809265137] │ └────────┴───────────────────────────┘ diff --git a/docs/en/query_language/agg_functions/index.md b/docs/en/query_language/agg_functions/index.md index 613518611f6..18f1555f9dc 100644 --- a/docs/en/query_language/agg_functions/index.md +++ b/docs/en/query_language/agg_functions/index.md @@ -1,13 +1,13 @@ -# Aggregate functions {#aggregate_functions} +# Aggregate functions {#aggregate-functions} Aggregate functions work in the [normal](http://www.sql-tutorial.com/sql-aggregate-functions-sql-tutorial) way as expected by database experts. ClickHouse also supports: -- [Parametric aggregate functions](parametric_functions.md#aggregate_functions_parametric), which accept other parameters in addition to columns. -- [Combinators](combinators.md#aggregate_functions_combinators), which change the behavior of aggregate functions. +- [Parametric aggregate functions](parametric_functions.md#aggregate_functions_parametric), which accept other parameters in addition to columns. +- [Combinators](combinators.md#aggregate_functions_combinators), which change the behavior of aggregate functions. -## NULL processing +## NULL processing {#null-processing} During aggregation, all `NULL`s are skipped. @@ -15,7 +15,7 @@ During aggregation, all `NULL`s are skipped. Consider this table: -```text +``` text ┌─x─┬────y─┐ │ 1 │ 2 │ │ 2 │ ᴺᵁᴸᴸ │ @@ -25,32 +25,30 @@ Consider this table: └───┴──────┘ ``` -Let's say you need to total the values in the `y` column: +Let’s say you need to total the values in the `y` column: -```sql +``` sql SELECT sum(y) FROM t_null_big ``` -``` -┌─sum(y)─┐ -│ 7 │ -└────────┘ -``` + + ┌─sum(y)─┐ + │ 7 │ + └────────┘ The `sum` function interprets `NULL` as `0`. In particular, this means that if the function receives input of a selection where all the values are `NULL`, then the result will be `0`, not `NULL`. Now you can use the `groupArray` function to create an array from the `y` column: -```sql +``` sql SELECT groupArray(y) FROM t_null_big ``` -```text + +``` text ┌─groupArray(y)─┐ │ [2,2,3] │ └───────────────┘ - ``` `groupArray` does not include `NULL` in the resulting array. - [Original article](https://clickhouse.tech/docs/en/query_language/agg_functions/) diff --git a/docs/en/query_language/agg_functions/parametric_functions.md b/docs/en/query_language/agg_functions/parametric_functions.md index 9a7a356d7c4..dd53b66d711 100644 --- a/docs/en/query_language/agg_functions/parametric_functions.md +++ b/docs/en/query_language/agg_functions/parametric_functions.md @@ -1,15 +1,15 @@ -# Parametric aggregate functions {#aggregate_functions_parametric} +# Parametric aggregate functions {#aggregate-functions-parametric} Some aggregate functions can accept not only argument columns (used for compression), but a set of parameters – constants for initialization. The syntax is two pairs of brackets instead of one. The first is for parameters, and the second is for arguments. -## histogram +## histogram {#histogram} -Calculates an adaptive histogram. It doesn't guarantee precise results. +Calculates an adaptive histogram. It doesn’t guarantee precise results. -```sql +``` sql histogram(number_of_bins)(values) ``` - + The functions uses [A Streaming Parallel Decision Tree Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). The borders of histogram bins are adjusted as new data enters a function. In common case, the widths of bins are not equal. **Parameters** @@ -19,27 +19,28 @@ The functions uses [A Streaming Parallel Decision Tree Algorithm](http://jmlr.or **Returned values** -- [Array](../../data_types/array.md) of [Tuples](../../data_types/tuple.md) of the following format: +- [Array](../../data_types/array.md) of [Tuples](../../data_types/tuple.md) of the following format: - ``` - [(lower_1, upper_1, height_1), ... (lower_N, upper_N, height_N)] - ``` + ``` + [(lower_1, upper_1, height_1), ... (lower_N, upper_N, height_N)] + ``` - - `lower` — Lower bound of the bin. - - `upper` — Upper bound of the bin. - - `height` — Calculated height of the bin. + - `lower` — Lower bound of the bin. + - `upper` — Upper bound of the bin. + - `height` — Calculated height of the bin. **Example** -```sql -SELECT histogram(5)(number + 1) +``` sql +SELECT histogram(5)(number + 1) FROM ( - SELECT * - FROM system.numbers + SELECT * + FROM system.numbers LIMIT 20 ) ``` -```text + +``` text ┌─histogram(5)(plus(number, 1))───────────────────────────────────────────┐ │ [(1,4.5,4),(4.5,8.5,4),(8.5,12.75,4.125),(12.75,17,4.625),(17,20,3.25)] │ └─────────────────────────────────────────────────────────────────────────┘ @@ -47,19 +48,20 @@ FROM ( You can visualize a histogram with the [bar](../functions/other_functions.md#function-bar) function, for example: -```sql +``` sql WITH histogram(5)(rand() % 100) AS hist -SELECT - arrayJoin(hist).3 AS height, +SELECT + arrayJoin(hist).3 AS height, bar(height, 0, 6, 5) AS bar -FROM +FROM ( SELECT * FROM system.numbers LIMIT 20 ) ``` -```text + +``` text ┌─height─┬─bar───┐ │ 2.125 │ █▋ │ │ 3.25 │ ██▌ │ @@ -69,52 +71,48 @@ FROM └────────┴───────┘ ``` -In this case, you should remember that you don't know the histogram bin borders. +In this case, you should remember that you don’t know the histogram bin borders. -## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) {#function-sequencematch} +## sequenceMatch(pattern)(timestamp, cond1, cond2, …) {#function-sequencematch} Checks whether the sequence contains an event chain that matches the pattern. -```sql +``` sql sequenceMatch(pattern)(timestamp, cond1, cond2, ...) ``` !!! warning "Warning" Events that occur at the same second may lay in the sequence in an undefined order affecting the result. - **Parameters** -- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). +- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). -- `timestamp` — Column considered to contain time data. Typical data types are `Date` and `DateTime`. You can also use any of the supported [UInt](../../data_types/int_uint.md) data types. - -- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes only the events described in these conditions into account. If the sequence contains data that isn't described in a condition, the function skips them. +- `timestamp` — Column considered to contain time data. Typical data types are `Date` and `DateTime`. You can also use any of the supported [UInt](../../data_types/int_uint.md) data types. +- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes only the events described in these conditions into account. If the sequence contains data that isn’t described in a condition, the function skips them. **Returned values** -- 1, if the pattern is matched. -- 0, if the pattern isn't matched. - +- 1, if the pattern is matched. +- 0, if the pattern isn’t matched. Type: `UInt8`. - **Pattern syntax** -- `(?N)` — Matches the condition argument at position `N`. Conditions are numbered in the `[1, 32]` range. For example, `(?1)` matches the argument passed to the `cond1` parameter. +- `(?N)` — Matches the condition argument at position `N`. Conditions are numbered in the `[1, 32]` range. For example, `(?1)` matches the argument passed to the `cond1` parameter. -- `.*` — Matches any number of events. You don't need conditional arguments to match this element of the pattern. +- `.*` — Matches any number of events. You don’t need conditional arguments to match this element of the pattern. -- `(?t operator value)` — Sets the time in seconds that should separate two events. For example, pattern `(?1)(?t>1800)(?2)` matches events that occur more than 1800 seconds from each other. An arbitrary number of any events can lay between these events. You can use the `>=`, `>`, `<`, `<=` operators. +- `(?t operator value)` — Sets the time in seconds that should separate two events. For example, pattern `(?1)(?t>1800)(?2)` matches events that occur more than 1800 seconds from each other. An arbitrary number of any events can lay between these events. You can use the `>=`, `>`, `<`, `<=` operators. **Examples** Consider data in the `t` table: -```text +``` text ┌─time─┬─number─┐ │ 1 │ 1 │ │ 2 │ 3 │ @@ -124,10 +122,11 @@ Consider data in the `t` table: Perform the query: -```sql +``` sql SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2) FROM t ``` -```text + +``` text ┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2))─┐ │ 1 │ └───────────────────────────────────────────────────────────────────────┘ @@ -135,64 +134,62 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2) FROM t The function found the event chain where number 2 follows number 1. It skipped number 3 between them, because the number is not described as an event. If we want to take this number into account when searching for the event chain given in the example, we should make a condition for it. -```sql +``` sql SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 3) FROM t ``` -```text + +``` text ┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 3))─┐ │ 0 │ └──────────────────────────────────────────────────────────────────────────────────────────┘ ``` -In this case, the function couldn't find the event chain matching the pattern, because the event for number 3 occured between 1 and 2. If in the same case we checked the condition for number 4, the sequence would match the pattern. +In this case, the function couldn’t find the event chain matching the pattern, because the event for number 3 occured between 1 and 2. If in the same case we checked the condition for number 4, the sequence would match the pattern. -```sql +``` sql SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM t ``` -```text + +``` text ┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 4))─┐ │ 1 │ └──────────────────────────────────────────────────────────────────────────────────────────┘ ``` - **See Also** -- [sequenceCount](#function-sequencecount) +- [sequenceCount](#function-sequencecount) +## sequenceCount(pattern)(time, cond1, cond2, …) {#function-sequencecount} -## sequenceCount(pattern)(time, cond1, cond2, ...) {#function-sequencecount} - -Counts the number of event chains that matched the pattern. The function searches event chains that don't overlap. It starts to search for the next chain after the current chain is matched. +Counts the number of event chains that matched the pattern. The function searches event chains that don’t overlap. It starts to search for the next chain after the current chain is matched. !!! warning "Warning" Events that occur at the same second may lay in the sequence in an undefined order affecting the result. -```sql +``` sql sequenceCount(pattern)(timestamp, cond1, cond2, ...) ``` **Parameters** -- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). +- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). -- `timestamp` — Column considered to contain time data. Typical data types are `Date` and `DateTime`. You can also use any of the supported [UInt](../../data_types/int_uint.md) data types. - -- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes only the events described in these conditions into account. If the sequence contains data that isn't described in a condition, the function skips them. +- `timestamp` — Column considered to contain time data. Typical data types are `Date` and `DateTime`. You can also use any of the supported [UInt](../../data_types/int_uint.md) data types. +- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes only the events described in these conditions into account. If the sequence contains data that isn’t described in a condition, the function skips them. **Returned values** -- Number of non-overlapping event chains that are matched. +- Number of non-overlapping event chains that are matched. Type: `UInt64`. - **Example** Consider data in the `t` table: -```text +``` text ┌─time─┬─number─┐ │ 1 │ 1 │ │ 2 │ 3 │ @@ -205,10 +202,11 @@ Consider data in the `t` table: Count how many times the number 2 occurs after the number 1 with any amount of other numbers between them: -```sql +``` sql SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t ``` -```text + +``` text ┌─sequenceCount('(?1).*(?2)')(time, equals(number, 1), equals(number, 2))─┐ │ 2 │ └─────────────────────────────────────────────────────────────────────────┘ @@ -216,8 +214,7 @@ SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t **See Also** -- [sequenceMatch](#function-sequencematch) - +- [sequenceMatch](#function-sequencematch) ## windowFunnel {#windowfunnel} @@ -225,25 +222,25 @@ Searches for event chains in a sliding time window and calculates the maximum nu The function works according to the algorithm: -- The function searches for data that triggers the first condition in the chain and sets the event counter to 1. This is the moment when the sliding window starts. +- The function searches for data that triggers the first condition in the chain and sets the event counter to 1. This is the moment when the sliding window starts. -- If events from the chain occur sequentially within the window, the counter is incremented. If the sequence of events is disrupted, the counter isn't incremented. +- If events from the chain occur sequentially within the window, the counter is incremented. If the sequence of events is disrupted, the counter isn’t incremented. -- If the data has multiple event chains at varying points of completion, the function will only output the size of the longest chain. +- If the data has multiple event chains at varying points of completion, the function will only output the size of the longest chain. -**Syntax** +**Syntax** -```sql +``` sql windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) ``` **Parameters** -- `window` — Length of the sliding window in seconds. -- `mode` - It is an optional argument. - - `'strict'` - When the `'strict'` is set, the windowFunnel() applies conditions only for the unique values. -- `timestamp` — Name of the column containing the timestamp. Data types supported: [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md#data_type-datetime) and other unsigned integer types (note that even though timestamp supports the `UInt64` type, it's value can't exceed the Int64 maximum, which is 2^63 - 1). -- `cond` — Conditions or data describing the chain of events. [UInt8](../../data_types/int_uint.md). +- `window` — Length of the sliding window in seconds. +- `mode` - It is an optional argument. + - `'strict'` - When the `'strict'` is set, the windowFunnel() applies conditions only for the unique values. +- `timestamp` — Name of the column containing the timestamp. Data types supported: [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md#data_type-datetime) and other unsigned integer types (note that even though timestamp supports the `UInt64` type, it’s value can’t exceed the Int64 maximum, which is 2^63 - 1). +- `cond` — Conditions or data describing the chain of events. [UInt8](../../data_types/int_uint.md). **Returned value** @@ -258,14 +255,14 @@ Determine if a set period of time is enough for the user to select a phone and p Set the following chain of events: -1. The user logged in to their account on the store (`eventID = 1003`). -2. The user searches for a phone (`eventID = 1007, product = 'phone'`). -3. The user placed an order (`eventID = 1009`). -4. The user made the order again (`eventID = 1010`). +1. The user logged in to their account on the store (`eventID = 1003`). +2. The user searches for a phone (`eventID = 1007, product = 'phone'`). +3. The user placed an order (`eventID = 1009`). +4. The user made the order again (`eventID = 1010`). Input table: -```text +``` text ┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ │ 2019-01-28 │ 1 │ 2019-01-29 10:00:00 │ 1003 │ phone │ └────────────┴─────────┴─────────────────────┴─────────┴─────────┘ @@ -284,7 +281,7 @@ Find out how far the user `user_id` could get through the chain in a period in J Query: -```sql +``` sql SELECT level, count() AS c @@ -303,7 +300,7 @@ ORDER BY level ASC Result: -```text +``` text ┌─level─┬─c─┐ │ 4 │ 1 │ └───────┴───┘ @@ -316,32 +313,32 @@ Any condition can be specified as an argument (as in [WHERE](../../query_languag The conditions, except the first, apply in pairs: the result of the second will be true if the first and second are true, of the third if the first and fird are true, etc. -**Syntax** +**Syntax** -```sql +``` sql retention(cond1, cond2, ..., cond32); ``` **Parameters** -- `cond` — an expression that returns a `UInt8` result (1 or 0). +- `cond` — an expression that returns a `UInt8` result (1 or 0). **Returned value** The array of 1 or 0. -- 1 — condition was met for the event. -- 0 — condition wasn't met for the event. +- 1 — condition was met for the event. +- 0 — condition wasn’t met for the event. Type: `UInt8`. **Example** -Let's consider an example of calculating the `retention` function to determine site traffic. +Let’s consider an example of calculating the `retention` function to determine site traffic. **1.** Сreate a table to illustrate an example. -```sql +``` sql CREATE TABLE retention_test(date Date, uid Int32) ENGINE = Memory; INSERT INTO retention_test SELECT '2020-01-01', number FROM numbers(5); @@ -353,13 +350,13 @@ Input table: Query: -```sql +``` sql SELECT * FROM retention_test ``` Result: -```text +``` text ┌───────date─┬─uid─┐ │ 2020-01-01 │ 0 │ │ 2020-01-01 │ 1 │ @@ -402,7 +399,7 @@ Result: Query: -```sql +``` sql SELECT uid, retention(date = '2020-01-01', date = '2020-01-02', date = '2020-01-03') AS r @@ -414,7 +411,7 @@ ORDER BY uid ASC Result: -```text +``` text ┌─uid─┬─r───────┐ │ 0 │ [1,1,1] │ │ 1 │ [1,1,1] │ @@ -434,11 +431,11 @@ Result: └─────┴─────────┘ ``` -**3.** Calculate the total number of site visits per day. +**3.** Calculate the total number of site visits per day. Query: -```sql +``` sql SELECT sum(r[1]) AS r1, sum(r[2]) AS r2, @@ -456,7 +453,7 @@ FROM Result: -```text +``` text ┌─r1─┬─r2─┬─r3─┐ │ 5 │ 5 │ 5 │ └────┴────┴────┘ @@ -464,11 +461,11 @@ Result: Where: -- `r1`- the number of unique visitors who visited the site during 2020-01-01 (the `cond1` condition). -- `r2`- the number of unique visitors who visited the site during a specific time period between 2020-01-01 and 2020-01-02 (`cond1` and `cond2` conditions). -- `r3`- the number of unique visitors who visited the site during a specific time period between 2020-01-01 and 2020-01-03 (`cond1` and `cond3` conditions). +- `r1`- the number of unique visitors who visited the site during 2020-01-01 (the `cond1` condition). +- `r2`- the number of unique visitors who visited the site during a specific time period between 2020-01-01 and 2020-01-02 (`cond1` and `cond2` conditions). +- `r3`- the number of unique visitors who visited the site during a specific time period between 2020-01-01 and 2020-01-03 (`cond1` and `cond3` conditions). -## uniqUpTo(N)(x) +## uniqUpTo(N)(x) {#uniquptonx} Calculates the number of different argument values ​​if it is less than or equal to N. If the number of different argument values is greater than N, it returns N + 1. @@ -483,14 +480,13 @@ It works as fast as possible, except for cases when a large N value is used and Usage example: -```text +``` text Problem: Generate a report that shows only keywords that produced at least 5 unique users. Solution: Write in the GROUP BY query SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 ``` [Original article](https://clickhouse.tech/docs/en/query_language/agg_functions/parametric_functions/) - -## sumMapFiltered(keys_to_keep)(keys, values) +## sumMapFiltered(keys\_to\_keep)(keys, values) {#summapfilteredkeys-to-keepkeys-values} Same behavior as [sumMap](reference.md#agg_functions-summap) except that an array of keys is passed as a parameter. This can be especially useful when working with a high cardinality of keys. diff --git a/docs/en/query_language/agg_functions/reference.md b/docs/en/query_language/agg_functions/reference.md index 7c099c26580..e57806af0c3 100644 --- a/docs/en/query_language/agg_functions/reference.md +++ b/docs/en/query_language/agg_functions/reference.md @@ -1,6 +1,6 @@ -# Function Reference +# Function Reference {#function-reference} -## count {#agg_function-count} +## count {#agg-function-count} Counts the number of rows or not-NULL values. @@ -12,32 +12,31 @@ ClickHouse supports the following syntaxes for `count`: The function can take: -- Zero parameters. -- One [expression](../syntax.md#syntax-expressions). - +- Zero parameters. +- One [expression](../syntax.md#syntax-expressions). **Returned value** -- If the function is called without parameters it counts the number of rows. -- If the [expression](../syntax.md#syntax-expressions) is passed, then the function counts how many times this expression returned not null. If the expression returns a [Nullable](../../data_types/nullable.md)-type value, then the result of `count` stays not `Nullable`. The function returns 0 if the expression returned `NULL` for all the rows. +- If the function is called without parameters it counts the number of rows. +- If the [expression](../syntax.md#syntax-expressions) is passed, then the function counts how many times this expression returned not null. If the expression returns a [Nullable](../../data_types/nullable.md)-type value, then the result of `count` stays not `Nullable`. The function returns 0 if the expression returned `NULL` for all the rows. In both cases the type of the returned value is [UInt64](../../data_types/int_uint.md). **Details** -ClickHouse supports the `COUNT(DISTINCT ...)` syntax. The behavior of this construction depends on the [count_distinct_implementation](../../operations/settings/settings.md#settings-count_distinct_implementation) setting. It defines which of the [uniq*](#agg_function-uniq) functions is used to perform the operation. The default is the [uniqExact](#agg_function-uniqexact) function. +ClickHouse supports the `COUNT(DISTINCT ...)` syntax. The behavior of this construction depends on the [count\_distinct\_implementation](../../operations/settings/settings.md#settings-count_distinct_implementation) setting. It defines which of the [uniq\*](#agg_function-uniq) functions is used to perform the operation. The default is the [uniqExact](#agg_function-uniqexact) function. The `SELECT count() FROM table` query is not optimized, because the number of entries in the table is not stored separately. It chooses a small column from the table and counts the number of values in it. - **Examples** Example 1: -```sql +``` sql SELECT count() FROM t ``` -```text + +``` text ┌─count()─┐ │ 5 │ └─────────┘ @@ -45,18 +44,21 @@ SELECT count() FROM t Example 2: -```sql +``` sql SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation' ``` -```text + +``` text ┌─name──────────────────────────┬─value─────┐ │ count_distinct_implementation │ uniqExact │ └───────────────────────────────┴───────────┘ ``` -```sql + +``` sql SELECT count(DISTINCT num) FROM t ``` -```text + +``` text ┌─uniqExact(num)─┐ │ 3 │ └────────────────┘ @@ -64,54 +66,53 @@ SELECT count(DISTINCT num) FROM t This example shows that `count(DISTINCT num)` is performed by the `uniqExact` function according to the `count_distinct_implementation` setting value. - -## any(x) {#agg_function-any} +## any(x) {#agg-function-any} Selects the first encountered value. The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate. -To get a determinate result, you can use the 'min' or 'max' function instead of 'any'. +To get a determinate result, you can use the ‘min’ or ‘max’ function instead of ‘any’. In some cases, you can rely on the order of execution. This applies to cases when SELECT comes from a subquery that uses ORDER BY. When a `SELECT` query has the `GROUP BY` clause or at least one aggregate function, ClickHouse (in contrast to MySQL) requires that all expressions in the `SELECT`, `HAVING`, and `ORDER BY` clauses be calculated from keys or from aggregate functions. In other words, each column selected from the table must be used either in keys or inside aggregate functions. To get behavior like in MySQL, you can put the other columns in the `any` aggregate function. -## anyHeavy(x) +## anyHeavy(x) {#anyheavyx} -Selects a frequently occurring value using the [heavy hitters](http://www.cs.umd.edu/~samir/498/karp.pdf) algorithm. If there is a value that occurs more than in half the cases in each of the query's execution threads, this value is returned. Normally, the result is nondeterministic. +Selects a frequently occurring value using the [heavy hitters](http://www.cs.umd.edu/~samir/498/karp.pdf) algorithm. If there is a value that occurs more than in half the cases in each of the query’s execution threads, this value is returned. Normally, the result is nondeterministic. -```sql +``` sql anyHeavy(column) ``` **Arguments** -- `column` – The column name. +- `column` – The column name. **Example** Take the [OnTime](../../getting_started/example_datasets/ontime.md) data set and select any frequently occurring value in the `AirlineID` column. -```sql +``` sql SELECT anyHeavy(AirlineID) AS res FROM ontime ``` -```text +``` text ┌───res─┐ │ 19690 │ └───────┘ ``` -## anyLast(x) +## anyLast(x) {#anylastx} Selects the last value encountered. The result is just as indeterminate as for the `any` function. -##groupBitAnd +## groupBitAnd {#groupbitand} Applies bitwise `AND` for series of numbers. -```sql +``` sql groupBitAnd(expr) ``` @@ -127,7 +128,7 @@ Value of the `UInt*` type. Test data: -```text +``` text binary decimal 00101100 = 44 00011100 = 28 @@ -137,7 +138,7 @@ binary decimal Query: -```sql +``` sql SELECT groupBitAnd(num) FROM t ``` @@ -145,16 +146,16 @@ Where `num` is the column with the test data. Result: -```text +``` text binary decimal 00000100 = 4 ``` -##groupBitOr +## groupBitOr {#groupbitor} Applies bitwise `OR` for series of numbers. -```sql +``` sql groupBitOr(expr) ``` @@ -170,7 +171,7 @@ Value of the `UInt*` type. Test data: -```text +``` text binary decimal 00101100 = 44 00011100 = 28 @@ -180,7 +181,7 @@ binary decimal Query: -```sql +``` sql SELECT groupBitOr(num) FROM t ``` @@ -188,16 +189,16 @@ Where `num` is the column with the test data. Result: -```text +``` text binary decimal 01111101 = 125 ``` -##groupBitXor +## groupBitXor {#groupbitxor} Applies bitwise `XOR` for series of numbers. -```sql +``` sql groupBitXor(expr) ``` @@ -213,7 +214,7 @@ Value of the `UInt*` type. Test data: -```text +``` text binary decimal 00101100 = 44 00011100 = 28 @@ -223,7 +224,7 @@ binary decimal Query: -```sql +``` sql SELECT groupBitXor(num) FROM t ``` @@ -231,16 +232,16 @@ Where `num` is the column with the test data. Result: -```text +``` text binary decimal 01101000 = 104 ``` -## groupBitmap +## groupBitmap {#groupbitmap} Bitmap or Aggregate calculations from a unsigned integer column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../functions/bitmap_functions.md). -```sql +``` sql groupBitmap(expr) ``` @@ -256,7 +257,7 @@ Value of the `UInt64` type. Test data: -```text +``` text UserID 1 1 @@ -266,72 +267,73 @@ UserID Query: -```sql +``` sql SELECT groupBitmap(UserID) as num FROM t ``` Result: -```text +``` text num 3 ``` -## min(x) {#agg_function-min} +## min(x) {#agg-function-min} Calculates the minimum. -## max(x) {#agg_function-max} +## max(x) {#agg-function-max} Calculates the maximum. -## argMin(arg, val) {#agg_function-argMin} +## argMin(arg, val) {#agg-function-argmin} -Calculates the 'arg' value for a minimal 'val' value. If there are several different values of 'arg' for minimal values of 'val', the first of these values encountered is output. +Calculates the ‘arg’ value for a minimal ‘val’ value. If there are several different values of ‘arg’ for minimal values of ‘val’, the first of these values encountered is output. **Example:** -```text + +``` text ┌─user─────┬─salary─┐ │ director │ 5000 │ │ manager │ 3000 │ │ worker │ 1000 │ └──────────┴────────┘ ``` -```sql + +``` sql SELECT argMin(user, salary) FROM salary ``` -```text + +``` text ┌─argMin(user, salary)─┐ │ worker │ └──────────────────────┘ ``` -## argMax(arg, val) {#agg_function-argMax} +## argMax(arg, val) {#agg-function-argmax} -Calculates the 'arg' value for a maximum 'val' value. If there are several different values of 'arg' for maximum values of 'val', the first of these values encountered is output. +Calculates the ‘arg’ value for a maximum ‘val’ value. If there are several different values of ‘arg’ for maximum values of ‘val’, the first of these values encountered is output. - -## sum(x) {#agg_function-sum} +## sum(x) {#agg-function-sum} Calculates the sum. Only works for numbers. -## sumWithOverflow(x) +## sumWithOverflow(x) {#sumwithoverflowx} Computes the sum of the numbers, using the same data type for the result as for the input parameters. If the sum exceeds the maximum value for this data type, the function returns an error. Only works for numbers. +## sumMap(key, value) {#agg-functions-summap} -## sumMap(key, value) {#agg_functions-summap} - -Totals the 'value' array according to the keys specified in the 'key' array. -The number of elements in 'key' and 'value' must be the same for each row that is totaled. +Totals the ‘value’ array according to the keys specified in the ‘key’ array. +The number of elements in ‘key’ and ‘value’ must be the same for each row that is totaled. Returns a tuple of two arrays: keys in sorted order, and values ​​summed for the corresponding keys. Example: -```sql +``` sql CREATE TABLE sum_map( date Date, timeslot DateTime, @@ -352,18 +354,18 @@ FROM sum_map GROUP BY timeslot ``` -```text +``` text ┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┐ │ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10]) │ │ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10]) │ └─────────────────────┴──────────────────────────────────────────────┘ ``` -## skewPop +## skewPop {#skewpop} Computes the [skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence. -```sql +``` sql skewPop(expr) ``` @@ -377,17 +379,17 @@ The skewness of the given distribution. Type — [Float64](../../data_types/floa **Example** -```sql +``` sql SELECT skewPop(value) FROM series_with_value_column ``` -## skewSamp +## skewSamp {#skewsamp} Computes the [sample skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence. It represents an unbiased estimate of the skewness of a random variable if passed values form its sample. -```sql +``` sql skewSamp(expr) ``` @@ -401,15 +403,15 @@ The skewness of the given distribution. Type — [Float64](../../data_types/floa **Example** -```sql +``` sql SELECT skewSamp(value) FROM series_with_value_column ``` -## kurtPop +## kurtPop {#kurtpop} Computes the [kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence. -```sql +``` sql kurtPop(expr) ``` @@ -423,17 +425,17 @@ The kurtosis of the given distribution. Type — [Float64](../../data_types/floa **Example** -```sql +``` sql SELECT kurtPop(value) FROM series_with_value_column ``` -## kurtSamp +## kurtSamp {#kurtsamp} Computes the [sample kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence. It represents an unbiased estimate of the kurtosis of a random variable if passed values form its sample. -```sql +``` sql kurtSamp(expr) ``` @@ -447,24 +449,26 @@ The kurtosis of the given distribution. Type — [Float64](../../data_types/floa **Example** -```sql +``` sql SELECT kurtSamp(value) FROM series_with_value_column ``` -## timeSeriesGroupSum(uid, timestamp, value) {#agg_function-timeseriesgroupsum} +## timeSeriesGroupSum(uid, timestamp, value) {#agg-function-timeseriesgroupsum} + `timeSeriesGroupSum` can aggregate different time series that sample timestamp not alignment. It will use linear interpolation between two sample timestamp and then sum time-series together. -- `uid` is the time series unique id, `UInt64`. -- `timestamp` is Int64 type in order to support millisecond or microsecond. -- `value` is the metric. +- `uid` is the time series unique id, `UInt64`. +- `timestamp` is Int64 type in order to support millisecond or microsecond. +- `value` is the metric. The function returns array of tuples with `(timestamp, aggregated_value)` pairs. Before using this function make sure `timestamp` is in ascending order. Example: -```text + +``` text ┌─uid─┬─timestamp─┬─value─┐ │ 1 │ 2 │ 0.2 │ │ 1 │ 7 │ 0.7 │ @@ -478,7 +482,8 @@ Example: │ 2 │ 24 │ 4.8 │ └─────┴───────────┴───────┘ ``` -```sql + +``` sql CREATE TABLE time_series( uid UInt64, timestamp Int64, @@ -493,31 +498,35 @@ FROM ( SELECT * FROM time_series order by timestamp ASC ); ``` + And the result will be: -```text + +``` text [(2,0.2),(3,0.9),(7,2.1),(8,2.4),(12,3.6),(17,5.1),(18,5.4),(24,7.2),(25,2.5)] ``` -## timeSeriesGroupRateSum(uid, ts, val) {#agg_function-timeseriesgroupratesum} +## timeSeriesGroupRateSum(uid, ts, val) {#agg-function-timeseriesgroupratesum} + Similarly timeSeriesGroupRateSum, timeSeriesGroupRateSum will Calculate the rate of time-series and then sum rates together. Also, timestamp should be in ascend order before use this function. Use this function, the result above case will be: -```text + +``` text [(2,0),(3,0.1),(7,0.3),(8,0.3),(12,0.3),(17,0.3),(18,0.3),(24,0.3),(25,0.1)] ``` -## avg(x) {#agg_function-avg} +## avg(x) {#agg-function-avg} Calculates the average. Only works for numbers. The result is always Float64. -## uniq {#agg_function-uniq} +## uniq {#agg-function-uniq} Calculates the approximate number of different values of the argument. -```sql +``` sql uniq(x[, ...]) ``` @@ -527,33 +536,34 @@ The function takes a variable number of parameters. Parameters can be `Tuple`, ` **Returned value** -- A [UInt64](../../data_types/int_uint.md)-type number. +- A [UInt64](../../data_types/int_uint.md)-type number. **Implementation details** Function: -- Calculates a hash for all parameters in the aggregate, then uses it in calculations. -- Uses an adaptive sampling algorithm. For the calculation state, the function uses a sample of element hash values up to 65536. +- Calculates a hash for all parameters in the aggregate, then uses it in calculations. - This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions. +- Uses an adaptive sampling algorithm. For the calculation state, the function uses a sample of element hash values up to 65536. -- Provides the result deterministically (it doesn't depend on the query processing order). + This algorithm is very accurate and very efficient on the CPU. When the query contains several of these functions, using `uniq` is almost as fast as using other aggregate functions. + +- Provides the result deterministically (it doesn’t depend on the query processing order). We recommend using this function in almost all scenarios. **See Also** -- [uniqCombined](#agg_function-uniqcombined) -- [uniqCombined64](#agg_function-uniqcombined64) -- [uniqHLL12](#agg_function-uniqhll12) -- [uniqExact](#agg_function-uniqexact) +- [uniqCombined](#agg_function-uniqcombined) +- [uniqCombined64](#agg_function-uniqcombined64) +- [uniqHLL12](#agg_function-uniqhll12) +- [uniqExact](#agg_function-uniqexact) -## uniqCombined {#agg_function-uniqcombined} +## uniqCombined {#agg-function-uniqcombined} Calculates the approximate number of different argument values. -```sql +``` sql uniqCombined(HLL_precision)(x[, ...]) ``` @@ -567,44 +577,45 @@ The function takes a variable number of parameters. Parameters can be `Tuple`, ` **Returned value** -- A number [UInt64](../../data_types/int_uint.md)-type number. +- A number [UInt64](../../data_types/int_uint.md)-type number. **Implementation details** Function: -- Calculates a hash (64-bit hash for `String` and 32-bit otherwise) for all parameters in the aggregate, then uses it in calculations. -- Uses a combination of three algorithms: array, hash table, and HyperLogLog with an error correction table. +- Calculates a hash (64-bit hash for `String` and 32-bit otherwise) for all parameters in the aggregate, then uses it in calculations. - For a small number of distinct elements, an array is used. When the set size is larger, a hash table is used. For a larger number of elements, HyperLogLog is used, which will occupy a fixed amount of memory. +- Uses a combination of three algorithms: array, hash table, and HyperLogLog with an error correction table. -- Provides the result deterministically (it doesn't depend on the query processing order). + For a small number of distinct elements, an array is used. When the set size is larger, a hash table is used. For a larger number of elements, HyperLogLog is used, which will occupy a fixed amount of memory. + +- Provides the result deterministically (it doesn’t depend on the query processing order). !!! note "Note" Since it uses 32-bit hash for non-`String` type, the result will have very high error for cardinalities significantly larger than `UINT_MAX` (error will raise quickly after a few tens of billions of distinct values), hence in this case you should use [uniqCombined64](#agg_function-uniqcombined64) Compared to the [uniq](#agg_function-uniq) function, the `uniqCombined`: -- Consumes several times less memory. -- Calculates with several times higher accuracy. -- Usually has slightly lower performance. In some scenarios, `uniqCombined` can perform better than `uniq`, for example, with distributed queries that transmit a large number of aggregation states over the network. +- Consumes several times less memory. +- Calculates with several times higher accuracy. +- Usually has slightly lower performance. In some scenarios, `uniqCombined` can perform better than `uniq`, for example, with distributed queries that transmit a large number of aggregation states over the network. **See Also** -- [uniq](#agg_function-uniq) -- [uniqCombined64](#agg_function-uniqcombined64) -- [uniqHLL12](#agg_function-uniqhll12) -- [uniqExact](#agg_function-uniqexact) +- [uniq](#agg_function-uniq) +- [uniqCombined64](#agg_function-uniqcombined64) +- [uniqHLL12](#agg_function-uniqhll12) +- [uniqExact](#agg_function-uniqexact) -## uniqCombined64 {#agg_function-uniqcombined64} +## uniqCombined64 {#agg-function-uniqcombined64} Same as [uniqCombined](#agg_function-uniqcombined), but uses 64-bit hash for all data types. -## uniqHLL12 {#agg_function-uniqhll12} +## uniqHLL12 {#agg-function-uniqhll12} Calculates the approximate number of different argument values, using the [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) algorithm. -```sql +``` sql uniqHLL12(x[, ...]) ``` @@ -614,33 +625,33 @@ The function takes a variable number of parameters. Parameters can be `Tuple`, ` **Returned value** -- A [UInt64](../../data_types/int_uint.md)-type number. +- A [UInt64](../../data_types/int_uint.md)-type number. **Implementation details** Function: -- Calculates a hash for all parameters in the aggregate, then uses it in calculations. -- Uses the HyperLogLog algorithm to approximate the number of different argument values. +- Calculates a hash for all parameters in the aggregate, then uses it in calculations. - 212 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). +- Uses the HyperLogLog algorithm to approximate the number of different argument values. -- Provides the determinate result (it doesn't depend on the query processing order). + 212 5-bit cells are used. The size of the state is slightly more than 2.5 KB. The result is not very accurate (up to ~10% error) for small data sets (<10K elements). However, the result is fairly accurate for high-cardinality data sets (10K-100M), with a maximum error of ~1.6%. Starting from 100M, the estimation error increases, and the function will return very inaccurate results for data sets with extremely high cardinality (1B+ elements). -We don't recommend using this function. In most cases, use the [uniq](#agg_function-uniq) or [uniqCombined](#agg_function-uniqcombined) function. +- Provides the determinate result (it doesn’t depend on the query processing order). + +We don’t recommend using this function. In most cases, use the [uniq](#agg_function-uniq) or [uniqCombined](#agg_function-uniqcombined) function. **See Also** -- [uniq](#agg_function-uniq) -- [uniqCombined](#agg_function-uniqcombined) -- [uniqExact](#agg_function-uniqexact) +- [uniq](#agg_function-uniq) +- [uniqCombined](#agg_function-uniqcombined) +- [uniqExact](#agg_function-uniqexact) - -## uniqExact {#agg_function-uniqexact} +## uniqExact {#agg-function-uniqexact} Calculates the exact number of different argument values. -```sql +``` sql uniqExact(x[, ...]) ``` @@ -654,12 +665,11 @@ The function takes a variable number of parameters. Parameters can be `Tuple`, ` **See Also** -- [uniq](#agg_function-uniq) -- [uniqCombined](#agg_function-uniqcombined) -- [uniqHLL12](#agg_function-uniqhll12) +- [uniq](#agg_function-uniq) +- [uniqCombined](#agg_function-uniqcombined) +- [uniqHLL12](#agg_function-uniqhll12) - -## groupArray(x), groupArray(max_size)(x) {#agg_function-grouparray} +## groupArray(x), groupArray(max\_size)(x) {#agg-function-grouparray} Creates an array of argument values. Values can be added to the array in any (indeterminate) order. @@ -669,8 +679,7 @@ For example, `groupArray (1) (x)` is equivalent to `[any (x)]`. In some cases, you can still rely on the order of execution. This applies to cases when `SELECT` comes from a subquery that uses `ORDER BY`. - -## groupArrayInsertAt(value, position) +## groupArrayInsertAt(value, position) {#grouparrayinsertatvalue-position} Inserts a value into the array in the specified position. @@ -681,14 +690,14 @@ Accepts the value and position as input. If several values ​​are inserted in Optional parameters: -- The default value for substituting in empty positions. -- The length of the resulting array. This allows you to receive arrays of the same size for all the aggregate keys. When using this parameter, the default value must be specified. +- The default value for substituting in empty positions. +- The length of the resulting array. This allows you to receive arrays of the same size for all the aggregate keys. When using this parameter, the default value must be specified. -## groupArrayMovingSum {#agg_function-grouparraymovingsum} +## groupArrayMovingSum {#agg-function-grouparraymovingsum} Calculates the moving sum of input values. -```sql +``` sql groupArrayMovingSum(numbers_for_summing) groupArrayMovingSum(window_size)(numbers_for_summing) ``` @@ -697,18 +706,18 @@ The function can take the window size as a parameter. If left unspecified, the f **Parameters** -- `numbers_for_summing` — [Expression](../syntax.md#syntax-expressions) resulting in a numeric data type value. -- `window_size` — Size of the calculation window. +- `numbers_for_summing` — [Expression](../syntax.md#syntax-expressions) resulting in a numeric data type value. +- `window_size` — Size of the calculation window. **Returned values** -- Array of the same size and type as the input data. +- Array of the same size and type as the input data. **Example** The sample table: -```sql +``` sql CREATE TABLE t ( `int` UInt8, @@ -717,7 +726,8 @@ CREATE TABLE t ) ENGINE = TinyLog ``` -```text + +``` text ┌─int─┬─float─┬──dec─┐ │ 1 │ 1.1 │ 1.10 │ │ 2 │ 2.2 │ 2.20 │ @@ -728,36 +738,39 @@ ENGINE = TinyLog The queries: -```sql +``` sql SELECT groupArrayMovingSum(int) AS I, groupArrayMovingSum(float) AS F, groupArrayMovingSum(dec) AS D FROM t ``` -```text + +``` text ┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ │ [1,3,7,14] │ [1.1,3.3000002,7.7000003,15.47] │ [1.10,3.30,7.70,15.47] │ └────────────┴─────────────────────────────────┴────────────────────────┘ ``` -```sql + +``` sql SELECT groupArrayMovingSum(2)(int) AS I, groupArrayMovingSum(2)(float) AS F, groupArrayMovingSum(2)(dec) AS D FROM t ``` -```text + +``` text ┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ │ [1,3,6,11] │ [1.1,3.3000002,6.6000004,12.17] │ [1.10,3.30,6.60,12.17] │ └────────────┴─────────────────────────────────┴────────────────────────┘ ``` -## groupArrayMovingAvg {#agg_function-grouparraymovingavg} +## groupArrayMovingAvg {#agg-function-grouparraymovingavg} Calculates the moving average of input values. -```sql +``` sql groupArrayMovingAvg(numbers_for_summing) groupArrayMovingAvg(window_size)(numbers_for_summing) ``` @@ -766,12 +779,12 @@ The function can take the window size as a parameter. If left unspecified, the f **Parameters** -- `numbers_for_summing` — [Expression](../syntax.md#syntax-expressions) resulting in a numeric data type value. -- `window_size` — Size of the calculation window. +- `numbers_for_summing` — [Expression](../syntax.md#syntax-expressions) resulting in a numeric data type value. +- `window_size` — Size of the calculation window. **Returned values** -- Array of the same size and type as the input data. +- Array of the same size and type as the input data. The function uses [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero). It truncates the decimal places insignificant for the resulting data type. @@ -779,7 +792,7 @@ The function uses [rounding towards zero](https://en.wikipedia.org/wiki/Rounding The sample table `b`: -```sql +``` sql CREATE TABLE t ( `int` UInt8, @@ -788,7 +801,8 @@ CREATE TABLE t ) ENGINE = TinyLog ``` -```text + +``` text ┌─int─┬─float─┬──dec─┐ │ 1 │ 1.1 │ 1.10 │ │ 2 │ 2.2 │ 2.20 │ @@ -799,32 +813,35 @@ ENGINE = TinyLog The queries: -```sql +``` sql SELECT groupArrayMovingAvg(int) AS I, groupArrayMovingAvg(float) AS F, groupArrayMovingAvg(dec) AS D FROM t ``` -```text + +``` text ┌─I─────────┬─F───────────────────────────────────┬─D─────────────────────┐ │ [0,0,1,3] │ [0.275,0.82500005,1.9250001,3.8675] │ [0.27,0.82,1.92,3.86] │ └───────────┴─────────────────────────────────────┴───────────────────────┘ ``` -```sql + +``` sql SELECT groupArrayMovingAvg(2)(int) AS I, groupArrayMovingAvg(2)(float) AS F, groupArrayMovingAvg(2)(dec) AS D FROM t ``` -```text + +``` text ┌─I─────────┬─F────────────────────────────────┬─D─────────────────────┐ │ [0,1,3,5] │ [0.55,1.6500001,3.3000002,6.085] │ [0.55,1.65,3.30,6.08] │ └───────────┴──────────────────────────────────┴───────────────────────┘ ``` -## groupUniqArray(x), groupUniqArray(max_size)(x) +## groupUniqArray(x), groupUniqArray(max\_size)(x) {#groupuniqarrayx-groupuniqarraymax-sizex} Creates an array from different argument values. Memory consumption is the same as for the `uniqExact` function. @@ -839,35 +856,34 @@ This function applies [reservoir sampling](https://en.wikipedia.org/wiki/Reservo When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](#quantiles) function. -**Syntax** +**Syntax** -```sql +``` sql quantile(level)(expr) ``` Alias: `median`. -**Parameters** +**Parameters** -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). **Returned value** -- Approximate quantile of the specified level. +- Approximate quantile of the specified level. Type: -- [Float64](../../data_types/float.md) for numeric data type input. -- [Date](../../data_types/date.md) if input values have the `Date` type. -- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. - +- [Float64](../../data_types/float.md) for numeric data type input. +- [Date](../../data_types/date.md) if input values have the `Date` type. +- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. **Example** Input table: -```text +``` text ┌─val─┐ │ 1 │ │ 1 │ @@ -878,13 +894,13 @@ Input table: Query: -```sql +``` sql SELECT quantile(val) FROM t ``` Result: -```text +``` text ┌─quantile(val)─┐ │ 1.5 │ └───────────────┘ @@ -892,9 +908,8 @@ Result: **See Also** -- [median](#median) -- [quantiles](#quantiles) - +- [median](#median) +- [quantiles](#quantiles) ## quantileDeterministic {#quantiledeterministic} @@ -904,36 +919,35 @@ This function applies [reservoir sampling](https://en.wikipedia.org/wiki/Reservo When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](#quantiles) function. -**Syntax** +**Syntax** -```sql +``` sql quantileDeterministic(level)(expr, determinator) ``` Alias: `medianDeterministic`. -**Parameters** +**Parameters** -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). -- `determinator` — Number whose hash is used instead of a random number generator in the reservoir sampling algorithm to make the result of sampling deterministic. As a determinator you can use any deterministic positive number, for example, a user id or an event id. If the same determinator value occures too often, the function works incorrectly. +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). +- `determinator` — Number whose hash is used instead of a random number generator in the reservoir sampling algorithm to make the result of sampling deterministic. As a determinator you can use any deterministic positive number, for example, a user id or an event id. If the same determinator value occures too often, the function works incorrectly. **Returned value** -- Approximate quantile of the specified level. +- Approximate quantile of the specified level. Type: -- [Float64](../../data_types/float.md) for numeric data type input. -- [Date](../../data_types/date.md) if input values have the `Date` type. -- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. - +- [Float64](../../data_types/float.md) for numeric data type input. +- [Date](../../data_types/date.md) if input values have the `Date` type. +- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. **Example** Input table: -```text +``` text ┌─val─┐ │ 1 │ │ 1 │ @@ -944,13 +958,13 @@ Input table: Query: -```sql +``` sql SELECT quantileDeterministic(val, 1) FROM t ``` Result: -```text +``` text ┌─quantileDeterministic(val, 1)─┐ │ 1.5 │ └───────────────────────────────┘ @@ -958,9 +972,8 @@ Result: **See Also** -- [median](#median) -- [quantiles](#quantiles) - +- [median](#median) +- [quantiles](#quantiles) ## quantileExact {#quantileexact} @@ -970,40 +983,40 @@ To get exact value, all the passed values ​​are combined into an array, whic When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](#quantiles) function. -**Syntax** +**Syntax** -```sql +``` sql quantileExact(level)(expr) ``` Alias: `medianExact`. -**Parameters** +**Parameters** -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). **Returned value** -- Quantile of the specified level. +- Quantile of the specified level. Type: -- [Float64](../../data_types/float.md) for numeric data type input. -- [Date](../../data_types/date.md) if input values have the `Date` type. -- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. +- [Float64](../../data_types/float.md) for numeric data type input. +- [Date](../../data_types/date.md) if input values have the `Date` type. +- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. **Example** Query: -```sql +``` sql SELECT quantileExact(number) FROM numbers(10) ``` Result: -```text +``` text ┌─quantileExact(number)─┐ │ 5 │ └───────────────────────┘ @@ -1011,8 +1024,8 @@ Result: **See Also** -- [median](#median) -- [quantiles](#quantiles) +- [median](#median) +- [quantiles](#quantiles) ## quantileExactWeighted {#quantileexactweighted} @@ -1022,35 +1035,35 @@ To get exact value, all the passed values ​​are combined into an array, whic When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](#quantiles) function. -**Syntax** +**Syntax** -```sql +``` sql quantileExactWeighted(level)(expr, weight) ``` Alias: `medianExactWeighted`. -**Parameters** +**Parameters** -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). -- `weight` — Column with weights of sequence members. Weight is a number of value occurrences. +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). +- `weight` — Column with weights of sequence members. Weight is a number of value occurrences. **Returned value** -- Quantile of the specified level. +- Quantile of the specified level. Type: -- [Float64](../../data_types/float.md) for numeric data type input. -- [Date](../../data_types/date.md) if input values have the `Date` type. -- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. +- [Float64](../../data_types/float.md) for numeric data type input. +- [Date](../../data_types/date.md) if input values have the `Date` type. +- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. **Example** Input table: -```text +``` text ┌─n─┬─val─┐ │ 0 │ 3 │ │ 1 │ 2 │ @@ -1061,34 +1074,34 @@ Input table: Query: -```sql +``` sql SELECT quantileExactWeighted(n, val) FROM t ``` Result: -```text +``` text ┌─quantileExactWeighted(n, val)─┐ │ 1 │ └───────────────────────────────┘ ``` -**See Also** +**See Also** -- [median](#median) -- [quantiles](#quantiles) +- [median](#median) +- [quantiles](#quantiles) ## quantileTiming {#quantiletiming} With the determined precision computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence. -The result is deterministic (it doesn't depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times. +The result is deterministic (it doesn’t depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times. When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](#quantiles) function. **Syntax** -```sql +``` sql quantileTiming(level)(expr) ``` @@ -1096,18 +1109,19 @@ Alias: `medianTiming`. **Parameters** -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). -- `expr` — [Expression](../syntax.md#syntax-expressions) over a column values returning a [Float*](../../data_types/float.md)-type number. +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). - - If negative values are passed to the function, the behavior is undefined. - - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000. +- `expr` — [Expression](../syntax.md#syntax-expressions) over a column values returning a [Float\*](../../data_types/float.md)-type number. + + - If negative values are passed to the function, the behavior is undefined. + - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000. **Accuracy** The calculation is accurate if: -- Total number of values doesn't exceed 5670. -- Total number of values exceeds 5670, but the page loading time is less than 1024ms. +- Total number of values doesn’t exceed 5670. +- Total number of values exceeds 5670, but the page loading time is less than 1024ms. Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms. @@ -1116,7 +1130,7 @@ Otherwise, the result of the calculation is rounded to the nearest multiple of 1 **Returned value** -- Quantile of the specified level. +- Quantile of the specified level. Type: `Float32`. @@ -1127,7 +1141,7 @@ Type: `Float32`. Input table: -```text +``` text ┌─response_time─┐ │ 72 │ │ 112 │ @@ -1143,13 +1157,13 @@ Input table: Query: -```sql +``` sql SELECT quantileTiming(response_time) FROM t ``` Result: -```text +``` text ┌─quantileTiming(response_time)─┐ │ 126 │ └───────────────────────────────┘ @@ -1157,20 +1171,20 @@ Result: **See Also** -- [median](#median) -- [quantiles](#quantiles) +- [median](#median) +- [quantiles](#quantiles) ## quantileTimingWeighted {#quantiletimingweighted} With the determined precision computes the [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence according to the weight of each sequence member. -The result is deterministic (it doesn't depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times. +The result is deterministic (it doesn’t depend on the query processing order). The function is optimized for working with sequences which describe distributions like loading web pages times or backend response times. When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](#quantiles) function. **Syntax** -```sql +``` sql quantileTimingWeighted(level)(expr, weight) ``` @@ -1178,20 +1192,21 @@ Alias: `medianTimingWeighted`. **Parameters** -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). -- `expr` — [Expression](../syntax.md#syntax-expressions) over a column values returning a [Float*](../../data_types/float.md)-type number. +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). - - If negative values are passed to the function, the behavior is undefined. - - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000. +- `expr` — [Expression](../syntax.md#syntax-expressions) over a column values returning a [Float\*](../../data_types/float.md)-type number. -- `weight` — Column with weights of sequence elements. Weight is a number of value occurrences. + - If negative values are passed to the function, the behavior is undefined. + - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000. + +- `weight` — Column with weights of sequence elements. Weight is a number of value occurrences. **Accuracy** The calculation is accurate if: -- Total number of values doesn't exceed 5670. -- Total number of values exceeds 5670, but the page loading time is less than 1024ms. +- Total number of values doesn’t exceed 5670. +- Total number of values exceeds 5670, but the page loading time is less than 1024ms. Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms. @@ -1200,19 +1215,18 @@ Otherwise, the result of the calculation is rounded to the nearest multiple of 1 **Returned value** -- Quantile of the specified level. +- Quantile of the specified level. Type: `Float32`. !!! note "Note" If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../data_types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../select.md#select-order-by) for notes on sorting `NaN` values. - **Example** Input table: -```text +``` text ┌─response_time─┬─weight─┐ │ 68 │ 1 │ │ 104 │ 2 │ @@ -1225,13 +1239,13 @@ Input table: Query: -```sql +``` sql SELECT quantileTimingWeighted(response_time, weight) FROM t ``` Result: -```text +``` text ┌─quantileTimingWeighted(response_time, weight)─┐ │ 112 │ └───────────────────────────────────────────────┘ @@ -1239,55 +1253,53 @@ Result: **See Also** -- [median](#median) -- [quantiles](#quantiles) - +- [median](#median) +- [quantiles](#quantiles) ## quantileTDigest {#quantiletdigest} Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm. -The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values. The result depends on the order of running the query, and is nondeterministic. +The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values. The result depends on the order of running the query, and is nondeterministic. The performance of the function is lower than performance of [quantile](#quantile) or [quantileTiming](#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`. When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](#quantiles) function. -**Syntax** +**Syntax** -```sql +``` sql quantileTDigest(level)(expr) ``` Alias: `medianTDigest`. -**Parameters** +**Parameters** -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). **Returned value** -- Approximate quantile of the specified level. +- Approximate quantile of the specified level. Type: -- [Float64](../../data_types/float.md) for numeric data type input. -- [Date](../../data_types/date.md) if input values have the `Date` type. -- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. - +- [Float64](../../data_types/float.md) for numeric data type input. +- [Date](../../data_types/date.md) if input values have the `Date` type. +- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. **Example** Query: -```sql +``` sql SELECT quantileTDigest(number) FROM numbers(10) ``` Result: -```text +``` text ┌─quantileTDigest(number)─┐ │ 4.5 │ └─────────────────────────┘ @@ -1295,12 +1307,12 @@ Result: **See Also** -- [median](#median) -- [quantiles](#quantiles) +- [median](#median) +- [quantiles](#quantiles) ## quantileTDigestWeighted {#quantiletdigestweighted} -Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm. The function takes into account the weight of each sequence member. The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values. +Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm. The function takes into account the weight of each sequence member. The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values. The performance of the function is lower than performance of [quantile](#quantile) or [quantileTiming](#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`. @@ -1308,41 +1320,41 @@ The result depends on the order of running the query, and is nondeterministic. When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](#quantiles) function. -**Syntax** +**Syntax** -```sql +``` sql quantileTDigest(level)(expr) ``` Alias: `medianTDigest`. -**Parameters** +**Parameters** -- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). -- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). -- `weight` — Column with weights of sequence elements. Weight is a number of value occurrences. +- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median). +- `expr` — Expression over the column values resulting in numeric [data types](../../data_types/index.md#data_types), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). +- `weight` — Column with weights of sequence elements. Weight is a number of value occurrences. **Returned value** -- Approximate quantile of the specified level. +- Approximate quantile of the specified level. Type: -- [Float64](../../data_types/float.md) for numeric data type input. -- [Date](../../data_types/date.md) if input values have the `Date` type. -- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. +- [Float64](../../data_types/float.md) for numeric data type input. +- [Date](../../data_types/date.md) if input values have the `Date` type. +- [DateTime](../../data_types/datetime.md) if input values have the `DateTime` type. **Example** Query: -```sql +``` sql SELECT quantileTDigestWeighted(number, 1) FROM numbers(10) ``` Result: -```text +``` text ┌─quantileTDigestWeighted(number, 1)─┐ │ 4.5 │ └────────────────────────────────────┘ @@ -1350,9 +1362,8 @@ Result: **See Also** -- [median](#median) -- [quantiles](#quantiles) - +- [median](#median) +- [quantiles](#quantiles) ## median {#median} @@ -1360,20 +1371,20 @@ The `median*` functions are the aliases for the corresponding `quantile*` functi Functions: -- `median` — Alias for [quantile](#quantile). -- `medianDeterministic` — Alias for [quantileDeterministic](#quantiledeterministic). -- `medianExact` — Alias for [quantileExact](#quantileexact). -- `medianExactWeighted` — Alias for [quantileExactWeighted](#quantileexactweighted). -- `medianTiming` — Alias for [quantileTiming](#quantiletiming). -- `medianTimingWeighted` — Alias for [quantileTimingWeighted](#quantiletimingweighted). -- `medianTDigest` — Alias for [quantileTDigest](#quantiletdigest). -- `medianTDigestWeighted` — Alias for [quantileTDigestWeighted](#quantiletdigestweighted). +- `median` — Alias for [quantile](#quantile). +- `medianDeterministic` — Alias for [quantileDeterministic](#quantiledeterministic). +- `medianExact` — Alias for [quantileExact](#quantileexact). +- `medianExactWeighted` — Alias for [quantileExactWeighted](#quantileexactweighted). +- `medianTiming` — Alias for [quantileTiming](#quantiletiming). +- `medianTimingWeighted` — Alias for [quantileTimingWeighted](#quantiletimingweighted). +- `medianTDigest` — Alias for [quantileTDigest](#quantiletdigest). +- `medianTDigestWeighted` — Alias for [quantileTDigestWeighted](#quantiletdigestweighted). **Example** Input table: -```text +``` text ┌─val─┐ │ 1 │ │ 1 │ @@ -1384,24 +1395,23 @@ Input table: Query: -```sql +``` sql SELECT medianDeterministic(val, 1) FROM t ``` Result: -```text +``` text ┌─medianDeterministic(val, 1)─┐ │ 1.5 │ └─────────────────────────────┘ ``` - -## quantiles(level1, level2, ...)(x) {#quantiles} +## quantiles(level1, level2, …)(x) {#quantiles} All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values. -## varSamp(x) +## varSamp(x) {#varsampx} Calculates the amount `Σ((x - x̅)^2) / (n - 1)`, where `n` is the sample size and `x̅`is the average value of `x`. @@ -1409,54 +1419,54 @@ It represents an unbiased estimate of the variance of a random variable if passe Returns `Float64`. When `n <= 1`, returns `+∞`. -## varPop(x) +## varPop(x) {#varpopx} Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x̅`is the average value of `x`. In other words, dispersion for a set of values. Returns `Float64`. -## stddevSamp(x) +## stddevSamp(x) {#stddevsampx} The result is equal to the square root of `varSamp(x)`. -## stddevPop(x) +## stddevPop(x) {#stddevpopx} The result is equal to the square root of `varPop(x)`. -## topK(N)(x) +## topK(N)(x) {#topknx} Returns an array of the approximately most frequent values in the specified column. The resulting array is sorted in descending order of approximate frequency of values (not by the values themselves). -Implements the [ Filtered Space-Saving](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf) algorithm for analyzing TopK, based on the reduce-and-combine algorithm from [Parallel Space Saving](https://arxiv.org/pdf/1401.0702.pdf). +Implements the [Filtered Space-Saving](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf) algorithm for analyzing TopK, based on the reduce-and-combine algorithm from [Parallel Space Saving](https://arxiv.org/pdf/1401.0702.pdf). -```sql +``` sql topK(N)(column) ``` -This function doesn't provide a guaranteed result. In certain situations, errors might occur and it might return frequent values that aren't the most frequent values. +This function doesn’t provide a guaranteed result. In certain situations, errors might occur and it might return frequent values that aren’t the most frequent values. -We recommend using the `N < 10 ` value; performance is reduced with large `N` values. Maximum value of ` N = 65536`. +We recommend using the `N < 10` value; performance is reduced with large `N` values. Maximum value of `N = 65536`. **Parameters** -- 'N' is the number of elements to return. +- ‘N’ is the number of elements to return. If the parameter is omitted, default value 10 is used. **Arguments** -- ' x ' – The value to calculate frequency. +- ’ x ’ – The value to calculate frequency. **Example** Take the [OnTime](../../getting_started/example_datasets/ontime.md) data set and select the three most frequently occurring values in the `AirlineID` column. -```sql +``` sql SELECT topK(3)(AirlineID) AS res FROM ontime ``` -```text +``` text ┌─res─────────────────┐ │ [19393,19790,19805] │ └─────────────────────┘ @@ -1468,18 +1478,18 @@ Similar to `topK` but takes one additional argument of integer type - `weight`. **Syntax** -```sql +``` sql topKWeighted(N)(x, weight) ``` **Parameters** -- `N` — The number of elements to return. +- `N` — The number of elements to return. **Arguments** -- `x` – The value. -- `weight` — The weight. [UInt8](../../data_types/int_uint.md). +- `x` – The value. +- `weight` — The weight. [UInt8](../../data_types/int_uint.md). **Returned value** @@ -1489,54 +1499,54 @@ Returns an array of the values with maximum approximate sum of weights. Query: -```sql +``` sql SELECT topKWeighted(10)(number, number) FROM numbers(1000) ``` Result: -```text +``` text ┌─topKWeighted(10)(number, number)──────────┐ │ [999,998,997,996,995,994,993,992,991,990] │ └───────────────────────────────────────────┘ ``` -## covarSamp(x, y) +## covarSamp(x, y) {#covarsampx-y} Calculates the value of `Σ((x - x̅)(y - y̅)) / (n - 1)`. Returns Float64. When `n <= 1`, returns +∞. -## covarPop(x, y) +## covarPop(x, y) {#covarpopx-y} Calculates the value of `Σ((x - x̅)(y - y̅)) / n`. -## corr(x, y) +## corr(x, y) {#corrx-y} Calculates the Pearson correlation coefficient: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`. -## categoricalInformationValue +## categoricalInformationValue {#categoricalinformationvalue} Calculates the value of `(P(tag = 1) - P(tag = 0))(log(P(tag = 1)) - log(P(tag = 0)))` for each category. -```sql +``` sql categoricalInformationValue(category1, category2, ..., tag) ``` The result indicates how a discrete (categorical) feature `[category1, category2, ...]` contribute to a learning model which predicting the value of `tag`. -## simpleLinearRegression +## simpleLinearRegression {#simplelinearregression} Performs simple (unidimensional) linear regression. -```sql +``` sql simpleLinearRegression(x, y) ``` Parameters: -- `x` — Column with dependent variable values. -- `y` — Column with explanatory variable values. +- `x` — Column with dependent variable values. +- `y` — Column with explanatory variable values. Returned values: @@ -1544,44 +1554,44 @@ Constants `(a, b)` of the resulting line `y = a*x + b`. **Examples** -```sql +``` sql SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3]) ``` -```text + +``` text ┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])─┐ │ (1,0) │ └───────────────────────────────────────────────────────────────────┘ ``` -```sql +``` sql SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6]) ``` -```text + +``` text ┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])─┐ │ (1,3) │ └───────────────────────────────────────────────────────────────────┘ ``` -## stochasticLinearRegression {#agg_functions-stochasticlinearregression} - +## stochasticLinearRegression {#agg-functions-stochasticlinearregression} This function implements stochastic linear regression. It supports custom parameters for learning rate, L2 regularization coefficient, mini-batch size and has few methods for updating weights ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (used by default), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)). -### Parameters {#agg_functions-stochasticlinearregression-parameters} +### Parameters {#agg-functions-stochasticlinearregression-parameters} There are 4 customizable parameters. They are passed to the function sequentially, but there is no need to pass all four - default values will be used, however good model required some parameter tuning. -```text +``` text stochasticLinearRegression(1.0, 1.0, 10, 'SGD') ``` -1. `learning rate` is the coefficient on step length, when gradient descent step is performed. Too big learning rate may cause infinite weights of the model. Default is `0.00001`. -2. `l2 regularization coefficient` which may help to prevent overfitting. Default is `0.1`. -3. `mini-batch size` sets the number of elements, which gradients will be computed and summed to perform one step of gradient descent. Pure stochastic descent uses one element, however having small batches(about 10 elements) make gradient steps more stable. Default is `15`. -4. `method for updating weights`, they are: `Adam` (by default), `SGD`, `Momentum`, `Nesterov`. `Momentum` and `Nesterov` require little bit more computations and memory, however they happen to be useful in terms of speed of convergance and stability of stochastic gradient methods. +1. `learning rate` is the coefficient on step length, when gradient descent step is performed. Too big learning rate may cause infinite weights of the model. Default is `0.00001`. +2. `l2 regularization coefficient` which may help to prevent overfitting. Default is `0.1`. +3. `mini-batch size` sets the number of elements, which gradients will be computed and summed to perform one step of gradient descent. Pure stochastic descent uses one element, however having small batches(about 10 elements) make gradient steps more stable. Default is `15`. +4. `method for updating weights`, they are: `Adam` (by default), `SGD`, `Momentum`, `Nesterov`. `Momentum` and `Nesterov` require little bit more computations and memory, however they happen to be useful in terms of speed of convergance and stability of stochastic gradient methods. - -### Usage {#agg_functions-stochasticlinearregression-usage} +### Usage {#agg-functions-stochasticlinearregression-usage} `stochasticLinearRegression` is used in two steps: fitting the model and predicting on new data. In order to fit the model and save its state for later usage we use `-State` combinator, which basically saves the state (model weights, etc). To predict we use function [evalMLMethod](../functions/machine_learning_functions.md#machine_learning_methods-evalmlmethod), which takes a state as an argument as well as features to predict on. @@ -1592,7 +1602,7 @@ To predict we use function [evalMLMethod](../functions/machine_learning_function Such query may be used. -```sql +``` sql CREATE TABLE IF NOT EXISTS train_data ( param1 Float64, @@ -1603,7 +1613,6 @@ CREATE TABLE IF NOT EXISTS train_data CREATE TABLE your_model ENGINE = Memory AS SELECT stochasticLinearRegressionState(0.1, 0.0, 5, 'SGD')(target, param1, param2) AS state FROM train_data; - ``` Here we also need to insert data into `train_data` table. The number of parameters is not fixed, it depends only on number of arguments, passed into `linearRegressionState`. They all must be numeric values. @@ -1613,7 +1622,7 @@ Note that the column with target value(which we would like to learn to predict) After saving a state into the table, we may use it multiple times for prediction, or even merge with other states and create new even better models. -```sql +``` sql WITH (SELECT state FROM your_model) AS model SELECT evalMLMethod(model, param1, param2) FROM test_data ``` @@ -1622,52 +1631,50 @@ The query will return a column of predicted values. Note that first argument of `test_data` is a table like `train_data` but may not contain target value. -### Notes {#agg_functions-stochasticlinearregression-notes} +### Notes {#agg-functions-stochasticlinearregression-notes} -1. To merge two models user may create such query: - ```sql - SELECT state1 + state2 FROM your_models - ``` +1. To merge two models user may create such query: + `sql SELECT state1 + state2 FROM your_models` where `your_models` table contains both models. This query will return new `AggregateFunctionState` object. -2. User may fetch weights of the created model for its own purposes without saving the model if no `-State` combinator is used. - ```sql - SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data - ``` +2. User may fetch weights of the created model for its own purposes without saving the model if no `-State` combinator is used. + `sql SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data` Such query will fit the model and return its weights - first are weights, which correspond to the parameters of the model, the last one is bias. So in the example above the query will return a column with 3 values. **See Also** -- [stochasticLogisticRegression](#agg_functions-stochasticlogisticregression) -- [Difference between linear and logistic regressions](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) - - -## stochasticLogisticRegression {#agg_functions-stochasticlogisticregression} +- [stochasticLogisticRegression](#agg_functions-stochasticlogisticregression) +- [Difference between linear and logistic regressions](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) +## stochasticLogisticRegression {#agg-functions-stochasticlogisticregression} This function implements stochastic logistic regression. It can be used for binary classification problem, supports the same custom parameters as stochasticLinearRegression and works the same way. -### Parameters {#agg_functions-stochasticlogisticregression-parameters} +### Parameters {#agg-functions-stochasticlogisticregression-parameters} Parameters are exactly the same as in stochasticLinearRegression: `learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`. For more information see [parameters](#agg_functions-stochasticlinearregression-parameters). -```text +``` text stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') ``` -1. Fitting +1. Fitting + + See the `Fitting` section in the [stochasticLinearRegression](#stochasticlinearregression-usage-fitting) description. - Predicted labels have to be in [-1, 1]. + Predicted labels have to be in \[-1, 1\]. -2. Predicting +1. Predicting + + Using saved state we can predict probability of object having label `1`. - ```sql + ``` sql WITH (SELECT state FROM your_model) AS model SELECT evalMLMethod(model, param1, param2) FROM test_data ``` @@ -1676,7 +1683,7 @@ stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') We can also set a bound of probability, which assigns elements to different labels. - ```sql + ``` sql SELECT ans < 1.1 AND ans > 0.5 FROM (WITH (SELECT state FROM your_model) AS model SELECT evalMLMethod(model, param1, param2) AS ans FROM test_data) @@ -1688,15 +1695,14 @@ stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') **See Also** -- [stochasticLinearRegression](#agg_functions-stochasticlinearregression) -- [Difference between linear and logistic regressions.](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) +- [stochasticLinearRegression](#agg_functions-stochasticlinearregression) +- [Difference between linear and logistic regressions.](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) - -## groupBitmapAnd +## groupBitmapAnd {#groupbitmapand} Calculations the AND of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../functions/bitmap_functions.md). -```sql +``` sql groupBitmapAnd(expr) ``` @@ -1710,7 +1716,7 @@ Value of the `UInt64` type. **Example** -```sql +``` sql DROP TABLE IF EXISTS bitmap_column_expr_test2; CREATE TABLE bitmap_column_expr_test2 ( @@ -1733,15 +1739,13 @@ SELECT arraySort(bitmapToArray(groupBitmapAndState(z))) FROM bitmap_column_expr_ ┌─arraySort(bitmapToArray(groupBitmapAndState(z)))─┐ │ [6,8,10] │ └──────────────────────────────────────────────────┘ - ``` - -## groupBitmapOr +## groupBitmapOr {#groupbitmapor} Calculations the OR of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../functions/bitmap_functions.md). This is equivalent to `groupBitmapMerge`. -```sql +``` sql groupBitmapOr(expr) ``` @@ -1755,7 +1759,7 @@ Value of the `UInt64` type. **Example** -```sql +``` sql DROP TABLE IF EXISTS bitmap_column_expr_test2; CREATE TABLE bitmap_column_expr_test2 ( @@ -1778,15 +1782,13 @@ SELECT arraySort(bitmapToArray(groupBitmapOrState(z))) FROM bitmap_column_expr_t ┌─arraySort(bitmapToArray(groupBitmapOrState(z)))─┐ │ [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] │ └─────────────────────────────────────────────────┘ - ``` - -## groupBitmapXor +## groupBitmapXor {#groupbitmapxor} Calculations the XOR of a bitmap column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../functions/bitmap_functions.md). -```sql +``` sql groupBitmapOr(expr) ``` @@ -1800,7 +1802,7 @@ Value of the `UInt64` type. **Example** -```sql +``` sql DROP TABLE IF EXISTS bitmap_column_expr_test2; CREATE TABLE bitmap_column_expr_test2 ( @@ -1823,8 +1825,6 @@ SELECT arraySort(bitmapToArray(groupBitmapXorState(z))) FROM bitmap_column_expr_ ┌─arraySort(bitmapToArray(groupBitmapXorState(z)))─┐ │ [1,3,5,6,8,10,11,13,14,15] │ └──────────────────────────────────────────────────┘ - ``` - [Original article](https://clickhouse.tech/docs/en/query_language/agg_functions/reference/) diff --git a/docs/en/query_language/alter.md b/docs/en/query_language/alter.md index eeb11282f65..fb906a7468d 100644 --- a/docs/en/query_language/alter.md +++ b/docs/en/query_language/alter.md @@ -1,12 +1,12 @@ -## ALTER {#query_language_queries_alter} +## ALTER {#query-language-queries-alter} The `ALTER` query is only supported for `*MergeTree` tables, as well as `Merge`and`Distributed`. The query has several variations. -### Column Manipulations +### Column Manipulations {#column-manipulations} Changing the table structure. -```sql +``` sql ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN ... ``` @@ -15,73 +15,73 @@ Each action is an operation on a column. The following actions are supported: -- [ADD COLUMN](#alter_add-column) — Adds a new column to the table. -- [DROP COLUMN](#alter_drop-column) — Deletes the column. -- [CLEAR COLUMN](#alter_clear-column) — Resets column values. -- [COMMENT COLUMN](#alter_comment-column) — Adds a text comment to the column. -- [MODIFY COLUMN](#alter_modify-column) — Changes column's type, default expression and TTL. +- [ADD COLUMN](#alter_add-column) — Adds a new column to the table. +- [DROP COLUMN](#alter_drop-column) — Deletes the column. +- [CLEAR COLUMN](#alter_clear-column) — Resets column values. +- [COMMENT COLUMN](#alter_comment-column) — Adds a text comment to the column. +- [MODIFY COLUMN](#alter_modify-column) — Changes column’s type, default expression and TTL. These actions are described in detail below. -#### ADD COLUMN {#alter_add-column} +#### ADD COLUMN {#alter-add-column} -```sql +``` sql ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after] ``` Adds a new column to the table with the specified `name`, `type`, [`codec`](create.md#codecs) and `default_expr` (see the section [Default expressions](create.md#create-default-values)). -If the `IF NOT EXISTS` clause is included, the query won't return an error if the column already exists. If you specify `AFTER name_after` (the name of another column), the column is added after the specified one in the list of table columns. Otherwise, the column is added to the end of the table. Note that there is no way to add a column to the beginning of a table. For a chain of actions, `name_after` can be the name of a column that is added in one of the previous actions. +If the `IF NOT EXISTS` clause is included, the query won’t return an error if the column already exists. If you specify `AFTER name_after` (the name of another column), the column is added after the specified one in the list of table columns. Otherwise, the column is added to the end of the table. Note that there is no way to add a column to the beginning of a table. For a chain of actions, `name_after` can be the name of a column that is added in one of the previous actions. -Adding a column just changes the table structure, without performing any actions with data. The data doesn't appear on the disk after `ALTER`. If the data is missing for a column when reading from the table, it is filled in with default values (by performing the default expression if there is one, or using zeros or empty strings). The column appears on the disk after merging data parts (see [MergeTree](../operations/table_engines/mergetree.md)). +Adding a column just changes the table structure, without performing any actions with data. The data doesn’t appear on the disk after `ALTER`. If the data is missing for a column when reading from the table, it is filled in with default values (by performing the default expression if there is one, or using zeros or empty strings). The column appears on the disk after merging data parts (see [MergeTree](../operations/table_engines/mergetree.md)). This approach allows us to complete the `ALTER` query instantly, without increasing the volume of old data. Example: -```sql +``` sql ALTER TABLE visits ADD COLUMN browser String AFTER user_id ``` -#### DROP COLUMN {#alter_drop-column} +#### DROP COLUMN {#alter-drop-column} -```sql +``` sql DROP COLUMN [IF EXISTS] name ``` -Deletes the column with the name `name`. If the `IF EXISTS` clause is specified, the query won't return an error if the column doesn't exist. +Deletes the column with the name `name`. If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist. Deletes data from the file system. Since this deletes entire files, the query is completed almost instantly. Example: -```sql +``` sql ALTER TABLE visits DROP COLUMN browser ``` -#### CLEAR COLUMN {#alter_clear-column} +#### CLEAR COLUMN {#alter-clear-column} -```sql +``` sql CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name ``` Resets all data in a column for a specified partition. Read more about setting the partition name in the section [How to specify the partition expression](#alter-how-to-specify-part-expr). -If the `IF EXISTS` clause is specified, the query won't return an error if the column doesn't exist. +If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist. Example: -```sql +``` sql ALTER TABLE visits CLEAR COLUMN browser IN PARTITION tuple() ``` -#### COMMENT COLUMN {#alter_comment-column} +#### COMMENT COLUMN {#alter-comment-column} -```sql +``` sql COMMENT COLUMN [IF EXISTS] name 'comment' ``` -Adds a comment to the column. If the `IF EXISTS` clause is specified, the query won't return an error if the column doesn't exist. +Adds a comment to the column. If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist. Each column can have one comment. If a comment already exists for the column, a new comment overwrites the previous comment. @@ -89,32 +89,33 @@ Comments are stored in the `comment_expression` column returned by the [DESCRIBE Example: -```sql +``` sql ALTER TABLE visits COMMENT COLUMN browser 'The table shows the browser used for accessing the site.' ``` -#### MODIFY COLUMN {#alter_modify-column} +#### MODIFY COLUMN {#alter-modify-column} -```sql +``` sql MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] ``` This query changes the `name` column properties: -- Type -- Default expression -- TTL +- Type - For examples of columns TTL modifying, see [Column TTL](../operations/table_engines/mergetree.md#mergetree-column-ttl). +- Default expression +- TTL -If the `IF EXISTS` clause is specified, the query won't return an error if the column doesn't exist. + For examples of columns TTL modifying, see [Column TTL](../operations/table_engines/mergetree.md#mergetree-column-ttl). -When changing the type, values are converted as if the [toType](functions/type_conversion_functions.md) functions were applied to them. If only the default expression is changed, the query doesn't do anything complex, and is completed almost instantly. +If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist. + +When changing the type, values are converted as if the [toType](functions/type_conversion_functions.md) functions were applied to them. If only the default expression is changed, the query doesn’t do anything complex, and is completed almost instantly. Example: -```sql +``` sql ALTER TABLE visits MODIFY COLUMN browser Array(String) ``` @@ -122,17 +123,17 @@ Changing the column type is the only complex action – it changes the contents There are several processing stages: -- Preparing temporary (new) files with modified data. -- Renaming old files. -- Renaming the temporary (new) files to the old names. -- Deleting the old files. +- Preparing temporary (new) files with modified data. +- Renaming old files. +- Renaming the temporary (new) files to the old names. +- Deleting the old files. Only the first stage takes time. If there is a failure at this stage, the data is not changed. If there is a failure during one of the successive stages, data can be restored manually. The exception is if the old files were deleted from the file system but the data for the new files did not get written to the disk and was lost. The `ALTER` query for changing columns is replicated. The instructions are saved in ZooKeeper, then each replica applies them. All `ALTER` queries are run in the same order. The query waits for the appropriate actions to be completed on the other replicas. However, a query to change columns in a replicated table can be interrupted, and all actions will be performed asynchronously. -#### ALTER Query Limitations +#### ALTER Query Limitations {#alter-query-limitations} The `ALTER` query lets you create and delete separate elements (columns) in nested data structures, but not whole nested data structures. To add a nested data structure, you can add columns with a name like `name.nested_name` and the type `Array(T)`. A nested data structure is equivalent to multiple array columns with a name that has the same prefix before the dot. @@ -142,13 +143,13 @@ If the `ALTER` query is not sufficient to make the table changes you need, you c The `ALTER` query blocks all reads and writes for the table. In other words, if a long `SELECT` is running at the time of the `ALTER` query, the `ALTER` query will wait for it to complete. At the same time, all new queries to the same table will wait while this `ALTER` is running. -For tables that don't store data themselves (such as `Merge` and `Distributed`), `ALTER` just changes the table structure, and does not change the structure of subordinate tables. For example, when running ALTER for a `Distributed` table, you will also need to run `ALTER` for the tables on all remote servers. +For tables that don’t store data themselves (such as `Merge` and `Distributed`), `ALTER` just changes the table structure, and does not change the structure of subordinate tables. For example, when running ALTER for a `Distributed` table, you will also need to run `ALTER` for the tables on all remote servers. -### Manipulations With Key Expressions +### Manipulations With Key Expressions {#manipulations-with-key-expressions} The following command is supported: -```sql +``` sql MODIFY ORDER BY new_expression ``` @@ -161,26 +162,26 @@ The command is lightweight in a sense that it only changes metadata. To keep the rows are ordered by the sorting key expression you cannot add expressions containing existing columns to the sorting key (only columns added by the `ADD COLUMN` command in the same `ALTER` query). - -### Manipulations With Data Skipping Indices +### Manipulations With Data Skipping Indices {#manipulations-with-data-skipping-indices} It only works for tables in the [`*MergeTree`](../operations/table_engines/mergetree.md) family (including [replicated](../operations/table_engines/replication.md) tables). The following operations are available: -* `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value AFTER name [AFTER name2]` - Adds index description to tables metadata. +- `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value AFTER name [AFTER name2]` - Adds index description to tables metadata. -* `ALTER TABLE [db].name DROP INDEX name` - Removes index description from tables metadata and deletes index files from disk. +- `ALTER TABLE [db].name DROP INDEX name` - Removes index description from tables metadata and deletes index files from disk. These commands are lightweight in a sense that they only change metadata or remove files. Also, they are replicated (syncing indices metadata through ZooKeeper). -### Manipulations with constraints +### Manipulations with constraints {#manipulations-with-constraints} See more on [constraints](create.md#constraints) Constraints could be added or deleted using following syntax: -```sql + +``` sql ALTER TABLE [db].name ADD CONSTRAINT constraint_name CHECK expression; ALTER TABLE [db].name DROP CONSTRAINT constraint_name; ``` @@ -191,25 +192,28 @@ Constraint check *will not be executed* on existing data if it was added. All changes on replicated tables are broadcasting to ZooKeeper so will be applied on other replicas. -### Manipulations With Partitions and Parts {#alter_manipulations-with-partitions} +### Manipulations With Partitions and Parts {#alter-manipulations-with-partitions} The following operations with [partitions](../operations/table_engines/custom_partitioning_key.md) are available: -- [DETACH PARTITION](#alter_detach-partition) – Moves a partition to the `detached` directory and forget it. -- [DROP PARTITION](#alter_drop-partition) – Deletes a partition. -- [ATTACH PART|PARTITION](#alter_attach-partition) – Adds a part or partition from the `detached` directory to the table. -- [REPLACE PARTITION](#alter_replace-partition) - Copies the data partition from one table to another. -- [ATTACH PARTITION FROM](#alter_attach-partition-from) – Copies the data partition from one table to another and adds. -- [REPLACE PARTITION](#alter_replace-partition) - Copies the data partition from one table to another and replaces. -- [MOVE PARTITION TO TABLE] (#alter_move_to_table-partition) - Move the data partition from one table to another. -- [CLEAR COLUMN IN PARTITION](#alter_clear-column-partition) - Resets the value of a specified column in a partition. -- [CLEAR INDEX IN PARTITION](#alter_clear-index-partition) - Resets the specified secondary index in a partition. -- [FREEZE PARTITION](#alter_freeze-partition) – Creates a backup of a partition. -- [FETCH PARTITION](#alter_fetch-partition) – Downloads a partition from another server. -- [MOVE PARTITION|PART](#alter_move-partition) – Move partition/data part to another disk or volume. -#### DETACH PARTITION {#alter_detach-partition} +- [DETACH PARTITION](#alter_detach-partition) – Moves a partition to the `detached` directory and forget it. +- [DROP PARTITION](#alter_drop-partition) – Deletes a partition. +- [ATTACH PART\|PARTITION](#alter_attach-partition) – Adds a part or partition from the `detached` directory to the table. +- [REPLACE PARTITION](#alter_replace-partition) - Copies the data partition from one table to another. +- [ATTACH PARTITION FROM](#alter_attach-partition-from) – Copies the data partition from one table to another and adds. +- [REPLACE PARTITION](#alter_replace-partition) - Copies the data partition from one table to another and replaces. +- [MOVE PARTITION TO TABLE](#alter_move_to_table-partition) (\#alter\_move\_to\_table-partition) - Move the data partition from one table to another. +- [CLEAR COLUMN IN PARTITION](#alter_clear-column-partition) - Resets the value of a specified column in a partition. +- [CLEAR INDEX IN PARTITION](#alter_clear-index-partition) - Resets the specified secondary index in a partition. +- [FREEZE PARTITION](#alter_freeze-partition) – Creates a backup of a partition. +- [FETCH PARTITION](#alter_fetch-partition) – Downloads a partition from another server. +- [MOVE PARTITION\|PART](#alter_move-partition) – Move partition/data part to another disk or volume. -```sql + + +#### DETACH PARTITION {\#alter\_detach-partition} {#detach-partition-alter-detach-partition} + +``` sql ALTER TABLE table_name DETACH PARTITION partition_expr ``` @@ -217,7 +221,7 @@ Moves all data for the specified partition to the `detached` directory. The serv Example: -```sql +``` sql ALTER TABLE visits DETACH PARTITION 201901 ``` @@ -227,9 +231,9 @@ After the query is executed, you can do whatever you want with the data in the ` This query is replicated – it moves the data to the `detached` directory on all replicas. Note that you can execute this query only on a leader replica. To find out if a replica is a leader, perform the `SELECT` query to the [system.replicas](../operations/system_tables.md#system_tables-replicas) table. Alternatively, it is easier to make a `DETACH` query on all replicas - all the replicas throw an exception, except the leader replica. -#### DROP PARTITION {#alter_drop-partition} +#### DROP PARTITION {#alter-drop-partition} -```sql +``` sql ALTER TABLE table_name DROP PARTITION partition_expr ``` @@ -239,17 +243,16 @@ Read about setting the partition expression in a section [How to specify the par The query is replicated – it deletes data on all replicas. -#### DROP DETACHED PARTITION|PART {#alter_drop-detached} +#### DROP DETACHED PARTITION\|PART {#alter-drop-detached} -```sql +``` sql ALTER TABLE table_name DROP DETACHED PARTITION|PART partition_expr ``` Removes the specified part or all parts of the specified partition from `detached`. Read more about setting the partition expression in a section [How to specify the partition expression](#alter-how-to-specify-part-expr). - -#### ATTACH PARTITION|PART {#alter_attach-partition} +#### ATTACH PARTITION\|PART {#alter-attach-partition} ``` sql ALTER TABLE table_name ATTACH PARTITION|PART partition_expr @@ -257,7 +260,7 @@ ALTER TABLE table_name ATTACH PARTITION|PART partition_expr Adds data to the table from the `detached` directory. It is possible to add data for an entire partition or for a separate part. Examples: -```sql +``` sql ALTER TABLE visits ATTACH PARTITION 201901; ALTER TABLE visits ATTACH PART 201901_2_2_0; ``` @@ -268,33 +271,33 @@ This query is replicated. The replica-initiator checks whether there is data in So you can put data to the `detached` directory on one replica, and use the `ALTER ... ATTACH` query to add it to the table on all replicas. -#### ATTACH PARTITION FROM {#alter_attach-partition-from} +#### ATTACH PARTITION FROM {#alter-attach-partition-from} -```sql +``` sql ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1 ``` -This query copies the data partition from the `table1` to `table2` adds data to exsisting in the `table2`. Note that data won't be deleted from `table1`. +This query copies the data partition from the `table1` to `table2` adds data to exsisting in the `table2`. Note that data won’t be deleted from `table1`. For the query to run successfully, the following conditions must be met: -- Both tables must have the same structure. -- Both tables must have the same partition key. +- Both tables must have the same structure. +- Both tables must have the same partition key. -#### REPLACE PARTITION {#alter_replace-partition} +#### REPLACE PARTITION {#alter-replace-partition} -```sql +``` sql ALTER TABLE table2 REPLACE PARTITION partition_expr FROM table1 ``` -This query copies the data partition from the `table1` to `table2` and replaces existing partition in the `table2`. Note that data won't be deleted from `table1`. +This query copies the data partition from the `table1` to `table2` and replaces existing partition in the `table2`. Note that data won’t be deleted from `table1`. For the query to run successfully, the following conditions must be met: -- Both tables must have the same structure. -- Both tables must have the same partition key. +- Both tables must have the same structure. +- Both tables must have the same partition key. -#### MOVE PARTITION TO TABLE {#alter_move_to_table-partition} +#### MOVE PARTITION TO TABLE {#alter-move-to-table-partition} ``` sql ALTER TABLE table_source MOVE PARTITION partition_expr TO TABLE table_dest @@ -304,16 +307,14 @@ This query move the data partition from the `table_source` to `table_dest` with For the query to run successfully, the following conditions must be met: -- Both tables must have the same structure. -- Both tables must have the same partition key. -- Both tables must be the same engine family. (replicated or non-replicated) -- Both tables must have the same storage policy. +- Both tables must have the same structure. +- Both tables must have the same partition key. +- Both tables must be the same engine family. (replicated or non-replicated) +- Both tables must have the same storage policy. +#### CLEAR COLUMN IN PARTITION {#alter-clear-column-partition} - -#### CLEAR COLUMN IN PARTITION {#alter_clear-column-partition} - -```sql +``` sql ALTER TABLE table_name CLEAR COLUMN column_name IN PARTITION partition_expr ``` @@ -321,13 +322,13 @@ Resets all values in the specified column in a partition. If the `DEFAULT` claus Example: -```sql +``` sql ALTER TABLE visits CLEAR COLUMN hour in PARTITION 201902 ``` -#### FREEZE PARTITION {#alter_freeze-partition} +#### FREEZE PARTITION {#alter-freeze-partition} -```sql +``` sql ALTER TABLE table_name FREEZE [PARTITION partition_expr] ``` @@ -336,17 +337,17 @@ This query creates a local backup of a specified partition. If the `PARTITION` c !!! note "Note" The entire backup process is performed without stopping the server. -Note that for old-styled tables you can specify the prefix of the partition name (for example, '2019') - then the query creates the backup for all the corresponding partitions. Read about setting the partition expression in a section [How to specify the partition expression](#alter-how-to-specify-part-expr). +Note that for old-styled tables you can specify the prefix of the partition name (for example, ‘2019’) - then the query creates the backup for all the corresponding partitions. Read about setting the partition expression in a section [How to specify the partition expression](#alter-how-to-specify-part-expr). At the time of execution, for a data snapshot, the query creates hardlinks to a table data. Hardlinks are placed in the directory `/var/lib/clickhouse/shadow/N/...`, where: -- `/var/lib/clickhouse/` is the working ClickHouse directory specified in the config. -- `N` is the incremental number of the backup. +- `/var/lib/clickhouse/` is the working ClickHouse directory specified in the config. +- `N` is the incremental number of the backup. !!! note "Note" If you use [a set of disks for data storage in a table](../operations/table_engines/mergetree.md#table_engine-mergetree-multiple-volumes), the `shadow/N` directory appears on every disk, storing data parts that matched by the `PARTITION` expression. -The same structure of directories is created inside the backup as inside `/var/lib/clickhouse/`. The query performs 'chmod' for all files, forbidding writing into them. +The same structure of directories is created inside the backup as inside `/var/lib/clickhouse/`. The query performs ‘chmod’ for all files, forbidding writing into them. After creating the backup, you can copy the data from `/var/lib/clickhouse/shadow/` to the remote server and then delete it from the local server. Note that the `ALTER t FREEZE PARTITION` query is not replicated. It creates a local backup only on the local server. @@ -356,25 +357,25 @@ The query creates backup almost instantly (but first it waits for the current qu To restore data from a backup, do the following: -1. Create the table if it does not exist. To view the query, use the .sql file (replace `ATTACH` in it with `CREATE`). -2. Copy the data from the `data/database/table/` directory inside the backup to the `/var/lib/clickhouse/data/database/table/detached/` directory. -3. Run `ALTER TABLE t ATTACH PARTITION` queries to add the data to a table. +1. Create the table if it does not exist. To view the query, use the .sql file (replace `ATTACH` in it with `CREATE`). +2. Copy the data from the `data/database/table/` directory inside the backup to the `/var/lib/clickhouse/data/database/table/detached/` directory. +3. Run `ALTER TABLE t ATTACH PARTITION` queries to add the data to a table. -Restoring from a backup doesn't require stopping the server. +Restoring from a backup doesn’t require stopping the server. For more information about backups and restoring data, see the [Data Backup](../operations/backup.md) section. -#### CLEAR INDEX IN PARTITION {#alter_clear-index-partition} +#### CLEAR INDEX IN PARTITION {#alter-clear-index-partition} -```sql +``` sql ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr ``` The query works similar to `CLEAR COLUMN`, but it resets an index instead of a column data. -#### FETCH PARTITION {#alter_fetch-partition} +#### FETCH PARTITION {#alter-fetch-partition} -```sql +``` sql ALTER TABLE table_name FETCH PARTITION partition_expr FROM 'path-in-zookeeper' ``` @@ -382,41 +383,42 @@ Downloads a partition from another server. This query only works for the replica The query does the following: -1. Downloads the partition from the specified shard. In 'path-in-zookeeper' you must specify a path to the shard in ZooKeeper. -2. Then the query puts the downloaded data to the `detached` directory of the `table_name` table. Use the [ATTACH PARTITION|PART](#alter_attach-partition) query to add the data to the table. +1. Downloads the partition from the specified shard. In ‘path-in-zookeeper’ you must specify a path to the shard in ZooKeeper. +2. Then the query puts the downloaded data to the `detached` directory of the `table_name` table. Use the [ATTACH PARTITION\|PART](#alter_attach-partition) query to add the data to the table. For example: -```sql +``` sql ALTER TABLE users FETCH PARTITION 201902 FROM '/clickhouse/tables/01-01/visits'; ALTER TABLE users ATTACH PARTITION 201902; ``` + Note that: -- The `ALTER ... FETCH PARTITION` query isn't replicated. It places the partition to the `detached` directory only on the local server. -- The `ALTER TABLE ... ATTACH` query is replicated. It adds the data to all replicas. The data is added to one of the replicas from the `detached` directory, and to the others - from neighboring replicas. +- The `ALTER ... FETCH PARTITION` query isn’t replicated. It places the partition to the `detached` directory only on the local server. +- The `ALTER TABLE ... ATTACH` query is replicated. It adds the data to all replicas. The data is added to one of the replicas from the `detached` directory, and to the others - from neighboring replicas. Before downloading, the system checks if the partition exists and the table structure matches. The most appropriate replica is selected automatically from the healthy replicas. Although the query is called `ALTER TABLE`, it does not change the table structure and does not immediately change the data available in the table. -#### MOVE PARTITION|PART {#alter_move-partition} +#### MOVE PARTITION\|PART {#alter-move-partition} Moves partitions or data parts to another volume or disk for `MergeTree`-engine tables. See [Using Multiple Block Devices for Data Storage](../operations/table_engines/mergetree.md#table_engine-mergetree-multiple-volumes). -```sql +``` sql ALTER TABLE table_name MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name' ``` The `ALTER TABLE t MOVE` query: -- Not replicated, because different replicas can have different storage policies. -- Returns an error if the specified disk or volume is not configured. Query also returns an error if conditions of data moving, that specified in the storage policy, can't be applied. -- Can return an error in the case, when data to be moved is already moved by a background process, concurrent `ALTER TABLE t MOVE` query or as a result of background data merging. A user shouldn't perform any additional actions in this case. +- Not replicated, because different replicas can have different storage policies. +- Returns an error if the specified disk or volume is not configured. Query also returns an error if conditions of data moving, that specified in the storage policy, can’t be applied. +- Can return an error in the case, when data to be moved is already moved by a background process, concurrent `ALTER TABLE t MOVE` query or as a result of background data merging. A user shouldn’t perform any additional actions in this case. Example: -```sql +``` sql ALTER TABLE hits MOVE PART '20190301_14343_16206_438' TO VOLUME 'slow' ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' ``` @@ -425,10 +427,10 @@ ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' You can specify the partition expression in `ALTER ... PARTITION` queries in different ways: -- As a value from the `partition` column of the `system.parts` table. For example, `ALTER TABLE visits DETACH PARTITION 201901`. -- As the expression from the table column. Constants and constant expressions are supported. For example, `ALTER TABLE visits DETACH PARTITION toYYYYMM(toDate('2019-01-25'))`. -- Using the partition ID. Partition ID is a string identifier of the partition (human-readable, if possible) that is used as the names of partitions in the file system and in ZooKeeper. The partition ID must be specified in the `PARTITION ID` clause, in a single quotes. For example, `ALTER TABLE visits DETACH PARTITION ID '201901'`. -- In the [ALTER ATTACH PART](#alter_attach-partition) and [DROP DETACHED PART](#alter_drop-detached) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](../operations/system_tables.md#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. +- As a value from the `partition` column of the `system.parts` table. For example, `ALTER TABLE visits DETACH PARTITION 201901`. +- As the expression from the table column. Constants and constant expressions are supported. For example, `ALTER TABLE visits DETACH PARTITION toYYYYMM(toDate('2019-01-25'))`. +- Using the partition ID. Partition ID is a string identifier of the partition (human-readable, if possible) that is used as the names of partitions in the file system and in ZooKeeper. The partition ID must be specified in the `PARTITION ID` clause, in a single quotes. For example, `ALTER TABLE visits DETACH PARTITION ID '201901'`. +- In the [ALTER ATTACH PART](#alter_attach-partition) and [DROP DETACHED PART](#alter_drop-detached) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached\_parts](../operations/system_tables.md#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. Usage of quotes when specifying the partition depends on the type of partition expression. For example, for the `String` type, you have to specify its name in quotes (`'`). For the `Date` and `Int*` types no quotes are needed. @@ -436,22 +438,21 @@ For old-style tables, you can specify the partition either as a number `201901` All the rules above are also true for the [OPTIMIZE](misc.md#misc_operations-optimize) query. If you need to specify the only partition when optimizing a non-partitioned table, set the expression `PARTITION tuple()`. For example: -```sql +``` sql OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; ``` The examples of `ALTER ... PARTITION` queries are demonstrated in the tests [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_local.sql) and [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). - -### Manipulations with Table TTL +### Manipulations with Table TTL {#manipulations-with-table-ttl} You can change [table TTL](../operations/table_engines/mergetree.md#mergetree-table-ttl) with a request of the following form: -```sql +``` sql ALTER TABLE table-name MODIFY TTL ttl-expression ``` -### Synchronicity of ALTER Queries +### Synchronicity of ALTER Queries {#synchronicity-of-alter-queries} For non-replicatable tables, all `ALTER` queries are performed synchronously. For replicatable tables, the query just adds instructions for the appropriate actions to `ZooKeeper`, and the actions themselves are performed as soon as possible. However, the query can wait for these actions to be completed on all the replicas. @@ -466,19 +467,19 @@ Existing tables are ready for mutations as-is (no conversion necessary), but aft Currently available commands: -```sql +``` sql ALTER TABLE [db.]table DELETE WHERE filter_expr ``` The `filter_expr` must be of type `UInt8`. The query deletes rows in the table for which this expression takes a non-zero value. -```sql +``` sql ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr ``` The `filter_expr` must be of type `UInt8`. This query updates values of specified columns to the values of corresponding expressions in rows for which the `filter_expr` takes a non-zero value. Values are casted to the column type using the `CAST` operator. Updating columns that are used in the calculation of the primary or the partition key is not supported. -```sql +``` sql ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name ``` @@ -486,7 +487,7 @@ The query rebuilds the secondary index `name` in the partition `partition_name`. One query can contain several commands separated by commas. -For *MergeTree tables mutations execute by rewriting whole data parts. There is no atomicity - parts are substituted for mutated parts as soon as they are ready and a `SELECT` query that started executing during a mutation will see data from parts that have already been mutated along with data from parts that have not been mutated yet. +For \*MergeTree tables mutations execute by rewriting whole data parts. There is no atomicity - parts are substituted for mutated parts as soon as they are ready and a `SELECT` query that started executing during a mutation will see data from parts that have already been mutated along with data from parts that have not been mutated yet. Mutations are totally ordered by their creation order and are applied to each part in that order. Mutations are also partially ordered with INSERTs - data that was inserted into the table before the mutation was submitted will be mutated and data that was inserted after that will not be mutated. Note that mutations do not block INSERTs in any way. diff --git a/docs/en/query_language/create.md b/docs/en/query_language/create.md index f399eb27a4f..a4a96ada7b1 100644 --- a/docs/en/query_language/create.md +++ b/docs/en/query_language/create.md @@ -1,39 +1,39 @@ -# CREATE Queries +# CREATE Queries {#create-queries} -## CREATE DATABASE {#query_language-create-database} +## CREATE DATABASE {#query-language-create-database} Creates database. -```sql +``` sql CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] ``` -### Clauses +### Clauses {#clauses} -- `IF NOT EXISTS` +- `IF NOT EXISTS` - If the `db_name` database already exists, then ClickHouse doesn't create a new database and: + If the `db_name` database already exists, then ClickHouse doesn't create a new database and: - - Doesn't throw an exception if clause is specified. - - Throws an exception if clause isn't specified. + - Doesn't throw an exception if clause is specified. + - Throws an exception if clause isn't specified. -- `ON CLUSTER` +- `ON CLUSTER` - ClickHouse creates the `db_name` database on all the servers of a specified cluster. + ClickHouse creates the `db_name` database on all the servers of a specified cluster. -- `ENGINE` +- `ENGINE` - - [MySQL](../database_engines/mysql.md) + - [MySQL](../database_engines/mysql.md) - Allows you to retrieve data from the remote MySQL server. + Allows you to retrieve data from the remote MySQL server. - By default, ClickHouse uses its own [database engine](../database_engines/index.md). + By default, ClickHouse uses its own [database engine](../database_engines/index.md). ## CREATE TABLE {#create-table-query} The `CREATE TABLE` query can have several forms. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1], @@ -42,31 +42,31 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ) ENGINE = engine ``` -Creates a table named 'name' in the 'db' database or the current database if 'db' is not set, with the structure specified in brackets and the 'engine' engine. +Creates a table named ‘name’ in the ‘db’ database or the current database if ‘db’ is not set, with the structure specified in brackets and the ‘engine’ engine. The structure of the table is a list of column descriptions. If indexes are supported by the engine, they are indicated as parameters for the table engine. A column description is `name type` in the simplest case. Example: `RegionID UInt32`. Expressions can also be defined for default values (see below). -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name AS [db2.]name2 [ENGINE = engine] ``` Creates a table with the same structure as another table. You can specify a different engine for the table. If the engine is not specified, the same engine will be used as for the `db2.name2` table. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name AS table_function() ``` Creates a table with the structure and data returned by a [table function](table_functions/index.md). -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... ``` -Creates a table with a structure like the result of the `SELECT` query, with the 'engine' engine, and fills it with data from SELECT. +Creates a table with a structure like the result of the `SELECT` query, with the ‘engine’ engine, and fills it with data from SELECT. -In all cases, if `IF NOT EXISTS` is specified, the query won't return an error if the table already exists. In this case, the query won't do anything. +In all cases, if `IF NOT EXISTS` is specified, the query won’t return an error if the table already exists. In this case, the query won’t do anything. There can be other clauses after the `ENGINE` clause in the query. See detailed documentation on how to create tables in the descriptions of [table engines](../operations/table_engines/index.md#table_engines). @@ -77,26 +77,26 @@ Example: `URLDomain String DEFAULT domain(URL)`. If an expression for the default value is not defined, the default values will be set to zeros for numbers, empty strings for strings, empty arrays for arrays, and `0000-00-00` for dates or `0000-00-00 00:00:00` for dates with time. NULLs are not supported. -If the default expression is defined, the column type is optional. If there isn't an explicitly defined type, the default expression type is used. Example: `EventDate DEFAULT toDate(EventTime)` – the 'Date' type will be used for the 'EventDate' column. +If the default expression is defined, the column type is optional. If there isn’t an explicitly defined type, the default expression type is used. Example: `EventDate DEFAULT toDate(EventTime)` – the ‘Date’ type will be used for the ‘EventDate’ column. If the data type and default expression are defined explicitly, this expression will be cast to the specified type using type casting functions. Example: `Hits UInt32 DEFAULT 0` means the same thing as `Hits UInt32 DEFAULT toUInt32(0)`. -Default expressions may be defined as an arbitrary expression from table constants and columns. When creating and changing the table structure, it checks that expressions don't contain loops. For INSERT, it checks that expressions are resolvable – that all columns they can be calculated from have been passed. +Default expressions may be defined as an arbitrary expression from table constants and columns. When creating and changing the table structure, it checks that expressions don’t contain loops. For INSERT, it checks that expressions are resolvable – that all columns they can be calculated from have been passed. `DEFAULT expr` -Normal default value. If the INSERT query doesn't specify the corresponding column, it will be filled in by computing the corresponding expression. +Normal default value. If the INSERT query doesn’t specify the corresponding column, it will be filled in by computing the corresponding expression. `MATERIALIZED expr` -Materialized expression. Such a column can't be specified for INSERT, because it is always calculated. +Materialized expression. Such a column can’t be specified for INSERT, because it is always calculated. For an INSERT without a list of columns, these columns are not considered. In addition, this column is not substituted when using an asterisk in a SELECT query. This is to preserve the invariant that the dump obtained using `SELECT *` can be inserted back into the table using INSERT without specifying the list of columns. `ALIAS expr` -Synonym. Such a column isn't stored in the table at all. -Its values can't be inserted in a table, and it is not substituted when using an asterisk in a SELECT query. +Synonym. Such a column isn’t stored in the table at all. +Its values can’t be inserted in a table, and it is not substituted when using an asterisk in a SELECT query. It can be used in SELECTs if the alias is expanded during query parsing. When using the ALTER query to add new columns, old data for these columns is not written. Instead, when reading old data that does not have values for the new columns, expressions are computed on the fly by default. However, if running the expressions requires different columns that are not indicated in the query, these columns will additionally be read, but only for the blocks of data that need it. @@ -109,7 +109,7 @@ It is not possible to set default values for elements in nested data structures. Along with columns descriptions constraints could be defined: -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1], @@ -123,7 +123,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Adding large amount of constraints can negatively affect performance of big `INSERT` queries. -### TTL Expression +### TTL Expression {#ttl-expression} Defines storage time for values. Can be specified only for MergeTree-family tables. For the detailed description, see [TTL for columns and tables](../operations/table_engines/mergetree.md#table_engine-mergetree-ttl). @@ -131,7 +131,7 @@ Defines storage time for values. Can be specified only for MergeTree-family tabl By default, ClickHouse applies the `lz4` compression method. For `MergeTree`-engine family you can change the default compression method in the [compression](../operations/server_settings/settings.md#server-settings-compression) section of a server configuration. You can also define the compression method for each individual column in the `CREATE TABLE` query. -```sql +``` sql CREATE TABLE codec_example ( dt Date CODEC(ZSTD), @@ -144,34 +144,34 @@ ENGINE = ... ``` -If a codec is specified, the default codec doesn't apply. Codecs can be combined in a pipeline, for example, `CODEC(Delta, ZSTD)`. To select the best codec combination for you project, pass benchmarks similar to described in the Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article. +If a codec is specified, the default codec doesn’t apply. Codecs can be combined in a pipeline, for example, `CODEC(Delta, ZSTD)`. To select the best codec combination for you project, pass benchmarks similar to described in the Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article. -!!!warning "Warning" - You can't decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor) utility. +!!! warning "Warning" + You can’t decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor) utility. Compression is supported for the following table engines: -- [MergeTree](../operations/table_engines/mergetree.md) family. Supports column compression codecs and selecting the default compression method by [compression](../operations/server_settings/settings.md#server-settings-compression) settings. -- [Log](../operations/table_engines/log_family.md) family. Uses the `lz4` compression method by default and supports column compression codecs. -- [Set](../operations/table_engines/set.md). Only supported the default compression. -- [Join](../operations/table_engines/join.md). Only supported the default compression. +- [MergeTree](../operations/table_engines/mergetree.md) family. Supports column compression codecs and selecting the default compression method by [compression](../operations/server_settings/settings.md#server-settings-compression) settings. +- [Log](../operations/table_engines/log_family.md) family. Uses the `lz4` compression method by default and supports column compression codecs. +- [Set](../operations/table_engines/set.md). Only supported the default compression. +- [Join](../operations/table_engines/join.md). Only supported the default compression. ClickHouse supports common purpose codecs and specialized codecs. #### Specialized Codecs {#create-query-specialized-codecs} -These codecs are designed to make compression more effective by using specific features of data. Some of these codecs don't compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. +These codecs are designed to make compression more effective by using specific features of data. Some of these codecs don’t compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. Specialized codecs: -- `Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it's 1. -- `DoubleDelta` — Calculates delta of deltas and writes it in compact binary form. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-byte deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` and `DateTime`). At each step of its algorithm, codec takes a block of 64 values, puts them into 64x64 bit matrix, transposes it, crops the unused bits of values and returns the rest as a sequence. Unused bits are the bits, that don't differ between maximum and minimum values in the whole data part for which the compression is used. +- `Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. +- `DoubleDelta` — Calculates delta of deltas and writes it in compact binary form. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-byte deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` and `DateTime`). At each step of its algorithm, codec takes a block of 64 values, puts them into 64x64 bit matrix, transposes it, crops the unused bits of values and returns the rest as a sequence. Unused bits are the bits, that don’t differ between maximum and minimum values in the whole data part for which the compression is used. `DoubleDelta` and `Gorilla` codecs are used in Gorilla TSDB as the components of its compressing algorithm. Gorilla approach is effective in scenarios when there is a sequence of slowly changing values with their timestamps. Timestamps are effectively compressed by the `DoubleDelta` codec, and values are effectively compressed by the `Gorilla` codec. For example, to get an effectively stored table, you can create it in the following configuration: -```sql +``` sql CREATE TABLE codec_example ( timestamp DateTime CODEC(DoubleDelta), @@ -184,27 +184,27 @@ ENGINE = MergeTree() Codecs: -- `NONE` — No compression. -- `LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression. -- `LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` applies the default level. Possible levels: [1, 12]. Recommended level range: [4, 9]. -- `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: [1, 22]. Default value: 1. +- `NONE` — No compression. +- `LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression. +- `LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` applies the default level. Possible levels: \[1, 12\]. Recommended level range: \[4, 9\]. +- `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: \[1, 22\]. Default value: 1. High compression levels are useful for asymmetric scenarios, like compress once, decompress repeatedly. Higher levels mean better compression and higher CPU usage. -## Temporary Tables +## Temporary Tables {#temporary-tables} ClickHouse supports temporary tables which have the following characteristics: -- Temporary tables disappear when the session ends, including if the connection is lost. -- A temporary table uses the Memory engine only. -- The DB can't be specified for a temporary table. It is created outside of databases. -- Impossible to create a temporary table with distributed DDL query on all cluster servers (by using `ON CLUSTER`): this table exists only in the current session. -- If a temporary table has the same name as another one and a query specifies the table name without specifying the DB, the temporary table will be used. -- For distributed query processing, temporary tables used in a query are passed to remote servers. +- Temporary tables disappear when the session ends, including if the connection is lost. +- A temporary table uses the Memory engine only. +- The DB can’t be specified for a temporary table. It is created outside of databases. +- Impossible to create a temporary table with distributed DDL query on all cluster servers (by using `ON CLUSTER`): this table exists only in the current session. +- If a temporary table has the same name as another one and a query specifies the table name without specifying the DB, the temporary table will be used. +- For distributed query processing, temporary tables used in a query are passed to remote servers. To create a temporary table, use the following syntax: -```sql +``` sql CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -215,45 +215,45 @@ CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name In most cases, temporary tables are not created manually, but when using external data for a query, or for distributed `(GLOBAL) IN`. For more information, see the appropriate sections -It's possible to use tables with [ENGINE = Memory](../operations/table_engines/memory.md) instead of temporary tables. +It’s possible to use tables with [ENGINE = Memory](../operations/table_engines/memory.md) instead of temporary tables. -## Distributed DDL queries (ON CLUSTER clause) +## Distributed DDL queries (ON CLUSTER clause) {#distributed-ddl-queries-on-cluster-clause} The `CREATE`, `DROP`, `ALTER`, and `RENAME` queries support distributed execution on a cluster. For example, the following query creates the `all_hits` `Distributed` table on each host in `cluster`: -```sql +``` sql CREATE TABLE IF NOT EXISTS all_hits ON CLUSTER cluster (p Date, i Int32) ENGINE = Distributed(cluster, default, hits) ``` In order to run these queries correctly, each host must have the same cluster definition (to simplify syncing configs, you can use substitutions from ZooKeeper). They must also connect to the ZooKeeper servers. The local version of the query will eventually be implemented on each host in the cluster, even if some hosts are currently not available. The order for executing queries within a single host is guaranteed. -## CREATE VIEW +## CREATE VIEW {#create-view} -```sql +``` sql CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]table_name [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ... ``` Creates a view. There are two types of views: normal and MATERIALIZED. -Normal views don't store any data, but just perform a read from another table. In other words, a normal view is nothing more than a saved query. When reading from a view, this saved query is used as a subquery in the FROM clause. +Normal views don’t store any data, but just perform a read from another table. In other words, a normal view is nothing more than a saved query. When reading from a view, this saved query is used as a subquery in the FROM clause. -As an example, assume you've created a view: +As an example, assume you’ve created a view: -```sql +``` sql CREATE VIEW view AS SELECT ... ``` and written a query: -```sql +``` sql SELECT a, b, c FROM view ``` This query is fully equivalent to using the subquery: -```sql +``` sql SELECT a, b, c FROM (SELECT ...) ``` @@ -261,23 +261,23 @@ Materialized views store data transformed by the corresponding SELECT query. When creating a materialized view without `TO [db].[table]`, you must specify ENGINE – the table engine for storing data. -When creating a materialized view with `TO [db].[table]`, you must not use `POPULATE`. +When creating a materialized view with `TO [db].[table]`, you must not use `POPULATE`. A materialized view is arranged as follows: when inserting data to the table specified in SELECT, part of the inserted data is converted by this SELECT query, and the result is inserted in the view. -If you specify POPULATE, the existing table data is inserted in the view when creating it, as if making a `CREATE TABLE ... AS SELECT ...` . Otherwise, the query contains only the data inserted in the table after creating the view. We don't recommend using POPULATE, since data inserted in the table during the view creation will not be inserted in it. +If you specify POPULATE, the existing table data is inserted in the view when creating it, as if making a `CREATE TABLE ... AS SELECT ...` . Otherwise, the query contains only the data inserted in the table after creating the view. We don’t recommend using POPULATE, since data inserted in the table during the view creation will not be inserted in it. -A `SELECT` query can contain `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`... Note that the corresponding conversions are performed independently on each block of inserted data. For example, if `GROUP BY` is set, data is aggregated during insertion, but only within a single packet of inserted data. The data won't be further aggregated. The exception is when using an ENGINE that independently performs data aggregation, such as `SummingMergeTree`. +A `SELECT` query can contain `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`… Note that the corresponding conversions are performed independently on each block of inserted data. For example, if `GROUP BY` is set, data is aggregated during insertion, but only within a single packet of inserted data. The data won’t be further aggregated. The exception is when using an ENGINE that independently performs data aggregation, such as `SummingMergeTree`. -The execution of `ALTER` queries on materialized views has not been fully developed, so they might be inconvenient. If the materialized view uses the construction ``TO [db.]name``, you can ``DETACH`` the view, run ``ALTER`` for the target table, and then ``ATTACH`` the previously detached (``DETACH``) view. +The execution of `ALTER` queries on materialized views has not been fully developed, so they might be inconvenient. If the materialized view uses the construction `TO [db.]name`, you can `DETACH` the view, run `ALTER` for the target table, and then `ATTACH` the previously detached (`DETACH`) view. Views look the same as normal tables. For example, they are listed in the result of the `SHOW TABLES` query. -There isn't a separate query for deleting views. To delete a view, use `DROP TABLE`. +There isn’t a separate query for deleting views. To delete a view, use `DROP TABLE`. ## CREATE DICTIONARY {#create-dictionary-query} -```sql +``` sql CREATE DICTIONARY [IF NOT EXISTS] [db.]dictionary_name ( key1 type1 [DEFAULT|EXPRESSION expr1] [HIERARCHICAL|INJECTIVE|IS_OBJECT_ID], @@ -291,7 +291,7 @@ LAYOUT(LAYOUT_NAME([param_name param_value])) LIFETIME([MIN val1] MAX val2) ``` -Creates [external dictionary](dicts/external_dicts.md) with given [structure](dicts/external_dicts_dict_structure.md), [source](dicts/external_dicts_dict_sources.md), [layout](dicts/external_dicts_dict_layout.md) and [lifetime](dicts/external_dicts_dict_lifetime.md). +Creates [external dictionary](dicts/external_dicts.md) with given [structure](dicts/external_dicts_dict_structure.md), [source](dicts/external_dicts_dict_sources.md), [layout](dicts/external_dicts_dict_layout.md) and [lifetime](dicts/external_dicts_dict_lifetime.md). External dictionary structure consists of attributes. Dictionary attributes are specified similarly to table columns. The only required attribute property is its type, all other properties may have default values. diff --git a/docs/en/query_language/dicts/external_dicts.md b/docs/en/query_language/dicts/external_dicts.md index 46733654f68..6e9f2541447 100644 --- a/docs/en/query_language/dicts/external_dicts.md +++ b/docs/en/query_language/dicts/external_dicts.md @@ -1,20 +1,20 @@ -# External Dictionaries {#dicts-external_dicts} +# External Dictionaries {#dicts-external-dicts} -You can add your own dictionaries from various data sources. The data source for a dictionary can be a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see "[Sources for external dictionaries](external_dicts_dict_sources.md)". +You can add your own dictionaries from various data sources. The data source for a dictionary can be a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Sources for external dictionaries](external_dicts_dict_sources.md)”. ClickHouse: -- Fully or partially stores dictionaries in RAM. -- Periodically updates dictionaries and dynamically loads missing values. In other words, dictionaries can be loaded dynamically. -- Allows to create external dictionaries with xml files or [DDL queries](../create.md#create-dictionary-query). +- Fully or partially stores dictionaries in RAM. +- Periodically updates dictionaries and dynamically loads missing values. In other words, dictionaries can be loaded dynamically. +- Allows to create external dictionaries with xml files or [DDL queries](../create.md#create-dictionary-query). -The configuration of external dictionaries can be located in one or more xml-files. The path to the configuration is specified in the [dictionaries_config](../../operations/server_settings/settings.md#server_settings-dictionaries_config) parameter. +The configuration of external dictionaries can be located in one or more xml-files. The path to the configuration is specified in the [dictionaries\_config](../../operations/server_settings/settings.md#server_settings-dictionaries_config) parameter. -Dictionaries can be loaded at server startup or at first use, depending on the [dictionaries_lazy_load](../../operations/server_settings/settings.md#server_settings-dictionaries_lazy_load) setting. +Dictionaries can be loaded at server startup or at first use, depending on the [dictionaries\_lazy\_load](../../operations/server_settings/settings.md#server_settings-dictionaries_lazy_load) setting. The dictionary configuration file has the following format: -```xml +``` xml An optional element with any content. Ignored by the ClickHouse server. @@ -32,18 +32,18 @@ The dictionary configuration file has the following format: You can [configure](external_dicts_dict.md) any number of dictionaries in the same file. -[DDL queries for dictionaries](../create.md#create-dictionary-query) doesn't require any additional records in server configuration. They allow to work with dictionaries as first-class entities, like tables or views. +[DDL queries for dictionaries](../create.md#create-dictionary-query) doesn’t require any additional records in server configuration. They allow to work with dictionaries as first-class entities, like tables or views. !!! attention "Attention" You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../functions/other_functions.md) function). This functionality is not related to external dictionaries. ## See also {#ext-dicts-see-also} -- [Configuring an External Dictionary](external_dicts_dict.md) -- [Storing Dictionaries in Memory](external_dicts_dict_layout.md) -- [Dictionary Updates](external_dicts_dict_lifetime.md) -- [Sources of External Dictionaries](external_dicts_dict_sources.md) -- [Dictionary Key and Fields](external_dicts_dict_structure.md) -- [Functions for Working with External Dictionaries](../functions/ext_dict_functions.md) +- [Configuring an External Dictionary](external_dicts_dict.md) +- [Storing Dictionaries in Memory](external_dicts_dict_layout.md) +- [Dictionary Updates](external_dicts_dict_lifetime.md) +- [Sources of External Dictionaries](external_dicts_dict_sources.md) +- [Dictionary Key and Fields](external_dicts_dict_structure.md) +- [Functions for Working with External Dictionaries](../functions/ext_dict_functions.md) [Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts/) diff --git a/docs/en/query_language/dicts/external_dicts_dict.md b/docs/en/query_language/dicts/external_dicts_dict.md index fba76199784..f92c5b33791 100644 --- a/docs/en/query_language/dicts/external_dicts_dict.md +++ b/docs/en/query_language/dicts/external_dicts_dict.md @@ -1,8 +1,8 @@ -# Configuring an External Dictionary {#dicts-external_dicts_dict} +# Configuring an External Dictionary {#dicts-external-dicts-dict} If dictionary is configured using xml file, than dictionary configuration has the following structure: -```xml +``` xml dict_name @@ -26,7 +26,7 @@ If dictionary is configured using xml file, than dictionary configuration has th Corresponding [DDL-query](../create.md#create-dictionary-query) has the following structure: -```sql +``` sql CREATE DICTIONARY dict_name ( ... -- attributes @@ -37,10 +37,10 @@ LAYOUT(...) -- Memory layout configuration LIFETIME(...) -- Lifetime of dictionary in memory ``` -- `name` – The identifier that can be used to access the dictionary. Use the characters `[a-zA-Z0-9_\-]`. -- [source](external_dicts_dict_sources.md) — Source of the dictionary. -- [layout](external_dicts_dict_layout.md) — Dictionary layout in memory. -- [structure](external_dicts_dict_structure.md) — Structure of the dictionary . A key and attributes that can be retrieved by this key. -- [lifetime](external_dicts_dict_lifetime.md) — Frequency of dictionary updates. +- `name` – The identifier that can be used to access the dictionary. Use the characters `[a-zA-Z0-9_\-]`. +- [source](external_dicts_dict_sources.md) — Source of the dictionary. +- [layout](external_dicts_dict_layout.md) — Dictionary layout in memory. +- [structure](external_dicts_dict_structure.md) — Structure of the dictionary . A key and attributes that can be retrieved by this key. +- [lifetime](external_dicts_dict_lifetime.md) — Frequency of dictionary updates. [Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict/) diff --git a/docs/en/query_language/dicts/external_dicts_dict_hierarchical.md b/docs/en/query_language/dicts/external_dicts_dict_hierarchical.md index bca5df38110..0f94eac6149 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_hierarchical.md +++ b/docs/en/query_language/dicts/external_dicts_dict_hierarchical.md @@ -1,17 +1,17 @@ -# Hierarchical Dictionaries +# Hierarchical Dictionaries {#hierarchical-dictionaries} ClickHouse supports hierarchical dictionaries with a [numeric key](external_dicts_dict_structure.md#ext_dict-numeric-key). Look at the following hierarchical structure: -```text +``` text 0 (Common parent) │ ├── 1 (Russia) │ │ -│ └── 2 (Moscow) +│ └── 2 (Moscow) │ │ -│ └── 3 (Center) +│ └── 3 (Center) │ └── 4 (Great Britain) │ @@ -20,13 +20,13 @@ Look at the following hierarchical structure: This hierarchy can be expressed as the following dictionary table. -region_id | parent_region | region_name -----|--------------|------ -1 | 0 | Russia -2 | 1 | Moscow -3 | 2 | Center -4 | 0 | Great Britain -5 | 4 | London +| region\_id | parent\_region | region\_name | +|------------|----------------|---------------| +| 1 | 0 | Russia | +| 2 | 1 | Moscow | +| 3 | 2 | Center | +| 4 | 0 | Great Britain | +| 5 | 4 | London | This table contains a column `parent_region` that contains the key of the nearest parent for the element. @@ -36,7 +36,7 @@ The [dictGetHierarchy](../functions/ext_dict_functions.md#dictgethierarchy) func For our example, the structure of dictionary can be the following: -```xml +``` xml diff --git a/docs/en/query_language/dicts/external_dicts_dict_layout.md b/docs/en/query_language/dicts/external_dicts_dict_layout.md index 87034a5aa14..7426c281c47 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_layout.md +++ b/docs/en/query_language/dicts/external_dicts_dict_layout.md @@ -1,26 +1,26 @@ -# Storing Dictionaries in Memory {#dicts-external_dicts_dict_layout} +# Storing Dictionaries in Memory {#dicts-external-dicts-dict-layout} There are a variety of ways to store dictionaries in memory. -We recommend [flat](#flat), [hashed](#dicts-external_dicts_dict_layout-hashed) and [complex_key_hashed](#complex-key-hashed). which provide optimal processing speed. +We recommend [flat](#flat), [hashed](#dicts-external_dicts_dict_layout-hashed) and [complex\_key\_hashed](#complex-key-hashed). which provide optimal processing speed. -Caching is not recommended because of potentially poor performance and difficulties in selecting optimal parameters. Read more in the section "[cache](#cache)". +Caching is not recommended because of potentially poor performance and difficulties in selecting optimal parameters. Read more in the section “[cache](#cache)”. There are several ways to improve dictionary performance: -- Call the function for working with the dictionary after `GROUP BY`. -- Mark attributes to extract as injective. An attribute is called injective if different attribute values correspond to different keys. So when `GROUP BY` uses a function that fetches an attribute value by the key, this function is automatically taken out of `GROUP BY`. +- Call the function for working with the dictionary after `GROUP BY`. +- Mark attributes to extract as injective. An attribute is called injective if different attribute values correspond to different keys. So when `GROUP BY` uses a function that fetches an attribute value by the key, this function is automatically taken out of `GROUP BY`. ClickHouse generates an exception for errors with dictionaries. Examples of errors: -- The dictionary being accessed could not be loaded. -- Error querying a `cached` dictionary. +- The dictionary being accessed could not be loaded. +- Error querying a `cached` dictionary. You can view the list of external dictionaries and their statuses in the `system.dictionaries` table. The configuration looks like this: -```xml +``` xml ... @@ -36,30 +36,29 @@ The configuration looks like this: Corresponding [DDL-query](../create.md#create-dictionary-query): -```sql +``` sql CREATE DICTIONARY (...) ... LAYOUT(LAYOUT_TYPE(param value)) -- layout settings ... ``` +## Ways to Store Dictionaries in Memory {#ways-to-store-dictionaries-in-memory} -## Ways to Store Dictionaries in Memory +- [flat](#flat) +- [hashed](#dicts-external_dicts_dict_layout-hashed) +- [sparse\_hashed](#dicts-external_dicts_dict_layout-sparse_hashed) +- [cache](#cache) +- [range\_hashed](#range-hashed) +- [complex\_key\_hashed](#complex-key-hashed) +- [complex\_key\_cache](#complex-key-cache) +- [ip\_trie](#ip-trie) -- [flat](#flat) -- [hashed](#dicts-external_dicts_dict_layout-hashed) -- [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed) -- [cache](#cache) -- [range_hashed](#range-hashed) -- [complex_key_hashed](#complex-key-hashed) -- [complex_key_cache](#complex-key-cache) -- [ip_trie](#ip-trie) - -### flat +### flat {#flat} The dictionary is completely stored in memory in the form of flat arrays. How much memory does the dictionary use? The amount is proportional to the size of the largest key (in space used). -The dictionary key has the ` UInt64` type and the value is limited to 500,000. If a larger key is discovered when creating the dictionary, ClickHouse throws an exception and does not create the dictionary. +The dictionary key has the `UInt64` type and the value is limited to 500,000. If a larger key is discovered when creating the dictionary, ClickHouse throws an exception and does not create the dictionary. All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. @@ -67,7 +66,7 @@ This method provides the best performance among all available methods of storing Configuration example: -```xml +``` xml @@ -75,11 +74,11 @@ Configuration example: or -```sql +``` sql LAYOUT(FLAT()) ``` -### hashed {#dicts-external_dicts_dict_layout-hashed} +### hashed {#dicts-external-dicts-dict-layout-hashed} The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items. @@ -87,7 +86,7 @@ All types of sources are supported. When updating, data (from a file or from a t Configuration example: -```xml +``` xml @@ -95,43 +94,43 @@ Configuration example: or -```sql +``` sql LAYOUT(HASHED()) ``` -### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed} +### sparse\_hashed {#dicts-external-dicts-dict-layout-sparse-hashed} Similar to `hashed`, but uses less memory in favor more CPU usage. Configuration example: -```xml +``` xml ``` -```sql +``` sql LAYOUT(SPARSE_HASHED()) ``` -### complex_key_hashed +### complex\_key\_hashed {#complex-key-hashed} This type of storage is for use with composite [keys](external_dicts_dict_structure.md). Similar to `hashed`. Configuration example: -```xml +``` xml ``` -```sql +``` sql LAYOUT(COMPLEX_KEY_HASHED()) ``` -### range_hashed +### range\_hashed {#range-hashed} The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values. @@ -139,7 +138,7 @@ This storage method works the same way as hashed and allows using date/time (arb Example: The table contains discounts for each advertiser in the format: -```text +``` text +---------------|---------------------|-------------------|--------+ | advertiser id | discount start date | discount end date | amount | +===============+=====================+===================+========+ @@ -151,11 +150,11 @@ Example: The table contains discounts for each advertiser in the format: +---------------|---------------------|-------------------|--------+ ``` -To use a sample for date ranges, define the `range_min` and `range_max` elements in the [structure](external_dicts_dict_structure.md). These elements must contain elements `name` and` type` (if `type` is not specified, the default type will be used - Date). `type` can be any numeric type (Date / DateTime / UInt64 / Int32 / others). +To use a sample for date ranges, define the `range_min` and `range_max` elements in the [structure](external_dicts_dict_structure.md). These elements must contain elements `name` and`type` (if `type` is not specified, the default type will be used - Date). `type` can be any numeric type (Date / DateTime / UInt64 / Int32 / others). Example: -```xml +``` xml Id @@ -173,7 +172,7 @@ Example: or -```sql +``` sql CREATE DICTIONARY somedict ( id UInt64, first Date, @@ -186,7 +185,7 @@ RANGE(MIN first MAX last) To work with these dictionaries, you need to pass an additional argument to the `dictGetT` function, for which a range is selected: -```sql +``` sql dictGetT('dict_name', 'attr_name', id, date) ``` @@ -194,13 +193,13 @@ This function returns the value for the specified `id`s and the date range that Details of the algorithm: -- If the ` id` is not found or a range is not found for the ` id`, it returns the default value for the dictionary. -- If there are overlapping ranges, you can use any. -- If the range delimiter is `NULL` or an invalid date (such as 1900-01-01 or 2039-01-01), the range is left open. The range can be open on both sides. +- If the `id` is not found or a range is not found for the `id`, it returns the default value for the dictionary. +- If there are overlapping ranges, you can use any. +- If the range delimiter is `NULL` or an invalid date (such as 1900-01-01 or 2039-01-01), the range is left open. The range can be open on both sides. Configuration example: -```xml +``` xml @@ -235,7 +234,7 @@ Configuration example: or -```sql +``` sql CREATE DICTIONARY somedict( Abcdef UInt64, StartTimeStamp UInt64, @@ -246,22 +245,22 @@ PRIMARY KEY Abcdef RANGE(MIN StartTimeStamp MAX EndTimeStamp) ``` -### cache +### cache {#cache} The dictionary is stored in a cache that has a fixed number of cells. These cells contain frequently used elements. -When searching for a dictionary, the cache is searched first. For each block of data, all keys that are not found in the cache or are outdated are requested from the source using ` SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. The received data is then written to the cache. +When searching for a dictionary, the cache is searched first. For each block of data, all keys that are not found in the cache or are outdated are requested from the source using `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. The received data is then written to the cache. -For cache dictionaries, the expiration [lifetime](external_dicts_dict_lifetime.md) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell's value is not used, and it is re-requested the next time it needs to be used. +For cache dictionaries, the expiration [lifetime](external_dicts_dict_lifetime.md) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used, and it is re-requested the next time it needs to be used. This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the `system.dictionaries` table. -To improve cache performance, use a subquery with ` LIMIT`, and call the function with the dictionary externally. +To improve cache performance, use a subquery with `LIMIT`, and call the function with the dictionary externally. Supported [sources](external_dicts_dict_sources.md): MySQL, ClickHouse, executable, HTTP. Example of settings: -```xml +``` xml @@ -272,33 +271,31 @@ Example of settings: or -```sql +``` sql LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) ``` Set a large enough cache size. You need to experiment to select the number of cells: -1. Set some value. -2. Run queries until the cache is completely full. -3. Assess memory consumption using the `system.dictionaries` table. -4. Increase or decrease the number of cells until the required memory consumption is reached. +1. Set some value. +2. Run queries until the cache is completely full. +3. Assess memory consumption using the `system.dictionaries` table. +4. Increase or decrease the number of cells until the required memory consumption is reached. -!!! warning +!!! warning "Warning" Do not use ClickHouse as a source, because it is slow to process queries with random reads. - -### complex_key_cache +### complex\_key\_cache {#complex-key-cache} This type of storage is for use with composite [keys](external_dicts_dict_structure.md). Similar to `cache`. - -### ip_trie +### ip\_trie {#ip-trie} This type of storage is for mapping network prefixes (IP addresses) to metadata such as ASN. Example: The table contains network prefixes and their corresponding AS number and country code: -```text +``` text +-----------------|-------|--------+ | prefix | asn | cca2 | +=================+=======+========+ @@ -316,7 +313,7 @@ When using this type of layout, the structure must have a composite key. Example: -```xml +``` xml @@ -339,7 +336,7 @@ Example: or -```sql +``` sql CREATE DICTIONARY somedict ( prefix String, asn UInt32, @@ -352,13 +349,13 @@ The key must have only one String type attribute that contains an allowed IP pre For queries, you must use the same functions (`dictGetT` with a tuple) as for dictionaries with composite keys: -```sql +``` sql dictGetT('dict_name', 'attr_name', tuple(ip)) ``` The function takes either `UInt32` for IPv4, or `FixedString(16)` for IPv6: -```sql +``` sql dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1'))) ``` diff --git a/docs/en/query_language/dicts/external_dicts_dict_lifetime.md b/docs/en/query_language/dicts/external_dicts_dict_lifetime.md index dabb07f82af..12fca45e39f 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_lifetime.md +++ b/docs/en/query_language/dicts/external_dicts_dict_lifetime.md @@ -1,5 +1,4 @@ - -# Dictionary Updates +# Dictionary Updates {#dictionary-updates} ClickHouse periodically updates the dictionaries. The update interval for fully downloaded dictionaries and the invalidation interval for cached dictionaries are defined in the `` tag in seconds. @@ -7,7 +6,7 @@ Dictionary updates (other than loading for first use) do not block queries. Duri Example of settings: -```xml +``` xml ... 300 @@ -15,7 +14,7 @@ Example of settings: ``` -```sql +``` sql CREATE DICTIONARY (...) ... LIFETIME(300) @@ -28,7 +27,7 @@ You can set a time interval for upgrades, and ClickHouse will choose a uniformly Example of settings: -```xml +``` xml ... @@ -41,24 +40,24 @@ Example of settings: or -```sql +``` sql LIFETIME(MIN 300 MAX 360) ``` -When upgrading the dictionaries, the ClickHouse server applies different logic depending on the type of [ source](external_dicts_dict_sources.md): +When upgrading the dictionaries, the ClickHouse server applies different logic depending on the type of [source](external_dicts_dict_sources.md): -- For a text file, it checks the time of modification. If the time differs from the previously recorded time, the dictionary is updated. -- For MyISAM tables, the time of modification is checked using a `SHOW TABLE STATUS` query. -- Dictionaries from other sources are updated every time by default. +- For a text file, it checks the time of modification. If the time differs from the previously recorded time, the dictionary is updated. +- For MyISAM tables, the time of modification is checked using a `SHOW TABLE STATUS` query. +- Dictionaries from other sources are updated every time by default. For MySQL (InnoDB), ODBC and ClickHouse sources, you can set up a query that will update the dictionaries only if they really changed, rather than each time. To do this, follow these steps: -- The dictionary table must have a field that always changes when the source data is updated. -- The settings of the source must specify a query that retrieves the changing field. The ClickHouse server interprets the query result as a row, and if this row has changed relative to its previous state, the dictionary is updated. Specify the query in the `` field in the settings for the [source](external_dicts_dict_sources.md). +- The dictionary table must have a field that always changes when the source data is updated. +- The settings of the source must specify a query that retrieves the changing field. The ClickHouse server interprets the query result as a row, and if this row has changed relative to its previous state, the dictionary is updated. Specify the query in the `` field in the settings for the [source](external_dicts_dict_sources.md). Example of settings: -```xml +``` xml ... @@ -71,11 +70,10 @@ Example of settings: or -```sql +``` sql ... SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source where id = 1')) ... ``` - [Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_lifetime/) diff --git a/docs/en/query_language/dicts/external_dicts_dict_sources.md b/docs/en/query_language/dicts/external_dicts_dict_sources.md index b52b07b4256..d7ec3741006 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/en/query_language/dicts/external_dicts_dict_sources.md @@ -1,10 +1,10 @@ -# Sources of External Dictionaries {#dicts-external_dicts_dict_sources} +# Sources of External Dictionaries {#dicts-external-dicts-dict-sources} An external dictionary can be connected from many different sources. If dictionary is configured using xml-file, the configuration looks like this: -```xml +``` xml ... @@ -21,7 +21,7 @@ If dictionary is configured using xml-file, the configuration looks like this: In case of [DDL-query](../create.md#create-dictionary-query), equal configuration will looks like: -```sql +``` sql CREATE DICTIONARY dict_name (...) ... SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration @@ -32,22 +32,21 @@ The source is configured in the `source` section. Types of sources (`source_type`): -- [Local file](#dicts-external_dicts_dict_sources-local_file) -- [Executable file](#dicts-external_dicts_dict_sources-executable) -- [HTTP(s)](#dicts-external_dicts_dict_sources-http) -- DBMS - - [ODBC](#dicts-external_dicts_dict_sources-odbc) - - [MySQL](#dicts-external_dicts_dict_sources-mysql) - - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - - [Redis](#dicts-external_dicts_dict_sources-redis) +- [Local file](#dicts-external_dicts_dict_sources-local_file) +- [Executable file](#dicts-external_dicts_dict_sources-executable) +- [HTTP(s)](#dicts-external_dicts_dict_sources-http) +- DBMS + - [ODBC](#dicts-external_dicts_dict_sources-odbc) + - [MySQL](#dicts-external_dicts_dict_sources-mysql) + - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) + - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) + - [Redis](#dicts-external_dicts_dict_sources-redis) - -## Local File {#dicts-external_dicts_dict_sources-local_file} +## Local File {#dicts-external-dicts-dict-sources-local-file} Example of settings: -```xml +``` xml /opt/dictionaries/os.tsv @@ -58,23 +57,22 @@ Example of settings: or -```sql +``` sql SOURCE(FILE(path '/opt/dictionaries/os.tsv' format 'TabSeparated')) ``` Setting fields: -- `path` – The absolute path to the file. -- `format` – The file format. All the formats described in "[Formats](../../interfaces/formats.md#formats)" are supported. +- `path` – The absolute path to the file. +- `format` – The file format. All the formats described in “[Formats](../../interfaces/formats.md#formats)” are supported. +## Executable File {#dicts-external-dicts-dict-sources-executable} -## Executable File {#dicts-external_dicts_dict_sources-executable} - -Working with executable files depends on [how the dictionary is stored in memory](external_dicts_dict_layout.md). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file's STDIN. Otherwise, ClickHouse starts executable file and treats its output as dictionary data. +Working with executable files depends on [how the dictionary is stored in memory](external_dicts_dict_layout.md). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts executable file and treats its output as dictionary data. Example of settings: -```xml +``` xml cat /opt/dictionaries/os.tsv @@ -85,23 +83,22 @@ Example of settings: or -```sql +``` sql SOURCE(EXECUTABLE(command 'cat /opt/dictionaries/os.tsv' format 'TabSeparated')) ``` Setting fields: -- `command` – The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). -- `format` – The file format. All the formats described in "[Formats](../../interfaces/formats.md#formats)" are supported. +- `command` – The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). +- `format` – The file format. All the formats described in “[Formats](../../interfaces/formats.md#formats)” are supported. - -## HTTP(s) {#dicts-external_dicts_dict_sources-http} +## HTTP(s) {#dicts-external-dicts-dict-sources-http} Working with an HTTP(s) server depends on [how the dictionary is stored in memory](external_dicts_dict_layout.md). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request via the `POST` method. Example of settings: -```xml +``` xml http://[::1]/os.tsv @@ -122,7 +119,7 @@ Example of settings: or -```sql +``` sql SOURCE(HTTP( url 'http://[::1]/os.tsv' format 'TabSeparated' @@ -135,24 +132,23 @@ In order for ClickHouse to access an HTTPS resource, you must [configure openSSL Setting fields: -- `url` – The source URL. -- `format` – The file format. All the formats described in "[Formats](../../interfaces/formats.md#formats)" are supported. -- `credentials` – Basic HTTP authentication. Optional parameter. - - `user` – Username required for the authentication. - - `password` – Password required for the authentication. -- `headers` – All custom HTTP headers entries used for the HTTP request. Optional parameter. - - `header` – Single HTTP header entry. - - `name` – Identifiant name used for the header send on the request. - - `value` – Value set for a specific identifiant name. +- `url` – The source URL. +- `format` – The file format. All the formats described in “[Formats](../../interfaces/formats.md#formats)” are supported. +- `credentials` – Basic HTTP authentication. Optional parameter. + - `user` – Username required for the authentication. + - `password` – Password required for the authentication. +- `headers` – All custom HTTP headers entries used for the HTTP request. Optional parameter. + - `header` – Single HTTP header entry. + - `name` – Identifiant name used for the header send on the request. + - `value` – Value set for a specific identifiant name. - -## ODBC {#dicts-external_dicts_dict_sources-odbc} +## ODBC {#dicts-external-dicts-dict-sources-odbc} You can use this method to connect any database that has an ODBC driver. Example of settings: -```xml +``` xml DatabaseName @@ -165,7 +161,7 @@ Example of settings: or -```sql +``` sql SOURCE(ODBC( db 'DatabaseName' table 'SchemaName.TableName' @@ -176,25 +172,25 @@ SOURCE(ODBC( Setting fields: -- `db` – Name of the database. Omit it if the database name is set in the `` parameters. -- `table` – Name of the table and schema if exists. -- `connection_string` – Connection string. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](external_dicts_dict_lifetime.md). +- `db` – Name of the database. Omit it if the database name is set in the `` parameters. +- `table` – Name of the table and schema if exists. +- `connection_string` – Connection string. +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](external_dicts_dict_lifetime.md). -ClickHouse receives quoting symbols from ODBC-driver and quote all settings in queries to driver, so it's necessary to set table name accordingly to table name case in database. +ClickHouse receives quoting symbols from ODBC-driver and quote all settings in queries to driver, so it’s necessary to set table name accordingly to table name case in database. If you have a problems with encodings when using Oracle, see the corresponding [FAQ](../../faq/general.md#oracle-odbc-encodings) article. -### Known vulnerability of the ODBC dictionary functionality +### Known vulnerability of the ODBC dictionary functionality {#known-vulnerability-of-the-odbc-dictionary-functionality} -!!! attention +!!! attention "Attention" When connecting to the database through the ODBC driver connection parameter `Servername` can be substituted. In this case values of `USERNAME` and `PASSWORD` from `odbc.ini` are sent to the remote server and can be compromised. **Example of insecure use** -Let's configure unixODBC for PostgreSQL. Content of `/etc/odbc.ini`: +Let’s configure unixODBC for PostgreSQL. Content of `/etc/odbc.ini`: -```text +``` text [gregtest] Driver = /usr/lib/psqlodbca.so Servername = localhost @@ -207,24 +203,25 @@ PASSWORD = test If you then make a query such as -```sql -SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); +``` sql +SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); ``` ODBC driver will send values of `USERNAME` and `PASSWORD` from `odbc.ini` to `some-server.com`. -### Example of Connecting PostgreSQL +### Example of Connecting PostgreSQL {#example-of-connecting-postgresql} Ubuntu OS. Installing unixODBC and the ODBC driver for PostgreSQL: -```bash + +``` bash $ sudo apt-get install -y unixodbc odbcinst odbc-postgresql ``` Configuring `/etc/odbc.ini` (or `~/.odbc.ini`): -```text +``` text [DEFAULT] Driver = myconnection @@ -245,7 +242,7 @@ Configuring `/etc/odbc.ini` (or `~/.odbc.ini`): The dictionary configuration in ClickHouse: -```xml +``` xml table_name @@ -280,7 +277,7 @@ The dictionary configuration in ClickHouse: or -```sql +``` sql CREATE DICTIONARY table_name ( id UInt64, some_column UInt64 DEFAULT 0 @@ -293,19 +290,19 @@ LIFETIME(MIN 300 MAX 360) You may need to edit `odbc.ini` to specify the full path to the library with the driver `DRIVER=/usr/local/lib/psqlodbcw.so`. -### Example of Connecting MS SQL Server +### Example of Connecting MS SQL Server {#example-of-connecting-ms-sql-server} Ubuntu OS. Installing the driver: : -```bash +``` bash $ sudo apt-get install tdsodbc freetds-bin sqsh ``` Configuring the driver: -```bash +``` bash $ cat /etc/freetds/freetds.conf ... @@ -340,7 +337,7 @@ Configuring the driver: Configuring the dictionary in ClickHouse: -```xml +``` xml test @@ -376,7 +373,7 @@ Configuring the dictionary in ClickHouse: or -```sql +``` sql CREATE DICTIONARY test ( k UInt64, s String DEFAULT '' @@ -387,14 +384,13 @@ LAYOUT(FLAT()) LIFETIME(MIN 300 MAX 360) ``` -## DBMS +## DBMS {#dbms} - -### MySQL {#dicts-external_dicts_dict_sources-mysql} +### MySQL {#dicts-external-dicts-dict-sources-mysql} Example of settings: -```xml +``` xml 3306 @@ -418,7 +414,7 @@ Example of settings: or -```sql +``` sql SOURCE(MYSQL( port 3306 user 'clickhouse' @@ -434,24 +430,30 @@ SOURCE(MYSQL( Setting fields: -- `port` – The port on the MySQL server. You can specify it for all replicas, or for each one individually (inside ``). -- `user` – Name of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). -- `password` – Password of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). -- `replica` – Section of replica configurations. There can be multiple sections. +- `port` – The port on the MySQL server. You can specify it for all replicas, or for each one individually (inside ``). - - `replica/host` – The MySQL host. - - `replica/priority` – The replica priority. When attempting to connect, ClickHouse traverses the replicas in order of priority. The lower the number, the higher the priority. +- `user` – Name of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). -- `db` – Name of the database. -- `table` – Name of the table. -- `where ` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in MySQL, for example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](external_dicts_dict_lifetime.md). +- `password` – Password of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). + +- `replica` – Section of replica configurations. There can be multiple sections. + + - `replica/host` – The MySQL host. + - `replica/priority` – The replica priority. When attempting to connect, ClickHouse traverses the replicas in order of priority. The lower the number, the higher the priority. + +- `db` – Name of the database. + +- `table` – Name of the table. + +- `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in MySQL, for example, `id > 10 AND id < 20`. Optional parameter. + +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](external_dicts_dict_lifetime.md). MySQL can be connected on a local host via sockets. To do this, set `host` and `socket`. Example of settings: -```xml +``` xml localhost @@ -468,7 +470,7 @@ Example of settings: or -```sql +``` sql SOURCE(MYSQL( host 'localhost' socket '/path/to/socket/file.sock' @@ -481,12 +483,11 @@ SOURCE(MYSQL( )) ``` - -### ClickHouse {#dicts-external_dicts_dict_sources-clickhouse} +### ClickHouse {#dicts-external-dicts-dict-sources-clickhouse} Example of settings: -```xml +``` xml example01-01-1 @@ -502,7 +503,7 @@ Example of settings: or -```sql +``` sql SOURCE(CLICKHOUSE( host 'example01-01-1' port 9000 @@ -516,21 +517,20 @@ SOURCE(CLICKHOUSE( Setting fields: -- `host` – The ClickHouse host. If it is a local host, the query is processed without any network activity. To improve fault tolerance, you can create a [Distributed](../../operations/table_engines/distributed.md) table and enter it in subsequent configurations. -- `port` – The port on the ClickHouse server. -- `user` – Name of the ClickHouse user. -- `password` – Password of the ClickHouse user. -- `db` – Name of the database. -- `table` – Name of the table. -- `where ` – The selection criteria. May be omitted. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](external_dicts_dict_lifetime.md). +- `host` – The ClickHouse host. If it is a local host, the query is processed without any network activity. To improve fault tolerance, you can create a [Distributed](../../operations/table_engines/distributed.md) table and enter it in subsequent configurations. +- `port` – The port on the ClickHouse server. +- `user` – Name of the ClickHouse user. +- `password` – Password of the ClickHouse user. +- `db` – Name of the database. +- `table` – Name of the table. +- `where` – The selection criteria. May be omitted. +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](external_dicts_dict_lifetime.md). - -### MongoDB {#dicts-external_dicts_dict_sources-mongodb} +### MongoDB {#dicts-external-dicts-dict-sources-mongodb} Example of settings: -```xml +``` xml localhost @@ -545,7 +545,7 @@ Example of settings: or -```sql +``` sql SOURCE(MONGO( host 'localhost' port 27017 @@ -558,19 +558,18 @@ SOURCE(MONGO( Setting fields: -- `host` – The MongoDB host. -- `port` – The port on the MongoDB server. -- `user` – Name of the MongoDB user. -- `password` – Password of the MongoDB user. -- `db` – Name of the database. -- `collection` – Name of the collection. +- `host` – The MongoDB host. +- `port` – The port on the MongoDB server. +- `user` – Name of the MongoDB user. +- `password` – Password of the MongoDB user. +- `db` – Name of the database. +- `collection` – Name of the collection. - -### Redis {#dicts-external_dicts_dict_sources-redis} +### Redis {#dicts-external-dicts-dict-sources-redis} Example of settings: -```xml +``` xml localhost @@ -583,7 +582,7 @@ Example of settings: or -```sql +``` sql SOURCE(REDIS( host 'localhost' port 6379 @@ -594,9 +593,9 @@ SOURCE(REDIS( Setting fields: -- `host` – The Redis host. -- `port` – The port on the Redis server. -- `storage_type` – The structure of internal Redis storage using for work with keys. `simple` is for simple sources and for hashed single key sources, `hash_map` is for hashed sources with two keys. Ranged sources and cache sources with complex key are unsupported. May be omitted, default value is `simple`. -- `db_index` – The specific numeric index of Redis logical database. May be omitted, default value is 0. +- `host` – The Redis host. +- `port` – The port on the Redis server. +- `storage_type` – The structure of internal Redis storage using for work with keys. `simple` is for simple sources and for hashed single key sources, `hash_map` is for hashed sources with two keys. Ranged sources and cache sources with complex key are unsupported. May be omitted, default value is `simple`. +- `db_index` – The specific numeric index of Redis logical database. May be omitted, default value is 0. [Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_sources/) diff --git a/docs/en/query_language/dicts/external_dicts_dict_structure.md b/docs/en/query_language/dicts/external_dicts_dict_structure.md index f5a0b0b6017..0973a3c754f 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_structure.md +++ b/docs/en/query_language/dicts/external_dicts_dict_structure.md @@ -1,10 +1,10 @@ -# Dictionary Key and Fields +# Dictionary Key and Fields {#dictionary-key-and-fields} The `` clause describes the dictionary key and fields available for queries. XML description: -```xml +``` xml @@ -23,12 +23,12 @@ XML description: Attributes are described in the elements: -- `` — [Key column](external_dicts_dict_structure.md#ext_dict_structure-key). -- `` — [Data column](external_dicts_dict_structure.md#ext_dict_structure-attributes). There can be a multiple number of attributes. +- `` — [Key column](external_dicts_dict_structure.md#ext_dict_structure-key). +- `` — [Data column](external_dicts_dict_structure.md#ext_dict_structure-attributes). There can be a multiple number of attributes. DDL query: -```sql +``` sql CREATE DICTIONARY dict_name ( Id UInt64, -- attributes @@ -39,30 +39,28 @@ PRIMARY KEY Id Attributes are described in the query body: -- `PRIMARY KEY` — [Key column](external_dicts_dict_structure.md#ext_dict_structure-key) -- `AttrName AttrType` — [Data column](external_dicts_dict_structure.md#ext_dict_structure-attributes). There can be a multiple number of attributes. +- `PRIMARY KEY` — [Key column](external_dicts_dict_structure.md#ext_dict_structure-key) +- `AttrName AttrType` — [Data column](external_dicts_dict_structure.md#ext_dict_structure-attributes). There can be a multiple number of attributes. - -## Key {#ext_dict_structure-key} +## Key {#ext-dict-structure-key} ClickHouse supports the following types of keys: -- Numeric key. `UInt64`. Defined in the `` tag or using `PRIMARY KEY` keyword. -- Composite key. Set of values of different types. Defined in the tag `` or `PRIMARY KEY` keyword. +- Numeric key. `UInt64`. Defined in the `` tag or using `PRIMARY KEY` keyword. +- Composite key. Set of values of different types. Defined in the tag `` or `PRIMARY KEY` keyword. An xml structure can contain either `` or ``. DDL-query must contain single `PRIMARY KEY`. !!! warning "Warning" You must not describe key as an attribute. - -### Numeric Key {#ext_dict-numeric-key} +### Numeric Key {#ext-dict-numeric-key} Type: `UInt64`. Configuration example: -```xml +``` xml Id @@ -70,12 +68,11 @@ Configuration example: Configuration fields: -- `name` – The name of the column with keys. - +- `name` – The name of the column with keys. For DDL-query: -```sql +``` sql CREATE DICTIONARY ( Id UInt64, ... @@ -84,18 +81,18 @@ PRIMARY KEY Id ... ``` -- `PRIMARY KEY` – The name of the column with keys. +- `PRIMARY KEY` – The name of the column with keys. -### Composite Key +### Composite Key {#composite-key} The key can be a `tuple` from any types of fields. The [layout](external_dicts_dict_layout.md) in this case must be `complex_key_hashed` or `complex_key_cache`. -!!! tip +!!! tip "Tip" A composite key can consist of a single element. This makes it possible to use a string as the key, for instance. The key structure is set in the element ``. Key fields are specified in the same format as the dictionary [attributes](external_dicts_dict_structure.md). Example: -```xml +``` xml @@ -113,7 +110,7 @@ The key structure is set in the element ``. Key fields are specified in the or -```sql +``` sql CREATE DICTIONARY ( field1 String, field2 String @@ -125,12 +122,11 @@ PRIMARY KEY field1, field2 For a query to the `dictGet*` function, a tuple is passed as the key. Example: `dictGetString('dict_name', 'attr_name', tuple('string for field1', num_for_field2))`. - -## Attributes {#ext_dict_structure-attributes} +## Attributes {#ext-dict-structure-attributes} Configuration example: -```xml +``` xml ... @@ -147,27 +143,26 @@ Configuration example: or -```sql +``` sql CREATE DICTIONARY somename ( Name ClickHouseDataType DEFAULT '' EXPRESSION rand64() HIERARCHICAL INJECTIVE IS_OBJECT_ID ) ``` - Configuration fields: -Tag | Description | Required -----|-------------|--------- -`name`| Column name. | Yes -`type`| ClickHouse data type.
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../data_types/nullable.md) is not supported. | Yes -`null_value` | Default value for a non-existing element.
In the example, it is an empty string. You cannot use `NULL` in this field. | Yes -`expression` | [Expression](../syntax.md#syntax-expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No - `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](external_dicts_dict_hierarchical.md).

Default value: `false`. | No -`injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No -`is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. | No +| Tag | Description | Required | +|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| +| `name` | Column name. | Yes | +| `type` | ClickHouse data type.
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../data_types/nullable.md) is not supported. | Yes | +| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. You cannot use `NULL` in this field. | Yes | +| `expression` | [Expression](../syntax.md#syntax-expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | +| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](external_dicts_dict_hierarchical.md).

Default value: `false`. | No | +| `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | +| `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. | No | -## See Also +## See Also {#see-also} -- [Functions for working with external dictionaries](../functions/ext_dict_functions.md). +- [Functions for working with external dictionaries](../functions/ext_dict_functions.md). [Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts_dict_structure/) diff --git a/docs/en/query_language/dicts/index.md b/docs/en/query_language/dicts/index.md index d189712c490..593b37d13ba 100644 --- a/docs/en/query_language/dicts/index.md +++ b/docs/en/query_language/dicts/index.md @@ -1,15 +1,14 @@ -# Dictionaries +# Dictionaries {#dictionaries} A dictionary is a mapping (`key -> attributes`) that is convenient for various types of reference lists. ClickHouse supports special functions for working with dictionaries that can be used in queries. It is easier and more efficient to use dictionaries with functions than a `JOIN` with reference tables. -[NULL](../syntax.md#null) values can't be stored in a dictionary. +[NULL](../syntax.md#null) values can’t be stored in a dictionary. ClickHouse supports: -- [Built-in dictionaries](internal_dicts.md#internal_dicts) with a specific [set of functions](../functions/ym_dict_functions.md). -- [Plug-in (external) dictionaries](external_dicts.md) with a [set of functions](../functions/ext_dict_functions.md). - +- [Built-in dictionaries](internal_dicts.md#internal_dicts) with a specific [set of functions](../functions/ym_dict_functions.md). +- [Plug-in (external) dictionaries](external_dicts.md) with a [set of functions](../functions/ext_dict_functions.md). [Original article](https://clickhouse.tech/docs/en/query_language/dicts/) diff --git a/docs/en/query_language/dicts/internal_dicts.md b/docs/en/query_language/dicts/internal_dicts.md index 7df915dd998..af341c9133c 100644 --- a/docs/en/query_language/dicts/internal_dicts.md +++ b/docs/en/query_language/dicts/internal_dicts.md @@ -1,15 +1,15 @@ -# Internal dictionaries {#internal_dicts} +# Internal dictionaries {#internal-dicts} ClickHouse contains a built-in feature for working with a geobase. This allows you to: -- Use a region's ID to get its name in the desired language. -- Use a region's ID to get the ID of a city, area, federal district, country, or continent. -- Check whether a region is part of another region. -- Get a chain of parent regions. +- Use a region’s ID to get its name in the desired language. +- Use a region’s ID to get the ID of a city, area, federal district, country, or continent. +- Check whether a region is part of another region. +- Get a chain of parent regions. -All the functions support "translocality," the ability to simultaneously use different perspectives on region ownership. For more information, see the section "Functions for working with Yandex.Metrica dictionaries". +All the functions support “translocality,” the ability to simultaneously use different perspectives on region ownership. For more information, see the section “Functions for working with Yandex.Metrica dictionaries”. The internal dictionaries are disabled in the default package. To enable them, uncomment the parameters `path_to_regions_hierarchy_file` and `path_to_regions_names_files` in the server configuration file. @@ -24,17 +24,17 @@ You can also create these files yourself. The file format is as follows: `regions_hierarchy*.txt`: TabSeparated (no header), columns: -- region ID (`UInt32`) -- parent region ID (`UInt32`) -- region type (`UInt8`): 1 - continent, 3 - country, 4 - federal district, 5 - region, 6 - city; other types don't have values -- population (`UInt32`) — optional column +- region ID (`UInt32`) +- parent region ID (`UInt32`) +- region type (`UInt8`): 1 - continent, 3 - country, 4 - federal district, 5 - region, 6 - city; other types don’t have values +- population (`UInt32`) — optional column `regions_names_*.txt`: TabSeparated (no header), columns: -- region ID (`UInt32`) -- region name (`String`) — Can't contain tabs or line feeds, even escaped ones. +- region ID (`UInt32`) +- region name (`String`) — Can’t contain tabs or line feeds, even escaped ones. -A flat array is used for storing in RAM. For this reason, IDs shouldn't be more than a million. +A flat array is used for storing in RAM. For this reason, IDs shouldn’t be more than a million. Dictionaries can be updated without restarting the server. However, the set of available dictionaries is not updated. For updates, the file modification times are checked. If a file has changed, the dictionary is updated. @@ -43,6 +43,6 @@ Dictionary updates (other than loading at first use) do not block queries. Durin We recommend periodically updating the dictionaries with the geobase. During an update, generate new files and write them to a separate location. When everything is ready, rename them to the files used by the server. -There are also functions for working with OS identifiers and Yandex.Metrica search engines, but they shouldn't be used. +There are also functions for working with OS identifiers and Yandex.Metrica search engines, but they shouldn’t be used. [Original article](https://clickhouse.tech/docs/en/query_language/dicts/internal_dicts/) diff --git a/docs/en/query_language/functions/arithmetic_functions.md b/docs/en/query_language/functions/arithmetic_functions.md index 55b62b404ca..7701d4eec87 100644 --- a/docs/en/query_language/functions/arithmetic_functions.md +++ b/docs/en/query_language/functions/arithmetic_functions.md @@ -1,14 +1,14 @@ -# Arithmetic functions +# Arithmetic functions {#arithmetic-functions} For all arithmetic functions, the result type is calculated as the smallest number type that the result fits in, if there is such a type. The minimum is taken simultaneously based on the number of bits, whether it is signed, and whether it floats. If there are not enough bits, the highest bit type is taken. Example: -```sql +``` sql SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0) ``` -```text +``` text ┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐ │ UInt8 │ UInt16 │ UInt32 │ UInt64 │ └───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘ @@ -18,64 +18,63 @@ Arithmetic functions work for any pair of types from UInt8, UInt16, UInt32, UInt Overflow is produced the same way as in C++. -## plus(a, b), a + b operator +## plus(a, b), a + b operator {#plusa-b-a-b-operator} Calculates the sum of the numbers. You can also add integer numbers with a date or date and time. In the case of a date, adding an integer means adding the corresponding number of days. For a date with time, it means adding the corresponding number of seconds. -## minus(a, b), a - b operator +## minus(a, b), a - b operator {#minusa-b-a-b-operator} Calculates the difference. The result is always signed. -You can also calculate integer numbers from a date or date with time. The idea is the same – see above for 'plus'. +You can also calculate integer numbers from a date or date with time. The idea is the same – see above for ‘plus’. -## multiply(a, b), a \* b operator +## multiply(a, b), a \* b operator {#multiplya-b-a-b-operator} Calculates the product of the numbers. -## divide(a, b), a / b operator +## divide(a, b), a / b operator {#dividea-b-a-b-operator} Calculates the quotient of the numbers. The result type is always a floating-point type. -It is not integer division. For integer division, use the 'intDiv' function. -When dividing by zero you get 'inf', '-inf', or 'nan'. +It is not integer division. For integer division, use the ‘intDiv’ function. +When dividing by zero you get ‘inf’, ‘-inf’, or ‘nan’. -## intDiv(a, b) +## intDiv(a, b) {#intdiva-b} Calculates the quotient of the numbers. Divides into integers, rounding down (by the absolute value). An exception is thrown when dividing by zero or when dividing a minimal negative number by minus one. -## intDivOrZero(a, b) +## intDivOrZero(a, b) {#intdivorzeroa-b} -Differs from 'intDiv' in that it returns zero when dividing by zero or when dividing a minimal negative number by minus one. +Differs from ‘intDiv’ in that it returns zero when dividing by zero or when dividing a minimal negative number by minus one. -## modulo(a, b), a % b operator +## modulo(a, b), a % b operator {#moduloa-b-a-b-operator} Calculates the remainder after division. If arguments are floating-point numbers, they are pre-converted to integers by dropping the decimal portion. The remainder is taken in the same sense as in C++. Truncated division is used for negative numbers. An exception is thrown when dividing by zero or when dividing a minimal negative number by minus one. -## moduloOrZero(a, b) +## moduloOrZero(a, b) {#moduloorzeroa-b} -Differs from 'modulo' in that it returns zero when the divisor is zero. +Differs from ‘modulo’ in that it returns zero when the divisor is zero. -## negate(a), -a operator +## negate(a), -a operator {#negatea-a-operator} Calculates a number with the reverse sign. The result is always signed. -## abs(a) {#arithm_func-abs} +## abs(a) {#arithm-func-abs} -Calculates the absolute value of the number (a). That is, if a < 0, it returns -a. For unsigned types it doesn't do anything. For signed integer types, it returns an unsigned number. +Calculates the absolute value of the number (a). That is, if a \< 0, it returns -a. For unsigned types it doesn’t do anything. For signed integer types, it returns an unsigned number. -## gcd(a, b) +## gcd(a, b) {#gcda-b} Returns the greatest common divisor of the numbers. An exception is thrown when dividing by zero or when dividing a minimal negative number by minus one. -## lcm(a, b) +## lcm(a, b) {#lcma-b} Returns the least common multiple of the numbers. An exception is thrown when dividing by zero or when dividing a minimal negative number by minus one. - [Original article](https://clickhouse.tech/docs/en/query_language/functions/arithmetic_functions/) diff --git a/docs/en/query_language/functions/array_functions.md b/docs/en/query_language/functions/array_functions.md index 6c33b018c25..c8f96f770f9 100644 --- a/docs/en/query_language/functions/array_functions.md +++ b/docs/en/query_language/functions/array_functions.md @@ -1,4 +1,4 @@ -# Functions for working with arrays +# Functions for working with arrays {#functions-for-working-with-arrays} ## empty {#function-empty} @@ -12,29 +12,29 @@ Returns 0 for an empty array, or 1 for a non-empty array. The result type is UInt8. The function also works for strings. -## length {#array_functions-length} +## length {#array-functions-length} Returns the number of items in the array. The result type is UInt64. The function also works for strings. -## emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64 +## emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64 {#emptyarrayuint8-emptyarrayuint16-emptyarrayuint32-emptyarrayuint64} -## emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64 +## emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64 {#emptyarrayint8-emptyarrayint16-emptyarrayint32-emptyarrayint64} -## emptyArrayFloat32, emptyArrayFloat64 +## emptyArrayFloat32, emptyArrayFloat64 {#emptyarrayfloat32-emptyarrayfloat64} -## emptyArrayDate, emptyArrayDateTime +## emptyArrayDate, emptyArrayDateTime {#emptyarraydate-emptyarraydatetime} -## emptyArrayString +## emptyArrayString {#emptyarraystring} Accepts zero arguments and returns an empty array of the appropriate type. -## emptyArrayToSingle +## emptyArrayToSingle {#emptyarraytosingle} Accepts an empty array and returns a one-element array that is equal to the default value. -## range(end), range(start, end [, step]) +## range(end), range(start, end \[, step\]) {#rangeend-rangestart-end-step} Returns an array of numbers from start to end-1 by step. If the argument `start` is not specified, defaults to 0. @@ -42,35 +42,38 @@ If the argument `step` is not specified, defaults to 1. It behaviors almost like pythonic `range`. But the difference is that all the arguments type must be `UInt` numbers. Just in case, an exception is thrown if arrays with a total length of more than 100,000,000 elements are created in a data block. -## array(x1, ...), operator \[x1, ...\] +## array(x1, …), operator \[x1, …\] {#arrayx1-operator-x1} Creates an array from the function arguments. -The arguments must be constants and have types that have the smallest common type. At least one argument must be passed, because otherwise it isn't clear which type of array to create. That is, you can't use this function to create an empty array (to do that, use the 'emptyArray\*' function described above). -Returns an 'Array(T)' type result, where 'T' is the smallest common type out of the passed arguments. +The arguments must be constants and have types that have the smallest common type. At least one argument must be passed, because otherwise it isn’t clear which type of array to create. That is, you can’t use this function to create an empty array (to do that, use the ‘emptyArray\*’ function described above). +Returns an ‘Array(T)’ type result, where ‘T’ is the smallest common type out of the passed arguments. -## arrayConcat +## arrayConcat {#arrayconcat} Combines arrays passed as arguments. -```sql +``` sql arrayConcat(arrays) ``` **Parameters** -- `arrays` – Arbitrary number of arguments of [Array](../../data_types/array.md) type. -**Example** +- `arrays` – Arbitrary number of arguments of [Array](../../data_types/array.md) type. + **Example** -```sql + + +``` sql SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res ``` -```text + +``` text ┌─res───────────┐ │ [1,2,3,4,5,6] │ └───────────────┘ ``` -## arrayElement(arr, n), operator arr[n] +## arrayElement(arr, n), operator arr\[n\] {#arrayelementarr-n-operator-arrn} Get the element with the index `n` from the array `arr`. `n` must be any integer type. Indexes in an array begin from one. @@ -78,45 +81,46 @@ Negative indexes are supported. In this case, it selects the corresponding eleme If the index falls outside of the bounds of an array, it returns some default value (0 for numbers, an empty string for strings, etc.), except for the case with a non-constant array and a constant index 0 (in this case there will be an error `Array indices are 1-based`). -## has(arr, elem) +## has(arr, elem) {#hasarr-elem} -Checks whether the 'arr' array has the 'elem' element. +Checks whether the ‘arr’ array has the ‘elem’ element. Returns 0 if the the element is not in the array, or 1 if it is. `NULL` is processed as a value. -```sql +``` sql SELECT has([1, 2, NULL], NULL) ``` -```text + +``` text ┌─has([1, 2, NULL], NULL)─┐ │ 1 │ └─────────────────────────┘ ``` -## hasAll +## hasAll {#hasall} Checks whether one array is a subset of another. -```sql +``` sql hasAll(set, subset) ``` **Parameters** -- `set` – Array of any type with a set of elements. -- `subset` – Array of any type with elements that should be tested to be a subset of `set`. +- `set` – Array of any type with a set of elements. +- `subset` – Array of any type with elements that should be tested to be a subset of `set`. **Return values** -- `1`, if `set` contains all of the elements from `subset`. -- `0`, otherwise. +- `1`, if `set` contains all of the elements from `subset`. +- `0`, otherwise. **Peculiar properties** -- An empty array is a subset of any array. -- `Null` processed as a value. -- Order of values in both of arrays doesn't matter. +- An empty array is a subset of any array. +- `Null` processed as a value. +- Order of values in both of arrays doesn’t matter. **Examples** @@ -132,28 +136,28 @@ hasAll(set, subset) `SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [3, 5]])` returns 0. -## hasAny +## hasAny {#hasany} Checks whether two arrays have intersection by some elements. -```sql +``` sql hasAny(array1, array2) ``` **Parameters** -- `array1` – Array of any type with a set of elements. -- `array2` – Array of any type with a set of elements. +- `array1` – Array of any type with a set of elements. +- `array2` – Array of any type with a set of elements. **Return values** -- `1`, if `array1` and `array2` have one similar element at least. -- `0`, otherwise. +- `1`, if `array1` and `array2` have one similar element at least. +- `0`, otherwise. **Peculiar properties** -- `Null` processed as a value. -- Order of values in both of arrays doesn't matter. +- `Null` processed as a value. +- Order of values in both of arrays doesn’t matter. **Examples** @@ -167,16 +171,17 @@ hasAny(array1, array2) `SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [1, 2]])` returns `1`. -## indexOf(arr, x) +## indexOf(arr, x) {#indexofarr-x} -Returns the index of the first 'x' element (starting from 1) if it is in the array, or 0 if it is not. +Returns the index of the first ‘x’ element (starting from 1) if it is in the array, or 0 if it is not. Example: -```sql +``` sql SELECT indexOf([1, 3, NULL, NULL], NULL) ``` -```text + +``` text ┌─indexOf([1, 3, NULL, NULL], NULL)─┐ │ 3 │ @@ -185,30 +190,31 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) Elements set to `NULL` are handled as normal values. -## countEqual(arr, x) +## countEqual(arr, x) {#countequalarr-x} -Returns the number of elements in the array equal to x. Equivalent to arrayCount (elem -> elem = x, arr). +Returns the number of elements in the array equal to x. Equivalent to arrayCount (elem -\> elem = x, arr). `NULL` elements are handled as separate values. Example: -```sql +``` sql SELECT countEqual([1, 2, NULL, NULL], NULL) ``` -```text + +``` text ┌─countEqual([1, 2, NULL, NULL], NULL)─┐ │ 2 │ └──────────────────────────────────────┘ ``` -## arrayEnumerate(arr) {#array_functions-arrayenumerate} +## arrayEnumerate(arr) {#array-functions-arrayenumerate} -Returns the array \[1, 2, 3, ..., length (arr) \] +Returns the array \[1, 2, 3, …, length (arr) \] This function is normally used with ARRAY JOIN. It allows counting something just once for each array after applying ARRAY JOIN. Example: -```sql +``` sql SELECT count() AS Reaches, countIf(num = 1) AS Hits @@ -219,7 +225,8 @@ ARRAY JOIN WHERE CounterID = 160656 LIMIT 10 ``` -```text + +``` text ┌─Reaches─┬──Hits─┐ │ 95606 │ 31406 │ └─────────┴───────┘ @@ -227,14 +234,15 @@ LIMIT 10 In this example, Reaches is the number of conversions (the strings received after applying ARRAY JOIN), and Hits is the number of pageviews (strings before ARRAY JOIN). In this particular case, you can get the same result in an easier way: -```sql +``` sql SELECT sum(length(GoalsReached)) AS Reaches, count() AS Hits FROM test.hits WHERE (CounterID = 160656) AND notEmpty(GoalsReached) ``` -```text + +``` text ┌─Reaches─┬──Hits─┐ │ 95606 │ 31406 │ └─────────┴───────┘ @@ -242,7 +250,7 @@ WHERE (CounterID = 160656) AND notEmpty(GoalsReached) This function can also be used in higher-order functions. For example, you can use it to get array indexes for elements that match a condition. -## arrayEnumerateUniq(arr, ...) +## arrayEnumerateUniq(arr, …) {#arrayenumerateuniqarr} Returns an array the same size as the source array, indicating for each element what its position is among elements with the same value. For example: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\]. @@ -250,7 +258,7 @@ For example: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\]. This function is useful when using ARRAY JOIN and aggregation of array elements. Example: -```sql +``` sql SELECT Goals.ID AS GoalID, sum(Sign) AS Reaches, @@ -264,7 +272,8 @@ GROUP BY GoalID ORDER BY Reaches DESC LIMIT 10 ``` -```text + +``` text ┌──GoalID─┬─Reaches─┬─Visits─┐ │ 53225 │ 3214 │ 1097 │ │ 2825062 │ 3188 │ 1097 │ @@ -283,10 +292,11 @@ In this example, each goal ID has a calculation of the number of conversions (ea The arrayEnumerateUniq function can take multiple arrays of the same size as arguments. In this case, uniqueness is considered for tuples of elements in the same positions in all the arrays. -```sql +``` sql SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res ``` -```text + +``` text ┌─res───────────┐ │ [1,2,1,1,2,1] │ └───────────────┘ @@ -294,115 +304,119 @@ SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res This is necessary when using ARRAY JOIN with a nested data structure and further aggregation across multiple elements in this structure. -## arrayPopBack +## arrayPopBack {#arraypopback} Removes the last item from the array. -```sql +``` sql arrayPopBack(array) ``` **Parameters** -- `array` – Array. +- `array` – Array. **Example** -```sql +``` sql SELECT arrayPopBack([1, 2, 3]) AS res ``` -```text + +``` text ┌─res───┐ │ [1,2] │ └───────┘ ``` -## arrayPopFront +## arrayPopFront {#arraypopfront} Removes the first item from the array. -```sql +``` sql arrayPopFront(array) ``` **Parameters** -- `array` – Array. +- `array` – Array. **Example** -```sql +``` sql SELECT arrayPopFront([1, 2, 3]) AS res ``` -```text + +``` text ┌─res───┐ │ [2,3] │ └───────┘ ``` -## arrayPushBack +## arrayPushBack {#arraypushback} Adds one item to the end of the array. -```sql +``` sql arrayPushBack(array, single_value) ``` **Parameters** -- `array` – Array. -- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see "[Data types](../../data_types/index.md#data_types)". Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`. +- `array` – Array. +- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see “[Data types](../../data_types/index.md#data_types)”. Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`. **Example** -```sql +``` sql SELECT arrayPushBack(['a'], 'b') AS res ``` -```text + +``` text ┌─res───────┐ │ ['a','b'] │ └───────────┘ ``` -## arrayPushFront +## arrayPushFront {#arraypushfront} Adds one element to the beginning of the array. -```sql +``` sql arrayPushFront(array, single_value) ``` **Parameters** -- `array` – Array. -- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see "[Data types](../../data_types/index.md#data_types)". Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`. +- `array` – Array. +- `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see “[Data types](../../data_types/index.md#data_types)”. Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`. **Example** -```sql +``` sql SELECT arrayPushFront(['b'], 'a') AS res ``` -```text + +``` text ┌─res───────┐ │ ['a','b'] │ └───────────┘ ``` -## arrayResize +## arrayResize {#arrayresize} Changes the length of the array. -```sql +``` sql arrayResize(array, size[, extender]) ``` **Parameters:** -- `array` — Array. -- `size` — Required length of the array. - - If `size` is less than the original size of the array, the array is truncated from the right. -- If `size` is larger than the initial size of the array, the array is extended to the right with `extender` values or default values for the data type of the array items. -- `extender` — Value for extending an array. Can be `NULL`. +- `array` — Array. +- `size` — Required length of the array. + - If `size` is less than the original size of the array, the array is truncated from the right. +- If `size` is larger than the initial size of the array, the array is extended to the right with `extender` values or default values for the data type of the array items. +- `extender` — Value for extending an array. Can be `NULL`. **Returned value:** @@ -410,44 +424,47 @@ An array of length `size`. **Examples of calls** -```sql +``` sql SELECT arrayResize([1], 3) ``` -```text + +``` text ┌─arrayResize([1], 3)─┐ │ [1,0,0] │ └─────────────────────┘ ``` -```sql +``` sql SELECT arrayResize([1], 3, NULL) ``` -```text + +``` text ┌─arrayResize([1], 3, NULL)─┐ │ [1,NULL,NULL] │ └───────────────────────────┘ ``` -## arraySlice +## arraySlice {#arrayslice} Returns a slice of the array. -```sql +``` sql arraySlice(array, offset[, length]) ``` **Parameters** -- `array` – Array of data. -- `offset` – Indent from the edge of the array. A positive value indicates an offset on the left, and a negative value is an indent on the right. Numbering of the array items begins with 1. -- `length` - The length of the required slice. If you specify a negative value, the function returns an open slice `[offset, array_length - length)`. If you omit the value, the function returns the slice `[offset, the_end_of_array]`. +- `array` – Array of data. +- `offset` – Indent from the edge of the array. A positive value indicates an offset on the left, and a negative value is an indent on the right. Numbering of the array items begins with 1. +- `length` - The length of the required slice. If you specify a negative value, the function returns an open slice `[offset, array_length - length)`. If you omit the value, the function returns the slice `[offset, the_end_of_array]`. **Example** -```sql +``` sql SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res ``` -```text + +``` text ┌─res────────┐ │ [2,NULL,4] │ └────────────┘ @@ -455,16 +472,17 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res Array elements set to `NULL` are handled as normal values. -## arraySort(\[func,\] arr, ...) {#array_functions-sort} +## arraySort(\[func,\] arr, …) {#array-functions-sort} Sorts the elements of the `arr` array in ascending order. If the `func` function is specified, sorting order is determined by the result of the `func` function applied to the elements of the array. If `func` accepts multiple arguments, the `arraySort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arraySort` description. Example of integer values sorting: -```sql +``` sql SELECT arraySort([1, 3, 3, 0]); ``` -```text + +``` text ┌─arraySort([1, 3, 3, 0])─┐ │ [0,1,3,3] │ └─────────────────────────┘ @@ -472,10 +490,11 @@ SELECT arraySort([1, 3, 3, 0]); Example of string values sorting: -```sql +``` sql SELECT arraySort(['hello', 'world', '!']); ``` -```text + +``` text ┌─arraySort(['hello', 'world', '!'])─┐ │ ['!','hello','world'] │ └────────────────────────────────────┘ @@ -483,81 +502,87 @@ SELECT arraySort(['hello', 'world', '!']); Consider the following sorting order for the `NULL`, `NaN` and `Inf` values: -```sql +``` sql SELECT arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]); ``` -```text + +``` text ┌─arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf])─┐ │ [-inf,-4,1,2,3,inf,nan,nan,NULL,NULL] │ └───────────────────────────────────────────────────────────┘ ``` -- `-Inf` values are first in the array. -- `NULL` values are last in the array. -- `NaN` values are right before `NULL`. -- `Inf` values are right before `NaN`. +- `-Inf` values are first in the array. +- `NULL` values are last in the array. +- `NaN` values are right before `NULL`. +- `Inf` values are right before `NaN`. Note that `arraySort` is a [higher-order function](higher_order_functions.md). You can pass a lambda function to it as the first argument. In this case, sorting order is determined by the result of the lambda function applied to the elements of the array. -Let's consider the following example: +Let’s consider the following example: -```sql +``` sql SELECT arraySort((x) -> -x, [1, 2, 3]) as res; ``` -```text + +``` text ┌─res─────┐ │ [3,2,1] │ └─────────┘ ``` -For each element of the source array, the lambda function returns the sorting key, that is, [1 –> -1, 2 –> -2, 3 –> -3]. Since the `arraySort` function sorts the keys in ascending order, the result is [3, 2, 1]. Thus, the `(x) –> -x` lambda function sets the [descending order](#array_functions-reverse-sort) in a sorting. +For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` function sorts the keys in ascending order, the result is \[3, 2, 1\]. Thus, the `(x) –> -x` lambda function sets the [descending order](#array_functions-reverse-sort) in a sorting. The lambda function can accept multiple arguments. In this case, you need to pass the `arraySort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example: -```sql +``` sql SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -```text + +``` text ┌─res────────────────┐ │ ['world', 'hello'] │ └────────────────────┘ ``` -Here, the elements that are passed in the second array ([2, 1]) define a sorting key for the corresponding element from the source array (['hello', 'world']), that is, ['hello' –> 2, 'world' –> 1]. Since the lambda function doesn't use `x`, actual values of the source array don't affect the order in the result. So, 'hello' will be the second element in the result, and 'world' will be the first. +Here, the elements that are passed in the second array (\[2, 1\]) define a sorting key for the corresponding element from the source array (\[‘hello’, ‘world’\]), that is, \[‘hello’ –\> 2, ‘world’ –\> 1\]. Since the lambda function doesn’t use `x`, actual values of the source array don’t affect the order in the result. So, ‘hello’ will be the second element in the result, and ‘world’ will be the first. Other examples are shown below. -```sql +``` sql SELECT arraySort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res; ``` -```text + +``` text ┌─res─────┐ │ [2,1,0] │ └─────────┘ ``` -```sql +``` sql SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; ``` -```text + +``` text ┌─res─────┐ │ [2,1,0] │ └─────────┘ ``` -!!! note +!!! note "Note" To improve sorting efficiency, the [Schwartzian transform](https://en.wikipedia.org/wiki/Schwartzian_transform) is used. -## arrayReverseSort([func,] arr, ...) {#array_functions-reverse-sort} +## arrayReverseSort(\[func,\] arr, …) {#array-functions-reverse-sort} Sorts the elements of the `arr` array in descending order. If the `func` function is specified, `arr` is sorted according to the result of the `func` function applied to the elements of the array, and then the sorted array is reversed. If `func` accepts multiple arguments, the `arrayReverseSort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arrayReverseSort` description. Example of integer values sorting: -```sql +``` sql SELECT arrayReverseSort([1, 3, 3, 0]); ``` -```text + +``` text ┌─arrayReverseSort([1, 3, 3, 0])─┐ │ [3,3,1,0] │ └────────────────────────────────┘ @@ -565,10 +590,11 @@ SELECT arrayReverseSort([1, 3, 3, 0]); Example of string values sorting: -```sql +``` sql SELECT arrayReverseSort(['hello', 'world', '!']); ``` -```text + +``` text ┌─arrayReverseSort(['hello', 'world', '!'])─┐ │ ['world','hello','!'] │ └───────────────────────────────────────────┘ @@ -576,26 +602,28 @@ SELECT arrayReverseSort(['hello', 'world', '!']); Consider the following sorting order for the `NULL`, `NaN` and `Inf` values: -```sql +``` sql SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res; ``` -```text + +``` text ┌─res───────────────────────────────────┐ │ [inf,3,2,1,-4,-inf,nan,nan,NULL,NULL] │ └───────────────────────────────────────┘ ``` -- `Inf` values are first in the array. -- `NULL` values are last in the array. -- `NaN` values are right before `NULL`. -- `-Inf` values are right before `NaN`. +- `Inf` values are first in the array. +- `NULL` values are last in the array. +- `NaN` values are right before `NULL`. +- `-Inf` values are right before `NaN`. Note that the `arrayReverseSort` is a [higher-order function](higher_order_functions.md). You can pass a lambda function to it as the first argument. Example is shown below. -```sql +``` sql SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res; ``` -```text + +``` text ┌─res─────┐ │ [1,2,3] │ └─────────┘ @@ -603,15 +631,16 @@ SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res; The array is sorted in the following way: -1. At first, the source array ([1, 2, 3]) is sorted according to the result of the lambda function applied to the elements of the array. The result is an array [3, 2, 1]. -2. Array that is obtained on the previous step, is reversed. So, the final result is [1, 2, 3]. +1. At first, the source array (\[1, 2, 3\]) is sorted according to the result of the lambda function applied to the elements of the array. The result is an array \[3, 2, 1\]. +2. Array that is obtained on the previous step, is reversed. So, the final result is \[1, 2, 3\]. The lambda function can accept multiple arguments. In this case, you need to pass the `arrayReverseSort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example: -```sql +``` sql SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -```text + +``` text ┌─res───────────────┐ │ ['hello','world'] │ └───────────────────┘ @@ -619,70 +648,73 @@ SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; In this example, the array is sorted in the following way: -1. At first, the source array (['hello', 'world']) is sorted according to the result of the lambda function applied to the elements of the arrays. The elements that are passed in the second array ([2, 1]), define the sorting keys for corresponding elements from the source array. The result is an array ['world', 'hello']. -2. Array that was sorted on the previous step, is reversed. So, the final result is ['hello', 'world']. +1. At first, the source array (\[‘hello’, ‘world’\]) is sorted according to the result of the lambda function applied to the elements of the arrays. The elements that are passed in the second array (\[2, 1\]), define the sorting keys for corresponding elements from the source array. The result is an array \[‘world’, ‘hello’\]. +2. Array that was sorted on the previous step, is reversed. So, the final result is \[‘hello’, ‘world’\]. Other examples are shown below. -```sql +``` sql SELECT arrayReverseSort((x, y) -> y, [4, 3, 5], ['a', 'b', 'c']) AS res; ``` -```text + +``` text ┌─res─────┐ │ [5,3,4] │ └─────────┘ ``` -```sql + +``` sql SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; ``` -```text + +``` text ┌─res─────┐ │ [4,3,5] │ └─────────┘ ``` -## arrayUniq(arr, ...) +## arrayUniq(arr, …) {#arrayuniqarr} If one argument is passed, it counts the number of different elements in the array. If multiple arguments are passed, it counts the number of different tuples of elements at corresponding positions in multiple arrays. -If you want to get a list of unique items in an array, you can use arrayReduce('groupUniqArray', arr). +If you want to get a list of unique items in an array, you can use arrayReduce(‘groupUniqArray’, arr). -## arrayJoin(arr) {#array_functions-join} +## arrayJoin(arr) {#array-functions-join} -A special function. See the section ["ArrayJoin function"](array_join.md#functions_arrayjoin). +A special function. See the section [“ArrayJoin function”](array_join.md#functions_arrayjoin). ## arrayDifference {#arraydifference} -Calculates the difference between adjacent array elements. Returns an array where the first element will be 0, the second is the difference between `a[1] - a[0]`, etc. The type of elements in the resulting array is determined by the type inference rules for subtraction (e.g. `UInt8` - `UInt8` = `Int16`). +Calculates the difference between adjacent array elements. Returns an array where the first element will be 0, the second is the difference between `a[1] - a[0]`, etc. The type of elements in the resulting array is determined by the type inference rules for subtraction (e.g. `UInt8` - `UInt8` = `Int16`). -**Syntax** +**Syntax** -```sql +``` sql arrayDifference(array) ``` **Parameters** -- `array` – [Array](https://clickhouse.yandex/docs/en/data_types/array/). +- `array` – [Array](https://clickhouse.yandex/docs/en/data_types/array/). **Returned values** Returns an array of differences between adjacent elements. -Type: [UInt*](https://clickhouse.yandex/docs/en/data_types/int_uint/#uint-ranges), [Int*](https://clickhouse.yandex/docs/en/data_types/int_uint/#int-ranges), [Float*](https://clickhouse.yandex/docs/en/data_types/float/). +Type: [UInt\*](https://clickhouse.yandex/docs/en/data_types/int_uint/#uint-ranges), [Int\*](https://clickhouse.yandex/docs/en/data_types/int_uint/#int-ranges), [Float\*](https://clickhouse.yandex/docs/en/data_types/float/). **Example** Query: -```sql +``` sql SELECT arrayDifference([1, 2, 3, 4]) ``` Result: -```text +``` text ┌─arrayDifference([1, 2, 3, 4])─┐ │ [0,1,1,1] │ └───────────────────────────────┘ @@ -692,13 +724,13 @@ Example of the overflow due to result type Int64: Query: -```sql +``` sql SELECT arrayDifference([0, 10000000000000000000]) ``` Result: -```text +``` text ┌─arrayDifference([0, 10000000000000000000])─┐ │ [0,-8446744073709551616] │ └────────────────────────────────────────────┘ @@ -708,15 +740,15 @@ Result: Takes an array, returns an array containing the distinct elements only. -**Syntax** +**Syntax** -```sql +``` sql arrayDistinct(array) ``` -**Parameters** +**Parameters** -- `array` – [Array](https://clickhouse.yandex/docs/en/data_types/array/). +- `array` – [Array](https://clickhouse.yandex/docs/en/data_types/array/). **Returned values** @@ -726,63 +758,63 @@ Returns an array containing the distinct elements. Query: -```sql +``` sql SELECT arrayDistinct([1, 2, 2, 3, 1]) ``` Result: -```text +``` text ┌─arrayDistinct([1, 2, 2, 3, 1])─┐ │ [1,2,3] │ └────────────────────────────────┘ ``` -## arrayEnumerateDense(arr) {#array_functions-arrayenumeratedense} +## arrayEnumerateDense(arr) {#array-functions-arrayenumeratedense} Returns an array of the same size as the source array, indicating where each element first appears in the source array. Example: -```sql +``` sql SELECT arrayEnumerateDense([10, 20, 10, 30]) ``` -```text +``` text ┌─arrayEnumerateDense([10, 20, 10, 30])─┐ │ [1,2,1,3] │ └───────────────────────────────────────┘ ``` -## arrayIntersect(arr) {#array_functions-arrayintersect} +## arrayIntersect(arr) {#array-functions-arrayintersect} Takes multiple arrays, returns an array with elements that are present in all source arrays. Elements order in the resulting array is the same as in the first array. Example: -```sql +``` sql SELECT arrayIntersect([1, 2], [1, 3], [2, 3]) AS no_intersect, arrayIntersect([1, 2], [1, 3], [1, 4]) AS intersect ``` -```text +``` text ┌─no_intersect─┬─intersect─┐ │ [] │ [1] │ └──────────────┴───────────┘ ``` -## arrayReduce(agg_func, arr1, ...) {#array_functions-arrayreduce} +## arrayReduce(agg\_func, arr1, …) {#array-functions-arrayreduce} Applies an aggregate function to array elements and returns its result. The name of the aggregation function is passed as a string in single quotes `'max'`, `'sum'`. When using parametric aggregate functions, the parameter is indicated after the function name in parentheses `'uniqUpTo(6)'`. Example: -```sql +``` sql SELECT arrayReduce('max', [1, 2, 3]) ``` -```text +``` text ┌─arrayReduce('max', [1, 2, 3])─┐ │ 3 │ └───────────────────────────────┘ @@ -792,11 +824,11 @@ If an aggregate function takes multiple arguments, then this function must be ap Example: -```sql +``` sql SELECT arrayReduce('maxIf', [3, 5], [1, 0]) ``` -```text +``` text ┌─arrayReduce('maxIf', [3, 5], [1, 0])─┐ │ 3 │ └──────────────────────────────────────┘ @@ -804,35 +836,35 @@ SELECT arrayReduce('maxIf', [3, 5], [1, 0]) Example with a parametric aggregate function: -```sql +``` sql SELECT arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) ``` -```text +``` text ┌─arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])─┐ │ 4 │ └─────────────────────────────────────────────────────────────┘ ``` -## arrayReverse(arr) {#array_functions-arrayreverse} +## arrayReverse(arr) {#array-functions-arrayreverse} Returns an array of the same size as the original array containing the elements in reverse order. Example: -```sql +``` sql SELECT arrayReverse([1, 2, 3]) ``` -```text +``` text ┌─arrayReverse([1, 2, 3])─┐ │ [3,2,1] │ └─────────────────────────┘ ``` -## reverse(arr) {#array_functions-reverse} +## reverse(arr) {#array-functions-reverse} -Synonym for ["arrayReverse"](#array_functions-arrayreverse) +Synonym for [“arrayReverse”](#array_functions-arrayreverse) ## arrayFlatten {#arrayflatten} @@ -840,30 +872,30 @@ Converts an array of arrays to a flat array. Function: -- Applies to any depth of nested arrays. -- Does not change arrays that are already flat. +- Applies to any depth of nested arrays. +- Does not change arrays that are already flat. The flattened array contains all the elements from all source arrays. **Syntax** -```sql +``` sql flatten(array_of_arrays) ``` Alias: `flatten`. - **Parameters** -- `array_of_arrays` — [Array](../../data_types/array.md) of arrays. For example, `[[1,2,3], [4,5]]`. +- `array_of_arrays` — [Array](../../data_types/array.md) of arrays. For example, `[[1,2,3], [4,5]]`. **Examples** -```sql +``` sql SELECT flatten([[[1]], [[2], [3]]]) ``` -```text + +``` text ┌─flatten(array(array([1]), array([2], [3])))─┐ │ [1,2,3] │ └─────────────────────────────────────────────┘ @@ -875,7 +907,7 @@ Removes consecutive duplicate elements from an array. The order of result values **Syntax** -```sql +``` sql arrayCompact(arr) ``` @@ -893,13 +925,13 @@ Type: `Array`. Query: -```sql +``` sql SELECT arrayCompact([1, 1, nan, nan, 2, 3, 3, 3]) ``` Result: -```text +``` text ┌─arrayCompact([1, 1, nan, nan, 2, 3, 3, 3])─┐ │ [1,nan,nan,2,3] │ └────────────────────────────────────────────┘ @@ -907,11 +939,11 @@ Result: ## arrayZip {#arrayzip} -Combine multiple Array type columns into one Array[Tuple(...)] column +Combine multiple Array type columns into one Array\[Tuple(…)\] column **Syntax** -```sql +``` sql arrayZip(arr1, arr2, ..., arrN) ``` @@ -921,19 +953,19 @@ arrayZip(arr1, arr2, ..., arrN) **Returned value** -The result of Array[Tuple(...)] type after the combination of these arrays +The result of Array\[Tuple(…)\] type after the combination of these arrays **Example** Query: -```sql +``` sql SELECT arrayZip(['a', 'b', 'c'], ['d', 'e', 'f']); ``` Result: -```text +``` text ┌─arrayZip(['a', 'b', 'c'], ['d', 'e', 'f'])─┐ │ [('a','d'),('b','e'),('c','f')] │ └────────────────────────────────────────────┘ diff --git a/docs/en/query_language/functions/array_join.md b/docs/en/query_language/functions/array_join.md index 5db4b0e601e..869d168b5c1 100644 --- a/docs/en/query_language/functions/array_join.md +++ b/docs/en/query_language/functions/array_join.md @@ -1,10 +1,10 @@ -# arrayJoin function {#functions_arrayjoin} +# arrayJoin function {#functions-arrayjoin} This is a very unusual function. -Normal functions don't change a set of rows, but just change the values in each row (map). +Normal functions don’t change a set of rows, but just change the values in each row (map). Aggregate functions compress a set of rows (fold or reduce). -The 'arrayJoin' function takes each row and generates a set of rows (unfold). +The ‘arrayJoin’ function takes each row and generates a set of rows (unfold). This function takes an array as an argument, and propagates the source row to multiple rows for the number of elements in the array. All the values in columns are simply copied, except the values in the column where this function is applied; it is replaced with the corresponding array value. @@ -19,7 +19,7 @@ Example: SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src ``` -```text +``` text ┌─dst─┬─\'Hello\'─┬─src─────┐ │ 1 │ Hello │ [1,2,3] │ │ 2 │ Hello │ [1,2,3] │ @@ -27,5 +27,4 @@ SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src └─────┴───────────┴─────────┘ ``` - [Original article](https://clickhouse.tech/docs/en/query_language/functions/array_join/) diff --git a/docs/en/query_language/functions/bit_functions.md b/docs/en/query_language/functions/bit_functions.md index 5774375a546..759ae791a4c 100644 --- a/docs/en/query_language/functions/bit_functions.md +++ b/docs/en/query_language/functions/bit_functions.md @@ -1,39 +1,39 @@ -# Bit functions +# Bit functions {#bit-functions} Bit functions work for any pair of types from UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, or Float64. The result type is an integer with bits equal to the maximum bits of its arguments. If at least one of the arguments is signed, the result is a signed number. If an argument is a floating-point number, it is cast to Int64. -## bitAnd(a, b) +## bitAnd(a, b) {#bitanda-b} -## bitOr(a, b) +## bitOr(a, b) {#bitora-b} -## bitXor(a, b) +## bitXor(a, b) {#bitxora-b} -## bitNot(a) +## bitNot(a) {#bitnota} -## bitShiftLeft(a, b) +## bitShiftLeft(a, b) {#bitshiftlefta-b} -## bitShiftRight(a, b) +## bitShiftRight(a, b) {#bitshiftrighta-b} -## bitRotateLeft(a, b) +## bitRotateLeft(a, b) {#bitrotatelefta-b} -## bitRotateRight(a, b) +## bitRotateRight(a, b) {#bitrotaterighta-b} ## bitTest {#bittest} Takes any integer and converts it into [binary form](https://en.wikipedia.org/wiki/Binary_number), returns the value of a bit at specified position. The countdown starts from 0 from the right to the left. -**Syntax** +**Syntax** -```sql +``` sql SELECT bitTest(number, index) ``` **Parameters** -- `number` – integer number. -- `index` – position of bit. +- `number` – integer number. +- `index` – position of bit. **Returned values** @@ -47,13 +47,13 @@ For example, the number 43 in base-2 (binary) numeral system is 101011. Query: -```sql +``` sql SELECT bitTest(43, 1) ``` Result: -```text +``` text ┌─bitTest(43, 1)─┐ │ 1 │ └────────────────┘ @@ -63,13 +63,13 @@ Another example: Query: -```sql +``` sql SELECT bitTest(43, 2) ``` Result: -```text +``` text ┌─bitTest(43, 2)─┐ │ 0 │ └────────────────┘ @@ -89,16 +89,16 @@ The conjuction for bitwise operations: 1 AND 1 = 1 -**Syntax** +**Syntax** -```sql +``` sql SELECT bitTestAll(number, index1, index2, index3, index4, ...) ``` -**Parameters** +**Parameters** -- `number` – integer number. -- `index1`, `index2`, `index3`, `index4` – positions of bit. For example, for set of positions (`index1`, `index2`, `index3`, `index4`) is true if and only if all of its positions are true (`index1` ⋀ `index2`, ⋀ `index3` ⋀ `index4`). +- `number` – integer number. +- `index1`, `index2`, `index3`, `index4` – positions of bit. For example, for set of positions (`index1`, `index2`, `index3`, `index4`) is true if and only if all of its positions are true (`index1` ⋀ `index2`, ⋀ `index3` ⋀ `index4`). **Returned values** @@ -112,13 +112,13 @@ For example, the number 43 in base-2 (binary) numeral system is 101011. Query: -```sql +``` sql SELECT bitTestAll(43, 0, 1, 3, 5) ``` Result: -```text +``` text ┌─bitTestAll(43, 0, 1, 3, 5)─┐ │ 1 │ └────────────────────────────┘ @@ -128,13 +128,13 @@ Another example: Query: -```sql +``` sql SELECT bitTestAll(43, 0, 1, 3, 5, 2) ``` Result: -```text +``` text ┌─bitTestAll(43, 0, 1, 3, 5, 2)─┐ │ 0 │ └───────────────────────────────┘ @@ -154,16 +154,16 @@ The disjunction for bitwise operations: 1 OR 1 = 1 -**Syntax** +**Syntax** -```sql +``` sql SELECT bitTestAny(number, index1, index2, index3, index4, ...) ``` -**Parameters** +**Parameters** -- `number` – integer number. -- `index1`, `index2`, `index3`, `index4` – positions of bit. +- `number` – integer number. +- `index1`, `index2`, `index3`, `index4` – positions of bit. **Returned values** @@ -177,13 +177,13 @@ For example, the number 43 in base-2 (binary) numeral system is 101011. Query: -```sql +``` sql SELECT bitTestAny(43, 0, 2) ``` Result: -```text +``` text ┌─bitTestAny(43, 0, 2)─┐ │ 1 │ └──────────────────────┘ @@ -193,13 +193,13 @@ Another example: Query: -```sql +``` sql SELECT bitTestAny(43, 4, 2) ``` Result: -```text +``` text ┌─bitTestAny(43, 4, 2)─┐ │ 0 │ └──────────────────────┘ diff --git a/docs/en/query_language/functions/bitmap_functions.md b/docs/en/query_language/functions/bitmap_functions.md index 6e6edc6aa08..4cd97246b42 100644 --- a/docs/en/query_language/functions/bitmap_functions.md +++ b/docs/en/query_language/functions/bitmap_functions.md @@ -1,4 +1,4 @@ -# Bitmap functions +# Bitmap functions {#bitmap-functions} Bitmap functions work for two bitmaps Object value calculation, it is to return new bitmap or cardinality while using formula calculation, such as and, or, xor, and not, etc. @@ -8,75 +8,75 @@ RoaringBitmap is wrapped into a data structure while actual storage of Bitmap ob For more information on RoaringBitmap, see: [CRoaring](https://github.com/RoaringBitmap/CRoaring). - -## bitmapBuild {#bitmap_functions-bitmapbuild} +## bitmapBuild {#bitmap-functions-bitmapbuild} Build a bitmap from unsigned integer array. -```sql +``` sql bitmapBuild(array) ``` **Parameters** -- `array` – unsigned integer array. +- `array` – unsigned integer array. **Example** -```sql +``` sql SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) ``` -```text + +``` text ┌─res─┬─toTypeName(bitmapBuild([1, 2, 3, 4, 5]))─────┐ │  │ AggregateFunction(groupBitmap, UInt8) │ └─────┴──────────────────────────────────────────────┘ ``` -## bitmapToArray +## bitmapToArray {#bitmaptoarray} Convert bitmap to integer array. -```sql +``` sql bitmapToArray(bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -```text +``` text ┌─res─────────┐ │ [1,2,3,4,5] │ └─────────────┘ ``` -## bitmapSubsetInRange {#bitmap_functions-bitmapsubsetinrange} +## bitmapSubsetInRange {#bitmap-functions-bitmapsubsetinrange} -Return subset in specified range (not include the range_end). +Return subset in specified range (not include the range\_end). -```sql +``` sql bitmapSubsetInRange(bitmap, range_start, range_end) ``` **Parameters** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). -- `range_start` – range start point. Type: [UInt32](../../data_types/int_uint.md). -- `range_end` – range end point(excluded). Type: [UInt32](../../data_types/int_uint.md). +- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `range_start` – range start point. Type: [UInt32](../../data_types/int_uint.md). +- `range_end` – range end point(excluded). Type: [UInt32](../../data_types/int_uint.md). **Example** -```sql +``` sql SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res ``` -```text +``` text ┌─res───────────────┐ │ [30,31,32,33,100] │ └───────────────────┘ @@ -88,15 +88,15 @@ Creates a subset of bitmap with n elements taken between `range_start` and `card **Syntax** -```sql +``` sql bitmapSubsetLimit(bitmap, range_start, cardinality_limit) ``` **Parameters** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). -- `range_start` – The subset starting point. Type: [UInt32](../../data_types/int_uint.md). -- `cardinality_limit` – The subset cardinality upper limit. Type: [UInt32](../../data_types/int_uint.md). +- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `range_start` – The subset starting point. Type: [UInt32](../../data_types/int_uint.md). +- `cardinality_limit` – The subset cardinality upper limit. Type: [UInt32](../../data_types/int_uint.md). **Returned value** @@ -108,54 +108,55 @@ Type: `Bitmap object`. Query: -```sql +``` sql SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res ``` Result: -```text +``` text ┌─res───────────────────────┐ │ [30,31,32,33,100,200,500] │ └───────────────────────────┘ ``` -## bitmapContains {#bitmap_functions-bitmapcontains} +## bitmapContains {#bitmap-functions-bitmapcontains} Checks whether the bitmap contains an element. -```sql +``` sql bitmapContains(haystack, needle) ``` **Parameters** -- `haystack` – [Bitmap object](#bitmap_functions-bitmapbuild), where the function searches. -- `needle` – Value that the function searches. Type: [UInt32](../../data_types/int_uint.md). +- `haystack` – [Bitmap object](#bitmap_functions-bitmapbuild), where the function searches. +- `needle` – Value that the function searches. Type: [UInt32](../../data_types/int_uint.md). **Returned values** -- 0 — If `haystack` doesn't contain `needle`. -- 1 — If `haystack` contains `needle`. +- 0 — If `haystack` doesn’t contain `needle`. +- 1 — If `haystack` contains `needle`. Type: `UInt8`. **Example** -```sql +``` sql SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res ``` -```text + +``` text ┌─res─┐ │ 1 │ └─────┘ ``` -## bitmapHasAny +## bitmapHasAny {#bitmaphasany} Checks whether two bitmaps have intersection by some elements. -```sql +``` sql bitmapHasAny(bitmap1, bitmap2) ``` @@ -163,348 +164,326 @@ If you are sure that `bitmap2` contains strictly one element, consider using the **Parameters** -- `bitmap*` – bitmap object. +- `bitmap*` – bitmap object. **Return values** -- `1`, if `bitmap1` and `bitmap2` have one similar element at least. -- `0`, otherwise. +- `1`, if `bitmap1` and `bitmap2` have one similar element at least. +- `0`, otherwise. **Example** -```sql +``` sql SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res ``` -```text +``` text ┌─res─┐ │ 1 │ └─────┘ ``` -## bitmapHasAll +## bitmapHasAll {#bitmaphasall} Analogous to `hasAll(array, array)` returns 1 if the first bitmap contains all the elements of the second one, 0 otherwise. If the second argument is an empty bitmap then returns 1. -```sql +``` sql bitmapHasAll(bitmap,bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res ``` -```text +``` text ┌─res─┐ │ 0 │ └─────┘ ``` - -## bitmapCardinality +## bitmapCardinality {#bitmapcardinality} Retrun bitmap cardinality of type UInt64. - -```sql +``` sql bitmapCardinality(bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -```text +``` text ┌─res─┐ │ 5 │ └─────┘ ``` -## bitmapMin +## bitmapMin {#bitmapmin} -Retrun the smallest value of type UInt64 in the set, UINT32_MAX if the set is empty. +Retrun the smallest value of type UInt64 in the set, UINT32\_MAX if the set is empty. - -``` -bitmapMin(bitmap) -``` + bitmapMin(bitmap) **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -``` -┌─res─┐ -│ 1 │ -└─────┘ -``` + ┌─res─┐ + │ 1 │ + └─────┘ -## bitmapMax +## bitmapMax {#bitmapmax} Retrun the greatest value of type UInt64 in the set, 0 if the set is empty. - -``` -bitmapMax(bitmap) -``` + bitmapMax(bitmap) **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -``` -┌─res─┐ -│ 5 │ -└─────┘ -``` + ┌─res─┐ + │ 5 │ + └─────┘ -## bitmapTransform +## bitmapTransform {#bitmaptransform} Transform an array of values in a bitmap to another array of values, the result is a new bitmap. - -``` -bitmapTransform(bitmap, from_array, to_array) -``` + bitmapTransform(bitmap, from_array, to_array) **Parameters** -- `bitmap` – bitmap object. -- `from_array` – UInt32 array. For idx in range [0, from_array.size()), if bitmap contains from_array[idx], then replace it with to_array[idx]. Note that the result depends on array ordering if there are common elements between from_array and to_array. -- `to_array` – UInt32 array, its size shall be the same to from_array. +- `bitmap` – bitmap object. +- `from_array` – UInt32 array. For idx in range \[0, from\_array.size()), if bitmap contains from\_array\[idx\], then replace it with to\_array\[idx\]. Note that the result depends on array ordering if there are common elements between from\_array and to\_array. +- `to_array` – UInt32 array, its size shall be the same to from\_array. **Example** -```sql +``` sql SELECT bitmapToArray(bitmapTransform(bitmapBuild([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), cast([5,999,2] as Array(UInt32)), cast([2,888,20] as Array(UInt32)))) AS res ``` -``` -┌─res───────────────────┐ -│ [1,3,4,6,7,8,9,10,20] │ -└───────────────────────┘ -``` + ┌─res───────────────────┐ + │ [1,3,4,6,7,8,9,10,20] │ + └───────────────────────┘ -## bitmapAnd +## bitmapAnd {#bitmapand} Two bitmap and calculation, the result is a new bitmap. -```sql +``` sql bitmapAnd(bitmap,bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -```text +``` text ┌─res─┐ │ [3] │ └─────┘ ``` - -## bitmapOr +## bitmapOr {#bitmapor} Two bitmap or calculation, the result is a new bitmap. -```sql +``` sql bitmapOr(bitmap,bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -```text +``` text ┌─res─────────┐ │ [1,2,3,4,5] │ └─────────────┘ ``` -## bitmapXor +## bitmapXor {#bitmapxor} Two bitmap xor calculation, the result is a new bitmap. -```sql +``` sql bitmapXor(bitmap,bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -```text +``` text ┌─res───────┐ │ [1,2,4,5] │ └───────────┘ ``` -## bitmapAndnot +## bitmapAndnot {#bitmapandnot} Two bitmap andnot calculation, the result is a new bitmap. -```sql +``` sql bitmapAndnot(bitmap,bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -```text +``` text ┌─res───┐ │ [1,2] │ └───────┘ ``` -## bitmapAndCardinality +## bitmapAndCardinality {#bitmapandcardinality} Two bitmap and calculation, return cardinality of type UInt64. - -```sql +``` sql bitmapAndCardinality(bitmap,bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -```text +``` text ┌─res─┐ │ 1 │ └─────┘ ``` - -## bitmapOrCardinality +## bitmapOrCardinality {#bitmaporcardinality} Two bitmap or calculation, return cardinality of type UInt64. -```sql +``` sql bitmapOrCardinality(bitmap,bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -```text +``` text ┌─res─┐ │ 5 │ └─────┘ ``` -## bitmapXorCardinality +## bitmapXorCardinality {#bitmapxorcardinality} Two bitmap xor calculation, return cardinality of type UInt64. -```sql +``` sql bitmapXorCardinality(bitmap,bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -```text +``` text ┌─res─┐ │ 4 │ └─────┘ ``` - -## bitmapAndnotCardinality +## bitmapAndnotCardinality {#bitmapandnotcardinality} Two bitmap andnot calculation, return cardinality of type UInt64. -```sql +``` sql bitmapAndnotCardinality(bitmap,bitmap) ``` **Parameters** -- `bitmap` – bitmap object. +- `bitmap` – bitmap object. **Example** -```sql +``` sql SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -```text +``` text ┌─res─┐ │ 2 │ └─────┘ ``` - [Original article](https://clickhouse.tech/docs/en/query_language/functions/bitmap_functions/) diff --git a/docs/en/query_language/functions/comparison_functions.md b/docs/en/query_language/functions/comparison_functions.md index 0c4bf8324f6..4505cc8cbf1 100644 --- a/docs/en/query_language/functions/comparison_functions.md +++ b/docs/en/query_language/functions/comparison_functions.md @@ -1,21 +1,21 @@ -# Comparison functions +# Comparison functions {#comparison-functions} Comparison functions always return 0 or 1 (Uint8). The following types can be compared: -- numbers -- strings and fixed strings -- dates -- dates with times +- numbers +- strings and fixed strings +- dates +- dates with times within each group, but not between different groups. -For example, you can't compare a date with a string. You have to use a function to convert the string to a date, or vice versa. +For example, you can’t compare a date with a string. You have to use a function to convert the string to a date, or vice versa. Strings are compared by bytes. A shorter string is smaller than all strings that start with it and that contain at least one more character. -Note. Up until version 1.1.54134, signed and unsigned numbers were compared the same way as in C++. In other words, you could get an incorrect result in cases like SELECT 9223372036854775807 > -1. This behavior changed in version 1.1.54134 and is now mathematically correct. +Note. Up until version 1.1.54134, signed and unsigned numbers were compared the same way as in C++. In other words, you could get an incorrect result in cases like SELECT 9223372036854775807 \> -1. This behavior changed in version 1.1.54134 and is now mathematically correct. ## equals, a = b and a == b operator {#function-equals} @@ -29,5 +29,4 @@ Note. Up until version 1.1.54134, signed and unsigned numbers were compared the ## greaterOrEquals, `>= operator` {#function-greaterorequals} - [Original article](https://clickhouse.tech/docs/en/query_language/functions/comparison_functions/) diff --git a/docs/en/query_language/functions/conditional_functions.md b/docs/en/query_language/functions/conditional_functions.md index 6822d40bb21..be55c96daf3 100644 --- a/docs/en/query_language/functions/conditional_functions.md +++ b/docs/en/query_language/functions/conditional_functions.md @@ -1,4 +1,4 @@ -# Conditional functions +# Conditional functions {#conditional-functions} ## if {#if} @@ -6,7 +6,7 @@ Controls conditional branching. Unlike most systems, ClickHouse always evaluate **Syntax** -```sql +``` sql SELECT if(cond, then, else) ``` @@ -14,9 +14,9 @@ If the condition `cond` evaluates to a non-zero value, returns the result of the **Parameters** -- `cond` – The condition for evaluation that can be zero or not. The type is UInt8, Nullable(UInt8) or NULL. -- `then` - The expression to return if condition is met. -- `else` - The expression to return if condition is not met. +- `cond` – The condition for evaluation that can be zero or not. The type is UInt8, Nullable(UInt8) or NULL. +- `then` - The expression to return if condition is met. +- `else` - The expression to return if condition is not met. **Returned values** @@ -26,13 +26,13 @@ The function executes `then` and `else` expressions and returns its result, depe Query: -```sql +``` sql SELECT if(1, plus(2, 2), plus(2, 6)) ``` Result: -```text +``` text ┌─plus(2, 2)─┐ │ 4 │ └────────────┘ @@ -40,25 +40,25 @@ Result: Query: -```sql +``` sql SELECT if(0, plus(2, 2), plus(2, 6)) ``` Result: -```text +``` text ┌─plus(2, 6)─┐ │ 8 │ └────────────┘ ``` -* `then` and `else` must have the lowest common type. +- `then` and `else` must have the lowest common type. **Example:** Take this `LEFT_RIGHT` table: -```sql +``` sql SELECT * FROM LEFT_RIGHT @@ -70,9 +70,10 @@ FROM LEFT_RIGHT │ 4 │ ᴺᵁᴸᴸ │ └──────┴───────┘ ``` + The following query compares `left` and `right` values: -```sql +``` sql SELECT left, right, @@ -86,6 +87,7 @@ WHERE isNotNull(left) AND isNotNull(right) │ 3 │ 1 │ right is greater or equal than left │ └──────┴───────┴─────────────────────────────────────┘ ``` + Note: `NULL` values are not used in this example, check [NULL values in conditionals](#null-values-in-conditionals) section. ## Ternary Operator {#ternary-operator} @@ -96,15 +98,15 @@ Syntax: `cond ? then : else` Returns `then` if the `cond` evaluates to be true (greater than zero), otherwise returns `else`. -* `cond` must be of type of `UInt8`, and `then` and `else` must have the lowest common type. +- `cond` must be of type of `UInt8`, and `then` and `else` must have the lowest common type. -* `then` and `else` can be `NULL` +- `then` and `else` can be `NULL` **See also** -- [ifNotFinite](other_functions.md#ifnotfinite). +- [ifNotFinite](other_functions.md#ifnotfinite). -## multiIf +## multiIf {#multiif} Allows you to write the [CASE](../operators.md#operator_case) operator more compactly in the query. @@ -112,9 +114,9 @@ Syntax: `multiIf(cond_1, then_1, cond_2, then_2, ..., else)` **Parameters:** -- `cond_N` — The condition for the function to return `then_N`. -- `then_N` — The result of the function when executed. -- `else` — The result of the function if none of the conditions is met. +- `cond_N` — The condition for the function to return `then_N`. +- `then_N` — The result of the function when executed. +- `else` — The result of the function if none of the conditions is met. The function accepts `2N+1` parameters. @@ -126,7 +128,7 @@ The function returns one of the values `then_N` or `else`, depending on the cond Again using `LEFT_RIGHT` table. -```sql +``` sql SELECT left, right, @@ -141,11 +143,12 @@ FROM LEFT_RIGHT │ 4 │ ᴺᵁᴸᴸ │ Null value │ └──────┴───────┴─────────────────┘ ``` -## Using conditional results directly + +## Using conditional results directly {#using-conditional-results-directly} Conditionals always result to `0`, `1` or `NULL`. So you can use conditional results directly like this: -```sql +``` sql SELECT left < right AS is_small FROM LEFT_RIGHT @@ -158,12 +161,11 @@ FROM LEFT_RIGHT └──────────┘ ``` - -## NULL values in conditionals +## NULL values in conditionals {#null-values-in-conditionals} When `NULL` values are involved in conditionals, the result will also be `NULL`. -```sql +``` sql SELECT NULL < 1, 2 < NULL, @@ -179,7 +181,7 @@ So you should construct your queries carefully if the types are `Nullable`. The following example demonstrates this by failing to add equals condition to `multiIf`. -```sql +``` sql SELECT left, right, @@ -195,5 +197,4 @@ FROM LEFT_RIGHT └──────┴───────┴──────────────────┘ ``` - [Original article](https://clickhouse.tech/docs/en/query_language/functions/conditional_functions/) diff --git a/docs/en/query_language/functions/date_time_functions.md b/docs/en/query_language/functions/date_time_functions.md index 3a3adba38a0..f01ddf2a82d 100644 --- a/docs/en/query_language/functions/date_time_functions.md +++ b/docs/en/query_language/functions/date_time_functions.md @@ -1,10 +1,10 @@ -# Functions for working with dates and times +# Functions for working with dates and times {#functions-for-working-with-dates-and-times} Support for time zones All functions for working with the date and time that have a logical use for the time zone can accept a second optional time zone argument. Example: Asia/Yekaterinburg. In this case, they use the specified time zone instead of the local (default) one. -```sql +``` sql SELECT toDateTime('2016-06-15 23:00:00') AS time, toDate(time) AS date_local, @@ -12,7 +12,7 @@ SELECT toString(time, 'US/Samoa') AS time_samoa ``` -```text +``` text ┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ └─────────────────────┴────────────┴────────────┴─────────────────────┘ @@ -20,64 +20,64 @@ SELECT Only time zones that differ from UTC by a whole number of hours are supported. -## toTimeZone +## toTimeZone {#totimezone} Convert time or date and time to the specified time zone. -## toYear +## toYear {#toyear} Converts a date or date with time to a UInt16 number containing the year number (AD). -## toQuarter +## toQuarter {#toquarter} Converts a date or date with time to a UInt8 number containing the quarter number. -## toMonth +## toMonth {#tomonth} Converts a date or date with time to a UInt8 number containing the month number (1-12). -## toDayOfYear +## toDayOfYear {#todayofyear} Converts a date or date with time to a UInt16 number containing the number of the day of the year (1-366). -## toDayOfMonth +## toDayOfMonth {#todayofmonth} Converts a date or date with time to a UInt8 number containing the number of the day of the month (1-31). -## toDayOfWeek +## toDayOfWeek {#todayofweek} Converts a date or date with time to a UInt8 number containing the number of the day of the week (Monday is 1, and Sunday is 7). -## toHour +## toHour {#tohour} Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23). This function assumes that if clocks are moved ahead, it is by one hour and occurs at 2 a.m., and if clocks are moved back, it is by one hour and occurs at 3 a.m. (which is not always true – even in Moscow the clocks were twice changed at a different time). -## toMinute +## toMinute {#tominute} Converts a date with time to a UInt8 number containing the number of the minute of the hour (0-59). -## toSecond +## toSecond {#tosecond} Converts a date with time to a UInt8 number containing the number of the second in the minute (0-59). Leap seconds are not accounted for. -## toUnixTimestamp {#to_unix_timestamp} +## toUnixTimestamp {#to-unix-timestamp} For DateTime argument: converts value to its internal numeric representation (Unix Timestamp). For String argument: parse datetime from string according to the timezone (optional second argument, server timezone is used by default) and returns the corresponding unix timestamp. For Date argument: the behaviour is unspecified. -**Syntax** +**Syntax** -```sql +``` sql toUnixTimestamp(datetime) toUnixTimestamp(str, [timezone]) ``` **Returned value** -- Returns the unix timestamp. +- Returns the unix timestamp. Type: `UInt32`. @@ -85,230 +85,235 @@ Type: `UInt32`. Query: -```sql +``` sql SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp ``` Result: -```text +``` text ┌─unix_timestamp─┐ │ 1509836867 │ └────────────────┘ ``` -## toStartOfYear +## toStartOfYear {#tostartofyear} Rounds down a date or date with time to the first day of the year. Returns the date. -## toStartOfISOYear +## toStartOfISOYear {#tostartofisoyear} Rounds down a date or date with time to the first day of ISO year. Returns the date. -## toStartOfQuarter +## toStartOfQuarter {#tostartofquarter} Rounds down a date or date with time to the first day of the quarter. The first day of the quarter is either 1 January, 1 April, 1 July, or 1 October. Returns the date. -## toStartOfMonth +## toStartOfMonth {#tostartofmonth} Rounds down a date or date with time to the first day of the month. Returns the date. -!!! attention - The behavior of parsing incorrect dates is implementation specific. ClickHouse may return zero date, throw an exception or do "natural" overflow. +!!! attention "Attention" + The behavior of parsing incorrect dates is implementation specific. ClickHouse may return zero date, throw an exception or do “natural” overflow. -## toMonday +## toMonday {#tomonday} Rounds down a date or date with time to the nearest Monday. Returns the date. -## toStartOfWeek(t[,mode]) +## toStartOfWeek(t\[,mode\]) {#tostartofweektmode} Rounds down a date or date with time to the nearest Sunday or Monday by mode. Returns the date. -The mode argument works exactly like the mode argument to toWeek(). For the single-argument syntax, a mode value of 0 is used. +The mode argument works exactly like the mode argument to toWeek(). For the single-argument syntax, a mode value of 0 is used. -## toStartOfDay +## toStartOfDay {#tostartofday} Rounds down a date with time to the start of the day. -## toStartOfHour +## toStartOfHour {#tostartofhour} Rounds down a date with time to the start of the hour. -## toStartOfMinute +## toStartOfMinute {#tostartofminute} Rounds down a date with time to the start of the minute. -## toStartOfFiveMinute +## toStartOfFiveMinute {#tostartoffiveminute} Rounds down a date with time to the start of the five-minute interval. -## toStartOfTenMinutes +## toStartOfTenMinutes {#tostartoftenminutes} + Rounds down a date with time to the start of the ten-minute interval. -## toStartOfFifteenMinutes +## toStartOfFifteenMinutes {#tostartoffifteenminutes} Rounds down the date with time to the start of the fifteen-minute interval. -## toStartOfInterval(time_or_data, INTERVAL x unit [, time_zone]) -This is a generalization of other functions named `toStartOf*`. For example, -`toStartOfInterval(t, INTERVAL 1 year)` returns the same as `toStartOfYear(t)`, -`toStartOfInterval(t, INTERVAL 1 month)` returns the same as `toStartOfMonth(t)`, -`toStartOfInterval(t, INTERVAL 1 day)` returns the same as `toStartOfDay(t)`, +## toStartOfInterval(time\_or\_data, INTERVAL x unit \[, time\_zone\]) {#tostartofintervaltime-or-data-interval-x-unit-time-zone} + +This is a generalization of other functions named `toStartOf*`. For example, +`toStartOfInterval(t, INTERVAL 1 year)` returns the same as `toStartOfYear(t)`, +`toStartOfInterval(t, INTERVAL 1 month)` returns the same as `toStartOfMonth(t)`, +`toStartOfInterval(t, INTERVAL 1 day)` returns the same as `toStartOfDay(t)`, `toStartOfInterval(t, INTERVAL 15 minute)` returns the same as `toStartOfFifteenMinutes(t)` etc. -## toTime +## toTime {#totime} Converts a date with time to a certain fixed date, while preserving the time. -## toRelativeYearNum +## toRelativeYearNum {#torelativeyearnum} Converts a date with time or date to the number of the year, starting from a certain fixed point in the past. -## toRelativeQuarterNum +## toRelativeQuarterNum {#torelativequarternum} Converts a date with time or date to the number of the quarter, starting from a certain fixed point in the past. -## toRelativeMonthNum +## toRelativeMonthNum {#torelativemonthnum} Converts a date with time or date to the number of the month, starting from a certain fixed point in the past. -## toRelativeWeekNum +## toRelativeWeekNum {#torelativeweeknum} Converts a date with time or date to the number of the week, starting from a certain fixed point in the past. -## toRelativeDayNum +## toRelativeDayNum {#torelativedaynum} Converts a date with time or date to the number of the day, starting from a certain fixed point in the past. -## toRelativeHourNum +## toRelativeHourNum {#torelativehournum} Converts a date with time or date to the number of the hour, starting from a certain fixed point in the past. -## toRelativeMinuteNum +## toRelativeMinuteNum {#torelativeminutenum} Converts a date with time or date to the number of the minute, starting from a certain fixed point in the past. -## toRelativeSecondNum +## toRelativeSecondNum {#torelativesecondnum} Converts a date with time or date to the number of the second, starting from a certain fixed point in the past. -## toISOYear +## toISOYear {#toisoyear} Converts a date or date with time to a UInt16 number containing the ISO Year number. -## toISOWeek +## toISOWeek {#toisoweek} Converts a date or date with time to a UInt8 number containing the ISO Week number. -## toWeek(date[,mode]) +## toWeek(date\[,mode\]) {#toweekdatemode} + This function returns the week number for date or datetime. The two-argument form of toWeek() enables you to specify whether the week starts on Sunday or Monday and whether the return value should be in the range from 0 to 53 or from 1 to 53. If the mode argument is omitted, the default mode is 0. -`toISOWeek() `is a compatibility function that is equivalent to `toWeek(date,3)`. +`toISOWeek()`is a compatibility function that is equivalent to `toWeek(date,3)`. The following table describes how the mode argument works. -| Mode | First day of week | Range | Week 1 is the first week … | -| ----------- | -------- | -------- | ------------------ | -|0|Sunday|0-53|with a Sunday in this year -|1|Monday|0-53|with 4 or more days this year -|2|Sunday|1-53|with a Sunday in this year -|3|Monday|1-53|with 4 or more days this year -|4|Sunday|0-53|with 4 or more days this year -|5|Monday|0-53|with a Monday in this year -|6|Sunday|1-53|with 4 or more days this year -|7|Monday|1-53|with a Monday in this year -|8|Sunday|1-53|contains January 1 -|9|Monday|1-53|contains January 1 +| Mode | First day of week | Range | Week 1 is the first week … | +|------|-------------------|-------|-------------------------------| +| 0 | Sunday | 0-53 | with a Sunday in this year | +| 1 | Monday | 0-53 | with 4 or more days this year | +| 2 | Sunday | 1-53 | with a Sunday in this year | +| 3 | Monday | 1-53 | with 4 or more days this year | +| 4 | Sunday | 0-53 | with 4 or more days this year | +| 5 | Monday | 0-53 | with a Monday in this year | +| 6 | Sunday | 1-53 | with 4 or more days this year | +| 7 | Monday | 1-53 | with a Monday in this year | +| 8 | Sunday | 1-53 | contains January 1 | +| 9 | Monday | 1-53 | contains January 1 | For mode values with a meaning of “with 4 or more days this year,” weeks are numbered according to ISO 8601:1988: -- If the week containing January 1 has 4 or more days in the new year, it is week 1. +- If the week containing January 1 has 4 or more days in the new year, it is week 1. -- Otherwise, it is the last week of the previous year, and the next week is week 1. +- Otherwise, it is the last week of the previous year, and the next week is week 1. -For mode values with a meaning of “contains January 1”, the week contains January 1 is week 1. It doesn't matter how many days in the new year the week contained, even if it contained only one day. +For mode values with a meaning of “contains January 1”, the week contains January 1 is week 1. It doesn’t matter how many days in the new year the week contained, even if it contained only one day. -```sql +``` sql toWeek(date, [, mode][, Timezone]) ``` + **Parameters** -- `date` – Date or DateTime. -- `mode` – Optional parameter, Range of values is [0,9], default is 0. -- `Timezone` – Optional parameter, it behaves like any other conversion function. +- `date` – Date or DateTime. +- `mode` – Optional parameter, Range of values is \[0,9\], default is 0. +- `Timezone` – Optional parameter, it behaves like any other conversion function. **Example** -```sql +``` sql SELECT toDate('2016-12-27') AS date, toWeek(date) AS week0, toWeek(date,1) AS week1, toWeek(date,9) AS week9; ``` -```text +``` text ┌───────date─┬─week0─┬─week1─┬─week9─┐ │ 2016-12-27 │ 52 │ 52 │ 1 │ └────────────┴───────┴───────┴───────┘ ``` -## toYearWeek(date[,mode]) +## toYearWeek(date\[,mode\]) {#toyearweekdatemode} + Returns year and week for a date. The year in the result may be different from the year in the date argument for the first and the last week of the year. -The mode argument works exactly like the mode argument to toWeek(). For the single-argument syntax, a mode value of 0 is used. +The mode argument works exactly like the mode argument to toWeek(). For the single-argument syntax, a mode value of 0 is used. -`toISOYear() `is a compatibility function that is equivalent to `intDiv(toYearWeek(date,3),100)`. +`toISOYear()`is a compatibility function that is equivalent to `intDiv(toYearWeek(date,3),100)`. **Example** -```sql +``` sql SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(date,1) AS yearWeek1, toYearWeek(date,9) AS yearWeek9; ``` -```text +``` text ┌───────date─┬─yearWeek0─┬─yearWeek1─┬─yearWeek9─┐ │ 2016-12-27 │ 201652 │ 201652 │ 201701 │ └────────────┴───────────┴───────────┴───────────┘ ``` -## now +## now {#now} Accepts zero arguments and returns the current time at one of the moments of request execution. This function returns a constant, even if the request took a long time to complete. -## today +## today {#today} Accepts zero arguments and returns the current date at one of the moments of request execution. -The same as 'toDate(now())'. +The same as ‘toDate(now())’. -## yesterday +## yesterday {#yesterday} -Accepts zero arguments and returns yesterday's date at one of the moments of request execution. -The same as 'today() - 1'. +Accepts zero arguments and returns yesterday’s date at one of the moments of request execution. +The same as ‘today() - 1’. -## timeSlot +## timeSlot {#timeslot} Rounds the time to the half hour. -This function is specific to Yandex.Metrica, since half an hour is the minimum amount of time for breaking a session into two sessions if a tracking tag shows a single user's consecutive pageviews that differ in time by strictly more than this amount. This means that tuples (the tag ID, user ID, and time slot) can be used to search for pageviews that are included in the corresponding session. +This function is specific to Yandex.Metrica, since half an hour is the minimum amount of time for breaking a session into two sessions if a tracking tag shows a single user’s consecutive pageviews that differ in time by strictly more than this amount. This means that tuples (the tag ID, user ID, and time slot) can be used to search for pageviews that are included in the corresponding session. -## toYYYYMM +## toYYYYMM {#toyyyymm} -Converts a date or date with time to a UInt32 number containing the year and month number (YYYY * 100 + MM). +Converts a date or date with time to a UInt32 number containing the year and month number (YYYY \* 100 + MM). -## toYYYYMMDD +## toYYYYMMDD {#toyyyymmdd} -Converts a date or date with time to a UInt32 number containing the year and month number (YYYY * 10000 + MM * 100 + DD). +Converts a date or date with time to a UInt32 number containing the year and month number (YYYY \* 10000 + MM \* 100 + DD). -## toYYYYMMDDhhmmss +## toYYYYMMDDhhmmss {#toyyyymmddhhmmss} -Converts a date or date with time to a UInt64 number containing the year and month number (YYYY * 10000000000 + MM * 100000000 + DD * 1000000 + hh * 10000 + mm * 100 + ss). +Converts a date or date with time to a UInt64 number containing the year and month number (YYYY \* 10000000000 + MM \* 100000000 + DD \* 1000000 + hh \* 10000 + mm \* 100 + ss). -## addYears, addMonths, addWeeks, addDays, addHours, addMinutes, addSeconds, addQuarters +## addYears, addMonths, addWeeks, addDays, addHours, addMinutes, addSeconds, addQuarters {#addyears-addmonths-addweeks-adddays-addhours-addminutes-addseconds-addquarters} Function adds a Date/DateTime interval to a Date/DateTime and then return the Date/DateTime. For example: -```sql +``` sql WITH toDate('2018-01-01') AS date, toDateTime('2018-01-01 00:00:00') AS date_time @@ -317,17 +322,17 @@ SELECT addYears(date_time, 1) AS add_years_with_date_time ``` -```text +``` text ┌─add_years_with_date─┬─add_years_with_date_time─┐ │ 2019-01-01 │ 2019-01-01 00:00:00 │ └─────────────────────┴──────────────────────────┘ ``` -## subtractYears, subtractMonths, subtractWeeks, subtractDays, subtractHours, subtractMinutes, subtractSeconds, subtractQuarters +## subtractYears, subtractMonths, subtractWeeks, subtractDays, subtractHours, subtractMinutes, subtractSeconds, subtractQuarters {#subtractyears-subtractmonths-subtractweeks-subtractdays-subtracthours-subtractminutes-subtractseconds-subtractquarters} Function subtract a Date/DateTime interval to a Date/DateTime and then return the Date/DateTime. For example: -```sql +``` sql WITH toDate('2019-01-01') AS date, toDateTime('2019-01-01 00:00:00') AS date_time @@ -336,7 +341,7 @@ SELECT subtractYears(date_time, 1) AS subtract_years_with_date_time ``` -```text +``` text ┌─subtract_years_with_date─┬─subtract_years_with_date_time─┐ │ 2018-01-01 │ 2018-01-01 00:00:00 │ └──────────────────────────┴───────────────────────────────┘ @@ -348,30 +353,32 @@ Returns the difference between two Date or DateTime values. **Syntax** -```sql +``` sql dateDiff('unit', startdate, enddate, [timezone]) ``` **Parameters** -- `unit` — Time unit, in which the returned value is expressed. [String](../syntax.md#syntax-string-literal). +- `unit` — Time unit, in which the returned value is expressed. [String](../syntax.md#syntax-string-literal). - Supported values: + Supported values: - | unit | - | ------ | - |second | - |minute | - |hour | - |day | - |week | - |month | - |quarter | - |year | + | unit | + | ------ | + |second | + |minute | + |hour | + |day | + |week | + |month | + |quarter | + |year | -- `startdate` — The first time value to compare. [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). -- `enddate` — The second time value to compare. [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). -- `timezone` — Optional parameter. If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. +- `startdate` — The first time value to compare. [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). + +- `enddate` — The second time value to compare. [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). + +- `timezone` — Optional parameter. If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified. **Returned value** @@ -383,54 +390,54 @@ Type: `int`. Query: -```sql +``` sql SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); ``` Result: -```text +``` text ┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ │ 25 │ └────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## timeSlots(StartTime, Duration,\[, Size\]) +## timeSlots(StartTime, Duration,\[, Size\]) {#timeslotsstarttime-duration-size} -For a time interval starting at 'StartTime' and continuing for 'Duration' seconds, it returns an array of moments in time, consisting of points from this interval rounded down to the 'Size' in seconds. 'Size' is an optional parameter: a constant UInt32, set to 1800 by default. +For a time interval starting at ‘StartTime’ and continuing for ‘Duration’ seconds, it returns an array of moments in time, consisting of points from this interval rounded down to the ‘Size’ in seconds. ‘Size’ is an optional parameter: a constant UInt32, set to 1800 by default. For example, `timeSlots(toDateTime('2012-01-01 12:20:00'), 600) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')]`. This is necessary for searching for pageviews in the corresponding session. ## formatDateTime(Time, Format\[, Timezone\]) {#formatdatetime} -Function formats a Time according given Format string. N.B.: Format is a constant expression, e.g. you can not have multiple formats for single result column. +Function formats a Time according given Format string. N.B.: Format is a constant expression, e.g. you can not have multiple formats for single result column. Supported modifiers for Format: -("Example" column shows formatting result for time `2018-01-02 22:33:44`) +(“Example” column shows formatting result for time `2018-01-02 22:33:44`) -| Modifier | Description | Example | -| ----------- | -------- | --------------- | -|%C|year divided by 100 and truncated to integer (00-99)|20 -|%d|day of the month, zero-padded (01-31)|02 -|%D|Short MM/DD/YY date, equivalent to %m/%d/%y|01/02/18| -|%e|day of the month, space-padded ( 1-31)| 2| -|%F|short YYYY-MM-DD date, equivalent to %Y-%m-%d|2018-01-02 -|%H|hour in 24h format (00-23)|22| -|%I|hour in 12h format (01-12)|10| -|%j|day of the year (001-366)|002| -|%m|month as a decimal number (01-12)|01| -|%M|minute (00-59)|33| -|%n|new-line character ('\n')|| -|%p|AM or PM designation|PM| -|%R|24-hour HH:MM time, equivalent to %H:%M|22:33| -|%S|second (00-59)|44| -|%t|horizontal-tab character ('\t')|| -|%T|ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S|22:33:44| -|%u|ISO 8601 weekday as number with Monday as 1 (1-7)|2| -|%V|ISO 8601 week number (01-53)|01| -|%w|weekday as a decimal number with Sunday as 0 (0-6)|2| -|%y|Year, last two digits (00-99)|18| -|%Y|Year|2018| -|%%|a % sign|%| +| Modifier | Description | Example | +|----------|---------------------------------------------------------|------------| +| %C | year divided by 100 and truncated to integer (00-99) | 20 | +| %d | day of the month, zero-padded (01-31) | 02 | +| %D | Short MM/DD/YY date, equivalent to %m/%d/%y | 01/02/18 | +| %e | day of the month, space-padded ( 1-31) | 2 | +| %F | short YYYY-MM-DD date, equivalent to %Y-%m-%d | 2018-01-02 | +| %H | hour in 24h format (00-23) | 22 | +| %I | hour in 12h format (01-12) | 10 | +| %j | day of the year (001-366) | 002 | +| %m | month as a decimal number (01-12) | 01 | +| %M | minute (00-59) | 33 | +| %n | new-line character (‘’) | | +| %p | AM or PM designation | PM | +| %R | 24-hour HH:MM time, equivalent to %H:%M | 22:33 | +| %S | second (00-59) | 44 | +| %t | horizontal-tab character (’) | | +| %T | ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S | 22:33:44 | +| %u | ISO 8601 weekday as number with Monday as 1 (1-7) | 2 | +| %V | ISO 8601 week number (01-53) | 01 | +| %w | weekday as a decimal number with Sunday as 0 (0-6) | 2 | +| %y | Year, last two digits (00-99) | 18 | +| %Y | Year | 2018 | +| %% | a % sign | % | [Original article](https://clickhouse.tech/docs/en/query_language/functions/date_time_functions/) diff --git a/docs/en/query_language/functions/encoding_functions.md b/docs/en/query_language/functions/encoding_functions.md index 9adb69e2cf6..50b79a22b13 100644 --- a/docs/en/query_language/functions/encoding_functions.md +++ b/docs/en/query_language/functions/encoding_functions.md @@ -1,4 +1,4 @@ -# Encoding functions +# Encoding functions {#encoding-functions} ## char {#char} @@ -6,29 +6,31 @@ Returns the string with the length as the number of passed arguments and each by **Syntax** -```sql +``` sql char(number_1, [number_2, ..., number_n]); ``` **Parameters** -- `number_1, number_2, ..., number_n` — Numerical arguments interpreted as integers. Types: [Int](../../data_types/int_uint.md), [Float](../../data_types/float.md). +- `number_1, number_2, ..., number_n` — Numerical arguments interpreted as integers. Types: [Int](../../data_types/int_uint.md), [Float](../../data_types/float.md). **Returned value** -- a string of given bytes. +- a string of given bytes. Type: `String`. **Example** Query: -```sql + +``` sql SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello ``` Result: -```text + +``` text ┌─hello─┐ │ hello │ └───────┘ @@ -37,24 +39,28 @@ Result: You can construct a string of arbitrary encoding by passing the corresponding bytes. Here is example for UTF-8: Query: -```sql + +``` sql SELECT char(0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5, 0xD1, 0x82) AS hello; ``` Result: -```text + +``` text ┌─hello──┐ │ привет │ └────────┘ ``` Query: -```sql + +``` sql SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; ``` Result: -```text + +``` text ┌─hello─┐ │ 你好 │ └───────┘ @@ -62,17 +68,17 @@ Result: ## hex {#hex} -Returns a string containing the argument's hexadecimal representation. +Returns a string containing the argument’s hexadecimal representation. **Syntax** -```sql +``` sql hex(arg) ``` The function is using uppercase letters `A-F` and not using any prefixes (like `0x`) or suffixes (like `h`). -For integer arguments, it prints hex digits ("nibbles") from the most significant to least significant (big endian or "human readable" order). It starts with the most significant non-zero byte (leading zero bytes are omitted) but always prints both digits of every byte even if leading digit is zero. +For integer arguments, it prints hex digits (“nibbles”) from the most significant to least significant (big endian or “human readable” order). It starts with the most significant non-zero byte (leading zero bytes are omitted) but always prints both digits of every byte even if leading digit is zero. Example: @@ -80,13 +86,13 @@ Example: Query: -```sql +``` sql SELECT hex(1); ``` Result: -```text +``` text 01 ``` @@ -98,11 +104,11 @@ Values of floating point and Decimal types are encoded as their representation i **Parameters** -- `arg` — A value to convert to hexadecimal. Types: [String](../../data_types/string.md), [UInt](../../data_types/int_uint.md), [Float](../../data_types/float.md), [Decimal](../../data_types/decimal.md), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). +- `arg` — A value to convert to hexadecimal. Types: [String](../../data_types/string.md), [UInt](../../data_types/int_uint.md), [Float](../../data_types/float.md), [Decimal](../../data_types/decimal.md), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). **Returned value** -- A string with the hexadecimal representation of the argument. +- A string with the hexadecimal representation of the argument. Type: `String`. @@ -110,13 +116,13 @@ Type: `String`. Query: -```sql +``` sql SELECT hex(toFloat32(number)) as hex_presentation FROM numbers(15, 2); ``` Result: -```text +``` text ┌─hex_presentation─┐ │ 00007041 │ │ 00008041 │ @@ -125,39 +131,38 @@ Result: Query: -```sql +``` sql SELECT hex(toFloat64(number)) as hex_presentation FROM numbers(15, 2); ``` Result: -```text +``` text ┌─hex_presentation─┐ │ 0000000000002E40 │ │ 0000000000003040 │ └──────────────────┘ ``` -## unhex(str) +## unhex(str) {#unhexstr} -Accepts a string containing any number of hexadecimal digits, and returns a string containing the corresponding bytes. Supports both uppercase and lowercase letters A-F. The number of hexadecimal digits does not have to be even. If it is odd, the last digit is interpreted as the least significant half of the 00-0F byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn't thrown). -If you want to convert the result to a number, you can use the 'reverse' and 'reinterpretAsType' functions. +Accepts a string containing any number of hexadecimal digits, and returns a string containing the corresponding bytes. Supports both uppercase and lowercase letters A-F. The number of hexadecimal digits does not have to be even. If it is odd, the last digit is interpreted as the least significant half of the 00-0F byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn’t thrown). +If you want to convert the result to a number, you can use the ‘reverse’ and ‘reinterpretAsType’ functions. -## UUIDStringToNum(str) +## UUIDStringToNum(str) {#uuidstringtonumstr} Accepts a string containing 36 characters in the format `123e4567-e89b-12d3-a456-426655440000`, and returns it as a set of bytes in a FixedString(16). -## UUIDNumToString(str) +## UUIDNumToString(str) {#uuidnumtostringstr} Accepts a FixedString(16) value. Returns a string containing 36 characters in text format. -## bitmaskToList(num) +## bitmaskToList(num) {#bitmasktolistnum} Accepts an integer. Returns a string containing the list of powers of two that total the source number when summed. They are comma-separated without spaces in text format, in ascending order. -## bitmaskToArray(num) +## bitmaskToArray(num) {#bitmasktoarraynum} Accepts an integer. Returns an array of UInt64 numbers containing the list of powers of two that total the source number when summed. Numbers in the array are in ascending order. - [Original article](https://clickhouse.tech/docs/en/query_language/functions/encoding_functions/) diff --git a/docs/en/query_language/functions/ext_dict_functions.md b/docs/en/query_language/functions/ext_dict_functions.md index 26e05721b6a..770d7ea215a 100644 --- a/docs/en/query_language/functions/ext_dict_functions.md +++ b/docs/en/query_language/functions/ext_dict_functions.md @@ -1,38 +1,39 @@ -# Functions for Working with External Dictionaries {#ext_dict_functions} +# Functions for Working with External Dictionaries {#ext-dict-functions} For information on connecting and configuring external dictionaries, see [External dictionaries](../dicts/external_dicts.md). -## dictGet +## dictGet {#dictget} Retrieves a value from an external dictionary. -```sql +``` sql dictGet('dict_name', 'attr_name', id_expr) dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` **Parameters** -- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). -- `attr_name` — Name of the column of the dictionary. [String literal](../syntax.md#syntax-string-literal). -- `id_expr` — Key value. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md) or [Tuple](../../data_types/tuple.md)-type value depending on the dictionary configuration. -- `default_value_expr` — Value returned if the dictionary doesn't contain a row with the `id_expr` key. [Expression](../syntax.md#syntax-expressions) returning the value in the data type configured for the `attr_name` attribute. +- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). +- `attr_name` — Name of the column of the dictionary. [String literal](../syntax.md#syntax-string-literal). +- `id_expr` — Key value. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md) or [Tuple](../../data_types/tuple.md)-type value depending on the dictionary configuration. +- `default_value_expr` — Value returned if the dictionary doesn’t contain a row with the `id_expr` key. [Expression](../syntax.md#syntax-expressions) returning the value in the data type configured for the `attr_name` attribute. **Returned value** -- If ClickHouse parses the attribute successfully in the [attribute's data type](../../query_language/dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes), functions return the value of the dictionary attribute that corresponds to `id_expr`. -- If there is no the key, corresponding to `id_expr`, in the dictionary, then: +- If ClickHouse parses the attribute successfully in the [attribute’s data type](../../query_language/dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes), functions return the value of the dictionary attribute that corresponds to `id_expr`. - - `dictGet` returns the content of the `` element specified for the attribute in the dictionary configuration. - - `dictGetOrDefault` returns the value passed as the `default_value_expr` parameter. +- If there is no the key, corresponding to `id_expr`, in the dictionary, then: -ClickHouse throws an exception if it cannot parse the value of the attribute or the value doesn't match the attribute data type. + - `dictGet` returns the content of the `` element specified for the attribute in the dictionary configuration. + - `dictGetOrDefault` returns the value passed as the `default_value_expr` parameter. + +ClickHouse throws an exception if it cannot parse the value of the attribute or the value doesn’t match the attribute data type. **Example** Create a text file `ext-dict-text.csv` containing the following: -```text +``` text 1,1 2,2 ``` @@ -41,7 +42,7 @@ The first column is `id`, the second column is `c1`. Configure the external dictionary: -```xml +``` xml ext-dict-test @@ -71,14 +72,15 @@ Configure the external dictionary: Perform the query: -```sql +``` sql SELECT dictGetOrDefault('ext-dict-test', 'c1', number + 1, toUInt32(number * 10)) AS val, toTypeName(val) AS type FROM system.numbers LIMIT 3 ``` -```text + +``` text ┌─val─┬─type───┐ │ 1 │ UInt32 │ │ 2 │ UInt32 │ @@ -88,26 +90,25 @@ LIMIT 3 **See Also** -- [External Dictionaries](../dicts/external_dicts.md) +- [External Dictionaries](../dicts/external_dicts.md) - -## dictHas +## dictHas {#dicthas} Checks whether a key is present in a dictionary. -```sql +``` sql dictHas('dict_name', id_expr) ``` **Parameters** -- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). -- `id_expr` — Key value. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. +- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). +- `id_expr` — Key value. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. **Returned value** -- 0, if there is no key. -- 1, if there is a key. +- 0, if there is no key. +- 1, if there is a key. Type: `UInt8`. @@ -117,81 +118,81 @@ Creates an array, containing all the parents of a key in the [hierarchical dicti **Syntax** -```sql +``` sql dictGetHierarchy('dict_name', key) ``` **Parameters** -- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). -- `key` — Key value. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. +- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). +- `key` — Key value. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. **Returned value** -- Parents for the key. +- Parents for the key. Type: [Array(UInt64)](../../data_types/array.md). - -## dictIsIn +## dictIsIn {#dictisin} Checks the ancestor of a key through the whole hierarchical chain in the dictionary. -```sql +``` sql dictIsIn('dict_name', child_id_expr, ancestor_id_expr) ``` **Parameters** -- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). -- `child_id_expr` — Key to be checked. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. -- `ancestor_id_expr` — Alleged ancestor of the `child_id_expr` key. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. +- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). +- `child_id_expr` — Key to be checked. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. +- `ancestor_id_expr` — Alleged ancestor of the `child_id_expr` key. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. **Returned value** -- 0, if `child_id_expr` is not a child of `ancestor_id_expr`. -- 1, if `child_id_expr` is a child of `ancestor_id_expr` or if `child_id_expr` is an `ancestor_id_expr`. +- 0, if `child_id_expr` is not a child of `ancestor_id_expr`. +- 1, if `child_id_expr` is a child of `ancestor_id_expr` or if `child_id_expr` is an `ancestor_id_expr`. Type: `UInt8`. -## Other functions {#ext_dict_functions-other} +## Other functions {#ext-dict-functions-other} ClickHouse supports specialized functions that convert dictionary attribute values to a specific data type regardless of the dictionary configuration. Functions: -- `dictGetInt8`, `dictGetInt16`, `dictGetInt32`, `dictGetInt64` -- `dictGetUInt8`, `dictGetUInt16`, `dictGetUInt32`, `dictGetUInt64` -- `dictGetFloat32`, `dictGetFloat64` -- `dictGetDate` -- `dictGetDateTime` -- `dictGetUUID` -- `dictGetString` +- `dictGetInt8`, `dictGetInt16`, `dictGetInt32`, `dictGetInt64` +- `dictGetUInt8`, `dictGetUInt16`, `dictGetUInt32`, `dictGetUInt64` +- `dictGetFloat32`, `dictGetFloat64` +- `dictGetDate` +- `dictGetDateTime` +- `dictGetUUID` +- `dictGetString` All these functions have the `OrDefault` modification. For example, `dictGetDateOrDefault`. Syntax: -```sql +``` sql dictGet[Type]('dict_name', 'attr_name', id_expr) dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` **Parameters** -- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). -- `attr_name` — Name of the column of the dictionary. [String literal](../syntax.md#syntax-string-literal). -- `id_expr` — Key value. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. -- `default_value_expr` — Value which is returned if the dictionary doesn't contain a row with the `id_expr` key. [Expression](../syntax.md#syntax-expressions) returning a value in the data type configured for the `attr_name` attribute. +- `dict_name` — Name of the dictionary. [String literal](../syntax.md#syntax-string-literal). +- `attr_name` — Name of the column of the dictionary. [String literal](../syntax.md#syntax-string-literal). +- `id_expr` — Key value. [Expression](../syntax.md#syntax-expressions) returning a [UInt64](../../data_types/int_uint.md)-type value. +- `default_value_expr` — Value which is returned if the dictionary doesn’t contain a row with the `id_expr` key. [Expression](../syntax.md#syntax-expressions) returning a value in the data type configured for the `attr_name` attribute. **Returned value** -- If ClickHouse parses the attribute successfully in the [attribute's data type](../../query_language/dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes), functions return the value of the dictionary attribute that corresponds to `id_expr`. -- If there is no requested `id_expr` in the dictionary then: +- If ClickHouse parses the attribute successfully in the [attribute’s data type](../../query_language/dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes), functions return the value of the dictionary attribute that corresponds to `id_expr`. - - `dictGet[Type]` returns the content of the `` element specified for the attribute in the dictionary configuration. - - `dictGet[Type]OrDefault` returns the value passed as the `default_value_expr` parameter. +- If there is no requested `id_expr` in the dictionary then: -ClickHouse throws an exception if it cannot parse the value of the attribute or the value doesn't match the attribute data type. + - `dictGet[Type]` returns the content of the `` element specified for the attribute in the dictionary configuration. + - `dictGet[Type]OrDefault` returns the value passed as the `default_value_expr` parameter. + +ClickHouse throws an exception if it cannot parse the value of the attribute or the value doesn’t match the attribute data type. [Original article](https://clickhouse.tech/docs/en/query_language/functions/ext_dict_functions/) diff --git a/docs/en/query_language/functions/functions_for_nulls.md b/docs/en/query_language/functions/functions_for_nulls.md index 6abf94ad289..df6d2a6e98b 100644 --- a/docs/en/query_language/functions/functions_for_nulls.md +++ b/docs/en/query_language/functions/functions_for_nulls.md @@ -1,27 +1,27 @@ -# Functions for working with Nullable aggregates +# Functions for working with Nullable aggregates {#functions-for-working-with-nullable-aggregates} -## isNull +## isNull {#isnull} Checks whether the argument is [NULL](../syntax.md#null). -```sql +``` sql isNull(x) ``` **Parameters** -- `x` — A value with a non-compound data type. +- `x` — A value with a non-compound data type. **Returned value** -- `1` if `x` is `NULL`. -- `0` if `x` is not `NULL`. +- `1` if `x` is `NULL`. +- `0` if `x` is not `NULL`. **Example** Input table -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -30,38 +30,38 @@ Input table Query -```sql +``` sql SELECT x FROM t_null WHERE isNull(y) ``` -```text + +``` text ┌─x─┐ │ 1 │ └───┘ - ``` -## isNotNull +## isNotNull {#isnotnull} Checks whether the argument is [NULL](../syntax.md#null). -```sql +``` sql isNotNull(x) ``` **Parameters:** -- `x` — A value with a non-compound data type. +- `x` — A value with a non-compound data type. **Returned value** -- `0` if `x` is `NULL`. -- `1` if `x` is not `NULL`. +- `0` if `x` is `NULL`. +- `1` if `x` is not `NULL`. **Example** Input table -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -70,38 +70,38 @@ Input table Query -```sql +``` sql SELECT x FROM t_null WHERE isNotNull(y) ``` -```text + +``` text ┌─x─┐ │ 2 │ └───┘ - ``` -## coalesce +## coalesce {#coalesce} Checks from left to right whether `NULL` arguments were passed and returns the first non-`NULL` argument. -```sql +``` sql coalesce(x,...) ``` **Parameters:** -- Any number of parameters of a non-compound type. All parameters must be compatible by data type. +- Any number of parameters of a non-compound type. All parameters must be compatible by data type. **Returned values** -- The first non-`NULL` argument. -- `NULL`, if all arguments are `NULL`. +- The first non-`NULL` argument. +- `NULL`, if all arguments are `NULL`. **Example** Consider a list of contacts that may specify multiple ways to contact a customer. -```text +``` text ┌─name─────┬─mail─┬─phone─────┬──icq─┐ │ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ │ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ @@ -112,60 +112,62 @@ The `mail` and `phone` fields are of type String, but the `icq` field is `UInt32 Get the first available contact method for the customer from the contact list: -```sql +``` sql SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook ``` -```text + +``` text ┌─name─────┬─coalesce(mail, phone, CAST(icq, 'Nullable(String)'))─┐ │ client 1 │ 123-45-67 │ │ client 2 │ ᴺᵁᴸᴸ │ └──────────┴──────────────────────────────────────────────────────┘ - ``` -## ifNull +## ifNull {#ifnull} Returns an alternative value if the main argument is `NULL`. -```sql +``` sql ifNull(x,alt) ``` **Parameters:** -- `x` — The value to check for `NULL`. -- `alt` — The value that the function returns if `x` is `NULL`. +- `x` — The value to check for `NULL`. +- `alt` — The value that the function returns if `x` is `NULL`. **Returned values** -- The value `x`, if `x` is not `NULL`. -- The value `alt`, if `x` is `NULL`. +- The value `x`, if `x` is not `NULL`. +- The value `alt`, if `x` is `NULL`. **Example** -```sql +``` sql SELECT ifNull('a', 'b') ``` -```text + +``` text ┌─ifNull('a', 'b')─┐ │ a │ └──────────────────┘ ``` -```sql +``` sql SELECT ifNull(NULL, 'b') ``` -```text + +``` text ┌─ifNull(NULL, 'b')─┐ │ b │ └───────────────────┘ ``` -## nullIf +## nullIf {#nullif} Returns `NULL` if the arguments are equal. -```sql +``` sql nullIf(x, y) ``` @@ -175,60 +177,63 @@ nullIf(x, y) **Returned values** -- `NULL`, if the arguments are equal. -- The `x` value, if the arguments are not equal. +- `NULL`, if the arguments are equal. +- The `x` value, if the arguments are not equal. **Example** -```sql +``` sql SELECT nullIf(1, 1) ``` -```text + +``` text ┌─nullIf(1, 1)─┐ │ ᴺᵁᴸᴸ │ └──────────────┘ ``` -```sql +``` sql SELECT nullIf(1, 2) ``` -```text + +``` text ┌─nullIf(1, 2)─┐ │ 1 │ └──────────────┘ ``` -## assumeNotNull +## assumeNotNull {#assumenotnull} Results in a value of type [Nullable](../../data_types/nullable.md) for a non- `Nullable`, if the value is not `NULL`. -```sql +``` sql assumeNotNull(x) ``` **Parameters:** -- `x` — The original value. +- `x` — The original value. **Returned values** -- The original value from the non-`Nullable` type, if it is not `NULL`. -- The default value for the non-`Nullable` type if the original value was `NULL`. +- The original value from the non-`Nullable` type, if it is not `NULL`. +- The default value for the non-`Nullable` type if the original value was `NULL`. **Example** Consider the `t_null` table. -```sql +``` sql SHOW CREATE TABLE t_null ``` -```text + +``` text ┌─statement─────────────────────────────────────────────────────────────────┐ │ CREATE TABLE default.t_null ( x Int8, y Nullable(Int8)) ENGINE = TinyLog │ └───────────────────────────────────────────────────────────────────────────┘ ``` -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -237,60 +242,64 @@ SHOW CREATE TABLE t_null Apply the `assumeNotNull` function to the `y` column. -```sql +``` sql SELECT assumeNotNull(y) FROM t_null ``` -```text + +``` text ┌─assumeNotNull(y)─┐ │ 0 │ │ 3 │ └──────────────────┘ ``` -```sql +``` sql SELECT toTypeName(assumeNotNull(y)) FROM t_null ``` -```text + +``` text ┌─toTypeName(assumeNotNull(y))─┐ │ Int8 │ │ Int8 │ └──────────────────────────────┘ ``` -## toNullable +## toNullable {#tonullable} Converts the argument type to `Nullable`. -```sql +``` sql toNullable(x) ``` **Parameters:** -- `x` — The value of any non-compound type. +- `x` — The value of any non-compound type. **Returned value** -- The input value with a `Nullable` type. +- The input value with a `Nullable` type. **Example** -```sql +``` sql SELECT toTypeName(10) ``` -```text + +``` text ┌─toTypeName(10)─┐ │ UInt8 │ └────────────────┘ ``` -```sql + +``` sql SELECT toTypeName(toNullable(10)) ``` -```text + +``` text ┌─toTypeName(toNullable(10))─┐ │ Nullable(UInt8) │ └────────────────────────────┘ ``` - [Original article](https://clickhouse.tech/docs/en/query_language/functions/functions_for_nulls/) diff --git a/docs/en/query_language/functions/geo.md b/docs/en/query_language/functions/geo.md index 81e43c93ad3..a12af7a0dbe 100644 --- a/docs/en/query_language/functions/geo.md +++ b/docs/en/query_language/functions/geo.md @@ -1,54 +1,54 @@ -# Functions for Working with Geographical Coordinates +# Functions for Working with Geographical Coordinates {#functions-for-working-with-geographical-coordinates} -## greatCircleDistance +## greatCircleDistance {#greatcircledistance} -Calculate the distance between two points on the Earth's surface using [the great-circle formula](https://en.wikipedia.org/wiki/Great-circle_distance). +Calculate the distance between two points on the Earth’s surface using [the great-circle formula](https://en.wikipedia.org/wiki/Great-circle_distance). -```sql +``` sql greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) ``` **Input parameters** -- `lon1Deg` — Longitude of the first point in degrees. Range: `[-180°, 180°]`. -- `lat1Deg` — Latitude of the first point in degrees. Range: `[-90°, 90°]`. -- `lon2Deg` — Longitude of the second point in degrees. Range: `[-180°, 180°]`. -- `lat2Deg` — Latitude of the second point in degrees. Range: `[-90°, 90°]`. +- `lon1Deg` — Longitude of the first point in degrees. Range: `[-180°, 180°]`. +- `lat1Deg` — Latitude of the first point in degrees. Range: `[-90°, 90°]`. +- `lon2Deg` — Longitude of the second point in degrees. Range: `[-180°, 180°]`. +- `lat2Deg` — Latitude of the second point in degrees. Range: `[-90°, 90°]`. Positive values correspond to North latitude and East longitude, and negative values correspond to South latitude and West longitude. **Returned value** -The distance between two points on the Earth's surface, in meters. +The distance between two points on the Earth’s surface, in meters. Generates an exception when the input parameter values fall outside of the range. **Example** -```sql +``` sql SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) ``` -```text +``` text ┌─greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673)─┐ │ 14132374.194975413 │ └───────────────────────────────────────────────────────────────────┘ ``` -## pointInEllipses +## pointInEllipses {#pointinellipses} Checks whether the point belongs to at least one of the ellipses. Coordinates are geometric in the Cartesian coordinate system. -```sql +``` sql pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) ``` **Input parameters** -- `x, y` — Coordinates of a point on the plane. -- `xᵢ, yᵢ` — Coordinates of the center of the `i`-th ellipsis. -- `aᵢ, bᵢ` — Axes of the `i`-th ellipsis in units of x, y coordinates. +- `x, y` — Coordinates of a point on the plane. +- `xᵢ, yᵢ` — Coordinates of the center of the `i`-th ellipsis. +- `aᵢ, bᵢ` — Axes of the `i`-th ellipsis in units of x, y coordinates. The input parameters must be `2+4⋅n`, where `n` is the number of ellipses. @@ -58,29 +58,29 @@ The input parameters must be `2+4⋅n`, where `n` is the number of ellipses. **Example** -```sql +``` sql SELECT pointInEllipses(10., 10., 10., 9.1, 1., 0.9999) ``` -```text +``` text ┌─pointInEllipses(10., 10., 10., 9.1, 1., 0.9999)─┐ │ 1 │ └─────────────────────────────────────────────────┘ ``` -## pointInPolygon +## pointInPolygon {#pointinpolygon} Checks whether the point belongs to the polygon on the plane. -```sql +``` sql pointInPolygon((x, y), [(a, b), (c, d) ...], ...) ``` **Input values** -- `(x, y)` — Coordinates of a point on the plane. Data type — [Tuple](../../data_types/tuple.md) — A tuple of two numbers. -- `[(a, b), (c, d) ...]` — Polygon vertices. Data type — [Array](../../data_types/array.md). Each vertex is represented by a pair of coordinates `(a, b)`. Vertices should be specified in a clockwise or counterclockwise order. The minimum number of vertices is 3. The polygon must be constant. -- The function also supports polygons with holes (cut out sections). In this case, add polygons that define the cut out sections using additional arguments of the function. The function does not support non-simply-connected polygons. +- `(x, y)` — Coordinates of a point on the plane. Data type — [Tuple](../../data_types/tuple.md) — A tuple of two numbers. +- `[(a, b), (c, d) ...]` — Polygon vertices. Data type — [Array](../../data_types/array.md). Each vertex is represented by a pair of coordinates `(a, b)`. Vertices should be specified in a clockwise or counterclockwise order. The minimum number of vertices is 3. The polygon must be constant. +- The function also supports polygons with holes (cut out sections). In this case, add polygons that define the cut out sections using additional arguments of the function. The function does not support non-simply-connected polygons. **Returned values** @@ -89,64 +89,65 @@ If the point is on the polygon boundary, the function may return either 0 or 1. **Example** -```sql +``` sql SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res ``` -```text +``` text ┌─res─┐ │ 1 │ └─────┘ ``` -## geohashEncode +## geohashEncode {#geohashencode} Encodes latitude and longitude as a geohash-string, please see (http://geohash.org/, https://en.wikipedia.org/wiki/Geohash). -```sql + +``` sql geohashEncode(longitude, latitude, [precision]) ``` **Input values** -- longitude - longitude part of the coordinate you want to encode. Floating in range`[-180°, 180°]` -- latitude - latitude part of the coordinate you want to encode. Floating in range `[-90°, 90°]` -- precision - Optional, length of the resulting encoded string, defaults to `12`. Integer in range `[1, 12]`. Any value less than `1` or greater than `12` is silently converted to `12`. +- longitude - longitude part of the coordinate you want to encode. Floating in range`[-180°, 180°]` +- latitude - latitude part of the coordinate you want to encode. Floating in range `[-90°, 90°]` +- precision - Optional, length of the resulting encoded string, defaults to `12`. Integer in range `[1, 12]`. Any value less than `1` or greater than `12` is silently converted to `12`. **Returned values** -- alphanumeric `String` of encoded coordinate (modified version of the base32-encoding alphabet is used). +- alphanumeric `String` of encoded coordinate (modified version of the base32-encoding alphabet is used). **Example** -```sql +``` sql SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res ``` -```text +``` text ┌─res──────────┐ │ ezs42d000000 │ └──────────────┘ ``` -## geohashDecode +## geohashDecode {#geohashdecode} Decodes any geohash-encoded string into longitude and latitude. **Input values** -- encoded string - geohash-encoded string. +- encoded string - geohash-encoded string. **Returned values** -- (longitude, latitude) - 2-tuple of `Float64` values of longitude and latitude. +- (longitude, latitude) - 2-tuple of `Float64` values of longitude and latitude. **Example** -```sql +``` sql SELECT geohashDecode('ezs42') AS res ``` -```text +``` text ┌─res─────────────────────────────┐ │ (-5.60302734375,42.60498046875) │ └─────────────────────────────────┘ @@ -156,26 +157,26 @@ SELECT geohashDecode('ezs42') AS res Returns [H3](https://uber.github.io/h3/#/documentation/overview/introduction) point index `(lon, lat)` with specified resolution. -[H3](https://uber.github.io/h3/#/documentation/overview/introduction) is a geographical indexing system where Earth's surface divided into even hexagonal tiles. This system is hierarchical, i. e. each hexagon on the top level can be splitted into seven even but smaller ones and so on. +[H3](https://uber.github.io/h3/#/documentation/overview/introduction) is a geographical indexing system where Earth’s surface divided into even hexagonal tiles. This system is hierarchical, i. e. each hexagon on the top level can be splitted into seven even but smaller ones and so on. This index is used primarily for bucketing locations and other geospatial manipulations. -**Syntax** +**Syntax** -```sql +``` sql geoToH3(lon, lat, resolution) ``` -**Parameters** +**Parameters** -- `lon` — Longitude. Type: [Float64](../../data_types/float.md). -- `lat` — Latitude. Type: [Float64](../../data_types/float.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). +- `lon` — Longitude. Type: [Float64](../../data_types/float.md). +- `lat` — Latitude. Type: [Float64](../../data_types/float.md). +- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). **Returned values** -- Hexagon index number. -- 0 in case of error. +- Hexagon index number. +- 0 in case of error. Type: `UInt64`. @@ -183,317 +184,317 @@ Type: `UInt64`. Query: -```sql +``` sql SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index ``` Result: -```text +``` text ┌────────────h3Index─┐ │ 644325524701193974 │ └────────────────────┘ ``` -## geohashesInBox +## geohashesInBox {#geohashesinbox} Returns an array of geohash-encoded strings of given precision that fall inside and intersect boundaries of given box, basically a 2D grid flattened into array. **Input values** -- longitude_min - min longitude, floating value in range `[-180°, 180°]` -- latitude_min - min latitude, floating value in range `[-90°, 90°]` -- longitude_max - max longitude, floating value in range `[-180°, 180°]` -- latitude_max - max latitude, floating value in range `[-90°, 90°]` -- precision - geohash precision, `UInt8` in range `[1, 12]` +- longitude\_min - min longitude, floating value in range `[-180°, 180°]` +- latitude\_min - min latitude, floating value in range `[-90°, 90°]` +- longitude\_max - max longitude, floating value in range `[-180°, 180°]` +- latitude\_max - max latitude, floating value in range `[-90°, 90°]` +- precision - geohash precision, `UInt8` in range `[1, 12]` Please note that all coordinate parameters should be of the same type: either `Float32` or `Float64`. **Returned values** -- array of precision-long strings of geohash-boxes covering provided area, you should not rely on order of items. -- [] - empty array if *min* values of *latitude* and *longitude* aren't less than corresponding *max* values. +- array of precision-long strings of geohash-boxes covering provided area, you should not rely on order of items. +- \[\] - empty array if *min* values of *latitude* and *longitude* aren’t less than corresponding *max* values. -Please note that function will throw an exception if resulting array is over 10'000'000 items long. +Please note that function will throw an exception if resulting array is over 10’000’000 items long. **Example** -```sql +``` sql SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos ``` -```text + +``` text ┌─thasos──────────────────────────────────────┐ │ ['sx1q','sx1r','sx32','sx1w','sx1x','sx38'] │ └─────────────────────────────────────────────┘ ``` -## h3GetBaseCell +## h3GetBaseCell {#h3getbasecell} Returns the base cell number of the index. **Syntax** -```sql +``` sql h3GetBaseCell(index) ``` -**Parameters** +**Parameters** -- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). +- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). **Returned values** -- Hexagon base cell number. Type: [UInt8](../../data_types/int_uint.md). +- Hexagon base cell number. Type: [UInt8](../../data_types/int_uint.md). **Example** Query: -```sql +``` sql SELECT h3GetBaseCell(612916788725809151) as basecell ``` Result: -```text +``` text ┌─basecell─┐ │ 12 │ └──────────┘ ``` -## h3HexAreaM2 +## h3HexAreaM2 {#h3hexaream2} Average hexagon area in square meters at the given resolution. -**Syntax** +**Syntax** -```sql +``` sql h3HexAreaM2(resolution) ``` -**Parameters** +**Parameters** -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). **Returned values** -- Area in m². Type: [Float64](../../data_types/float.md). +- Area in m². Type: [Float64](../../data_types/float.md). **Example** Query: -```sql +``` sql SELECT h3HexAreaM2(13) as area ``` Result: -```text +``` text ┌─area─┐ │ 43.9 │ └──────┘ ``` -## h3IndexesAreNeighbors +## h3IndexesAreNeighbors {#h3indexesareneighbors} Returns whether or not the provided H3Indexes are neighbors. **Syntax** -```sql +``` sql h3IndexesAreNeighbors(index1, index2) ``` -**Parameters** +**Parameters** -- `index1` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). -- `index2` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). +- `index1` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). +- `index2` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). **Returned values** -- Returns `1` if the indexes are neighbors, `0` otherwise. Type: [UInt8](../../data_types/int_uint.md). +- Returns `1` if the indexes are neighbors, `0` otherwise. Type: [UInt8](../../data_types/int_uint.md). **Example** Query: -```sql +``` sql SELECT h3IndexesAreNeighbors(617420388351344639, 617420388352655359) AS n ``` Result: -```text +``` text ┌─n─┐ │ 1 │ └───┘ ``` - -## h3ToChildren +## h3ToChildren {#h3tochildren} Returns an array with the child indexes of the given index. **Syntax** -```sql +``` sql h3ToChildren(index, resolution) ``` -**Parameters** +**Parameters** -- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). +- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). **Returned values** -- Array with the child H3 indexes. Array of type: [UInt64](../../data_types/int_uint.md). +- Array with the child H3 indexes. Array of type: [UInt64](../../data_types/int_uint.md). **Example** Query: -```sql +``` sql SELECT h3ToChildren(599405990164561919, 6) AS children ``` Result: -```text +``` text ┌─children───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ [603909588852408319,603909588986626047,603909589120843775,603909589255061503,603909589389279231,603909589523496959,603909589657714687] │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## h3ToParent +## h3ToParent {#h3toparent} Returns the parent (coarser) index containing the given index. **Syntax** -```sql +``` sql h3ToParent(index, resolution) ``` -**Parameters** +**Parameters** -- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). -- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). +- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). +- `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). **Returned values** -- Parent H3 index. Type: [UInt64](../../data_types/int_uint.md). +- Parent H3 index. Type: [UInt64](../../data_types/int_uint.md). **Example** Query: -```sql +``` sql SELECT h3ToParent(599405990164561919, 3) as parent ``` Result: -```text +``` text ┌─────────────parent─┐ │ 590398848891879423 │ └────────────────────┘ ``` -## h3ToString +## h3ToString {#h3tostring} -Converts the H3Index representation of the index to the string representation. +Converts the H3Index representation of the index to the string representation. -```sql +``` sql h3ToString(index) ``` -**Parameters** +**Parameters** -- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). +- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). **Returned values** -- String representation of the H3 index. Type: [String](../../data_types/string.md). +- String representation of the H3 index. Type: [String](../../data_types/string.md). **Example** -Query: +Query: -```sql +``` sql SELECT h3ToString(617420388352917503) as h3_string ``` Result: -```text +``` text ┌─h3_string───────┐ │ 89184926cdbffff │ └─────────────────┘ ``` -## stringToH3 +## stringToH3 {#stringtoh3} Converts the string representation to H3Index (UInt64) representation. -```sql +``` sql stringToH3(index_str) ``` -**Parameters** +**Parameters** -- `index_str` — String representation of the H3 index. Type: [String](../../data_types/string.md). +- `index_str` — String representation of the H3 index. Type: [String](../../data_types/string.md). **Returned values** -- Hexagon index number. Returns 0 on error. Type: [UInt64](../../data_types/int_uint.md). +- Hexagon index number. Returns 0 on error. Type: [UInt64](../../data_types/int_uint.md). **Example** Query: -```sql +``` sql SELECT stringToH3('89184926cc3ffff') as index ``` Result: -```text +``` text ┌──────────────index─┐ │ 617420388351344639 │ └────────────────────┘ ``` -## h3GetResolution +## h3GetResolution {#h3getresolution} Returns the resolution of the index. **Syntax** -```sql +``` sql h3GetResolution(index) ``` -**Parameters** +**Parameters** -- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). +- `index` — Hexagon index number. Type: [UInt64](../../data_types/int_uint.md). **Returned values** -- Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). +- Index resolution. Range: `[0, 15]`. Type: [UInt8](../../data_types/int_uint.md). **Example** Query: -```sql +``` sql SELECT h3GetResolution(617420388352917503) as res ``` Result: -```text +``` text ┌─res─┐ │ 9 │ └─────┘ diff --git a/docs/en/query_language/functions/hash_functions.md b/docs/en/query_language/functions/hash_functions.md index 5c51bf33b20..b957057b001 100644 --- a/docs/en/query_language/functions/hash_functions.md +++ b/docs/en/query_language/functions/hash_functions.md @@ -1,12 +1,12 @@ -# Hash functions +# Hash functions {#hash-functions} Hash functions can be used for the deterministic pseudo-random shuffling of elements. -## halfMD5 {#hash_functions-halfmd5} +## halfMD5 {#hash-functions-halfmd5} [Interprets](../../query_language/functions/type_conversion_functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. -```sql +``` sql halfMD5(par1, ...) ``` @@ -23,26 +23,27 @@ A [UInt64](../../data_types/int_uint.md) data type hash value. **Example** -```sql +``` sql SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS halfMD5hash, toTypeName(halfMD5hash) AS type ``` -```text + +``` text ┌────────halfMD5hash─┬─type───┐ │ 186182704141653334 │ UInt64 │ └────────────────────┴────────┘ ``` -## MD5 {#hash_functions-md5} +## MD5 {#hash-functions-md5} Calculates the MD5 from a string and returns the resulting set of bytes as FixedString(16). -If you don't need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the 'sipHash128' function instead. +If you don’t need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the ‘sipHash128’ function instead. If you want to get the same result as output by the md5sum utility, use lower(hex(MD5(s))). -## sipHash64 {#hash_functions-siphash64} +## sipHash64 {#hash-functions-siphash64} Produces a 64-bit [SipHash](https://131002.net/siphash/) hash value. -```sql +``` sql sipHash64(par1,...) ``` @@ -50,10 +51,10 @@ This is a cryptographic hash function. It works at least three times faster than Function [interprets](../../query_language/functions/type_conversion_functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. Then combines hashes by the following algorithm: -1. After hashing all the input parameters, the function gets the array of hashes. -2. Function takes the first and the second elements and calculates a hash for the array of them. -3. Then the function takes the hash value, calculated at the previous step, and the third element of the initial hash array, and calculates a hash for the array of them. -4. The previous step is repeated for all the remaining elements of the initial hash array. +1. After hashing all the input parameters, the function gets the array of hashes. +2. Function takes the first and the second elements and calculates a hash for the array of them. +3. Then the function takes the hash value, calculated at the previous step, and the third element of the initial hash array, and calculates a hash for the array of them. +4. The previous step is repeated for all the remaining elements of the initial hash array. **Parameters** @@ -65,26 +66,27 @@ A [UInt64](../../data_types/int_uint.md) data type hash value. **Example** -```sql +``` sql SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type ``` -```text + +``` text ┌──────────────SipHash─┬─type───┐ │ 13726873534472839665 │ UInt64 │ └──────────────────────┴────────┘ ``` -## sipHash128 {#hash_functions-siphash128} +## sipHash128 {#hash-functions-siphash128} Calculates SipHash from a string. Accepts a String-type argument. Returns FixedString(16). Differs from sipHash64 in that the final xor-folding state is only done up to 128 bits. -## cityHash64 +## cityHash64 {#cityhash64} Produces a 64-bit [CityHash](https://github.com/google/cityhash) hash value. -```sql +``` sql cityHash64(par1,...) ``` @@ -102,10 +104,11 @@ A [UInt64](../../data_types/int_uint.md) data type hash value. Call example: -```sql +``` sql SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS CityHash, toTypeName(CityHash) AS type ``` -```text + +``` text ┌─────────────CityHash─┬─type───┐ │ 12072650598913549138 │ UInt64 │ └──────────────────────┴────────┘ @@ -113,44 +116,43 @@ SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 The following example shows how to compute the checksum of the entire table with accuracy up to the row order: -```sql +``` sql SELECT groupBitXor(cityHash64(*)) FROM table ``` - -## intHash32 +## intHash32 {#inthash32} Calculates a 32-bit hash code from any type of integer. This is a relatively fast non-cryptographic hash function of average quality for numbers. -## intHash64 +## intHash64 {#inthash64} Calculates a 64-bit hash code from any type of integer. It works faster than intHash32. Average quality. -## SHA1 +## SHA1 {#sha1} -## SHA224 +## SHA224 {#sha224} -## SHA256 +## SHA256 {#sha256} Calculates SHA-1, SHA-224, or SHA-256 from a string and returns the resulting set of bytes as FixedString(20), FixedString(28), or FixedString(32). The function works fairly slowly (SHA-1 processes about 5 million short strings per second per processor core, while SHA-224 and SHA-256 process about 2.2 million). -We recommend using this function only in cases when you need a specific hash function and you can't select it. +We recommend using this function only in cases when you need a specific hash function and you can’t select it. Even in these cases, we recommend applying the function offline and pre-calculating values when inserting them into the table, instead of applying it in SELECTS. -## URLHash(url\[, N\]) +## URLHash(url\[, N\]) {#urlhashurl-n} A fast, decent-quality non-cryptographic hash function for a string obtained from a URL using some type of normalization. `URLHash(s)` – Calculates a hash from a string without one of the trailing symbols `/`,`?` or `#` at the end, if present. `URLHash(s, N)` – Calculates a hash from a string up to the N level in the URL hierarchy, without one of the trailing symbols `/`,`?` or `#` at the end, if present. Levels are the same as in URLHierarchy. This function is specific to Yandex.Metrica. -## farmHash64 +## farmHash64 {#farmhash64} Produces a 64-bit [FarmHash](https://github.com/google/farmhash) hash value. -```sql +``` sql farmHash64(par1, ...) ``` @@ -166,22 +168,23 @@ A [UInt64](../../data_types/int_uint.md) data type hash value. **Example** -```sql +``` sql SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS FarmHash, toTypeName(FarmHash) AS type ``` -```text + +``` text ┌─────────────FarmHash─┬─type───┐ │ 17790458267262532859 │ UInt64 │ └──────────────────────┴────────┘ ``` -## javaHash {#hash_functions-javahash} +## javaHash {#hash-functions-javahash} Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) from a string. This hash function is neither fast nor having a good quality. The only reason to use it is when this algorithm is already used in another system and you have to calculate exactly the same result. -**Syntax** +**Syntax** -```sql +``` sql SELECT javaHash(''); ``` @@ -193,13 +196,13 @@ A `Int32` data type hash value. Query: -```sql +``` sql SELECT javaHash('Hello, world!'); ``` Result: -```text +``` text ┌─javaHash('Hello, world!')─┐ │ -1880044555 │ └───────────────────────────┘ @@ -209,15 +212,15 @@ Result: Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) from a string, assuming it contains bytes representing a string in UTF-16LE encoding. -**Syntax** +**Syntax** -```sql +``` sql javaHashUTF16LE(stringUtf16le) ``` -**Parameters** +**Parameters** -- `stringUtf16le` — a string in UTF-16LE encoding. +- `stringUtf16le` — a string in UTF-16LE encoding. **Returned value** @@ -229,23 +232,23 @@ Correct query with UTF-16LE encoded string. Query: -```sql +``` sql SELECT javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le')) ``` Result: -```text +``` text ┌─javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le'))─┐ │ 3556498 │ └──────────────────────────────────────────────────────────────┘ ``` -## hiveHash {#hash_functions-hivehash} +## hiveHash {#hash-functions-hivehash} Calculates `HiveHash` from a string. -```sql +``` sql SELECT hiveHash(''); ``` @@ -261,23 +264,23 @@ Type: `hiveHash`. Query: -```sql +``` sql SELECT hiveHash('Hello, world!'); ``` Result: -```text +``` text ┌─hiveHash('Hello, world!')─┐ │ 267439093 │ └───────────────────────────┘ ``` -## metroHash64 +## metroHash64 {#metrohash64} Produces a 64-bit [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/) hash value. -```sql +``` sql metroHash64(par1, ...) ``` @@ -291,26 +294,27 @@ A [UInt64](../../data_types/int_uint.md) data type hash value. **Example** -```sql +``` sql SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MetroHash, toTypeName(MetroHash) AS type ``` -```text + +``` text ┌────────────MetroHash─┬─type───┐ │ 14235658766382344533 │ UInt64 │ └──────────────────────┴────────┘ ``` -## jumpConsistentHash +## jumpConsistentHash {#jumpconsistenthash} Calculates JumpConsistentHash form a UInt64. Accepts two arguments: a UInt64-type key and the number of buckets. Returns Int32. For more information, see the link: [JumpConsistentHash](https://arxiv.org/pdf/1406.2294.pdf) -## murmurHash2_32, murmurHash2_64 +## murmurHash2\_32, murmurHash2\_64 {#murmurhash2-32-murmurhash2-64} Produces a [MurmurHash2](https://github.com/aappleby/smhasher) hash value. -```sql +``` sql murmurHash2_32(par1, ...) murmurHash2_64(par1, ...) ``` @@ -321,25 +325,26 @@ Both functions take a variable number of input parameters. Parameters can be any **Returned Value** -- The `murmurHash2_32` function returns hash value having the [UInt32](../../data_types/int_uint.md) data type. -- The `murmurHash2_64` function returns hash value having the [UInt64](../../data_types/int_uint.md) data type. +- The `murmurHash2_32` function returns hash value having the [UInt32](../../data_types/int_uint.md) data type. +- The `murmurHash2_64` function returns hash value having the [UInt64](../../data_types/int_uint.md) data type. **Example** -```sql +``` sql SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash2, toTypeName(MurmurHash2) AS type ``` -```text + +``` text ┌──────────MurmurHash2─┬─type───┐ │ 11832096901709403633 │ UInt64 │ └──────────────────────┴────────┘ ``` -## murmurHash3_32, murmurHash3_64 +## murmurHash3\_32, murmurHash3\_64 {#murmurhash3-32-murmurhash3-64} Produces a [MurmurHash3](https://github.com/aappleby/smhasher) hash value. -```sql +``` sql murmurHash3_32(par1, ...) murmurHash3_64(par1, ...) ``` @@ -350,31 +355,32 @@ Both functions take a variable number of input parameters. Parameters can be any **Returned Value** -- The `murmurHash3_32` function returns a [UInt32](../../data_types/int_uint.md) data type hash value. -- The `murmurHash3_64` function returns a [UInt64](../../data_types/int_uint.md) data type hash value. +- The `murmurHash3_32` function returns a [UInt32](../../data_types/int_uint.md) data type hash value. +- The `murmurHash3_64` function returns a [UInt64](../../data_types/int_uint.md) data type hash value. **Example** -```sql +``` sql SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash3, toTypeName(MurmurHash3) AS type ``` -```text + +``` text ┌─MurmurHash3─┬─type───┐ │ 2152717 │ UInt32 │ └─────────────┴────────┘ ``` -## murmurHash3_128 +## murmurHash3\_128 {#murmurhash3-128} Produces a 128-bit [MurmurHash3](https://github.com/aappleby/smhasher) hash value. -```sql +``` sql murmurHash3_128( expr ) ``` **Parameters** -- `expr` — [Expressions](../syntax.md#syntax-expressions) returning a [String](../../data_types/string.md)-type value. +- `expr` — [Expressions](../syntax.md#syntax-expressions) returning a [String](../../data_types/string.md)-type value. **Returned Value** @@ -382,20 +388,21 @@ A [FixedString(16)](../../data_types/fixedstring.md) data type hash value. **Example** -```sql +``` sql SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) AS type ``` -```text + +``` text ┌─MurmurHash3──────┬─type────────────┐ │ 6�1�4"S5KT�~~q │ FixedString(16) │ └──────────────────┴─────────────────┘ ``` -## xxHash32, xxHash64 {#hash_functions-xxhash32} +## xxHash32, xxHash64 {#hash-functions-xxhash32} Calculates `xxHash` from a string. It is proposed in two flavors, 32 and 64 bits. -```sql +``` sql SELECT xxHash32(''); OR @@ -413,13 +420,13 @@ Type: `xxHash`. Query: -```sql +``` sql SELECT xxHash32('Hello, world!'); ``` Result: -```text +``` text ┌─xxHash32('Hello, world!')─┐ │ 834093149 │ └───────────────────────────┘ @@ -427,6 +434,6 @@ Result: **See Also** -- [xxHash](http://cyan4973.github.io/xxHash/). +- [xxHash](http://cyan4973.github.io/xxHash/). [Original article](https://clickhouse.tech/docs/en/query_language/functions/hash_functions/) diff --git a/docs/en/query_language/functions/higher_order_functions.md b/docs/en/query_language/functions/higher_order_functions.md index ef5fd0c398a..0b3dbb7e7bc 100644 --- a/docs/en/query_language/functions/higher_order_functions.md +++ b/docs/en/query_language/functions/higher_order_functions.md @@ -1,6 +1,6 @@ -# Higher-order functions +# Higher-order functions {#higher-order-functions} -## `->` operator, lambda(params, expr) function +## `->` operator, lambda(params, expr) function {#operator-lambdaparams-expr-function} Allows describing a lambda function for passing to a higher-order function. The left side of the arrow has a formal parameter, which is any ID, or multiple formal parameters – any IDs in a tuple. The right side of the arrow has an expression that can use these formal parameters, as well as any table columns. @@ -12,61 +12,64 @@ A lambda function that accepts multiple arguments can be passed to a higher-orde For some functions, such as [arrayCount](#higher_order_functions-array-count) or [arraySum](#higher_order_functions-array-count), the first argument (the lambda function) can be omitted. In this case, identical mapping is assumed. -A lambda function can't be omitted for the following functions: +A lambda function can’t be omitted for the following functions: -- [arrayMap](#higher_order_functions-array-map) -- [arrayFilter](#higher_order_functions-array-filter) -- [arrayFill](#higher_order_functions-array-fill) -- [arrayReverseFill](#higher_order_functions-array-reverse-fill) -- [arraySplit](#higher_order_functions-array-split) -- [arrayReverseSplit](#higher_order_functions-array-reverse-split) -- [arrayFirst](#higher_order_functions-array-first) -- [arrayFirstIndex](#higher_order_functions-array-first-index) +- [arrayMap](#higher_order_functions-array-map) +- [arrayFilter](#higher_order_functions-array-filter) +- [arrayFill](#higher_order_functions-array-fill) +- [arrayReverseFill](#higher_order_functions-array-reverse-fill) +- [arraySplit](#higher_order_functions-array-split) +- [arrayReverseSplit](#higher_order_functions-array-reverse-split) +- [arrayFirst](#higher_order_functions-array-first) +- [arrayFirstIndex](#higher_order_functions-array-first-index) -### arrayMap(func, arr1, ...) {#higher_order_functions-array-map} +### arrayMap(func, arr1, …) {#higher-order-functions-array-map} Returns an array obtained from the original application of the `func` function to each element in the `arr` array. Examples: -```sql +``` sql SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res; ``` -```text + +``` text ┌─res─────┐ │ [3,4,5] │ └─────────┘ ``` + The following example shows how to create a tuple of elements from different arrays: -```sql +``` sql SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res ``` -```text + +``` text ┌─res─────────────────┐ │ [(1,4),(2,5),(3,6)] │ └─────────────────────┘ ``` -Note that the first argument (lambda function) can't be omitted in the `arrayMap` function. +Note that the first argument (lambda function) can’t be omitted in the `arrayMap` function. -### arrayFilter(func, arr1, ...) {#higher_order_functions-array-filter} +### arrayFilter(func, arr1, …) {#higher-order-functions-array-filter} Returns an array containing only the elements in `arr1` for which `func` returns something other than 0. Examples: -```sql +``` sql SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res ``` -```text +``` text ┌─res───────────┐ │ ['abc World'] │ └───────────────┘ ``` -```sql +``` sql SELECT arrayFilter( (i, x) -> x LIKE '%World%', @@ -75,145 +78,145 @@ SELECT AS res ``` -```text +``` text ┌─res─┐ │ [2] │ └─────┘ ``` -Note that the first argument (lambda function) can't be omitted in the `arrayFilter` function. +Note that the first argument (lambda function) can’t be omitted in the `arrayFilter` function. -### arrayFill(func, arr1, ...) {#higher_order_functions-array-fill} +### arrayFill(func, arr1, …) {#higher-order-functions-array-fill} Scan through `arr1` from the first element to the last element and replace `arr1[i]` by `arr1[i - 1]` if `func` returns 0. The first element of `arr1` will not be replaced. Examples: -```sql +``` sql SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res ``` -```text +``` text ┌─res──────────────────────────────┐ │ [1,1,3,11,12,12,12,5,6,14,14,14] │ └──────────────────────────────────┘ ``` -Note that the first argument (lambda function) can't be omitted in the `arrayFill` function. +Note that the first argument (lambda function) can’t be omitted in the `arrayFill` function. -### arrayReverseFill(func, arr1, ...) {#higher_order_functions-array-reverse-fill} +### arrayReverseFill(func, arr1, …) {#higher-order-functions-array-reverse-fill} Scan through `arr1` from the last element to the first element and replace `arr1[i]` by `arr1[i + 1]` if `func` returns 0. The last element of `arr1` will not be replaced. Examples: -```sql +``` sql SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res ``` -```text +``` text ┌─res────────────────────────────────┐ │ [1,3,3,11,12,5,5,5,6,14,NULL,NULL] │ └────────────────────────────────────┘ ``` -Note that the first argument (lambda function) can't be omitted in the `arrayReverseFill` function. +Note that the first argument (lambda function) can’t be omitted in the `arrayReverseFill` function. -### arraySplit(func, arr1, ...) {#higher_order_functions-array-split} +### arraySplit(func, arr1, …) {#higher-order-functions-array-split} Split `arr1` into multiple arrays. When `func` returns something other than 0, the array will be split on the left hand side of the element. The array will not be split before the first element. Examples: -```sql +``` sql SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res ``` -```text +``` text ┌─res─────────────┐ │ [[1,2,3],[4,5]] │ └─────────────────┘ ``` -Note that the first argument (lambda function) can't be omitted in the `arraySplit` function. +Note that the first argument (lambda function) can’t be omitted in the `arraySplit` function. -### arrayReverseSplit(func, arr1, ...) {#higher_order_functions-array-reverse-split} +### arrayReverseSplit(func, arr1, …) {#higher-order-functions-array-reverse-split} Split `arr1` into multiple arrays. When `func` returns something other than 0, the array will be split on the right hand side of the element. The array will not be split after the last element. Examples: -```sql +``` sql SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res ``` -```text +``` text ┌─res───────────────┐ │ [[1],[2,3,4],[5]] │ └───────────────────┘ ``` -Note that the first argument (lambda function) can't be omitted in the `arraySplit` function. +Note that the first argument (lambda function) can’t be omitted in the `arraySplit` function. -### arrayCount(\[func,\] arr1, ...) {#higher_order_functions-array-count} +### arrayCount(\[func,\] arr1, …) {#higher-order-functions-array-count} -Returns the number of elements in the arr array for which func returns something other than 0. If 'func' is not specified, it returns the number of non-zero elements in the array. +Returns the number of elements in the arr array for which func returns something other than 0. If ‘func’ is not specified, it returns the number of non-zero elements in the array. -### arrayExists(\[func,\] arr1, ...) +### arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} -Returns 1 if there is at least one element in 'arr' for which 'func' returns something other than 0. Otherwise, it returns 0. +Returns 1 if there is at least one element in ‘arr’ for which ‘func’ returns something other than 0. Otherwise, it returns 0. -### arrayAll(\[func,\] arr1, ...) +### arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} -Returns 1 if 'func' returns something other than 0 for all the elements in 'arr'. Otherwise, it returns 0. +Returns 1 if ‘func’ returns something other than 0 for all the elements in ‘arr’. Otherwise, it returns 0. -### arraySum(\[func,\] arr1, ...) {#higher_order_functions-array-sum} +### arraySum(\[func,\] arr1, …) {#higher-order-functions-array-sum} -Returns the sum of the 'func' values. If the function is omitted, it just returns the sum of the array elements. +Returns the sum of the ‘func’ values. If the function is omitted, it just returns the sum of the array elements. -### arrayFirst(func, arr1, ...) {#higher_order_functions-array-first} +### arrayFirst(func, arr1, …) {#higher-order-functions-array-first} -Returns the first element in the 'arr1' array for which 'func' returns something other than 0. +Returns the first element in the ‘arr1’ array for which ‘func’ returns something other than 0. -Note that the first argument (lambda function) can't be omitted in the `arrayFirst` function. +Note that the first argument (lambda function) can’t be omitted in the `arrayFirst` function. -### arrayFirstIndex(func, arr1, ...) {#higher_order_functions-array-first-index} +### arrayFirstIndex(func, arr1, …) {#higher-order-functions-array-first-index} -Returns the index of the first element in the 'arr1' array for which 'func' returns something other than 0. +Returns the index of the first element in the ‘arr1’ array for which ‘func’ returns something other than 0. -Note that the first argument (lambda function) can't be omitted in the `arrayFirstIndex` function. +Note that the first argument (lambda function) can’t be omitted in the `arrayFirstIndex` function. -### arrayCumSum(\[func,\] arr1, ...) +### arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} Returns an array of partial sums of elements in the source array (a running sum). If the `func` function is specified, then the values of the array elements are converted by this function before summing. Example: -```sql +``` sql SELECT arrayCumSum([1, 1, 1, 1]) AS res ``` -```text +``` text ┌─res──────────┐ │ [1, 2, 3, 4] │ └──────────────┘ ``` -### arrayCumSumNonNegative(arr) +### arrayCumSumNonNegative(arr) {#arraycumsumnonnegativearr} Same as `arrayCumSum`, returns an array of partial sums of elements in the source array (a running sum). Different `arrayCumSum`, when then returned value contains a value less than zero, the value is replace with zero and the subsequent calculation is performed with zero parameters. For example: -```sql +``` sql SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res ``` -```text +``` text ┌─res───────┐ │ [1,2,0,1] │ └───────────┘ ``` -### arraySort(\[func,\] arr1, ...) +### arraySort(\[func,\] arr1, …) {#arraysortfunc-arr1} Returns an array as result of sorting the elements of `arr1` in ascending order. If the `func` function is specified, sorting order is determined by the result of the function `func` applied to the elements of array (arrays) @@ -221,11 +224,11 @@ The [Schwartzian transform](https://en.wikipedia.org/wiki/Schwartzian_transform) Example: -```sql +``` sql SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); ``` -```text +``` text ┌─res────────────────┐ │ ['world', 'hello'] │ └────────────────────┘ @@ -233,16 +236,17 @@ SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); For more information about the `arraySort` method, see the [Functions for Working With Arrays](array_functions.md#array_functions-sort) section. -### arrayReverseSort(\[func,\] arr1, ...) +### arrayReverseSort(\[func,\] arr1, …) {#arrayreversesortfunc-arr1} Returns an array as result of sorting the elements of `arr1` in descending order. If the `func` function is specified, sorting order is determined by the result of the function `func` applied to the elements of array (arrays). Example: -```sql +``` sql SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -```text + +``` text ┌─res───────────────┐ │ ['hello','world'] │ └───────────────────┘ diff --git a/docs/en/query_language/functions/in_functions.md b/docs/en/query_language/functions/in_functions.md index 287d401a0a5..806ce20ef2c 100644 --- a/docs/en/query_language/functions/in_functions.md +++ b/docs/en/query_language/functions/in_functions.md @@ -1,20 +1,19 @@ -# Functions for implementing the IN operator +# Functions for implementing the IN operator {#functions-for-implementing-the-in-operator} ## in, notIn, globalIn, globalNotIn {#in-functions} See the section [IN operators](../select.md#select-in-operators). -## tuple(x, y, ...), operator (x, y, ...) +## tuple(x, y, …), operator (x, y, …) {#tuplex-y-operator-x-y} A function that allows grouping multiple columns. -For columns with the types T1, T2, ..., it returns a Tuple(T1, T2, ...) type tuple containing these columns. There is no cost to execute the function. -Tuples are normally used as intermediate values for an argument of IN operators, or for creating a list of formal parameters of lambda functions. Tuples can't be written to a table. +For columns with the types T1, T2, …, it returns a Tuple(T1, T2, …) type tuple containing these columns. There is no cost to execute the function. +Tuples are normally used as intermediate values for an argument of IN operators, or for creating a list of formal parameters of lambda functions. Tuples can’t be written to a table. -## tupleElement(tuple, n), operator x.N +## tupleElement(tuple, n), operator x.N {#tupleelementtuple-n-operator-x-n} A function that allows getting a column from a tuple. -'N' is the column index, starting from 1. N must be a constant. 'N' must be a constant. 'N' must be a strict postive integer no greater than the size of the tuple. +‘N’ is the column index, starting from 1. N must be a constant. ‘N’ must be a constant. ‘N’ must be a strict postive integer no greater than the size of the tuple. There is no cost to execute the function. - [Original article](https://clickhouse.tech/docs/en/query_language/functions/in_functions/) diff --git a/docs/en/query_language/functions/index.md b/docs/en/query_language/functions/index.md index a25ee88dea1..c52e56ffdb1 100644 --- a/docs/en/query_language/functions/index.md +++ b/docs/en/query_language/functions/index.md @@ -1,67 +1,66 @@ -# Functions +# Functions {#functions} -There are at least\* two types of functions - regular functions (they are just called "functions") and aggregate functions. These are completely different concepts. Regular functions work as if they are applied to each row separately (for each row, the result of the function doesn't depend on the other rows). Aggregate functions accumulate a set of values from various rows (i.e. they depend on the entire set of rows). +There are at least\* two types of functions - regular functions (they are just called “functions”) and aggregate functions. These are completely different concepts. Regular functions work as if they are applied to each row separately (for each row, the result of the function doesn’t depend on the other rows). Aggregate functions accumulate a set of values from various rows (i.e. they depend on the entire set of rows). -In this section we discuss regular functions. For aggregate functions, see the section "Aggregate functions". +In this section we discuss regular functions. For aggregate functions, see the section “Aggregate functions”. -\* - There is a third type of function that the 'arrayJoin' function belongs to; table functions can also be mentioned separately.\* +\* - There is a third type of function that the ‘arrayJoin’ function belongs to; table functions can also be mentioned separately.\* -## Strong typing +## Strong typing {#strong-typing} -In contrast to standard SQL, ClickHouse has strong typing. In other words, it doesn't make implicit conversions between types. Each function works for a specific set of types. This means that sometimes you need to use type conversion functions. +In contrast to standard SQL, ClickHouse has strong typing. In other words, it doesn’t make implicit conversions between types. Each function works for a specific set of types. This means that sometimes you need to use type conversion functions. -## Common subexpression elimination +## Common subexpression elimination {#common-subexpression-elimination} All expressions in a query that have the same AST (the same record or same result of syntactic parsing) are considered to have identical values. Such expressions are concatenated and executed once. Identical subqueries are also eliminated this way. -## Types of results +## Types of results {#types-of-results} All functions return a single return as the result (not several values, and not zero values). The type of result is usually defined only by the types of arguments, not by the values. Exceptions are the tupleElement function (the a.N operator), and the toFixedString function. -## Constants +## Constants {#constants} For simplicity, certain functions can only work with constants for some arguments. For example, the right argument of the LIKE operator must be a constant. Almost all functions return a constant for constant arguments. The exception is functions that generate random numbers. -The 'now' function returns different values for queries that were run at different times, but the result is considered a constant, since constancy is only important within a single query. +The ‘now’ function returns different values for queries that were run at different times, but the result is considered a constant, since constancy is only important within a single query. A constant expression is also considered a constant (for example, the right half of the LIKE operator can be constructed from multiple constants). Functions can be implemented in different ways for constant and non-constant arguments (different code is executed). But the results for a constant and for a true column containing only the same value should match each other. -## NULL processing +## NULL processing {#null-processing} Functions have the following behaviors: -- If at least one of the arguments of the function is `NULL`, the function result is also `NULL`. -- Special behavior that is specified individually in the description of each function. In the ClickHouse source code, these functions have `UseDefaultImplementationForNulls=false`. +- If at least one of the arguments of the function is `NULL`, the function result is also `NULL`. +- Special behavior that is specified individually in the description of each function. In the ClickHouse source code, these functions have `UseDefaultImplementationForNulls=false`. -## Constancy +## Constancy {#constancy} -Functions can't change the values of their arguments – any changes are returned as the result. Thus, the result of calculating separate functions does not depend on the order in which the functions are written in the query. +Functions can’t change the values of their arguments – any changes are returned as the result. Thus, the result of calculating separate functions does not depend on the order in which the functions are written in the query. -## Error handling +## Error handling {#error-handling} Some functions might throw an exception if the data is invalid. In this case, the query is canceled and an error text is returned to the client. For distributed processing, when an exception occurs on one of the servers, the other servers also attempt to abort the query. -## Evaluation of argument expressions +## Evaluation of argument expressions {#evaluation-of-argument-expressions} In almost all programming languages, one of the arguments might not be evaluated for certain operators. This is usually the operators `&&`, `||`, and `?:`. But in ClickHouse, arguments of functions (operators) are always evaluated. This is because entire parts of columns are evaluated at once, instead of calculating each row separately. -## Performing functions for distributed query processing +## Performing functions for distributed query processing {#performing-functions-for-distributed-query-processing} For distributed query processing, as many stages of query processing as possible are performed on remote servers, and the rest of the stages (merging intermediate results and everything after that) are performed on the requestor server. This means that functions can be performed on different servers. For example, in the query `SELECT f(sum(g(x))) FROM distributed_table GROUP BY h(y),` -- if a `distributed_table` has at least two shards, the functions 'g' and 'h' are performed on remote servers, and the function 'f' is performed on the requestor server. -- if a `distributed_table` has only one shard, all the 'f', 'g', and 'h' functions are performed on this shard's server. +- if a `distributed_table` has at least two shards, the functions ‘g’ and ‘h’ are performed on remote servers, and the function ‘f’ is performed on the requestor server. +- if a `distributed_table` has only one shard, all the ‘f’, ‘g’, and ‘h’ functions are performed on this shard’s server. -The result of a function usually doesn't depend on which server it is performed on. However, sometimes this is important. +The result of a function usually doesn’t depend on which server it is performed on. However, sometimes this is important. For example, functions that work with dictionaries use the dictionary that exists on the server they are running on. Another example is the `hostName` function, which returns the name of the server it is running on in order to make `GROUP BY` by servers in a `SELECT` query. -If a function in a query is performed on the requestor server, but you need to perform it on remote servers, you can wrap it in an 'any' aggregate function or add it to a key in `GROUP BY`. - +If a function in a query is performed on the requestor server, but you need to perform it on remote servers, you can wrap it in an ‘any’ aggregate function or add it to a key in `GROUP BY`. [Original article](https://clickhouse.tech/docs/en/query_language/functions/) diff --git a/docs/en/query_language/functions/introspection.md b/docs/en/query_language/functions/introspection.md index 520c89feaeb..389a0555315 100644 --- a/docs/en/query_language/functions/introspection.md +++ b/docs/en/query_language/functions/introspection.md @@ -1,4 +1,4 @@ -# Introspection Functions +# Introspection Functions {#introspection-functions} You can use functions described in this chapter to introspect [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) and [DWARF](https://en.wikipedia.org/wiki/DWARF) for query profiling. @@ -7,12 +7,13 @@ You can use functions described in this chapter to introspect [ELF](https://en.w For proper operation of introspection functions: -- Install the `clickhouse-common-static-dbg` package. -- Set the [allow_introspection_functions](../../operations/settings/settings.md#settings-allow_introspection_functions) setting to 1. +- Install the `clickhouse-common-static-dbg` package. - For security reasons introspection functions are disabled by default. +- Set the [allow\_introspection\_functions](../../operations/settings/settings.md#settings-allow_introspection_functions) setting to 1. -ClickHouse saves profiler reports to the [trace_log](../../operations/system_tables.md#system_tables-trace_log) system table. Make sure the table and profiler are configured properly. + For security reasons introspection functions are disabled by default. + +ClickHouse saves profiler reports to the [trace\_log](../../operations/system_tables.md#system_tables-trace_log) system table. Make sure the table and profiler are configured properly. ## addressToLine {#addresstoline} @@ -22,22 +23,23 @@ If you use official ClickHouse packages, you need to install the `clickhouse-com **Syntax** -```sql +``` sql addressToLine(address_of_binary_instruction) ``` **Parameters** -- `address_of_binary_instruction` ([UInt64](../../data_types/int_uint.md)) — Address of instruction in a running process. +- `address_of_binary_instruction` ([UInt64](../../data_types/int_uint.md)) — Address of instruction in a running process. **Returned value** -- Source code filename and the line number in this file delimited by colon. +- Source code filename and the line number in this file delimited by colon. - For example, `/build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.cpp:199`, where `199` is a line number. + For example, `/build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.cpp:199`, where `199` is a line number. -- Name of a binary, if the function couldn't find the debug information. -- Empty string, if the address is not valid. +- Name of a binary, if the function couldn’t find the debug information. + +- Empty string, if the address is not valid. Type: [String](../../data_types/string.md). @@ -45,16 +47,17 @@ Type: [String](../../data_types/string.md). Enabling introspection functions: -```sql +``` sql SET allow_introspection_functions=1 ``` Selecting the first string from the `trace_log` system table: -```sql +``` sql SELECT * FROM system.trace_log LIMIT 1 \G ``` -```text + +``` text Row 1: ────── event_date: 2019-11-19 @@ -70,10 +73,11 @@ The `trace` field contains the stack trace at the moment of sampling. Getting the source code filename and the line number for a single address: -```sql +``` sql SELECT addressToLine(94784076370703) \G ``` -```text + +``` text Row 1: ────── addressToLine(94784076370703): /build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.cpp:199 @@ -81,17 +85,17 @@ addressToLine(94784076370703): /build/obj-x86_64-linux-gnu/../dbms/src/Common/Th Applying the function to the whole stack trace: -```sql -SELECT - arrayStringConcat(arrayMap(x -> addressToLine(x), trace), '\n') AS trace_source_code_lines -FROM system.trace_log -LIMIT 1 +``` sql +SELECT + arrayStringConcat(arrayMap(x -> addressToLine(x), trace), '\n') AS trace_source_code_lines +FROM system.trace_log +LIMIT 1 \G ``` The [arrayMap](higher_order_functions.md#higher_order_functions-array-map) function allows to process each individual element of the `trace` array by the `addressToLine` function. The result of this processing you see in the `trace_source_code_lines` column of output. -```text +``` text Row 1: ────── trace_source_code_lines: /lib/x86_64-linux-gnu/libpthread-2.27.so @@ -108,21 +112,20 @@ trace_source_code_lines: /lib/x86_64-linux-gnu/libpthread-2.27.so Converts virtual memory address inside ClickHouse server process to the symbol from ClickHouse object files. - **Syntax** -```sql +``` sql addressToSymbol(address_of_binary_instruction) ``` **Parameters** -- `address_of_binary_instruction` ([UInt64](../../data_types/int_uint.md)) — Address of instruction in a running process. +- `address_of_binary_instruction` ([UInt64](../../data_types/int_uint.md)) — Address of instruction in a running process. **Returned value** -- Symbol from ClickHouse object files. -- Empty string, if the address is not valid. +- Symbol from ClickHouse object files. +- Empty string, if the address is not valid. Type: [String](../../data_types/string.md). @@ -130,16 +133,17 @@ Type: [String](../../data_types/string.md). Enabling introspection functions: -```sql +``` sql SET allow_introspection_functions=1 ``` Selecting the first string from the `trace_log` system table: -```sql +``` sql SELECT * FROM system.trace_log LIMIT 1 \G ``` -```text + +``` text Row 1: ────── event_date: 2019-11-20 @@ -155,10 +159,11 @@ The `trace` field contains the stack trace at the moment of sampling. Getting a symbol for a single address: -```sql +``` sql SELECT addressToSymbol(94138803686098) \G ``` -```text + +``` text Row 1: ────── addressToSymbol(94138803686098): _ZNK2DB24IAggregateFunctionHelperINS_20AggregateFunctionSumImmNS_24AggregateFunctionSumDataImEEEEE19addBatchSinglePlaceEmPcPPKNS_7IColumnEPNS_5ArenaE @@ -166,8 +171,8 @@ addressToSymbol(94138803686098): _ZNK2DB24IAggregateFunctionHelperINS_20Aggregat Applying the function to the whole stack trace: -```sql -SELECT +``` sql +SELECT arrayStringConcat(arrayMap(x -> addressToSymbol(x), trace), '\n') AS trace_symbols FROM system.trace_log LIMIT 1 @@ -176,8 +181,7 @@ LIMIT 1 The [arrayMap](higher_order_functions.md#higher_order_functions-array-map) function allows to process each individual element of the `trace` array by the `addressToSymbols` function. The result of this processing you see in the `trace_symbols` column of output. - -```text +``` text Row 1: ────── trace_symbols: _ZNK2DB24IAggregateFunctionHelperINS_20AggregateFunctionSumImmNS_24AggregateFunctionSumDataImEEEEE19addBatchSinglePlaceEmPcPPKNS_7IColumnEPNS_5ArenaE @@ -205,21 +209,20 @@ clone Converts a symbol that you can get using the [addressToSymbol](#addresstosymbol) function to the C++ function name. - **Syntax** -```sql +``` sql demangle(symbol) ``` **Parameters** -- `symbol` ([String](../../data_types/string.md)) — Symbol from an object file. +- `symbol` ([String](../../data_types/string.md)) — Symbol from an object file. **Returned value** -- Name of the C++ function. -- Empty string if a symbol is not valid. +- Name of the C++ function. +- Empty string if a symbol is not valid. Type: [String](../../data_types/string.md). @@ -227,16 +230,17 @@ Type: [String](../../data_types/string.md). Enabling introspection functions: -```sql +``` sql SET allow_introspection_functions=1 ``` Selecting the first string from the `trace_log` system table: -```sql +``` sql SELECT * FROM system.trace_log LIMIT 1 \G ``` -```text + +``` text Row 1: ────── event_date: 2019-11-20 @@ -252,10 +256,11 @@ The `trace` field contains the stack trace at the moment of sampling. Getting a function name for a single address: -```sql +``` sql SELECT demangle(addressToSymbol(94138803686098)) \G ``` -```text + +``` text Row 1: ────── demangle(addressToSymbol(94138803686098)): DB::IAggregateFunctionHelper > >::addBatchSinglePlace(unsigned long, char*, DB::IColumn const**, DB::Arena*) const @@ -263,8 +268,8 @@ demangle(addressToSymbol(94138803686098)): DB::IAggregateFunctionHelper demangle(addressToSymbol(x)), trace), '\n') AS trace_functions FROM system.trace_log LIMIT 1 @@ -273,7 +278,7 @@ LIMIT 1 The [arrayMap](higher_order_functions.md#higher_order_functions-array-map) function allows to process each individual element of the `trace` array by the `demangle` function. The result of this processing you see in the `trace_functions` column of output. -```text +``` text Row 1: ────── trace_functions: DB::IAggregateFunctionHelper > >::addBatchSinglePlace(unsigned long, char*, DB::IColumn const**, DB::Arena*) const diff --git a/docs/en/query_language/functions/ip_address_functions.md b/docs/en/query_language/functions/ip_address_functions.md index 89fabc04b0d..91a896cbf8a 100644 --- a/docs/en/query_language/functions/ip_address_functions.md +++ b/docs/en/query_language/functions/ip_address_functions.md @@ -1,20 +1,20 @@ -# Functions for working with IP addresses +# Functions for working with IP addresses {#functions-for-working-with-ip-addresses} -## IPv4NumToString(num) +## IPv4NumToString(num) {#ipv4numtostringnum} Takes a UInt32 number. Interprets it as an IPv4 address in big endian. Returns a string containing the corresponding IPv4 address in the format A.B.C.d (dot-separated numbers in decimal form). -## IPv4StringToNum(s) +## IPv4StringToNum(s) {#ipv4stringtonums} The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0. -## IPv4NumToStringClassC(num) +## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Similar to IPv4NumToString, but using xxx instead of the last octet. Example: -```sql +``` sql SELECT IPv4NumToStringClassC(ClientIP) AS k, count() AS c @@ -24,7 +24,7 @@ ORDER BY c DESC LIMIT 10 ``` -```text +``` text ┌─k──────────────┬─────c─┐ │ 83.149.9.xxx │ 26238 │ │ 217.118.81.xxx │ 26074 │ @@ -39,24 +39,24 @@ LIMIT 10 └────────────────┴───────┘ ``` -Since using 'xxx' is highly unusual, this may be changed in the future. We recommend that you don't rely on the exact format of this fragment. +Since using ‘xxx’ is highly unusual, this may be changed in the future. We recommend that you don’t rely on the exact format of this fragment. -### IPv6NumToString(x) +### IPv6NumToString(x) {#ipv6numtostringx} Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing this address in text format. IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. Examples: -```sql +``` sql SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr ``` -```text +``` text ┌─addr─────────┐ │ 2a02:6b8::11 │ └──────────────┘ ``` -```sql +``` sql SELECT IPv6NumToString(ClientIP6 AS k), count() AS c @@ -67,7 +67,7 @@ ORDER BY c DESC LIMIT 10 ``` -```text +``` text ┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ │ 2a02:2168:aaa:bbbb::2 │ 24695 │ │ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ @@ -82,7 +82,7 @@ LIMIT 10 └─────────────────────────────────────────┴───────┘ ``` -```sql +``` sql SELECT IPv6NumToString(ClientIP6 AS k), count() AS c @@ -93,7 +93,7 @@ ORDER BY c DESC LIMIT 10 ``` -```text +``` text ┌─IPv6NumToString(ClientIP6)─┬──────c─┐ │ ::ffff:94.26.111.111 │ 747440 │ │ ::ffff:37.143.222.4 │ 529483 │ @@ -108,80 +108,77 @@ LIMIT 10 └────────────────────────────┴────────┘ ``` -## IPv6StringToNum(s) +## IPv6StringToNum(s) {#ipv6stringtonums} The reverse function of IPv6NumToString. If the IPv6 address has an invalid format, it returns a string of null bytes. HEX can be uppercase or lowercase. -## IPv4ToIPv6(x) +## IPv4ToIPv6(x) {#ipv4toipv6x} Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples: -```sql +``` sql SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr ``` -```text +``` text ┌─addr───────────────┐ │ ::ffff:192.168.0.1 │ └────────────────────┘ ``` -## cutIPv6(x, bitsToCutForIPv6, bitsToCutForIPv4) +## cutIPv6(x, bitsToCutForIPv6, bitsToCutForIPv4) {#cutipv6x-bitstocutforipv6-bitstocutforipv4} Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing the address of the specified number of bits removed in text format. For example: -```sql +``` sql WITH IPv6StringToNum('2001:0DB8:AC10:FE01:FEED:BABE:CAFE:F00D') AS ipv6, IPv4ToIPv6(IPv4StringToNum('192.168.0.1')) AS ipv4 SELECT cutIPv6(ipv6, 2, 0), cutIPv6(ipv4, 0, 2) - ``` -```text +``` text ┌─cutIPv6(ipv6, 2, 0)─────────────────┬─cutIPv6(ipv4, 0, 2)─┐ │ 2001:db8:ac10:fe01:feed:babe:cafe:0 │ ::ffff:192.168.0.0 │ └─────────────────────────────────────┴─────────────────────┘ ``` -## IPv4CIDRToRange(ipv4, cidr), +## IPv4CIDRToRange(ipv4, cidr), {#ipv4cidrtorangeipv4-cidr} Accepts an IPv4 and an UInt8 value containing the [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing). Return a tuple with two IPv4 containing the lower range and the higher range of the subnet. - -```sql +``` sql SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16) ``` -```text + +``` text ┌─IPv4CIDRToRange(toIPv4('192.168.5.2'), 16)─┐ │ ('192.168.0.0','192.168.255.255') │ └────────────────────────────────────────────┘ ``` - -## IPv6CIDRToRange(ipv6, cidr), +## IPv6CIDRToRange(ipv6, cidr), {#ipv6cidrtorangeipv6-cidr} Accepts an IPv6 and an UInt8 value containing the CIDR. Return a tuple with two IPv6 containing the lower range and the higher range of the subnet. - -```sql +``` sql SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32); ``` -```text +``` text ┌─IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32)─┐ │ ('2001:db8::','2001:db8:ffff:ffff:ffff:ffff:ffff:ffff') │ └────────────────────────────────────────────────────────────────────────┘ ``` -## toIPv4(string) +## toIPv4(string) {#toipv4string} An alias to `IPv4StringToNum()` that takes a string form of IPv4 address and returns value of [IPv4](../../data_types/domains/ipv4.md) type, which is binary equal to value returned by `IPv4StringToNum()`. -```sql +``` sql WITH '171.225.130.45' as IPv4_string SELECT @@ -189,13 +186,13 @@ SELECT toTypeName(toIPv4(IPv4_string)) ``` -```text +``` text ┌─toTypeName(IPv4StringToNum(IPv4_string))─┬─toTypeName(toIPv4(IPv4_string))─┐ │ UInt32 │ IPv4 │ └──────────────────────────────────────────┴─────────────────────────────────┘ ``` -```sql +``` sql WITH '171.225.130.45' as IPv4_string SELECT @@ -203,17 +200,17 @@ SELECT hex(toIPv4(IPv4_string)) ``` -```text +``` text ┌─hex(IPv4StringToNum(IPv4_string))─┬─hex(toIPv4(IPv4_string))─┐ │ ABE1822D │ ABE1822D │ └───────────────────────────────────┴──────────────────────────┘ ``` -## toIPv6(string) +## toIPv6(string) {#toipv6string} An alias to `IPv6StringToNum()` that takes a string form of IPv6 address and returns value of [IPv6](../../data_types/domains/ipv6.md) type, which is binary equal to value returned by `IPv6StringToNum()`. -```sql +``` sql WITH '2001:438:ffff::407d:1bc1' as IPv6_string SELECT @@ -221,13 +218,13 @@ SELECT toTypeName(toIPv6(IPv6_string)) ``` -```text +``` text ┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ │ FixedString(16) │ IPv6 │ └──────────────────────────────────────────┴─────────────────────────────────┘ ``` -```sql +``` sql WITH '2001:438:ffff::407d:1bc1' as IPv6_string SELECT @@ -235,7 +232,7 @@ SELECT hex(toIPv6(IPv6_string)) ``` -```text +``` text ┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ │ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ └───────────────────────────────────┴──────────────────────────────────┘ diff --git a/docs/en/query_language/functions/json_functions.md b/docs/en/query_language/functions/json_functions.md index 90fd687f1ed..14098bc72a7 100644 --- a/docs/en/query_language/functions/json_functions.md +++ b/docs/en/query_language/functions/json_functions.md @@ -1,52 +1,52 @@ -# Functions for working with JSON +# Functions for working with JSON {#functions-for-working-with-json} In Yandex.Metrica, JSON is transmitted by users as session parameters. There are some special functions for working with this JSON. (Although in most of the cases, the JSONs are additionally pre-processed, and the resulting values are put in separate columns in their processed format.) All these functions are based on strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done. The following assumptions are made: -1. The field name (function argument) must be a constant. -2. The field name is somehow canonically encoded in JSON. For example: `visitParamHas('{"abc":"def"}', 'abc') = 1`, but `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` -3. Fields are searched for on any nesting level, indiscriminately. If there are multiple matching fields, the first occurrence is used. -4. The JSON doesn't have space characters outside of string literals. +1. The field name (function argument) must be a constant. +2. The field name is somehow canonically encoded in JSON. For example: `visitParamHas('{"abc":"def"}', 'abc') = 1`, but `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` +3. Fields are searched for on any nesting level, indiscriminately. If there are multiple matching fields, the first occurrence is used. +4. The JSON doesn’t have space characters outside of string literals. -## visitParamHas(params, name) +## visitParamHas(params, name) {#visitparamhasparams-name} -Checks whether there is a field with the 'name' name. +Checks whether there is a field with the ‘name’ name. -## visitParamExtractUInt(params, name) +## visitParamExtractUInt(params, name) {#visitparamextractuintparams-name} -Parses UInt64 from the value of the field named 'name'. If this is a string field, it tries to parse a number from the beginning of the string. If the field doesn't exist, or it exists but doesn't contain a number, it returns 0. +Parses UInt64 from the value of the field named ‘name’. If this is a string field, it tries to parse a number from the beginning of the string. If the field doesn’t exist, or it exists but doesn’t contain a number, it returns 0. -## visitParamExtractInt(params, name) +## visitParamExtractInt(params, name) {#visitparamextractintparams-name} The same as for Int64. -## visitParamExtractFloat(params, name) +## visitParamExtractFloat(params, name) {#visitparamextractfloatparams-name} The same as for Float64. -## visitParamExtractBool(params, name) +## visitParamExtractBool(params, name) {#visitparamextractboolparams-name} Parses a true/false value. The result is UInt8. -## visitParamExtractRaw(params, name) +## visitParamExtractRaw(params, name) {#visitparamextractrawparams-name} Returns the value of a field, including separators. Examples: -```sql +``` sql visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' ``` -## visitParamExtractString(params, name) +## visitParamExtractString(params, name) {#visitparamextractstringparams-name} Parses the string in double quotes. The value is unescaped. If unescaping failed, it returns an empty string. Examples: -```sql +``` sql visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' @@ -57,18 +57,18 @@ There is currently no support for code points in the format `\uXXXX\uYYYY` that The following functions are based on [simdjson](https://github.com/lemire/simdjson) designed for more complex JSON parsing requirements. The assumption 2 mentioned above still applies. -## isValidJSON(json) +## isValidJSON(json) {#isvalidjsonjson} Checks that passed string is a valid json. Examples: -```sql +``` sql SELECT isValidJSON('{"a": "hello", "b": [-100, 200.0, 300]}') = 1 SELECT isValidJSON('not a json') = 0 ``` -## JSONHas(json[, indices_or_keys]...) +## JSONHas(json\[, indices\_or\_keys\]…) {#jsonhasjson-indices-or-keys} If the value exists in the JSON document, `1` will be returned. @@ -76,24 +76,24 @@ If the value does not exist, `0` will be returned. Examples: -```sql +``` sql SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 ``` `indices_or_keys` is a list of zero or more arguments each of them can be either string or integer. -* String = access object member by key. -* Positive integer = access the n-th member/key from the beginning. -* Negative integer = access the n-th member/key from the end. +- String = access object member by key. +- Positive integer = access the n-th member/key from the beginning. +- Negative integer = access the n-th member/key from the end. -Minimum index of the element is 1. Thus the element 0 doesn't exist. +Minimum index of the element is 1. Thus the element 0 doesn’t exist. You may use integers to access both JSON arrays and JSON objects. So, for example: -```sql +``` sql SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' @@ -101,7 +101,7 @@ SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' ``` -## JSONLength(json[, indices_or_keys]...) +## JSONLength(json\[, indices\_or\_keys\]…) {#jsonlengthjson-indices-or-keys} Return the length of a JSON array or a JSON object. @@ -109,12 +109,12 @@ If the value does not exist or has a wrong type, `0` will be returned. Examples: -```sql +``` sql SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 ``` -## JSONType(json[, indices_or_keys]...) +## JSONType(json\[, indices\_or\_keys\]…) {#jsontypejson-indices-or-keys} Return the type of a JSON value. @@ -122,16 +122,19 @@ If the value does not exist, `Null` will be returned. Examples: -```sql +``` sql SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' ``` -## JSONExtractUInt(json[, indices_or_keys]...) -## JSONExtractInt(json[, indices_or_keys]...) -## JSONExtractFloat(json[, indices_or_keys]...) -## JSONExtractBool(json[, indices_or_keys]...) +## JSONExtractUInt(json\[, indices\_or\_keys\]…) {#jsonextractuintjson-indices-or-keys} + +## JSONExtractInt(json\[, indices\_or\_keys\]…) {#jsonextractintjson-indices-or-keys} + +## JSONExtractFloat(json\[, indices\_or\_keys\]…) {#jsonextractfloatjson-indices-or-keys} + +## JSONExtractBool(json\[, indices\_or\_keys\]…) {#jsonextractbooljson-indices-or-keys} Parses a JSON and extract a value. These functions are similar to `visitParam` functions. @@ -139,13 +142,13 @@ If the value does not exist or has a wrong type, `0` will be returned. Examples: -```sql +``` sql SELECT JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 ``` -## JSONExtractString(json[, indices_or_keys]...) +## JSONExtractString(json\[, indices\_or\_keys\]…) {#jsonextractstringjson-indices-or-keys} Parses a JSON and extract a string. This function is similar to `visitParamExtractString` functions. @@ -155,7 +158,7 @@ The value is unescaped. If unescaping failed, it returns an empty string. Examples: -```sql +``` sql SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' SELECT JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' SELECT JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' @@ -163,7 +166,7 @@ SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' ``` -## JSONExtract(json[, indices_or_keys...], return_type) +## JSONExtract(json\[, indices\_or\_keys…\], return\_type) {#jsonextractjson-indices-or-keys-return-type} Parses a JSON and extract a value of the given ClickHouse data type. @@ -174,7 +177,7 @@ This means Examples: -```sql +``` sql SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))') = ('hello',[-100,200,300]) SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)') = ([-100,200,300],'hello') SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))') = [-100, NULL, NULL] @@ -184,17 +187,17 @@ SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' ``` -## JSONExtractKeysAndValues(json[, indices_or_keys...], value_type) +## JSONExtractKeysAndValues(json\[, indices\_or\_keys…\], value\_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} Parse key-value pairs from a JSON where the values are of the given ClickHouse data type. Example: -```sql +``` sql SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; ``` -## JSONExtractRaw(json[, indices_or_keys]...) +## JSONExtractRaw(json\[, indices\_or\_keys\]…) {#jsonextractrawjson-indices-or-keys} Returns a part of JSON. @@ -202,19 +205,19 @@ If the part does not exist or has a wrong type, an empty string will be returned Example: -```sql +``` sql SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' ``` -## JSONExtractArrayRaw(json[, indices_or_keys]...) +## JSONExtractArrayRaw(json\[, indices\_or\_keys\]…) {#jsonextractarrayrawjson-indices-or-keys} Returns an array with elements of JSON array, each represented as unparsed string. -If the part does not exist or isn't array, an empty array will be returned. +If the part does not exist or isn’t array, an empty array will be returned. Example: -```sql +``` sql SELECT JSONExtractArrayRaw('{"a": "hello", "b": [-100, 200.0, "hello"]}', 'b') = ['-100', '200.0', '"hello"']' ``` diff --git a/docs/en/query_language/functions/logical_functions.md b/docs/en/query_language/functions/logical_functions.md index e9c6d2864c0..a1d9c41ec7e 100644 --- a/docs/en/query_language/functions/logical_functions.md +++ b/docs/en/query_language/functions/logical_functions.md @@ -1,16 +1,15 @@ -# Logical functions +# Logical functions {#logical-functions} Logical functions accept any numeric types, but return a UInt8 number equal to 0 or 1. -Zero as an argument is considered "false," while any non-zero value is considered "true". +Zero as an argument is considered “false,” while any non-zero value is considered “true”. -## and, AND operator +## and, AND operator {#and-and-operator} -## or, OR operator +## or, OR operator {#or-or-operator} -## not, NOT operator - -## xor +## not, NOT operator {#not-not-operator} +## xor {#xor} [Original article](https://clickhouse.tech/docs/en/query_language/functions/logical_functions/) diff --git a/docs/en/query_language/functions/machine_learning_functions.md b/docs/en/query_language/functions/machine_learning_functions.md index 5d9983f015f..92781bbfea7 100644 --- a/docs/en/query_language/functions/machine_learning_functions.md +++ b/docs/en/query_language/functions/machine_learning_functions.md @@ -1,13 +1,13 @@ -# Machine learning functions +# Machine learning functions {#machine-learning-functions} -## evalMLMethod (prediction) {#machine_learning_methods-evalmlmethod} +## evalMLMethod (prediction) {#machine-learning-methods-evalmlmethod} Prediction using fitted regression models uses `evalMLMethod` function. See link in `linearRegression`. -### Stochastic Linear Regression +### Stochastic Linear Regression {#stochastic-linear-regression} The [stochasticLinearRegression](../agg_functions/reference.md#agg_functions-stochasticlinearregression) aggregate function implements stochastic gradient descent method using linear model and MSE loss function. Uses `evalMLMethod` to predict on new data. -### Stochastic Logistic Regression +### Stochastic Logistic Regression {#stochastic-logistic-regression} The [stochasticLogisticRegression](../agg_functions/reference.md#agg_functions-stochasticlogisticregression) aggregate function implements stochastic gradient descent method for binary classification problem. Uses `evalMLMethod` to predict on new data. diff --git a/docs/en/query_language/functions/math_functions.md b/docs/en/query_language/functions/math_functions.md index 6da8b4fb55f..cc89d22d786 100644 --- a/docs/en/query_language/functions/math_functions.md +++ b/docs/en/query_language/functions/math_functions.md @@ -1,108 +1,108 @@ -# Mathematical functions +# Mathematical functions {#mathematical-functions} All the functions return a Float64 number. The accuracy of the result is close to the maximum precision possible, but the result might not coincide with the machine representable number nearest to the corresponding real number. -## e() +## e() {#e} Returns a Float64 number that is close to the number e. -## pi() +## pi() {#pi} Returns a Float64 number that is close to the number π. -## exp(x) +## exp(x) {#expx} Accepts a numeric argument and returns a Float64 number close to the exponent of the argument. -## log(x), ln(x) +## log(x), ln(x) {#logx-lnx} Accepts a numeric argument and returns a Float64 number close to the natural logarithm of the argument. -## exp2(x) +## exp2(x) {#exp2x} Accepts a numeric argument and returns a Float64 number close to 2 to the power of x. -## log2(x) +## log2(x) {#log2x} Accepts a numeric argument and returns a Float64 number close to the binary logarithm of the argument. -## exp10(x) +## exp10(x) {#exp10x} Accepts a numeric argument and returns a Float64 number close to 10 to the power of x. -## log10(x) +## log10(x) {#log10x} Accepts a numeric argument and returns a Float64 number close to the decimal logarithm of the argument. -## sqrt(x) +## sqrt(x) {#sqrtx} Accepts a numeric argument and returns a Float64 number close to the square root of the argument. -## cbrt(x) +## cbrt(x) {#cbrtx} Accepts a numeric argument and returns a Float64 number close to the cubic root of the argument. -## erf(x) +## erf(x) {#erfx} -If 'x' is non-negative, then erf(x / σ√2) is the probability that a random variable having a normal distribution with standard deviation 'σ' takes the value that is separated from the expected value by more than 'x'. +If ‘x’ is non-negative, then erf(x / σ√2) is the probability that a random variable having a normal distribution with standard deviation ‘σ’ takes the value that is separated from the expected value by more than ‘x’. Example (three sigma rule): -```sql +``` sql SELECT erf(3 / sqrt(2)) ``` -```text +``` text ┌─erf(divide(3, sqrt(2)))─┐ │ 0.9973002039367398 │ └─────────────────────────┘ ``` -## erfc(x) +## erfc(x) {#erfcx} -Accepts a numeric argument and returns a Float64 number close to 1 - erf(x), but without loss of precision for large 'x' values. +Accepts a numeric argument and returns a Float64 number close to 1 - erf(x), but without loss of precision for large ‘x’ values. -## lgamma(x) +## lgamma(x) {#lgammax} The logarithm of the gamma function. -## tgamma(x) +## tgamma(x) {#tgammax} Gamma function. -## sin(x) +## sin(x) {#sinx} The sine. -## cos(x) +## cos(x) {#cosx} The cosine. -## tan(x) +## tan(x) {#tanx} The tangent. -## asin(x) +## asin(x) {#asinx} The arc sine. -## acos(x) +## acos(x) {#acosx} The arc cosine. -## atan(x) +## atan(x) {#atanx} The arc tangent. -## pow(x, y), power(x, y) +## pow(x, y), power(x, y) {#powx-y-powerx-y} Takes two numeric arguments x and y. Returns a Float64 number close to x to the power of y. -## intExp2 +## intExp2 {#intexp2} Accepts a numeric argument and returns a UInt64 number close to 2 to the power of x. -## intExp10 +## intExp10 {#intexp10} Accepts a numeric argument and returns a UInt64 number close to 10 to the power of x. diff --git a/docs/en/query_language/functions/other_functions.md b/docs/en/query_language/functions/other_functions.md index 24b6906b57e..cd67541423d 100644 --- a/docs/en/query_language/functions/other_functions.md +++ b/docs/en/query_language/functions/other_functions.md @@ -1,6 +1,6 @@ -# Other functions +# Other functions {#other-functions} -## hostName() +## hostName() {#hostname} Returns a string with the name of the host that this function was performed on. For distributed processing, this is the name of the remote server host, if the function is performed on a remote server. @@ -10,7 +10,7 @@ Returns the fully qualified domain name. **Syntax** -```sql +``` sql fqdn(); ``` @@ -18,7 +18,7 @@ This function is case-insensitive. **Returned value** -- String with the fully qualified domain name. +- String with the fully qualified domain name. Type: `String`. @@ -26,84 +26,90 @@ Type: `String`. Query: -```sql +``` sql SELECT FQDN(); ``` Result: -```text +``` text ┌─FQDN()──────────────────────────┐ │ clickhouse.ru-central1.internal │ └─────────────────────────────────┘ ``` -## basename +## basename {#basename} Extracts the trailing part of a string after the last slash or backslash. This function if often used to extract the filename from a path. -```sql +``` sql basename( expr ) ``` **Parameters** -- `expr` — Expression resulting in a [String](../../data_types/string.md) type value. All the backslashes must be escaped in the resulting value. +- `expr` — Expression resulting in a [String](../../data_types/string.md) type value. All the backslashes must be escaped in the resulting value. **Returned Value** A string that contains: -- The trailing part of a string after the last slash or backslash. +- The trailing part of a string after the last slash or backslash. - If the input string contains a path ending with slash or backslash, for example, `/` or `c:\`, the function returns an empty string. + If the input string contains a path ending with slash or backslash, for example, `/` or `c:\`, the function returns an empty string. -- The original string if there are no slashes or backslashes. +- The original string if there are no slashes or backslashes. **Example** -```sql +``` sql SELECT 'some/long/path/to/file' AS a, basename(a) ``` -```text + +``` text ┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ │ some\long\path\to\file │ file │ └────────────────────────┴────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT 'some\\long\\path\\to\\file' AS a, basename(a) ``` -```text + +``` text ┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ │ some\long\path\to\file │ file │ └────────────────────────┴────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT 'some-file-name' AS a, basename(a) ``` -```text + +``` text ┌─a──────────────┬─basename('some-file-name')─┐ │ some-file-name │ some-file-name │ └────────────────┴────────────────────────────┘ ``` -## visibleWidth(x) +## visibleWidth(x) {#visiblewidthx} Calculates the approximate width when outputting values to the console in text format (tab-separated). This function is used by the system for implementing Pretty formats. `NULL` is represented as a string corresponding to `NULL` in `Pretty` formats. -```sql +``` sql SELECT visibleWidth(NULL) ``` -```text + +``` text ┌─visibleWidth(NULL)─┐ │ 4 │ └────────────────────┘ ``` -## toTypeName(x) +## toTypeName(x) {#totypenamex} Returns a string containing the type name of the passed argument. @@ -114,34 +120,34 @@ If `NULL` is passed to the function as input, then it returns the `Nullable(Noth Gets the size of the block. In ClickHouse, queries are always run on blocks (sets of column parts). This function allows getting the size of the block that you called it for. -## materialize(x) +## materialize(x) {#materializex} Turns a constant into a full column containing just one value. In ClickHouse, full columns and constants are represented differently in memory. Functions work differently for constant arguments and normal arguments (different code is executed), although the result is almost always the same. This function is for debugging this behavior. -## ignore(...) +## ignore(…) {#ignore} Accepts any arguments, including `NULL`. Always returns 0. However, the argument is still evaluated. This can be used for benchmarks. -## sleep(seconds) +## sleep(seconds) {#sleepseconds} -Sleeps 'seconds' seconds on each data block. You can specify an integer or a floating-point number. +Sleeps ‘seconds’ seconds on each data block. You can specify an integer or a floating-point number. -## sleepEachRow(seconds) +## sleepEachRow(seconds) {#sleepeachrowseconds} -Sleeps 'seconds' seconds on each row. You can specify an integer or a floating-point number. +Sleeps ‘seconds’ seconds on each row. You can specify an integer or a floating-point number. -## currentDatabase() +## currentDatabase() {#currentdatabase} Returns the name of the current database. You can use this function in table engine parameters in a CREATE TABLE query where you need to specify the database. -## currentUser() {#other_function-currentuser} +## currentUser() {#other-function-currentuser} Returns the login of current user. Login of user, that initiated query, will be returned in case distibuted query. -```sql +``` sql SELECT currentUser(); ``` @@ -149,8 +155,8 @@ Alias: `user()`, `USER()`. **Returned values** -- Login of current user. -- Login of user that initiated query in case of disributed query. +- Login of current user. +- Login of user that initiated query in case of disributed query. Type: `String`. @@ -158,23 +164,23 @@ Type: `String`. Query: -```sql +``` sql SELECT currentUser(); ``` Result: -```text +``` text ┌─currentUser()─┐ │ default │ └───────────────┘ ``` -## isFinite(x) +## isFinite(x) {#isfinitex} Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is not infinite and not a NaN, otherwise 0. -## isInfinite(x) +## isInfinite(x) {#isinfinitex} Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is infinite, otherwise 0. Note that 0 is returned for a NaN. @@ -182,44 +188,39 @@ Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is infi Checks whether floating point value is finite. -**Syntax** +**Syntax** -``` -ifNotFinite(x,y) -``` -**Parameters** + ifNotFinite(x,y) -- `x` — Value to be checked for infinity. Type: [Float*](../../data_types/float.md). -- `y` — Fallback value. Type: [Float*](../../data_types/float.md). +**Parameters** + +- `x` — Value to be checked for infinity. Type: [Float\*](../../data_types/float.md). +- `y` — Fallback value. Type: [Float\*](../../data_types/float.md). **Returned value** -- `x` if `x` is finite. -- `y` if `x` is not finite. +- `x` if `x` is finite. +- `y` if `x` is not finite. **Example** Query: -``` -SELECT 1/0 as infimum, ifNotFinite(infimum,42) -``` + SELECT 1/0 as infimum, ifNotFinite(infimum,42) Result: -``` -┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ -│ inf │ 42 │ -└─────────┴───────────────────────────────┘ -``` + ┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ + │ inf │ 42 │ + └─────────┴───────────────────────────────┘ You can get similar result by using [ternary operator](conditional_functions.md#ternary-operator): `isFinite(x) ? x : y`. -## isNaN(x) +## isNaN(x) {#isnanx} Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is a NaN, otherwise 0. -## hasColumnInTable(\['hostname'\[, 'username'\[, 'password'\]\],\] 'database', 'table', 'column') +## hasColumnInTable(\[‘hostname’\[, ‘username’\[, ‘password’\]\],\] ‘database’, ‘table’, ‘column’) {#hascolumnintablehostname-username-password-database-table-column} Accepts constant strings: database name, table name, and column name. Returns a UInt8 constant expression equal to 1 if there is a column, otherwise 0. If the hostname parameter is set, the test will run on a remote server. The function throws an exception if the table does not exist. @@ -233,15 +234,15 @@ Allows building a unicode-art diagram. Parameters: -- `x` — Size to display. -- `min, max` — Integer constants. The value must fit in `Int64`. -- `width` — Constant, positive integer, can be fractional. +- `x` — Size to display. +- `min, max` — Integer constants. The value must fit in `Int64`. +- `width` — Constant, positive integer, can be fractional. The band is drawn with accuracy to one eighth of a symbol. Example: -```sql +``` sql SELECT toHour(EventTime) AS h, count() AS c, @@ -251,7 +252,7 @@ GROUP BY h ORDER BY h ASC ``` -```text +``` text ┌──h─┬──────c─┬─bar────────────────┐ │ 0 │ 292907 │ █████████▋ │ │ 1 │ 180563 │ ██████ │ @@ -280,21 +281,20 @@ ORDER BY h ASC └────┴────────┴────────────────────┘ ``` - -## transform +## transform {#transform} Transforms a value according to the explicitly defined mapping of some elements to other ones. There are two variations of this function: -### transform(x, array_from, array_to, default) +### transform(x, array\_from, array\_to, default) {#transformx-array-from-array-to-default} `x` – What to transform. `array_from` – Constant array of values for converting. -`array_to` – Constant array of values to convert the values in 'from' to. +`array_to` – Constant array of values to convert the values in ‘from’ to. -`default` – Which value to use if 'x' is not equal to any of the values in 'from'. +`default` – Which value to use if ‘x’ is not equal to any of the values in ‘from’. `array_from` and `array_to` – Arrays of the same size. @@ -306,11 +306,11 @@ Types: Where the same letter is indicated (T or U), for numeric types these might not be matching types, but types that have a common type. For example, the first argument can have the Int64 type, while the second has the Array(UInt16) type. -If the 'x' value is equal to one of the elements in the 'array_from' array, it returns the existing element (that is numbered the same) from the 'array_to' array. Otherwise, it returns 'default'. If there are multiple matching elements in 'array_from', it returns one of the matches. +If the ‘x’ value is equal to one of the elements in the ‘array\_from’ array, it returns the existing element (that is numbered the same) from the ‘array\_to’ array. Otherwise, it returns ‘default’. If there are multiple matching elements in ‘array\_from’, it returns one of the matches. Example: -```sql +``` sql SELECT transform(SearchEngineID, [2, 3], ['Yandex', 'Google'], 'Other') AS title, count() AS c @@ -320,7 +320,7 @@ GROUP BY title ORDER BY c DESC ``` -```text +``` text ┌─title─────┬──────c─┐ │ Yandex │ 498635 │ │ Google │ 229872 │ @@ -328,10 +328,10 @@ ORDER BY c DESC └───────────┴────────┘ ``` -### transform(x, array_from, array_to) +### transform(x, array\_from, array\_to) {#transformx-array-from-array-to} -Differs from the first variation in that the 'default' argument is omitted. -If the 'x' value is equal to one of the elements in the 'array_from' array, it returns the matching element (that is numbered the same) from the 'array_to' array. Otherwise, it returns 'x'. +Differs from the first variation in that the ‘default’ argument is omitted. +If the ‘x’ value is equal to one of the elements in the ‘array\_from’ array, it returns the matching element (that is numbered the same) from the ‘array\_to’ array. Otherwise, it returns ‘x’. Types: @@ -339,7 +339,7 @@ Types: Example: -```sql +``` sql SELECT transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'example.com']) AS s, count() AS c @@ -349,7 +349,7 @@ ORDER BY count() DESC LIMIT 10 ``` -```text +``` text ┌─s──────────────┬───────c─┐ │ │ 2906259 │ │ www.yandex │ 867767 │ @@ -363,19 +363,19 @@ LIMIT 10 └────────────────┴─────────┘ ``` -## formatReadableSize(x) +## formatReadableSize(x) {#formatreadablesizex} Accepts the size (number of bytes). Returns a rounded size with a suffix (KiB, MiB, etc.) as a string. Example: -```sql +``` sql SELECT arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, formatReadableSize(filesize_bytes) AS filesize ``` -```text +``` text ┌─filesize_bytes─┬─filesize───┐ │ 1 │ 1.00 B │ │ 1024 │ 1.00 KiB │ @@ -384,27 +384,27 @@ SELECT └────────────────┴────────────┘ ``` -## least(a, b) +## least(a, b) {#leasta-b} Returns the smallest value from a and b. -## greatest(a, b) +## greatest(a, b) {#greatesta-b} Returns the largest value of a and b. -## uptime() +## uptime() {#uptime} -Returns the server's uptime in seconds. +Returns the server’s uptime in seconds. -## version() +## version() {#version} Returns the version of the server as a string. -## timezone() +## timezone() {#timezone} Returns the timezone of the server. -## blockNumber +## blockNumber {#blocknumber} Returns the sequence number of the data block where the row is located. @@ -412,7 +412,7 @@ Returns the sequence number of the data block where the row is located. Returns the ordinal number of the row in the data block. Different data blocks are always recalculated. -## rowNumberInAllBlocks() +## rowNumberInAllBlocks() {#rownumberinallblocks} Returns the ordinal number of the row in the data block. This function only considers the affected data blocks. @@ -422,7 +422,7 @@ The window function that provides access to a row at a specified offset which co **Syntax** -```sql +``` sql neighbor(column, offset[, default_value]) ``` @@ -431,14 +431,14 @@ If you make a subquery with ORDER BY and call the function from outside the subq **Parameters** -- `column` — A column name or scalar expression. -- `offset` — The number of rows forwards or backwards from the current row of `column`. [Int64](../../data_types/int_uint.md). -- `default_value` — Optional. The value to be returned if offset goes beyond the scope of the block. Type of data blocks affected. +- `column` — A column name or scalar expression. +- `offset` — The number of rows forwards or backwards from the current row of `column`. [Int64](../../data_types/int_uint.md). +- `default_value` — Optional. The value to be returned if offset goes beyond the scope of the block. Type of data blocks affected. **Returned values** -- Value for `column` in `offset` distance from current row if `offset` value is not outside block bounds. -- Default value for `column` if `offset` value is outside block bounds. If `default_value` is given, then it will be used. +- Value for `column` in `offset` distance from current row if `offset` value is not outside block bounds. +- Default value for `column` if `offset` value is outside block bounds. If `default_value` is given, then it will be used. Type: type of data blocks affected or default value type. @@ -446,13 +446,13 @@ Type: type of data blocks affected or default value type. Query: -```sql +``` sql SELECT number, neighbor(number, 2) FROM system.numbers LIMIT 10; ``` Result: -```text +``` text ┌─number─┬─neighbor(number, 2)─┐ │ 0 │ 2 │ │ 1 │ 3 │ @@ -469,13 +469,13 @@ Result: Query: -```sql +``` sql SELECT number, neighbor(number, 2, 999) FROM system.numbers LIMIT 10; ``` Result: -```text +``` text ┌─number─┬─neighbor(number, 2, 999)─┐ │ 0 │ 2 │ │ 1 │ 3 │ @@ -494,7 +494,7 @@ This function can be used to compute year-over-year metric value: Query: -```sql +``` sql WITH toDate('2018-01-01') AS start_date SELECT toStartOfMonth(start_date + (number * 32)) AS month, @@ -506,7 +506,7 @@ FROM numbers(16) Result: -```text +``` text ┌──────month─┬─money─┬─prev_year─┬─year_over_year─┐ │ 2018-01-01 │ 32 │ 0 │ 0 │ │ 2018-02-01 │ 63 │ 0 │ 0 │ @@ -527,7 +527,7 @@ Result: └────────────┴───────┴───────────┴────────────────┘ ``` -## runningDifference(x) {#other_functions-runningdifference} +## runningDifference(x) {#other-functions-runningdifference} Calculates the difference between successive row values ​​in the data block. Returns 0 for the first row and the difference from the previous row for each subsequent row. @@ -537,7 +537,7 @@ If you make a subquery with ORDER BY and call the function from outside the subq Example: -```sql +``` sql SELECT EventID, EventTime, @@ -554,7 +554,7 @@ FROM ) ``` -```text +``` text ┌─EventID─┬───────────EventTime─┬─delta─┐ │ 1106 │ 2016-11-24 00:00:04 │ 0 │ │ 1107 │ 2016-11-24 00:00:05 │ 1 │ @@ -566,14 +566,15 @@ FROM Please note - block size affects the result. With each new block, the `runningDifference` state is reset. -```sql +``` sql SELECT number, runningDifference(number + 1) AS diff FROM numbers(100000) WHERE diff != 1 ``` -```text + +``` text ┌─number─┬─diff─┐ │ 0 │ 0 │ └────────┴──────┘ @@ -581,7 +582,8 @@ WHERE diff != 1 │ 65536 │ 0 │ └────────┴──────┘ ``` -```sql + +``` sql set max_block_size=100000 -- default value is 65536! SELECT @@ -590,114 +592,119 @@ SELECT FROM numbers(100000) WHERE diff != 1 ``` -```text + +``` text ┌─number─┬─diff─┐ │ 0 │ 0 │ └────────┴──────┘ ``` -## runningDifferenceStartingWithFirstValue +## runningDifferenceStartingWithFirstValue {#runningdifferencestartingwithfirstvalue} Same as for [runningDifference](./other_functions.md#other_functions-runningdifference), the difference is the value of the first row, returned the value of the first row, and each subsequent row returns the difference from the previous row. -## MACNumToString(num) +## MACNumToString(num) {#macnumtostringnum} Accepts a UInt64 number. Interprets it as a MAC address in big endian. Returns a string containing the corresponding MAC address in the format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form). -## MACStringToNum(s) +## MACStringToNum(s) {#macstringtonums} The inverse function of MACNumToString. If the MAC address has an invalid format, it returns 0. -## MACStringToOUI(s) +## MACStringToOUI(s) {#macstringtoouis} Accepts a MAC address in the format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form). Returns the first three octets as a UInt64 number. If the MAC address has an invalid format, it returns 0. -## getSizeOfEnumType +## getSizeOfEnumType {#getsizeofenumtype} Returns the number of fields in [Enum](../../data_types/enum.md). -```sql +``` sql getSizeOfEnumType(value) ``` **Parameters:** -- `value` — Value of type `Enum`. +- `value` — Value of type `Enum`. **Returned values** -- The number of fields with `Enum` input values. -- An exception is thrown if the type is not `Enum`. +- The number of fields with `Enum` input values. +- An exception is thrown if the type is not `Enum`. **Example** -```sql +``` sql SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x ``` -```text + +``` text ┌─x─┐ │ 2 │ └───┘ ``` -## blockSerializedSize +## blockSerializedSize {#blockserializedsize} Returns size on disk (without taking into account compression). - -```sql +``` sql blockSerializedSize(value[, value[, ...]]) ``` **Parameters:** -- `value` — Any value. +- `value` — Any value. **Returned values** -- The number of bytes that will be written to disk for block of values (without compression). +- The number of bytes that will be written to disk for block of values (without compression). **Example** -```sql +``` sql SELECT blockSerializedSize(maxState(1)) as x ``` -```text + +``` text ┌─x─┐ │ 2 │ └───┘ ``` -## toColumnTypeName +## toColumnTypeName {#tocolumntypename} Returns the name of the class that represents the data type of the column in RAM. -```sql +``` sql toColumnTypeName(value) ``` **Parameters:** -- `value` — Any type of value. +- `value` — Any type of value. **Returned values** -- A string with the name of the class that is used for representing the `value` data type in RAM. +- A string with the name of the class that is used for representing the `value` data type in RAM. -**Example of the difference between` toTypeName ' and ' toColumnTypeName`** +**Example of the difference between`toTypeName ' and ' toColumnTypeName`** -```sql +``` sql SELECT toTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) ``` -```text + +``` text ┌─toTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ DateTime │ └─────────────────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) ``` -```text + +``` text ┌─toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ Const(UInt32) │ └───────────────────────────────────────────────────────────┘ @@ -705,87 +712,90 @@ SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) The example shows that the `DateTime` data type is stored in memory as `Const(UInt32)`. -## dumpColumnStructure +## dumpColumnStructure {#dumpcolumnstructure} Outputs a detailed description of data structures in RAM -```sql +``` sql dumpColumnStructure(value) ``` **Parameters:** -- `value` — Any type of value. +- `value` — Any type of value. **Returned values** -- A string describing the structure that is used for representing the `value` data type in RAM. +- A string describing the structure that is used for representing the `value` data type in RAM. **Example** -```sql +``` sql SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) ``` -```text + +``` text ┌─dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ DateTime, Const(size = 1, UInt32(size = 1)) │ └──────────────────────────────────────────────────────────────┘ ``` -## defaultValueOfArgumentType +## defaultValueOfArgumentType {#defaultvalueofargumenttype} Outputs the default value for the data type. Does not include default values for custom columns set by the user. -```sql +``` sql defaultValueOfArgumentType(expression) ``` **Parameters:** -- `expression` — Arbitrary type of value or an expression that results in a value of an arbitrary type. +- `expression` — Arbitrary type of value or an expression that results in a value of an arbitrary type. **Returned values** -- `0` for numbers. -- Empty string for strings. -- `ᴺᵁᴸᴸ` for [Nullable](../../data_types/nullable.md). +- `0` for numbers. +- Empty string for strings. +- `ᴺᵁᴸᴸ` for [Nullable](../../data_types/nullable.md). **Example** -```sql +``` sql SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) ``` -```text + +``` text ┌─defaultValueOfArgumentType(CAST(1, 'Int8'))─┐ │ 0 │ └─────────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) ``` -```text + +``` text ┌─defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)'))─┐ │ ᴺᵁᴸᴸ │ └───────────────────────────────────────────────────────┘ ``` - -## replicate {#other_functions-replicate} +## replicate {#other-functions-replicate} Creates an array with a single value. Used for internal implementation of [arrayJoin](array_join.md#functions_arrayjoin). -```sql +``` sql SELECT replicate(x, arr); ``` **Parameters:** -- `arr` — Original array. ClickHouse creates a new array of the same length as the original and fills it with the value `x`. -- `x` — The value that the resulting array will be filled with. +- `arr` — Original array. ClickHouse creates a new array of the same length as the original and fills it with the value `x`. +- `x` — The value that the resulting array will be filled with. **Returned value** @@ -797,13 +807,13 @@ Type: `Array`. Query: -```sql +``` sql SELECT replicate(1, ['a', 'b', 'c']) ``` Result: -```text +``` text ┌─replicate(1, ['a', 'b', 'c'])─┐ │ [1,1,1] │ └───────────────────────────────┘ @@ -815,13 +825,13 @@ Returns amount of remaining space on the filesystem where the files of the datab **Syntax** -```sql +``` sql filesystemAvailable() ``` **Returned value** -- The amount of remaining space available in bytes. +- The amount of remaining space available in bytes. Type: [UInt64](../../data_types/int_uint.md). @@ -829,13 +839,13 @@ Type: [UInt64](../../data_types/int_uint.md). Query: -```sql +``` sql SELECT formatReadableSize(filesystemAvailable()) AS "Available space", toTypeName(filesystemAvailable()) AS "Type"; ``` Result: -```text +``` text ┌─Available space─┬─Type───┐ │ 30.75 GiB │ UInt64 │ └─────────────────┴────────┘ @@ -847,13 +857,13 @@ Returns total amount of the free space on the filesystem where the files of the **Syntax** -```sql +``` sql filesystemFree() ``` **Returned value** -- Amount of free space in bytes. +- Amount of free space in bytes. Type: [UInt64](../../data_types/int_uint.md). @@ -861,13 +871,13 @@ Type: [UInt64](../../data_types/int_uint.md). Query: -```sql +``` sql SELECT formatReadableSize(filesystemFree()) AS "Free space", toTypeName(filesystemFree()) AS "Type"; ``` Result: -```text +``` text ┌─Free space─┬─Type───┐ │ 32.39 GiB │ UInt64 │ └────────────┴────────┘ @@ -879,13 +889,13 @@ Returns the capacity of the filesystem in bytes. For evaluation, the [path](../. **Syntax** -```sql +``` sql filesystemCapacity() ``` **Returned value** -- Capacity information of the filesystem in bytes. +- Capacity information of the filesystem in bytes. Type: [UInt64](../../data_types/int_uint.md). @@ -893,13 +903,13 @@ Type: [UInt64](../../data_types/int_uint.md). Query: -```sql +``` sql SELECT formatReadableSize(filesystemCapacity()) AS "Capacity", toTypeName(filesystemCapacity()) AS "Type" ``` Result: -```text +``` text ┌─Capacity──┬─Type───┐ │ 39.32 GiB │ UInt64 │ └───────────┴────────┘ @@ -925,21 +935,21 @@ Only supports tables created with the `ENGINE = Join(ANY, LEFT, )` st **Syntax** -```sql +``` sql joinGet(join_storage_table_name, `value_column`, join_keys) ``` **Parameters** -- `join_storage_table_name` — an [identifier](../syntax.md#syntax-identifiers) indicates where search is performed. The identifier is searched in the default database (see parameter `default_database` in the config file). To override the default database, use the `USE db_name` or specify the database and the table through the separator `db_name.db_table`, see the example. -- `value_column` — name of the column of the table that contains required data. -- `join_keys` — list of keys. +- `join_storage_table_name` — an [identifier](../syntax.md#syntax-identifiers) indicates where search is performed. The identifier is searched in the default database (see parameter `default_database` in the config file). To override the default database, use the `USE db_name` or specify the database and the table through the separator `db_name.db_table`, see the example. +- `value_column` — name of the column of the table that contains required data. +- `join_keys` — list of keys. **Returned value** Returns list of values corresponded to list of keys. -If certain doesn't exist in source table then `0` or `null` will be returned based on [join_use_nulls](../../operations/settings/settings.md#join_use_nulls) setting. +If certain doesn’t exist in source table then `0` or `null` will be returned based on [join\_use\_nulls](../../operations/settings/settings.md#join_use_nulls) setting. More info about `join_use_nulls` in [Join operation](../../operations/table_engines/join.md). @@ -947,13 +957,13 @@ More info about `join_use_nulls` in [Join operation](../../operations/table_engi Input table: -```sql +``` sql CREATE DATABASE db_test CREATE TABLE db_test.id_val(`id` UInt32, `val` UInt32) ENGINE = Join(ANY, LEFT, id) SETTINGS join_use_nulls = 1 INSERT INTO db_test.id_val VALUES (1,11)(2,12)(4,13) ``` -```text +``` text ┌─id─┬─val─┐ │ 4 │ 13 │ │ 2 │ 12 │ @@ -963,13 +973,13 @@ INSERT INTO db_test.id_val VALUES (1,11)(2,12)(4,13) Query: -```sql +``` sql SELECT joinGet(db_test.id_val,'val',toUInt32(number)) from numbers(4) SETTINGS join_use_nulls = 1 ``` Result: -```text +``` text ┌─joinGet(db_test.id_val, 'val', toUInt32(number))─┐ │ 0 │ │ 11 │ @@ -978,30 +988,32 @@ Result: └──────────────────────────────────────────────────┘ ``` -## modelEvaluate(model_name, ...) {#function-modelevaluate} +## modelEvaluate(model\_name, …) {#function-modelevaluate} + Evaluate external model. Accepts a model name and model arguments. Returns Float64. -## throwIf(x\[, custom_message\]) +## throwIf(x\[, custom\_message\]) {#throwifx-custom-message} Throw an exception if the argument is non zero. -custom_message - is an optional parameter: a constant string, provides an error message +custom\_message - is an optional parameter: a constant string, provides an error message -```sql +``` sql SELECT throwIf(number = 3, 'Too many') FROM numbers(10); ``` -```text + +``` text ↙ Progress: 0.00 rows, 0.00 B (0.00 rows/s., 0.00 B/s.) Received exception from server (version 19.14.1): Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many. ``` ## identity {#identity} -Returns the same value that was used as its argument. Used for debugging and testing, allows to cancel using index, and get the query performance of a full scan. When query is analyzed for possible use of index, the analyzer doesn't look inside `identity` functions. +Returns the same value that was used as its argument. Used for debugging and testing, allows to cancel using index, and get the query performance of a full scan. When query is analyzed for possible use of index, the analyzer doesn’t look inside `identity` functions. **Syntax** -```sql +``` sql identity(x) ``` @@ -1009,13 +1021,13 @@ identity(x) Query: -```sql +``` sql SELECT identity(42) ``` Result: -```text +``` text ┌─identity(42)─┐ │ 42 │ └──────────────┘ @@ -1027,28 +1039,29 @@ Generates a string with a random set of [ASCII](https://en.wikipedia.org/wiki/AS **Syntax** -```sql +``` sql randomPrintableASCII(length) ``` **Parameters** -- `length` — Resulting string length. Positive integer. +- `length` — Resulting string length. Positive integer. - If you pass `length < 0`, behavior of the function is undefined. + If you pass `length < 0`, behavior of the function is undefined. **Returned value** - - String with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters. +- String with a random set of [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) printable characters. Type: [String](../../data_types/string.md) **Example** -```sql +``` sql SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3 ``` -```text + +``` text ┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐ │ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │ │ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │ diff --git a/docs/en/query_language/functions/random_functions.md b/docs/en/query_language/functions/random_functions.md index 91bd5a5a725..744eca48a01 100644 --- a/docs/en/query_language/functions/random_functions.md +++ b/docs/en/query_language/functions/random_functions.md @@ -1,4 +1,4 @@ -# Functions for generating pseudo-random numbers +# Functions for generating pseudo-random numbers {#functions-for-generating-pseudo-random-numbers} Non-cryptographic generators of pseudo-random numbers are used. @@ -6,17 +6,17 @@ All the functions accept zero arguments or one argument. If an argument is passed, it can be any type, and its value is not used for anything. The only purpose of this argument is to prevent common subexpression elimination, so that two different instances of the same function return different columns with different random numbers. -## rand +## rand {#rand} Returns a pseudo-random UInt32 number, evenly distributed among all UInt32-type numbers. Uses a linear congruential generator. -## rand64 +## rand64 {#rand64} Returns a pseudo-random UInt64 number, evenly distributed among all UInt64-type numbers. Uses a linear congruential generator. -## randConstant +## randConstant {#randconstant} Returns a pseudo-random UInt32 number, The value is one for different blocks. diff --git a/docs/en/query_language/functions/rounding_functions.md b/docs/en/query_language/functions/rounding_functions.md index 740ae3112fb..538851b6dfc 100644 --- a/docs/en/query_language/functions/rounding_functions.md +++ b/docs/en/query_language/functions/rounding_functions.md @@ -1,55 +1,56 @@ -# Rounding functions +# Rounding functions {#rounding-functions} -## floor(x\[, N\]) +## floor(x\[, N\]) {#floorx-n} -Returns the largest round number that is less than or equal to `x`. A round number is a multiple of 1/10N, or the nearest number of the appropriate data type if 1 / 10N isn't exact. -'N' is an integer constant, optional parameter. By default it is zero, which means to round to an integer. -'N' may be negative. +Returns the largest round number that is less than or equal to `x`. A round number is a multiple of 1/10N, or the nearest number of the appropriate data type if 1 / 10N isn’t exact. +‘N’ is an integer constant, optional parameter. By default it is zero, which means to round to an integer. +‘N’ may be negative. Examples: `floor(123.45, 1) = 123.4, floor(123.45, -1) = 120.` `x` is any numeric type. The result is a number of the same type. -For integer arguments, it makes sense to round with a negative `N` value (for non-negative `N`, the function doesn't do anything). +For integer arguments, it makes sense to round with a negative `N` value (for non-negative `N`, the function doesn’t do anything). If rounding causes overflow (for example, floor(-128, -1)), an implementation-specific result is returned. -## ceil(x\[, N\]), ceiling(x\[, N\]) +## ceil(x\[, N\]), ceiling(x\[, N\]) {#ceilx-n-ceilingx-n} Returns the smallest round number that is greater than or equal to `x`. In every other way, it is the same as the `floor` function (see above). -## trunc(x\[, N\]), truncate(x\[, N\]) +## trunc(x\[, N\]), truncate(x\[, N\]) {#truncx-n-truncatex-n} -Returns the round number with largest absolute value that has an absolute value less than or equal to `x`'s. In every other way, it is the same as the 'floor' function (see above). +Returns the round number with largest absolute value that has an absolute value less than or equal to `x`‘s. In every other way, it is the same as the ’floor’ function (see above). -## round(x\[, N\]) {#rounding_functions-round} +## round(x\[, N\]) {#rounding-functions-round} Rounds a value to a specified number of decimal places. -The function returns the nearest number of the specified order. In case when given number has equal distance to surrounding numbers, the function uses banker's rounding for float number types and rounds away from zero for the other number types. +The function returns the nearest number of the specified order. In case when given number has equal distance to surrounding numbers, the function uses banker’s rounding for float number types and rounds away from zero for the other number types. -```sql +``` sql round(expression [, decimal_places]) ``` **Parameters:** -- `expression` — A number to be rounded. Can be any [expression](../syntax.md#syntax-expressions) returning the numeric [data type](../../data_types/index.md#data_types). -- `decimal-places` — An integer value. - - If `decimal-places > 0` then the function rounds the value to the right of the decimal point. - - If `decimal-places < 0` then the function rounds the value to the left of the decimal point. - - If `decimal-places = 0` then the function rounds the value to integer. In this case the argument can be omitted. +- `expression` — A number to be rounded. Can be any [expression](../syntax.md#syntax-expressions) returning the numeric [data type](../../data_types/index.md#data_types). +- `decimal-places` — An integer value. + - If `decimal-places > 0` then the function rounds the value to the right of the decimal point. + - If `decimal-places < 0` then the function rounds the value to the left of the decimal point. + - If `decimal-places = 0` then the function rounds the value to integer. In this case the argument can be omitted. **Returned value:** The rounded number of the same type as the input number. -### Examples +### Examples {#examples} **Example of use** -```sql +``` sql SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 ``` -```text + +``` text ┌───x─┬─round(divide(number, 2))─┐ │ 0 │ 0 │ │ 0.5 │ 0 │ @@ -61,7 +62,7 @@ SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 Rounding to the nearest number. -```text +``` text round(3.2, 0) = 3 round(4.1267, 2) = 4.13 round(22,-1) = 20 @@ -69,70 +70,70 @@ round(467,-2) = 500 round(-467,-2) = -500 ``` -Banker's rounding. +Banker’s rounding. -```text +``` text round(3.5) = 4 round(4.5) = 4 round(3.55, 1) = 3.6 round(3.65, 1) = 3.6 ``` -**See Also** +**See Also** -- [roundBankers](#roundbankers) +- [roundBankers](#roundbankers) ## roundBankers {#roundbankers} Rounds a number to a specified decimal position. -- If the rounding number is halfway between two numbers, the function uses banker's rounding. +- If the rounding number is halfway between two numbers, the function uses banker’s rounding. - Banker's rounding is a method of rounding fractional numbers. When the rounding number is halfway between two numbers, it's rounded to the nearest even digit at the specified decimal position. For example: 3.5 rounds up to 4, 2.5 rounds down to 2. - - It's the default rounding method for floating point numbers defined in [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). The [round](#rounding_functions-round) function performs the same rounding for floating point numbers. The `roundBankers` function also rounds integers the same way, for example, `roundBankers(45, -1) = 40`. + Banker's rounding is a method of rounding fractional numbers. When the rounding number is halfway between two numbers, it's rounded to the nearest even digit at the specified decimal position. For example: 3.5 rounds up to 4, 2.5 rounds down to 2. -- In other cases, the function rounds numbers to the nearest integer. + It's the default rounding method for floating point numbers defined in [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). The [round](#rounding_functions-round) function performs the same rounding for floating point numbers. The `roundBankers` function also rounds integers the same way, for example, `roundBankers(45, -1) = 40`. -Using banker's rounding, you can reduce the effect that rounding numbers has on the results of summing or subtracting these numbers. +- In other cases, the function rounds numbers to the nearest integer. + +Using banker’s rounding, you can reduce the effect that rounding numbers has on the results of summing or subtracting these numbers. For example, sum numbers 1.5, 2.5, 3.5, 4.5 with different rounding: -- No rounding: 1.5 + 2.5 + 3.5 + 4.5 = 12. -- Banker's rounding: 2 + 2 + 4 + 4 = 12. -- Rounding to the nearest integer: 2 + 3 + 4 + 5 = 14. +- No rounding: 1.5 + 2.5 + 3.5 + 4.5 = 12. +- Banker’s rounding: 2 + 2 + 4 + 4 = 12. +- Rounding to the nearest integer: 2 + 3 + 4 + 5 = 14. -**Syntax** +**Syntax** -```sql +``` sql roundBankers(expression [, decimal_places]) ``` -**Parameters** +**Parameters** -- `expression` — A number to be rounded. Can be any [expression](../syntax.md#syntax-expressions) returning the numeric [data type](../../data_types/index.md#data_types). -- `decimal-places` — Decimal places. An integer number. - - `decimal-places > 0` — The function rounds the number to the given position right of the decimal point. Example: `roundBankers(3.55, 1) = 3.6`. - - `decimal-places < 0` — The function rounds the number to the given position left of the decimal point. Example: `roundBankers(24.55, -1) = 20`. - - `decimal-places = 0` — The function rounds the number to an integer. In this case the argument can be omitted. Example: `roundBankers(2.5) = 2`. +- `expression` — A number to be rounded. Can be any [expression](../syntax.md#syntax-expressions) returning the numeric [data type](../../data_types/index.md#data_types). +- `decimal-places` — Decimal places. An integer number. + - `decimal-places > 0` — The function rounds the number to the given position right of the decimal point. Example: `roundBankers(3.55, 1) = 3.6`. + - `decimal-places < 0` — The function rounds the number to the given position left of the decimal point. Example: `roundBankers(24.55, -1) = 20`. + - `decimal-places = 0` — The function rounds the number to an integer. In this case the argument can be omitted. Example: `roundBankers(2.5) = 2`. **Returned value** -A value rounded by the banker's rounding method. +A value rounded by the banker’s rounding method. -### Examples +### Examples {#examples-1} **Example of use** Query: -```sql +``` sql SELECT number / 2 AS x, roundBankers(x, 0) AS b fROM system.numbers limit 10 ``` Result: -```text +``` text ┌───x─┬─b─┐ │ 0 │ 0 │ │ 0.5 │ 0 │ @@ -147,9 +148,9 @@ Result: └─────┴───┘ ``` -**Examples of Banker's rounding** +**Examples of Banker’s rounding** -```text +``` text roundBankers(0.4) = 0 roundBankers(-3.5) = -4 roundBankers(4.5) = 4 @@ -159,23 +160,23 @@ roundBankers(10.35, 1) = 10.4 roundBankers(10.755, 2) = 11,76 ``` -**See Also** +**See Also** -- [round](#rounding_functions-round) +- [round](#rounding_functions-round) -## roundToExp2(num) +## roundToExp2(num) {#roundtoexp2num} Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to the nearest (whole non-negative) degree of two. -## roundDuration(num) +## roundDuration(num) {#rounddurationnum} Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. This function is specific to Yandex.Metrica and used for implementing the report on session length. -## roundAge(num) +## roundAge(num) {#roundagenum} Accepts a number. If the number is less than 18, it returns 0. Otherwise, it rounds the number down to a number from the set: 18, 25, 35, 45, 55. This function is specific to Yandex.Metrica and used for implementing the report on user age. -## roundDown(num, arr) +## roundDown(num, arr) {#rounddownnum-arr} Accepts a number and rounds it down to an element in the specified array. If the value is less than the lowest bound, the lowest bound is returned. diff --git a/docs/en/query_language/functions/splitting_merging_functions.md b/docs/en/query_language/functions/splitting_merging_functions.md index 514c2165376..b91cdd4ed59 100644 --- a/docs/en/query_language/functions/splitting_merging_functions.md +++ b/docs/en/query_language/functions/splitting_merging_functions.md @@ -1,31 +1,109 @@ -# Functions for splitting and merging strings and arrays +# Functions for splitting and merging strings and arrays {#functions-for-splitting-and-merging-strings-and-arrays} -## splitByChar(separator, s) +## splitByChar(separator, s) {#splitbycharseparator-s} -Splits a string into substrings separated by 'separator'.'separator' must be a string constant consisting of exactly one character. +Splits a string into substrings separated by a specified character. It uses a constant string `separator` which consisting of exactly one character. Returns an array of selected substrings. Empty substrings may be selected if the separator occurs at the beginning or end of the string, or if there are multiple consecutive separators. -## splitByString(separator, s) +**Syntax** -The same as above, but it uses a string of multiple characters as the separator. The string must be non-empty. +```sql +splitByChar(, ) +``` -## arrayStringConcat(arr\[, separator\]) +**Parameters** -Concatenates the strings listed in the array with the separator.'separator' is an optional parameter: a constant string, set to an empty string by default. +- `separator` — The separator which should contain exactly one character. [String](../../data_types/string.md). +- `s` — The string to split. [String](../../data_types/string.md). + +**Returned value(s)** + +Returns an array of selected substrings. Empty substrings may be selected when: + +* A separator occurs at the beginning or end of the string; +* There are multiple consecutive separators; +* The original string `s` is empty. + +Type: [Array](../../data_types/array.md) of [String](../../data_types/string.md). + +**Example** + +``` sql +SELECT splitByChar(',', '1,2,3,abcde') +``` + +``` text +┌─splitByChar(',', '1,2,3,abcde')─┐ +│ ['1','2','3','abcde'] │ +└─────────────────────────────────┘ +``` + +## splitByString(separator, s) {#splitbystringseparator-s} + +Splits a string into substrings separated by a string. It uses a constant string `separator` of multiple characters as the separator. If the string `separator` is empty, it will split the string `s` into an array of single characters. + +**Syntax** + +```sql +splitByString(, ) +``` + +**Parameters** + +- `separator` — The separator. [String](../../data_types/string.md). +- `s` — The string to split. [String](../../data_types/string.md). + +**Returned value(s)** + +Returns an array of selected substrings. Empty substrings may be selected when: + +Type: [Array](../../data_types/array.md) of [String](../../data_types/string.md). + +* A non-empty separator occurs at the beginning or end of the string; +* There are multiple consecutive non-empty separators; +* The original string `s` is empty while the separator is not empty. + +**Example** + +``` sql +SELECT splitByString(', ', '1, 2 3, 4,5, abcde') +``` + +``` text +┌─splitByString(', ', '1, 2 3, 4,5, abcde')─┐ +│ ['1','2 3','4,5','abcde'] │ +└───────────────────────────────────────────┘ +``` + +``` sql +SELECT splitByString('', 'abcde') +``` + +``` text +┌─splitByString('', 'abcde')─┐ +│ ['a','b','c','d','e'] │ +└────────────────────────────┘ +``` + +## arrayStringConcat(arr\[, separator\]) {#arraystringconcatarr-separator} + +Concatenates the strings listed in the array with the separator.’separator’ is an optional parameter: a constant string, set to an empty string by default. Returns the string. -## alphaTokens(s) +## alphaTokens(s) {#alphatokenss} Selects substrings of consecutive bytes from the ranges a-z and A-Z.Returns an array of substrings. -**Example:** +**Example** -```sql +``` sql SELECT alphaTokens('abca1abc') ``` -```text + +``` text ┌─alphaTokens('abca1abc')─┐ │ ['abca','abc'] │ └─────────────────────────┘ ``` + [Original article](https://clickhouse.tech/docs/en/query_language/functions/splitting_merging_functions/) diff --git a/docs/en/query_language/functions/string_functions.md b/docs/en/query_language/functions/string_functions.md index 18ad0c38093..b17b4e8d801 100644 --- a/docs/en/query_language/functions/string_functions.md +++ b/docs/en/query_language/functions/string_functions.md @@ -1,85 +1,86 @@ -# Functions for working with strings +# Functions for working with strings {#functions-for-working-with-strings} -## empty {#string_functions-empty} +## empty {#empty} Returns 1 for an empty string or 0 for a non-empty string. The result type is UInt8. A string is considered non-empty if it contains at least one byte, even if this is a space or a null byte. The function also works for arrays. -## notEmpty +## notEmpty {#notempty} Returns 0 for an empty string or 1 for a non-empty string. The result type is UInt8. The function also works for arrays. -## length +## length {#length} Returns the length of a string in bytes (not in characters, and not in code points). The result type is UInt64. The function also works for arrays. -## lengthUTF8 +## lengthUTF8 {#lengthutf8} -Returns the length of a string in Unicode code points (not in characters), assuming that the string contains a set of bytes that make up UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). +Returns the length of a string in Unicode code points (not in characters), assuming that the string contains a set of bytes that make up UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn’t throw an exception). The result type is UInt64. -## char_length, CHAR_LENGTH +## char\_length, CHAR\_LENGTH {#char-length} -Returns the length of a string in Unicode code points (not in characters), assuming that the string contains a set of bytes that make up UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). +Returns the length of a string in Unicode code points (not in characters), assuming that the string contains a set of bytes that make up UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn’t throw an exception). The result type is UInt64. -## character_length, CHARACTER_LENGTH +## character\_length, CHARACTER\_LENGTH {#character-length} -Returns the length of a string in Unicode code points (not in characters), assuming that the string contains a set of bytes that make up UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). +Returns the length of a string in Unicode code points (not in characters), assuming that the string contains a set of bytes that make up UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn’t throw an exception). The result type is UInt64. -## lower, lcase +## lower, lcase {#lower} Converts ASCII Latin symbols in a string to lowercase. -## upper, ucase +## upper, ucase {#upper} Converts ASCII Latin symbols in a string to uppercase. -## lowerUTF8 +## lowerUTF8 {#lowerutf8} Converts a string to lowercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. -It doesn't detect the language. So for Turkish the result might not be exactly correct. +It doesn’t detect the language. So for Turkish the result might not be exactly correct. If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point. If the string contains a set of bytes that is not UTF-8, then the behavior is undefined. -## upperUTF8 +## upperUTF8 {#upperutf8} Converts a string to uppercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. -It doesn't detect the language. So for Turkish the result might not be exactly correct. +It doesn’t detect the language. So for Turkish the result might not be exactly correct. If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point. If the string contains a set of bytes that is not UTF-8, then the behavior is undefined. -## isValidUTF8 +## isValidUTF8 {#isvalidutf8} Returns 1, if the set of bytes is valid UTF-8 encoded, otherwise 0. -## toValidUTF8 +## toValidUTF8 {#tovalidutf8} Replaces invalid UTF-8 characters by the `�` (U+FFFD) character. All running in a row invalid characters are collapsed into the one replacement character. -```sql +``` sql toValidUTF8( input_string ) ``` Parameters: -- input_string — Any set of bytes represented as the [String](../../data_types/string.md) data type object. +- input\_string — Any set of bytes represented as the [String](../../data_types/string.md) data type object. Returned value: Valid UTF-8 string. -### Example +**Example** -```sql +``` sql SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') ``` -```text + +``` text ┌─toValidUTF8('a����b')─┐ │ a�b │ └───────────────────────┘ @@ -91,18 +92,18 @@ Repeats a string as many times as specified and concatenates the replicated valu **Syntax** -```sql +``` sql repeat(s, n) ``` **Parameters** -- `s` — The string to repeat. [String](../../data_types/string.md). -- `n` — The number of times to repeat the string. [UInt](../../data_types/int_uint.md). +- `s` — The string to repeat. [String](../../data_types/string.md). +- `n` — The number of times to repeat the string. [UInt](../../data_types/int_uint.md). **Returned value** -The single string, which contains the string `s` repeated `n` times. If `n` < 1, the function returns empty string. +The single string, which contains the string `s` repeated `n` times. If `n` \< 1, the function returns empty string. Type: `String`. @@ -110,42 +111,45 @@ Type: `String`. Query: -```sql +``` sql SELECT repeat('abc', 10) ``` Result: -```text +``` text ┌─repeat('abc', 10)──────────────┐ │ abcabcabcabcabcabcabcabcabcabc │ └────────────────────────────────┘ ``` -## reverse +## reverse {#reverse} Reverses the string (as a sequence of bytes). -## reverseUTF8 +## reverseUTF8 {#reverseutf8} -Reverses a sequence of Unicode code points, assuming that the string contains a set of bytes representing a UTF-8 text. Otherwise, it does something else (it doesn't throw an exception). +Reverses a sequence of Unicode code points, assuming that the string contains a set of bytes representing a UTF-8 text. Otherwise, it does something else (it doesn’t throw an exception). -## format(pattern, s0, s1, ...) +## format(pattern, s0, s1, …) {#format} -Formatting constant pattern with the string listed in the arguments. `pattern` is a simplified Python format pattern. Format string contains "replacement fields" surrounded by curly braces `{}`. Anything that is not contained in braces is considered literal text, which is copied unchanged to the output. If you need to include a brace character in the literal text, it can be escaped by doubling: `{{` and `}}`. Field names can be numbers (starting from zero) or empty (then they are treated as consequence numbers). +Formatting constant pattern with the string listed in the arguments. `pattern` is a simplified Python format pattern. Format string contains “replacement fields” surrounded by curly braces `{}`. Anything that is not contained in braces is considered literal text, which is copied unchanged to the output. If you need to include a brace character in the literal text, it can be escaped by doubling: `{{ '{{' }}` and `{{ '}}' }}`. Field names can be numbers (starting from zero) or empty (then they are treated as consequence numbers). -```sql +``` sql SELECT format('{1} {0} {1}', 'World', 'Hello') ``` -```text + +``` text ┌─format('{1} {0} {1}', 'World', 'Hello')─┐ │ Hello World Hello │ └─────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT format('{} {}', 'Hello', 'World') ``` -```text + +``` text ┌─format('{} {}', 'Hello', 'World')─┐ │ Hello World │ └───────────────────────────────────┘ @@ -155,9 +159,9 @@ SELECT format('{} {}', 'Hello', 'World') Concatenates the strings listed in the arguments, without a separator. -**Syntax** +**Syntax** -```sql +``` sql concat(s1, s2, ...) ``` @@ -167,21 +171,21 @@ Values of type String or FixedString. **Returned values** -Returns the String that results from concatenating the arguments. +Returns the String that results from concatenating the arguments. -If any of argument values is `NULL`, `concat` returns `NULL`. +If any of argument values is `NULL`, `concat` returns `NULL`. **Example** Query: -```sql +``` sql SELECT concat('Hello, ', 'World!') ``` Result: -```text +``` text ┌─concat('Hello, ', 'World!')─┐ │ Hello, World! │ └─────────────────────────────┘ @@ -191,19 +195,35 @@ Result: Same as [concat](#concat), the difference is that you need to ensure that `concat(s1, s2, ...) → sn` is injective, it will be used for optimization of GROUP BY. -The function is named "injective" if it always returns different result for different values of arguments. In other words: different arguments never yield identical result. +The function is named “injective” if it always returns different result for different values of arguments. In other words: different arguments never yield identical result. + +**Syntax** + +``` sql +concatAssumeInjective(s1, s2, ...) +``` + +**Parameters** + +Values of type String or FixedString. + +**Returned values** + +Returns the String that results from concatenating the arguments. + +If any of argument values is `NULL`, `concatAssumeInjective` returns `NULL`. **Example** Input table: -```sql +``` sql CREATE TABLE key_val(`key1` String, `key2` String, `value` UInt32) ENGINE = TinyLog; INSERT INTO key_val VALUES ('Hello, ','World',1), ('Hello, ','World',2), ('Hello, ','World!',3), ('Hello',', World!',2); SELECT * from key_val; ``` -```text +``` text ┌─key1────┬─key2─────┬─value─┐ │ Hello, │ World │ 1 │ │ Hello, │ World │ 2 │ @@ -214,13 +234,13 @@ SELECT * from key_val; Query: -```sql +``` sql SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY concatAssumeInjective(key1, key2) ``` Result: -```text +``` text ┌─concat(key1, key2)─┬─sum(value)─┐ │ Hello, World! │ 3 │ │ Hello, World! │ 2 │ @@ -228,59 +248,62 @@ Result: └────────────────────┴────────────┘ ``` -## substring(s, offset, length), mid(s, offset, length), substr(s, offset, length) +## substring(s, offset, length), mid(s, offset, length), substr(s, offset, length) {#substring} -Returns a substring starting with the byte from the 'offset' index that is 'length' bytes long. Character indexing starts from one (as in standard SQL). The 'offset' and 'length' arguments must be constants. +Returns a substring starting with the byte from the ‘offset’ index that is ‘length’ bytes long. Character indexing starts from one (as in standard SQL). The ‘offset’ and ‘length’ arguments must be constants. -## substringUTF8(s, offset, length) +## substringUTF8(s, offset, length) {#substringutf8} -The same as 'substring', but for Unicode code points. Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). +The same as ‘substring’, but for Unicode code points. Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn’t throw an exception). -## appendTrailingCharIfAbsent(s, c) +## appendTrailingCharIfAbsent(s, c) {#appendtrailingcharifabsent} -If the 's' string is non-empty and does not contain the 'c' character at the end, it appends the 'c' character to the end. +If the ‘s’ string is non-empty and does not contain the ‘c’ character at the end, it appends the ‘c’ character to the end. -## convertCharset(s, from, to) +## convertCharset(s, from, to) {#convertcharset} -Returns the string 's' that was converted from the encoding in 'from' to the encoding in 'to'. +Returns the string ‘s’ that was converted from the encoding in ‘from’ to the encoding in ‘to’. -## base64Encode(s) -Encodes 's' string into base64 +## base64Encode(s) {#base64encode} -## base64Decode(s) -Decode base64-encoded string 's' into original string. In case of failure raises an exception. +Encodes ‘s’ string into base64 + +## base64Decode(s) {#base64decode} + +Decode base64-encoded string ‘s’ into original string. In case of failure raises an exception. + +## tryBase64Decode(s) {#trybase64decode} -## tryBase64Decode(s) Similar to base64Decode, but in case of error an empty string would be returned. -## endsWith(s, suffix) {#function-endswith} +## endsWith(s, suffix) {#endswith} Returns whether to end with the specified suffix. Returns 1 if the string ends with the specified suffix, otherwise it returns 0. -## startsWith(str, prefix) {#function-startswith} +## startsWith(str, prefix) {#startswith} Returns 1 whether string starts with the specified prefix, otherwise it returns 0. -```sql +``` sql SELECT startsWith('Spider-Man', 'Spi'); ``` **Returned values** -- 1, if the string starts with the specified prefix. -- 0, if the string doesn't start with the specified prefix. +- 1, if the string starts with the specified prefix. +- 0, if the string doesn’t start with the specified prefix. **Example** Query: -```sql +``` sql SELECT startsWith('Hello, world!', 'He'); ``` Result: -```text +``` text ┌─startsWith('Hello, world!', 'He')─┐ │ 1 │ └───────────────────────────────────┘ @@ -293,14 +316,14 @@ By default removes all consecutive occurrences of common whitespace (ASCII chara **Syntax** -```sql +``` sql trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) ``` **Parameters** -- `trim_character` — specified characters for trim. [String](../../data_types/string.md). -- `input_string` — string for trim. [String](../../data_types/string.md). +- `trim_character` — specified characters for trim. [String](../../data_types/string.md). +- `input_string` — string for trim. [String](../../data_types/string.md). **Returned value** @@ -312,13 +335,13 @@ Type: `String`. Query: -```sql +``` sql SELECT trim(BOTH ' ()' FROM '( Hello, world! )') ``` Result: -```text +``` text ┌─trim(BOTH ' ()' FROM '( Hello, world! )')─┐ │ Hello, world! │ └───────────────────────────────────────────────┘ @@ -326,19 +349,19 @@ Result: ## trimLeft {#trimleft} -Removes all consecutive occurrences of common whitespace (ASCII character 32) from the beginning of a string. It doesn't remove other kinds of whitespace characters (tab, no-break space, etc.). +Removes all consecutive occurrences of common whitespace (ASCII character 32) from the beginning of a string. It doesn’t remove other kinds of whitespace characters (tab, no-break space, etc.). -**Syntax** +**Syntax** -```sql +``` sql trimLeft(input_string) ``` Alias: `ltrim(input_string)`. -**Parameters** +**Parameters** -- `input_string` — string to trim. [String](../../data_types/string.md). +- `input_string` — string to trim. [String](../../data_types/string.md). **Returned value** @@ -350,13 +373,13 @@ Type: `String`. Query: -```sql +``` sql SELECT trimLeft(' Hello, world! ') ``` Result: -```text +``` text ┌─trimLeft(' Hello, world! ')─┐ │ Hello, world! │ └─────────────────────────────────────┘ @@ -364,11 +387,11 @@ Result: ## trimRight {#trimright} -Removes all consecutive occurrences of common whitespace (ASCII character 32) from the end of a string. It doesn't remove other kinds of whitespace characters (tab, no-break space, etc.). +Removes all consecutive occurrences of common whitespace (ASCII character 32) from the end of a string. It doesn’t remove other kinds of whitespace characters (tab, no-break space, etc.). -**Syntax** +**Syntax** -```sql +``` sql trimRight(input_string) ``` @@ -376,7 +399,7 @@ Alias: `rtrim(input_string)`. **Parameters** -- `input_string` — string to trim. [String](../../data_types/string.md). +- `input_string` — string to trim. [String](../../data_types/string.md). **Returned value** @@ -388,25 +411,25 @@ Type: `String`. Query: -```sql +``` sql SELECT trimRight(' Hello, world! ') ``` Result: -```text +``` text ┌─trimRight(' Hello, world! ')─┐ │ Hello, world! │ └──────────────────────────────────────┘ ``` -## trimBoth {#trimboth} +## trimBoth {#trimboth} -Removes all consecutive occurrences of common whitespace (ASCII character 32) from both ends of a string. It doesn't remove other kinds of whitespace characters (tab, no-break space, etc.). +Removes all consecutive occurrences of common whitespace (ASCII character 32) from both ends of a string. It doesn’t remove other kinds of whitespace characters (tab, no-break space, etc.). -**Syntax** +**Syntax** -```sql +``` sql trimBoth(input_string) ``` @@ -414,7 +437,7 @@ Alias: `trim(input_string)`. **Parameters** -- `input_string` — string to trim. [String](../../data_types/string.md). +- `input_string` — string to trim. [String](../../data_types/string.md). **Returned value** @@ -426,31 +449,31 @@ Type: `String`. Query: -```sql +``` sql SELECT trimBoth(' Hello, world! ') ``` Result: -```text +``` text ┌─trimBoth(' Hello, world! ')─┐ │ Hello, world! │ └─────────────────────────────────────┘ ``` -## CRC32(s) +## CRC32(s) {#crc32} Returns the CRC32 checksum of a string, using CRC-32-IEEE 802.3 polynomial and initial value `0xffffffff` (zlib implementation). The result type is UInt32. -## CRC32IEEE(s) +## CRC32IEEE(s) {#crc32ieee} Returns the CRC32 checksum of a string, using CRC-32-IEEE 802.3 polynomial. The result type is UInt32. -## CRC64(s) +## CRC64(s) {#crc64} Returns the CRC64 checksum of a string, using CRC-64-ECMA polynomial. diff --git a/docs/en/query_language/functions/string_replace_functions.md b/docs/en/query_language/functions/string_replace_functions.md index 258c4459f01..c0709fd3090 100644 --- a/docs/en/query_language/functions/string_replace_functions.md +++ b/docs/en/query_language/functions/string_replace_functions.md @@ -1,25 +1,25 @@ -# Functions for searching and replacing in strings +# Functions for searching and replacing in strings {#functions-for-searching-and-replacing-in-strings} -## replaceOne(haystack, pattern, replacement) +## replaceOne(haystack, pattern, replacement) {#replaceonehaystack-pattern-replacement} -Replaces the first occurrence, if it exists, of the 'pattern' substring in 'haystack' with the 'replacement' substring. -Hereafter, 'pattern' and 'replacement' must be constants. +Replaces the first occurrence, if it exists, of the ‘pattern’ substring in ‘haystack’ with the ‘replacement’ substring. +Hereafter, ‘pattern’ and ‘replacement’ must be constants. -## replaceAll(haystack, pattern, replacement), replace(haystack, pattern, replacement) +## replaceAll(haystack, pattern, replacement), replace(haystack, pattern, replacement) {#replaceallhaystack-pattern-replacement-replacehaystack-pattern-replacement} -Replaces all occurrences of the 'pattern' substring in 'haystack' with the 'replacement' substring. +Replaces all occurrences of the ‘pattern’ substring in ‘haystack’ with the ‘replacement’ substring. -## replaceRegexpOne(haystack, pattern, replacement) +## replaceRegexpOne(haystack, pattern, replacement) {#replaceregexponehaystack-pattern-replacement} -Replacement using the 'pattern' regular expression. A re2 regular expression. +Replacement using the ‘pattern’ regular expression. A re2 regular expression. Replaces only the first occurrence, if it exists. -A pattern can be specified as 'replacement'. This pattern can include substitutions `\0-\9`. +A pattern can be specified as ‘replacement’. This pattern can include substitutions `\0-\9`. The substitution `\0` includes the entire regular expression. Substitutions `\1-\9` correspond to the subpattern numbers.To use the `\` character in a template, escape it using `\`. Also keep in mind that a string literal requires an extra escape. Example 1. Converting the date to American format: -```sql +``` sql SELECT DISTINCT EventDate, replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res @@ -28,7 +28,7 @@ LIMIT 7 FORMAT TabSeparated ``` -```text +``` text 2014-03-17 03/17/2014 2014-03-18 03/18/2014 2014-03-19 03/19/2014 @@ -40,25 +40,25 @@ FORMAT TabSeparated Example 2. Copying a string ten times: -```sql +``` sql SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res ``` -```text +``` text ┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## replaceRegexpAll(haystack, pattern, replacement) +## replaceRegexpAll(haystack, pattern, replacement) {#replaceregexpallhaystack-pattern-replacement} This does the same thing, but replaces all the occurrences. Example: -```sql +``` sql SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res ``` -```text +``` text ┌─res────────────────────────┐ │ HHeelllloo,, WWoorrlldd!! │ └────────────────────────────┘ @@ -67,23 +67,21 @@ SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res As an exception, if a regular expression worked on an empty substring, the replacement is not made more than once. Example: -```sql +``` sql SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res ``` -```text +``` text ┌─res─────────────────┐ │ here: Hello, World! │ └─────────────────────┘ ``` - -## regexpQuoteMeta(s) +## regexpQuoteMeta(s) {#regexpquotemetas} The function adds a backslash before some predefined characters in the string. -Predefined characters: '0', '\\', '|', '(', ')', '^', '$', '.', '[', ']', '?', '*', '+', '{', ':', '-'. -This implementation slightly differs from re2::RE2::QuoteMeta. It escapes zero byte as \0 instead of \x00 and it escapes only required characters. +Predefined characters: ‘0’, ‘\\’, ‘\|’, ‘(’, ‘)’, ‘^’, ‘$’, ‘.’, ‘\[’, ’\]’, ‘?’, ’\*‘,’+‘,’{‘,’:‘,’-’. +This implementation slightly differs from re2::RE2::QuoteMeta. It escapes zero byte as \\0 instead of 00 and it escapes only required characters. For more information, see the link: [RE2](https://github.com/google/re2/blob/master/re2/re2.cc#L473) - [Original article](https://clickhouse.tech/docs/en/query_language/functions/string_replace_functions/) diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index b3db647a1b5..17ccafea7bd 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -1,4 +1,4 @@ -# Functions for Searching Strings +# Functions for Searching Strings {#functions-for-searching-strings} The search is case-sensitive by default in all these functions. There are separate variants for case insensitive search. @@ -6,13 +6,13 @@ The search is case-sensitive by default in all these functions. There are separa Returns the position (in bytes) of the found substring in the string, starting from 1. -Works under the assumption that the string contains a set of bytes representing a single-byte encoded text. If this assumption is not met and a character can't be represented using a single byte, the function doesn't throw an exception and returns some unexpected result. If character can be represented using two bytes, it will use two bytes and so on. +Works under the assumption that the string contains a set of bytes representing a single-byte encoded text. If this assumption is not met and a character can’t be represented using a single byte, the function doesn’t throw an exception and returns some unexpected result. If character can be represented using two bytes, it will use two bytes and so on. For a case-insensitive search, use the function [positionCaseInsensitive](#positioncaseinsensitive). **Syntax** -```sql +``` sql position(haystack, needle) ``` @@ -20,45 +20,45 @@ Alias: `locate(haystack, needle)`. **Parameters** -- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). +- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). +- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). **Returned values** -- Starting position in bytes (counting from 1), if substring was found. -- 0, if the substring was not found. +- Starting position in bytes (counting from 1), if substring was found. +- 0, if the substring was not found. Type: `Integer`. **Examples** -The phrase "Hello, world!" contains a set of bytes representing a single-byte encoded text. The function returns some expected result: +The phrase “Hello, world!” contains a set of bytes representing a single-byte encoded text. The function returns some expected result: Query: -```sql +``` sql SELECT position('Hello, world!', '!') ``` Result: -```text +``` text ┌─position('Hello, world!', '!')─┐ │ 13 │ └────────────────────────────────┘ ``` -The same phrase in Russian contains characters which can't be represented using a single byte. The function returns some unexpected result (use [positionUTF8](#positionutf8) function for multi-byte encoded text): +The same phrase in Russian contains characters which can’t be represented using a single byte. The function returns some unexpected result (use [positionUTF8](#positionutf8) function for multi-byte encoded text): Query: -```sql +``` sql SELECT position('Привет, мир!', '!') ``` Result: -```text +``` text ┌─position('Привет, мир!', '!')─┐ │ 21 │ └───────────────────────────────┘ @@ -68,23 +68,23 @@ Result: The same as [position](#position) returns the position (in bytes) of the found substring in the string, starting from 1. Use the function for a case-insensitive search. -Works under the assumption that the string contains a set of bytes representing a single-byte encoded text. If this assumption is not met and a character can't be represented using a single byte, the function doesn't throw an exception and returns some unexpected result. If character can be represented using two bytes, it will use two bytes and so on. +Works under the assumption that the string contains a set of bytes representing a single-byte encoded text. If this assumption is not met and a character can’t be represented using a single byte, the function doesn’t throw an exception and returns some unexpected result. If character can be represented using two bytes, it will use two bytes and so on. **Syntax** -```sql +``` sql positionCaseInsensitive(haystack, needle) ``` **Parameters** -- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). +- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). +- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). **Returned values** -- Starting position in bytes (counting from 1), if substring was found. -- 0, if the substring was not found. +- Starting position in bytes (counting from 1), if substring was found. +- 0, if the substring was not found. Type: `Integer`. @@ -92,13 +92,13 @@ Type: `Integer`. Query: -```sql +``` sql SELECT positionCaseInsensitive('Hello, world!', 'hello') ``` Result: -```text +``` text ┌─positionCaseInsensitive('Hello, world!', 'hello')─┐ │ 1 │ └───────────────────────────────────────────────────┘ @@ -108,57 +108,57 @@ Result: Returns the position (in Unicode points) of the found substring in the string, starting from 1. -Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, the function doesn't throw an exception and returns some unexpected result. If character can be represented using two Unicode points, it will use two and so on. +Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, the function doesn’t throw an exception and returns some unexpected result. If character can be represented using two Unicode points, it will use two and so on. For a case-insensitive search, use the function [positionCaseInsensitiveUTF8](#positioncaseinsensitiveutf8). **Syntax** -```sql +``` sql positionUTF8(haystack, needle) ``` **Parameters** -- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). +- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). +- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). **Returned values** -- Starting position in Unicode points (counting from 1), if substring was found. -- 0, if the substring was not found. +- Starting position in Unicode points (counting from 1), if substring was found. +- 0, if the substring was not found. Type: `Integer`. **Examples** -The phrase "Hello, world!" in Russian contains a set of Unicode points representing a single-point encoded text. The function returns some expected result: +The phrase “Hello, world!” in Russian contains a set of Unicode points representing a single-point encoded text. The function returns some expected result: Query: -```sql +``` sql SELECT positionUTF8('Привет, мир!', '!') ``` Result: -```text +``` text ┌─positionUTF8('Привет, мир!', '!')─┐ │ 12 │ └───────────────────────────────────┘ ``` -The phrase "Salut, étudiante!", where character `é` can be represented using a one point (`U+00E9`) or two points (`U+0065U+0301`) the function can be returned some unexpected result: +The phrase “Salut, étudiante!”, where character `é` can be represented using a one point (`U+00E9`) or two points (`U+0065U+0301`) the function can be returned some unexpected result: Query for the letter `é`, which is represented one Unicode point `U+00E9`: -```sql +``` sql SELECT positionUTF8('Salut, étudiante!', '!') ``` Result: -```text +``` text ┌─positionUTF8('Salut, étudiante!', '!')─┐ │ 17 │ └────────────────────────────────────────┘ @@ -166,13 +166,13 @@ Result: Query for the letter `é`, which is represented two Unicode points `U+0065U+0301`: -```sql +``` sql SELECT positionUTF8('Salut, étudiante!', '!') ``` Result: -```text +``` text ┌─positionUTF8('Salut, étudiante!', '!')─┐ │ 18 │ └────────────────────────────────────────┘ @@ -182,23 +182,23 @@ Result: The same as [positionUTF8](#positionutf8), but is case-insensitive. Returns the position (in Unicode points) of the found substring in the string, starting from 1. -Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, the function doesn't throw an exception and returns some unexpected result. If character can be represented using two Unicode points, it will use two and so on. +Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, the function doesn’t throw an exception and returns some unexpected result. If character can be represented using two Unicode points, it will use two and so on. **Syntax** -```sql +``` sql positionCaseInsensitiveUTF8(haystack, needle) ``` **Parameters** -- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). +- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). +- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). **Returned value** -- Starting position in Unicode points (counting from 1), if substring was found. -- 0, if the substring was not found. +- Starting position in Unicode points (counting from 1), if substring was found. +- 0, if the substring was not found. Type: `Integer`. @@ -206,76 +206,76 @@ Type: `Integer`. Query: -```sql +``` sql SELECT positionCaseInsensitiveUTF8('Привет, мир!', 'Мир') ``` Result: -```text +``` text ┌─positionCaseInsensitiveUTF8('Привет, мир!', 'Мир')─┐ │ 9 │ └────────────────────────────────────────────────────┘ ``` -## multiSearchAllPositions {#multiSearchAllPositions} +## multiSearchAllPositions {#multisearchallpositions} The same as [position](string_search_functions.md#position) but returns `Array` of positions (in bytes) of the found corresponding substrings in the string. Positions are indexed starting from 1. The search is performed on sequences of bytes without respect to string encoding and collation. -- For case-insensitive ASCII search, use the function `multiSearchAllPositionsCaseInsensitive`. -- For search in UTF-8, use the function [multiSearchAllPositionsUTF8](#multiSearchAllPositionsUTF8). -- For case-insensitive UTF-8 search, use the function multiSearchAllPositionsCaseInsensitiveUTF8. +- For case-insensitive ASCII search, use the function `multiSearchAllPositionsCaseInsensitive`. +- For search in UTF-8, use the function [multiSearchAllPositionsUTF8](#multiSearchAllPositionsUTF8). +- For case-insensitive UTF-8 search, use the function multiSearchAllPositionsCaseInsensitiveUTF8. -**Syntax** +**Syntax** -```sql +``` sql multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) ``` **Parameters** -- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). +- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). +- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). **Returned values** -- Array of starting positions in bytes (counting from 1), if the corresponding substring was found and 0 if not found. +- Array of starting positions in bytes (counting from 1), if the corresponding substring was found and 0 if not found. **Example** Query: -```sql +``` sql SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']) ``` Result: -```text +``` text ┌─multiSearchAllPositions('Hello, World!', ['hello', '!', 'world'])─┐ │ [0,13,0] │ └───────────────────────────────────────────────────────────────────┘ ``` -## multiSearchAllPositionsUTF8 {#multiSearchAllPositionsUTF8} +## multiSearchAllPositionsUTF8 {#multisearchallpositionsutf8} See `multiSearchAllPositions`. -## multiSearchFirstPosition(haystack, [needle1, needle2, ..., needlen]) {#multiSearchFirstPosition} +## multiSearchFirstPosition(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstposition} The same as `position` but returns the leftmost offset of the string `haystack` that is matched to some of the needles. For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`. -## multiSearchFirstIndex(haystack, [needle1, needle2, ..., needlen]) +## multiSearchFirstIndex(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstindexhaystack-needle1-needle2-needlen} Returns the index `i` (starting from 1) of the leftmost found needlei in the string `haystack` and 0 otherwise. For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) {#function-multisearchany} +## multiSearchAny(haystack, \[needle1, needle2, …, needlen\]) {#function-multisearchany} Returns 1, if at least one string needlei matches the string `haystack` and 0 otherwise. @@ -284,41 +284,41 @@ For a case-insensitive search or/and in UTF-8 format use functions `multiSearchA !!! note "Note" In all `multiSearch*` functions the number of needles should be less than 28 because of implementation specification. -## match(haystack, pattern) +## match(haystack, pattern) {#matchhaystack-pattern} Checks whether the string matches the `pattern` regular expression. A `re2` regular expression. The [syntax](https://github.com/google/re2/wiki/Syntax) of the `re2` regular expressions is more limited than the syntax of the Perl regular expressions. -Returns 0 if it doesn't match, or 1 if it matches. +Returns 0 if it doesn’t match, or 1 if it matches. Note that the backslash symbol (`\`) is used for escaping in the regular expression. The same symbol is used for escaping in string literals. So in order to escape the symbol in a regular expression, you must write two backslashes (\\) in a string literal. -The regular expression works with the string as if it is a set of bytes. The regular expression can't contain null bytes. -For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster. +The regular expression works with the string as if it is a set of bytes. The regular expression can’t contain null bytes. +For patterns to search for substrings in a string, it is better to use LIKE or ‘position’, since they work much faster. -## multiMatchAny(haystack, [pattern1, pattern2, ..., patternn]) +## multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster. !!! note "Note" The length of any of the `haystack` string must be less than 232 bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API. -## multiMatchAnyIndex(haystack, [pattern1, pattern2, ..., patternn]) +## multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} The same as `multiMatchAny`, but returns any index that matches the haystack. -## multiMatchAllIndices(haystack, [pattern1, pattern2, ..., patternn]) +## multiMatchAllIndices(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchallindiceshaystack-pattern1-pattern2-patternn} The same as `multiMatchAny`, but returns the array of all indicies that match the haystack in any order. -## multiFuzzyMatchAny(haystack, distance, [pattern1, pattern2, ..., patternn]) +## multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyhaystack-distance-pattern1-pattern2-patternn} The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within a constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). -## multiFuzzyMatchAnyIndex(haystack, distance, [pattern1, pattern2, ..., patternn]) +## multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyindexhaystack-distance-pattern1-pattern2-patternn} The same as `multiFuzzyMatchAny`, but returns any index that matches the haystack within a constant edit distance. -## multiFuzzyMatchAllIndices(haystack, distance, [pattern1, pattern2, ..., patternn]) +## multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchallindiceshaystack-distance-pattern1-pattern2-patternn} The same as `multiFuzzyMatchAny`, but returns the array of all indices in any order that match the haystack within a constant edit distance. @@ -328,13 +328,13 @@ The same as `multiFuzzyMatchAny`, but returns the array of all indices in any or !!! note "Note" To turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`. -## extract(haystack, pattern) +## extract(haystack, pattern) {#extracthaystack-pattern} -Extracts a fragment of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. If the regex doesn't contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern. +Extracts a fragment of a string using a regular expression. If ‘haystack’ doesn’t match the ‘pattern’ regex, an empty string is returned. If the regex doesn’t contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern. -## extractAll(haystack, pattern) +## extractAll(haystack, pattern) {#extractallhaystack-pattern} -Extracts all the fragments of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. Returns an array of strings consisting of all matches to the regex. In general, the behavior is the same as the 'extract' function (it takes the first subpattern, or the entire expression if there isn't a subpattern). +Extracts all the fragments of a string using a regular expression. If ‘haystack’ doesn’t match the ‘pattern’ regex, an empty string is returned. Returns an array of strings consisting of all matches to the regex. In general, the behavior is the same as the ‘extract’ function (it takes the first subpattern, or the entire expression if there isn’t a subpattern). ## like(haystack, pattern), haystack LIKE pattern operator {#function-like} @@ -345,28 +345,28 @@ The regular expression can contain the metasymbols `%` and `_`. `_` indicates any one byte. -Use the backslash (`\`) for escaping metasymbols. See the note on escaping in the description of the 'match' function. +Use the backslash (`\`) for escaping metasymbols. See the note on escaping in the description of the ‘match’ function. For regular expressions like `%needle%`, the code is more optimal and works as fast as the `position` function. -For other regular expressions, the code is the same as for the 'match' function. +For other regular expressions, the code is the same as for the ‘match’ function. ## notLike(haystack, pattern), haystack NOT LIKE pattern operator {#function-notlike} -The same thing as 'like', but negative. +The same thing as ‘like’, but negative. -## ngramDistance(haystack, needle) +## ngramDistance(haystack, needle) {#ngramdistancehaystack-needle} -Calculates the 4-gram distance between `haystack` and `needle`: counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns float number from 0 to 1 -- the closer to zero, the more strings are similar to each other. If the constant `needle` or `haystack` is more than 32Kb, throws an exception. If some of the non-constant `haystack` or `needle` strings are more than 32Kb, the distance is always one. +Calculates the 4-gram distance between `haystack` and `needle`: counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns float number from 0 to 1 – the closer to zero, the more strings are similar to each other. If the constant `needle` or `haystack` is more than 32Kb, throws an exception. If some of the non-constant `haystack` or `needle` strings are more than 32Kb, the distance is always one. For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -## ngramSearch(haystack, needle) +## ngramSearch(haystack, needle) {#ngramsearchhaystack-needle} -Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` -- the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. The closer to one, the more likely `needle` is in the `haystack`. Can be useful for fuzzy string search. +Same as `ngramDistance` but calculates the non-symmetric difference between `needle` and `haystack` – the number of n-grams from needle minus the common number of n-grams normalized by the number of `needle` n-grams. The closer to one, the more likely `needle` is in the `haystack`. Can be useful for fuzzy string search. For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. !!! note "Note" - For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one -- this works for Latin and mostly for all Cyrillic letters. + For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters. [Original article](https://clickhouse.tech/docs/en/query_language/functions/string_search_functions/) diff --git a/docs/en/query_language/functions/type_conversion_functions.md b/docs/en/query_language/functions/type_conversion_functions.md index ffb757c17a2..b896f5b5b47 100644 --- a/docs/en/query_language/functions/type_conversion_functions.md +++ b/docs/en/query_language/functions/type_conversion_functions.md @@ -1,4 +1,4 @@ -# Type Conversion Functions +# Type Conversion Functions {#type-conversion-functions} ## Common Issues of Numeric Conversions {#numeric-conversion-issues} @@ -6,18 +6,18 @@ When you convert a value from one to another data type, you should remember that ClickHouse has the [same behavior as C++ programs](https://en.cppreference.com/w/cpp/language/implicit_conversion). -## toInt(8|16|32|64) +## toInt(8\|16\|32\|64) {#toint8163264} Converts an input value to the [Int](../../data_types/int_uint.md) data type. This function family includes: -* `toInt8(expr)` — Results in the `Int8` data type. -* `toInt16(expr)` — Results in the `Int16` data type. -* `toInt32(expr)` — Results in the `Int32` data type. -* `toInt64(expr)` — Results in the `Int64` data type. +- `toInt8(expr)` — Results in the `Int8` data type. +- `toInt16(expr)` — Results in the `Int16` data type. +- `toInt32(expr)` — Results in the `Int32` data type. +- `toInt64(expr)` — Results in the `Int64` data type. **Parameters** -- `expr` — [Expression](../syntax.md#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. +- `expr` — [Expression](../syntax.md#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. **Returned value** @@ -29,59 +29,60 @@ The behavior of functions for the [NaN and Inf](../../data_types/float.md#data_t **Example** -```sql +``` sql SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) ``` -```text + +``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ └──────────────────────┴─────────────┴───────────────┴─────────────┘ ``` -## toInt(8|16|32|64)OrZero +## toInt(8\|16\|32\|64)OrZero {#toint8163264orzero} -It takes an argument of type String and tries to parse it into Int (8 | 16 | 32 | 64). If failed, returns 0. +It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 32 \| 64). If failed, returns 0. **Example** -```sql +``` sql select toInt64OrZero('123123'), toInt8OrZero('123qwe123') ``` -```text + +``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ └─────────────────────────┴───────────────────────────┘ ``` +## toInt(8\|16\|32\|64)OrNull {#toint8163264ornull} -## toInt(8|16|32|64)OrNull - -It takes an argument of type String and tries to parse it into Int (8 | 16 | 32 | 64). If failed, returns NULL. +It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 32 \| 64). If failed, returns NULL. **Example** -```sql +``` sql select toInt64OrNull('123123'), toInt8OrNull('123qwe123') ``` -```text + +``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ └─────────────────────────┴───────────────────────────┘ ``` - -## toUInt(8|16|32|64) +## toUInt(8\|16\|32\|64) {#touint8163264} Converts an input value to the [UInt](../../data_types/int_uint.md) data type. This function family includes: -* `toUInt8(expr)` — Results in the `UInt8` data type. -* `toUInt16(expr)` — Results in the `UInt16` data type. -* `toUInt32(expr)` — Results in the `UInt32` data type. -* `toUInt64(expr)` — Results in the `UInt64` data type. +- `toUInt8(expr)` — Results in the `UInt8` data type. +- `toUInt16(expr)` — Results in the `UInt16` data type. +- `toUInt32(expr)` — Results in the `UInt32` data type. +- `toUInt64(expr)` — Results in the `UInt64` data type. **Parameters** -- `expr` — [Expression](../syntax.md#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. +- `expr` — [Expression](../syntax.md#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. **Returned value** @@ -93,146 +94,152 @@ The behavior of functions for negative agruments and for the [NaN and Inf](../.. **Example** -```sql +``` sql SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ``` -```text + +``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ └─────────────────────┴───────────────┴────────────────┴──────────────┘ ``` -## toUInt(8|16|32|64)OrZero +## toUInt(8\|16\|32\|64)OrZero {#touint8163264orzero} -## toUInt(8|16|32|64)OrNull +## toUInt(8\|16\|32\|64)OrNull {#touint8163264ornull} -## toFloat(32|64) +## toFloat(32\|64) {#tofloat3264} -## toFloat(32|64)OrZero +## toFloat(32\|64)OrZero {#tofloat3264orzero} -## toFloat(32|64)OrNull +## toFloat(32\|64)OrNull {#tofloat3264ornull} -## toDate +## toDate {#todate} -## toDateOrZero +## toDateOrZero {#todateorzero} -## toDateOrNull +## toDateOrNull {#todateornull} -## toDateTime +## toDateTime {#todatetime} -## toDateTimeOrZero +## toDateTimeOrZero {#todatetimeorzero} -## toDateTimeOrNull +## toDateTimeOrNull {#todatetimeornull} -## toDecimal(32|64|128) +## toDecimal(32\|64\|128) {#todecimal3264128} Converts `value` to the [Decimal](../../data_types/decimal.md) data type with precision of `S`. The `value` can be a number or a string. The `S` (scale) parameter specifies the number of decimal places. -- `toDecimal32(value, S)` -- `toDecimal64(value, S)` -- `toDecimal128(value, S)` +- `toDecimal32(value, S)` +- `toDecimal64(value, S)` +- `toDecimal128(value, S)` -## toDecimal(32|64|128)OrNull +## toDecimal(32\|64\|128)OrNull {#todecimal3264128ornull} Converts an input string to a [Nullable(Decimal(P,S))](../../data_types/decimal.md) data type value. This family of functions include: -- `toDecimal32OrNull(expr, S)` — Results in `Nullable(Decimal32(S))` data type. -- `toDecimal64OrNull(expr, S)` — Results in `Nullable(Decimal64(S))` data type. -- `toDecimal128OrNull(expr, S)` — Results in `Nullable(Decimal128(S))` data type. +- `toDecimal32OrNull(expr, S)` — Results in `Nullable(Decimal32(S))` data type. +- `toDecimal64OrNull(expr, S)` — Results in `Nullable(Decimal64(S))` data type. +- `toDecimal128OrNull(expr, S)` — Results in `Nullable(Decimal128(S))` data type. These functions should be used instead of `toDecimal*()` functions, if you prefer to get a `NULL` value instead of an exception in the event of an input value parsing error. **Parameters** -- `expr` — [Expression](../syntax.md#syntax-expressions), returns a value in the [String](../../data_types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. -- `S` — Scale, the number of decimal places in the resulting value. +- `expr` — [Expression](../syntax.md#syntax-expressions), returns a value in the [String](../../data_types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. +- `S` — Scale, the number of decimal places in the resulting value. **Returned value** A value in the `Nullable(Decimal(P,S))` data type. The value contains: -- Number with `S` decimal places, if ClickHouse interprets the input string as a number. -- `NULL`, if ClickHouse can't interpret the input string as a number or if the input number contains more than `S` decimal places. +- Number with `S` decimal places, if ClickHouse interprets the input string as a number. +- `NULL`, if ClickHouse can’t interpret the input string as a number or if the input number contains more than `S` decimal places. **Examples** -```sql +``` sql SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` -```text + +``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) ``` -```text + +``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ └──────┴────────────────────────────────────────────────────┘ ``` - -## toDecimal(32|64|128)OrZero +## toDecimal(32\|64\|128)OrZero {#todecimal3264128orzero} Converts an input value to the [Decimal(P,S)](../../data_types/decimal.md) data type. This family of functions include: -- `toDecimal32OrZero( expr, S)` — Results in `Decimal32(S)` data type. -- `toDecimal64OrZero( expr, S)` — Results in `Decimal64(S)` data type. -- `toDecimal128OrZero( expr, S)` — Results in `Decimal128(S)` data type. +- `toDecimal32OrZero( expr, S)` — Results in `Decimal32(S)` data type. +- `toDecimal64OrZero( expr, S)` — Results in `Decimal64(S)` data type. +- `toDecimal128OrZero( expr, S)` — Results in `Decimal128(S)` data type. These functions should be used instead of `toDecimal*()` functions, if you prefer to get a `0` value instead of an exception in the event of an input value parsing error. **Parameters** -- `expr` — [Expression](../syntax.md#syntax-expressions), returns a value in the [String](../../data_types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. -- `S` — Scale, the number of decimal places in the resulting value. +- `expr` — [Expression](../syntax.md#syntax-expressions), returns a value in the [String](../../data_types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`. +- `S` — Scale, the number of decimal places in the resulting value. **Returned value** A value in the `Nullable(Decimal(P,S))` data type. The value contains: -- Number with `S` decimal places, if ClickHouse interprets the input string as a number. -- 0 with `S` decimal places, if ClickHouse can't interpret the input string as a number or if the input number contains more than `S` decimal places. +- Number with `S` decimal places, if ClickHouse interprets the input string as a number. +- 0 with `S` decimal places, if ClickHouse can’t interpret the input string as a number or if the input number contains more than `S` decimal places. **Example** -```sql +``` sql SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) ``` -```text + +``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) ``` -```text + +``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ └──────┴────────────────────────────────────────────────────┘ ``` -## toString +## toString {#tostring} Functions for converting between numbers, strings (but not fixed strings), dates, and dates with times. All these functions accept one argument. -When converting to or from a string, the value is formatted or parsed using the same rules as for the TabSeparated format (and almost all other text formats). If the string can't be parsed, an exception is thrown and the request is canceled. +When converting to or from a string, the value is formatted or parsed using the same rules as for the TabSeparated format (and almost all other text formats). If the string can’t be parsed, an exception is thrown and the request is canceled. When converting dates to numbers or vice versa, the date corresponds to the number of days since the beginning of the Unix epoch. When converting dates with times to numbers or vice versa, the date with time corresponds to the number of seconds since the beginning of the Unix epoch. The date and date-with-time formats for the toDate/toDateTime functions are defined as follows: -```text +``` text YYYY-MM-DD YYYY-MM-DD hh:mm:ss ``` -As an exception, if converting from UInt32, Int32, UInt64, or Int64 numeric types to Date, and if the number is greater than or equal to 65536, the number is interpreted as a Unix timestamp (and not as the number of days) and is rounded to the date. This allows support for the common occurrence of writing 'toDate(unix_timestamp)', which otherwise would be an error and would require writing the more cumbersome 'toDate(toDateTime(unix_timestamp))'. +As an exception, if converting from UInt32, Int32, UInt64, or Int64 numeric types to Date, and if the number is greater than or equal to 65536, the number is interpreted as a Unix timestamp (and not as the number of days) and is rounded to the date. This allows support for the common occurrence of writing ‘toDate(unix\_timestamp)’, which otherwise would be an error and would require writing the more cumbersome ‘toDate(toDateTime(unix\_timestamp))’. Conversion between a date and date with time is performed the natural way: by adding a null time or dropping the time. @@ -240,13 +247,13 @@ Conversion between numeric types uses the same rules as assignments between diff Additionally, the toString function of the DateTime argument can take a second String argument containing the name of the time zone. Example: `Asia/Yekaterinburg` In this case, the time is formatted according to the specified time zone. -```sql +``` sql SELECT now() AS now_local, toString(now(), 'Asia/Yekaterinburg') AS now_yekat ``` -```text +``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ └─────────────────────┴─────────────────────┘ @@ -254,64 +261,64 @@ SELECT Also see the `toUnixTimestamp` function. -## toFixedString(s, N) +## toFixedString(s, N) {#tofixedstrings-n} Converts a String type argument to a FixedString(N) type (a string with fixed length N). N must be a constant. If the string has fewer bytes than N, it is passed with null bytes to the right. If the string has more bytes than N, an exception is thrown. -## toStringCutToZero(s) +## toStringCutToZero(s) {#tostringcuttozeros} Accepts a String or FixedString argument. Returns the String with the content truncated at the first zero byte found. Example: -```sql +``` sql SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut ``` -```text +``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` -```sql +``` sql SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ``` -```text +``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ └────────────┴───────┘ ``` -## reinterpretAsUInt(8|16|32|64) +## reinterpretAsUInt(8\|16\|32\|64) {#reinterpretasuint8163264} -## reinterpretAsInt(8|16|32|64) +## reinterpretAsInt(8\|16\|32\|64) {#reinterpretasint8163264} -## reinterpretAsFloat(32|64) +## reinterpretAsFloat(32\|64) {#reinterpretasfloat3264} -## reinterpretAsDate +## reinterpretAsDate {#reinterpretasdate} -## reinterpretAsDateTime +## reinterpretAsDateTime {#reinterpretasdatetime} -These functions accept a string and interpret the bytes placed at the beginning of the string as a number in host order (little endian). If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. A date is interpreted as the number of days since the beginning of the Unix Epoch, and a date with time is interpreted as the number of seconds since the beginning of the Unix Epoch. +These functions accept a string and interpret the bytes placed at the beginning of the string as a number in host order (little endian). If the string isn’t long enough, the functions work as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. A date is interpreted as the number of days since the beginning of the Unix Epoch, and a date with time is interpreted as the number of seconds since the beginning of the Unix Epoch. -## reinterpretAsString {#type_conversion_functions-reinterpretAsString} +## reinterpretAsString {#type-conversion-functions-reinterpretasstring} This function accepts a number or date or date with time, and returns a string containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long. -## reinterpretAsFixedString +## reinterpretAsFixedString {#reinterpretasfixedstring} This function accepts a number or date or date with time, and returns a FixedString containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a FixedString that is one byte long. -## CAST(x, t) {#type_conversion_function-cast} +## CAST(x, t) {#type-conversion-function-cast} -Converts 'x' to the 't' data type. The syntax CAST(x AS t) is also supported. +Converts ‘x’ to the ‘t’ data type. The syntax CAST(x AS t) is also supported. Example: -```sql +``` sql SELECT '2016-06-15 23:00:00' AS timestamp, CAST(timestamp AS DateTime) AS datetime, @@ -320,7 +327,7 @@ SELECT CAST(timestamp, 'FixedString(22)') AS fixed_string ``` -```text +``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ └─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ @@ -330,32 +337,35 @@ Conversion to FixedString(N) only works for arguments of type String or FixedStr Type conversion to [Nullable](../../data_types/nullable.md) and back is supported. Example: -```sql +``` sql SELECT toTypeName(x) FROM t_null ``` -```text + +``` text ┌─toTypeName(x)─┐ │ Int8 │ │ Int8 │ └───────────────┘ ``` -```sql + +``` sql SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ``` -```text + +``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ │ Nullable(UInt16) │ └─────────────────────────────────────────┘ ``` -## toInterval(Year|Quarter|Month|Week|Day|Hour|Minute|Second) {#function-tointerval} +## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval} Converts a Number type argument to an [Interval](../../data_types/special_data_types/interval.md) data type. **Syntax** -```sql +``` sql toIntervalSecond(number) toIntervalMinute(number) toIntervalHour(number) @@ -368,15 +378,15 @@ toIntervalYear(number) **Parameters** -- `number` — Duration of interval. Positive integer number. +- `number` — Duration of interval. Positive integer number. **Returned values** -- The value in `Interval` data type. +- The value in `Interval` data type. **Example** -```sql +``` sql WITH toDate('2019-01-01') AS date, INTERVAL 1 WEEK AS interval_week, @@ -386,23 +396,23 @@ SELECT date + interval_to_week ``` -```text +``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ └───────────────────────────┴──────────────────────────────┘ ``` -## parseDateTimeBestEffort {#type_conversion_functions-parsedatetimebesteffort} +## parseDateTimeBestEffort {#type-conversion-functions-parsedatetimebesteffort} Parse a number type argument to a Date or DateTime type. different from toDate and toDateTime, parseDateTimeBestEffort can progress more complex date format. For more information, see the link: [Complex Date Format](https://xkcd.com/1179/) -## parseDateTimeBestEffortOrNull +## parseDateTimeBestEffortOrNull {#parsedatetimebesteffortornull} Same as for [parseDateTimeBestEffort](#type_conversion_functions-parsedatetimebesteffort) except that it returns null when it encounters a date format that cannot be processed. -## parseDateTimeBestEffortOrZero +## parseDateTimeBestEffortOrZero {#parsedatetimebesteffortorzero} Same as for [parseDateTimeBestEffort](#type_conversion_functions-parsedatetimebesteffort) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. diff --git a/docs/en/query_language/functions/url_functions.md b/docs/en/query_language/functions/url_functions.md index 1f5f4d32b03..7e6390acbb5 100644 --- a/docs/en/query_language/functions/url_functions.md +++ b/docs/en/query_language/functions/url_functions.md @@ -1,33 +1,32 @@ -# Functions for working with URLs +# Functions for working with URLs {#functions-for-working-with-urls} -All these functions don't follow the RFC. They are maximally simplified for improved performance. +All these functions don’t follow the RFC. They are maximally simplified for improved performance. -## Functions that Extract Parts of a URL +## Functions that Extract Parts of a URL {#functions-that-extract-parts-of-a-url} -If the relevant part isn't present in a URL, an empty string is returned. +If the relevant part isn’t present in a URL, an empty string is returned. -### protocol +### protocol {#protocol} Extracts the protocol from a URL. -Examples of typical returned values: http, https, ftp, mailto, tel, magnet... +Examples of typical returned values: http, https, ftp, mailto, tel, magnet… -### domain +### domain {#domain} Extracts the hostname from a URL. -```sql +``` sql domain(url) ``` **Parameters** -- `url` — URL. Type: [String](../../data_types/string.md). - +- `url` — URL. Type: [String](../../data_types/string.md). The URL can be specified with or without a scheme. Examples: -```text +``` text svn+ssh://some.svn-hosting.com:80/repo/trunk some.svn-hosting.com:80/repo/trunk https://yandex.com/time/ @@ -35,7 +34,7 @@ https://yandex.com/time/ For these examples, the `domain` function returns the following results: -```text +``` text some.svn-hosting.com some.svn-hosting.com yandex.com @@ -43,41 +42,42 @@ yandex.com **Returned values** -- Host name. If ClickHouse can parse the input string as a URL. -- Empty string. If ClickHouse can't parse the input string as a URL. +- Host name. If ClickHouse can parse the input string as a URL. +- Empty string. If ClickHouse can’t parse the input string as a URL. Type: `String`. **Example** -```sql +``` sql SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk') ``` -```text + +``` text ┌─domain('svn+ssh://some.svn-hosting.com:80/repo/trunk')─┐ │ some.svn-hosting.com │ └────────────────────────────────────────────────────────┘ ``` -### domainWithoutWWW +### domainWithoutWWW {#domainwithoutwww} -Returns the domain and removes no more than one 'www.' from the beginning of it, if present. +Returns the domain and removes no more than one ‘www.’ from the beginning of it, if present. -### topLevelDomain +### topLevelDomain {#topleveldomain} Extracts the the top-level domain from a URL. -```sql +``` sql topLevelDomain(url) ``` **Parameters** -- `url` — URL. Type: [String](../../data_types/string.md). +- `url` — URL. Type: [String](../../data_types/string.md). The URL can be specified with or without a scheme. Examples: -```text +``` text svn+ssh://some.svn-hosting.com:80/repo/trunk some.svn-hosting.com:80/repo/trunk https://yandex.com/time/ @@ -85,73 +85,74 @@ https://yandex.com/time/ **Returned values** -- Domain name. If ClickHouse can parse the input string as a URL. -- Empty string. If ClickHouse cannot parse the input string as a URL. +- Domain name. If ClickHouse can parse the input string as a URL. +- Empty string. If ClickHouse cannot parse the input string as a URL. Type: `String`. **Example** -```sql +``` sql SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk') ``` -```text + +``` text ┌─topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk')─┐ │ com │ └────────────────────────────────────────────────────────────────────┘ ``` -### firstSignificantSubdomain +### firstSignificantSubdomain {#firstsignificantsubdomain} -Returns the "first significant subdomain". This is a non-standard concept specific to Yandex.Metrica. The first significant subdomain is a second-level domain if it is 'com', 'net', 'org', or 'co'. Otherwise, it is a third-level domain. For example, firstSignificantSubdomain ('') = 'yandex ', firstSignificantSubdomain ('') = 'yandex '. The list of "insignificant" second-level domains and other implementation details may change in the future. +Returns the “first significant subdomain”. This is a non-standard concept specific to Yandex.Metrica. The first significant subdomain is a second-level domain if it is ‘com’, ‘net’, ‘org’, or ‘co’. Otherwise, it is a third-level domain. For example, firstSignificantSubdomain (‘https://news.yandex.ru/’) = ‘yandex’, firstSignificantSubdomain (‘https://news.yandex.com.tr/’) = ‘yandex’. The list of “insignificant” second-level domains and other implementation details may change in the future. -### cutToFirstSignificantSubdomain +### cutToFirstSignificantSubdomain {#cuttofirstsignificantsubdomain} -Returns the part of the domain that includes top-level subdomains up to the "first significant subdomain" (see the explanation above). +Returns the part of the domain that includes top-level subdomains up to the “first significant subdomain” (see the explanation above). For example, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`. -### path +### path {#path} Returns the path. Example: `/top/news.html` The path does not include the query string. -### pathFull +### pathFull {#pathfull} The same as above, but including query string and fragment. Example: /top/news.html?page=2\#comments -### queryString +### queryString {#querystring} Returns the query string. Example: page=1&lr=213. query-string does not include the initial question mark, as well as \# and everything after \#. -### fragment +### fragment {#fragment} Returns the fragment identifier. fragment does not include the initial hash symbol. -### queryStringAndFragment +### queryStringAndFragment {#querystringandfragment} Returns the query string and fragment identifier. Example: page=1\#29390. -### extractURLParameter(URL, name) +### extractURLParameter(URL, name) {#extracturlparameterurl-name} -Returns the value of the 'name' parameter in the URL, if present. Otherwise, an empty string. If there are many parameters with this name, it returns the first occurrence. This function works under the assumption that the parameter name is encoded in the URL exactly the same way as in the passed argument. +Returns the value of the ‘name’ parameter in the URL, if present. Otherwise, an empty string. If there are many parameters with this name, it returns the first occurrence. This function works under the assumption that the parameter name is encoded in the URL exactly the same way as in the passed argument. -### extractURLParameters(URL) +### extractURLParameters(URL) {#extracturlparametersurl} Returns an array of name=value strings corresponding to the URL parameters. The values are not decoded in any way. -### extractURLParameterNames(URL) +### extractURLParameterNames(URL) {#extracturlparameternamesurl} Returns an array of name strings corresponding to the names of URL parameters. The values are not decoded in any way. -### URLHierarchy(URL) +### URLHierarchy(URL) {#urlhierarchyurl} Returns an array containing the URL, truncated at the end by the symbols /,? in the path and query-string. Consecutive separator characters are counted as one. The cut is made in the position after all the consecutive separator characters. -### URLPathHierarchy(URL) +### URLPathHierarchy(URL) {#urlpathhierarchyurl} The same as above, but without the protocol and host in the result. The / element (root) is not included. Example: the function is used to implement tree reports the URL in Yandex. Metric. -```text +``` text URLPathHierarchy('https://example.com/browse/CONV-6788') = [ '/browse/', @@ -159,44 +160,43 @@ URLPathHierarchy('https://example.com/browse/CONV-6788') = ] ``` -### decodeURLComponent(URL) +### decodeURLComponent(URL) {#decodeurlcomponenturl} Returns the decoded URL. Example: -```sql +``` sql SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; ``` -```text +``` text ┌─DecodedURL─────────────────────────────┐ │ http://127.0.0.1:8123/?query=SELECT 1; │ └────────────────────────────────────────┘ ``` -## Functions that remove part of a URL. +## Functions that remove part of a URL. {#functions-that-remove-part-of-a-url} -If the URL doesn't have anything similar, the URL remains unchanged. +If the URL doesn’t have anything similar, the URL remains unchanged. -### cutWWW +### cutWWW {#cutwww} -Removes no more than one 'www.' from the beginning of the URL's domain, if present. +Removes no more than one ‘www.’ from the beginning of the URL’s domain, if present. -### cutQueryString +### cutQueryString {#cutquerystring} Removes query string. The question mark is also removed. -### cutFragment +### cutFragment {#cutfragment} Removes the fragment identifier. The number sign is also removed. -### cutQueryStringAndFragment +### cutQueryStringAndFragment {#cutquerystringandfragment} Removes the query string and fragment identifier. The question mark and number sign are also removed. -### cutURLParameter(URL, name) - -Removes the 'name' URL parameter, if present. This function works under the assumption that the parameter name is encoded in the URL exactly the same way as in the passed argument. +### cutURLParameter(URL, name) {#cuturlparameterurl-name} +Removes the ‘name’ URL parameter, if present. This function works under the assumption that the parameter name is encoded in the URL exactly the same way as in the passed argument. [Original article](https://clickhouse.tech/docs/en/query_language/functions/url_functions/) diff --git a/docs/en/query_language/functions/uuid_functions.md b/docs/en/query_language/functions/uuid_functions.md index af8ba7f84f2..1a5a3fa79fc 100644 --- a/docs/en/query_language/functions/uuid_functions.md +++ b/docs/en/query_language/functions/uuid_functions.md @@ -1,12 +1,12 @@ -# Functions for working with UUID +# Functions for working with UUID {#functions-for-working-with-uuid} The functions for working with UUID are listed below. -## generateUUIDv4 {#uuid_function-generate} +## generateUUIDv4 {#uuid-function-generate} Generates the [UUID](../../data_types/uuid.md) of [version 4](https://tools.ietf.org/html/rfc4122#section-4.4). -```sql +``` sql generateUUIDv4() ``` @@ -18,24 +18,25 @@ The UUID type value. This example demonstrates creating a table with the UUID type column and inserting a value into the table. -```sql +``` sql CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog INSERT INTO t_uuid SELECT generateUUIDv4() SELECT * FROM t_uuid ``` -```text + +``` text ┌────────────────────────────────────x─┐ │ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │ └──────────────────────────────────────┘ ``` -## toUUID (x) +## toUUID (x) {#touuid-x} Converts String type value to UUID type. -```sql +``` sql toUUID(String) ``` @@ -45,20 +46,21 @@ The UUID type value. **Usage example** -```sql +``` sql SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid ``` -```text + +``` text ┌─────────────────────────────────uuid─┐ │ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │ └──────────────────────────────────────┘ ``` -## UUIDStringToNum +## UUIDStringToNum {#uuidstringtonum} Accepts a string containing 36 characters in the format `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, and returns it as a set of bytes in a [FixedString(16)](../../data_types/fixedstring.md). -```sql +``` sql UUIDStringToNum(String) ``` @@ -68,23 +70,24 @@ FixedString(16) **Usage examples** -```sql +``` sql SELECT '612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid, UUIDStringToNum(uuid) AS bytes ``` -```text + +``` text ┌─uuid─────────────────────────────────┬─bytes────────────┐ │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │ └──────────────────────────────────────┴──────────────────┘ ``` -## UUIDNumToString +## UUIDNumToString {#uuidnumtostring} Accepts a [FixedString(16)](../../data_types/fixedstring.md) value, and returns a string containing 36 characters in text format. -```sql +``` sql UUIDNumToString(FixedString(16)) ``` @@ -94,19 +97,20 @@ String. **Usage example** -```sql +``` sql SELECT 'a/<@];!~p{jTj={)' AS bytes, UUIDNumToString(toFixedString(bytes, 16)) AS uuid ``` -```text + +``` text ┌─bytes────────────┬─uuid─────────────────────────────────┐ │ a/<@];!~p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ └──────────────────┴──────────────────────────────────────┘ ``` -## See also +## See also {#see-also} -- [dictGetUUID](ext_dict_functions.md#ext_dict_functions-other) +- [dictGetUUID](ext_dict_functions.md#ext_dict_functions-other) [Original article](https://clickhouse.tech/docs/en/query_language/functions/uuid_function/) diff --git a/docs/en/query_language/functions/ym_dict_functions.md b/docs/en/query_language/functions/ym_dict_functions.md index e83e86d73f9..0d556b0057c 100644 --- a/docs/en/query_language/functions/ym_dict_functions.md +++ b/docs/en/query_language/functions/ym_dict_functions.md @@ -1,46 +1,46 @@ -# Functions for working with Yandex.Metrica dictionaries +# Functions for working with Yandex.Metrica dictionaries {#functions-for-working-with-yandex-metrica-dictionaries} -In order for the functions below to work, the server config must specify the paths and addresses for getting all the Yandex.Metrica dictionaries. The dictionaries are loaded at the first call of any of these functions. If the reference lists can't be loaded, an exception is thrown. +In order for the functions below to work, the server config must specify the paths and addresses for getting all the Yandex.Metrica dictionaries. The dictionaries are loaded at the first call of any of these functions. If the reference lists can’t be loaded, an exception is thrown. -For information about creating reference lists, see the section "Dictionaries". +For information about creating reference lists, see the section “Dictionaries”. -## Multiple geobases +## Multiple geobases {#multiple-geobases} ClickHouse supports working with multiple alternative geobases (regional hierarchies) simultaneously, in order to support various perspectives on which countries certain regions belong to. -The 'clickhouse-server' config specifies the file with the regional hierarchy::`/opt/geo/regions_hierarchy.txt` +The ‘clickhouse-server’ config specifies the file with the regional hierarchy::`/opt/geo/regions_hierarchy.txt` -Besides this file, it also searches for files nearby that have the _ symbol and any suffix appended to the name (before the file extension). +Besides this file, it also searches for files nearby that have the \_ symbol and any suffix appended to the name (before the file extension). For example, it will also find the file `/opt/geo/regions_hierarchy_ua.txt`, if present. `ua` is called the dictionary key. For a dictionary without a suffix, the key is an empty string. -All the dictionaries are re-loaded in runtime (once every certain number of seconds, as defined in the builtin_dictionaries_reload_interval config parameter, or once an hour by default). However, the list of available dictionaries is defined one time, when the server starts. +All the dictionaries are re-loaded in runtime (once every certain number of seconds, as defined in the builtin\_dictionaries\_reload\_interval config parameter, or once an hour by default). However, the list of available dictionaries is defined one time, when the server starts. All functions for working with regions have an optional argument at the end – the dictionary key. It is referred to as the geobase. Example: -```sql +``` sql regionToCountry(RegionID) – Uses the default dictionary: /opt/geo/regions_hierarchy.txt regionToCountry(RegionID, '') – Uses the default dictionary: /opt/geo/regions_hierarchy.txt regionToCountry(RegionID, 'ua') – Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt ``` -### regionToCity(id[, geobase]) +### regionToCity(id\[, geobase\]) {#regiontocityid-geobase} Accepts a UInt32 number – the region ID from the Yandex geobase. If this region is a city or part of a city, it returns the region ID for the appropriate city. Otherwise, returns 0. -### regionToArea(id\[, geobase\]) +### regionToArea(id\[, geobase\]) {#regiontoareaid-geobase} -Converts a region to an area (type 5 in the geobase). In every other way, this function is the same as 'regionToCity'. +Converts a region to an area (type 5 in the geobase). In every other way, this function is the same as ‘regionToCity’. -```sql +``` sql SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua')) FROM system.numbers LIMIT 15 ``` -```text +``` text ┌─regionToName(regionToArea(toUInt32(number), \'ua\'))─┐ │ │ │ Moscow and Moscow region │ @@ -60,17 +60,17 @@ LIMIT 15 └──────────────────────────────────────────────────────┘ ``` -### regionToDistrict(id[, geobase]) +### regionToDistrict(id\[, geobase\]) {#regiontodistrictid-geobase} -Converts a region to a federal district (type 4 in the geobase). In every other way, this function is the same as 'regionToCity'. +Converts a region to a federal district (type 4 in the geobase). In every other way, this function is the same as ‘regionToCity’. -```sql +``` sql SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua')) FROM system.numbers LIMIT 15 ``` -```text +``` text ┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'))─┐ │ │ │ Central federal district │ @@ -90,38 +90,37 @@ LIMIT 15 └──────────────────────────────────────────────────────────┘ ``` -### regionToCountry(id[, geobase]) +### regionToCountry(id\[, geobase\]) {#regiontocountryid-geobase} -Converts a region to a country. In every other way, this function is the same as 'regionToCity'. +Converts a region to a country. In every other way, this function is the same as ‘regionToCity’. Example: `regionToCountry(toUInt32(213)) = 225` converts Moscow (213) to Russia (225). -### regionToContinent(id[, geobase]) +### regionToContinent(id\[, geobase\]) {#regiontocontinentid-geobase} -Converts a region to a continent. In every other way, this function is the same as 'regionToCity'. +Converts a region to a continent. In every other way, this function is the same as ‘regionToCity’. Example: `regionToContinent(toUInt32(213)) = 10001` converts Moscow (213) to Eurasia (10001). -### regionToPopulation(id[, geobase]) +### regionToPopulation(id\[, geobase\]) {#regiontopopulationid-geobase} Gets the population for a region. -The population can be recorded in files with the geobase. See the section "External dictionaries". +The population can be recorded in files with the geobase. See the section “External dictionaries”. If the population is not recorded for the region, it returns 0. In the Yandex geobase, the population might be recorded for child regions, but not for parent regions. -### regionIn(lhs, rhs[, geobase]) +### regionIn(lhs, rhs\[, geobase\]) {#regioninlhs-rhs-geobase} -Checks whether a 'lhs' region belongs to a 'rhs' region. Returns a UInt8 number equal to 1 if it belongs, or 0 if it doesn't belong. +Checks whether a ‘lhs’ region belongs to a ‘rhs’ region. Returns a UInt8 number equal to 1 if it belongs, or 0 if it doesn’t belong. The relationship is reflexive – any region also belongs to itself. -### regionHierarchy(id\[, geobase\]) +### regionHierarchy(id\[, geobase\]) {#regionhierarchyid-geobase} Accepts a UInt32 number – the region ID from the Yandex geobase. Returns an array of region IDs consisting of the passed region and all parents along the chain. Example: `regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]`. -### regionToName(id\[, lang\]) +### regionToName(id\[, lang\]) {#regiontonameid-lang} -Accepts a UInt32 number – the region ID from the Yandex geobase. A string with the name of the language can be passed as a second argument. Supported languages are: ru, en, ua, uk, by, kz, tr. If the second argument is omitted, the language 'ru' is used. If the language is not supported, an exception is thrown. Returns a string – the name of the region in the corresponding language. If the region with the specified ID doesn't exist, an empty string is returned. +Accepts a UInt32 number – the region ID from the Yandex geobase. A string with the name of the language can be passed as a second argument. Supported languages are: ru, en, ua, uk, by, kz, tr. If the second argument is omitted, the language ‘ru’ is used. If the language is not supported, an exception is thrown. Returns a string – the name of the region in the corresponding language. If the region with the specified ID doesn’t exist, an empty string is returned. `ua` and `uk` both mean Ukrainian. - [Original article](https://clickhouse.tech/docs/en/query_language/functions/ym_dict_functions/) diff --git a/docs/en/query_language/index.md b/docs/en/query_language/index.md index d292f2d92db..024281e2ac0 100644 --- a/docs/en/query_language/index.md +++ b/docs/en/query_language/index.md @@ -1,10 +1,9 @@ -# SQL Reference - -* [SELECT](select.md) -* [INSERT INTO](insert_into.md) -* [CREATE](create.md) -* [ALTER](alter.md#query_language_queries_alter) -* [Other types of queries](misc.md) +# SQL Reference {#sql-reference} +- [SELECT](select.md) +- [INSERT INTO](insert_into.md) +- [CREATE](create.md) +- [ALTER](alter.md#query_language_queries_alter) +- [Other types of queries](misc.md) [Original article](https://clickhouse.tech/docs/en/query_language/) diff --git a/docs/en/query_language/insert_into.md b/docs/en/query_language/insert_into.md index e2bf226c298..5ab3c6c6785 100644 --- a/docs/en/query_language/insert_into.md +++ b/docs/en/query_language/insert_into.md @@ -1,30 +1,29 @@ - -## INSERT +## INSERT {#insert} Adding data. Basic query format: -```sql +``` sql INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ... ``` The query can specify a list of columns to insert `[(c1, c2, c3)]`. In this case, the rest of the columns are filled with: -- The values calculated from the `DEFAULT` expressions specified in the table definition. -- Zeros and empty strings, if `DEFAULT` expressions are not defined. +- The values calculated from the `DEFAULT` expressions specified in the table definition. +- Zeros and empty strings, if `DEFAULT` expressions are not defined. -If [strict_insert_defaults=1](../operations/settings/settings.md), columns that do not have `DEFAULT` defined must be listed in the query. +If [strict\_insert\_defaults=1](../operations/settings/settings.md), columns that do not have `DEFAULT` defined must be listed in the query. Data can be passed to the INSERT in any [format](../interfaces/formats.md#formats) supported by ClickHouse. The format must be specified explicitly in the query: -```sql +``` sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` -For example, the following query format is identical to the basic version of INSERT ... VALUES: +For example, the following query format is identical to the basic version of INSERT … VALUES: -```sql +``` sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... ``` @@ -32,21 +31,21 @@ ClickHouse removes all spaces and one line feed (if there is one) before the dat Example: -```sql +``` sql INSERT INTO t FORMAT TabSeparated 11 Hello, world! 22 Qwerty ``` -You can insert data separately from the query by using the command-line client or the HTTP interface. For more information, see the section "[Interfaces](../interfaces/index.md#interfaces)". +You can insert data separately from the query by using the command-line client or the HTTP interface. For more information, see the section “[Interfaces](../interfaces/index.md#interfaces)”. -### Constraints +### Constraints {#constraints} If table has [constraints](create.md#constraints), their expressions will be checked for each row of inserted data. If any of those constraints is not satisfied — server will raise an exception containing constraint name and expression, the query will be stopped. -### Inserting The Results of `SELECT` {#insert_query_insert-select} +### Inserting The Results of `SELECT` {#insert-query-insert-select} -```sql +``` sql INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... ``` @@ -59,17 +58,16 @@ However, you can delete old data using `ALTER TABLE ... DROP PARTITION`. `FORMAT` clause must be specified in the end of query if `SELECT` clause contains table function [input()](table_functions/input.md). -### Performance Considerations +### Performance Considerations {#performance-considerations} `INSERT` sorts the input data by primary key and splits them into partitions by a partition key. If you insert data into several partitions at once, it can significantly reduce the performance of the `INSERT` query. To avoid this: -- Add data in fairly large batches, such as 100,000 rows at a time. -- Group data by a partition key before uploading it to ClickHouse. +- Add data in fairly large batches, such as 100,000 rows at a time. +- Group data by a partition key before uploading it to ClickHouse. Performance will not decrease if: -- Data is added in real time. -- You upload data that is usually sorted by time. - +- Data is added in real time. +- You upload data that is usually sorted by time. [Original article](https://clickhouse.tech/docs/en/query_language/insert_into/) diff --git a/docs/en/query_language/misc.md b/docs/en/query_language/misc.md index cbdbf0318f8..69e0ad79434 100644 --- a/docs/en/query_language/misc.md +++ b/docs/en/query_language/misc.md @@ -1,47 +1,47 @@ -# Miscellaneous Queries +# Miscellaneous Queries {#miscellaneous-queries} -## ATTACH +## ATTACH {#attach} This query is exactly the same as `CREATE`, but -- Instead of the word `CREATE` it uses the word `ATTACH`. -- The query does not create data on the disk, but assumes that data is already in the appropriate places, and just adds information about the table to the server. -After executing an ATTACH query, the server will know about the existence of the table. +- Instead of the word `CREATE` it uses the word `ATTACH`. +- The query does not create data on the disk, but assumes that data is already in the appropriate places, and just adds information about the table to the server. + After executing an ATTACH query, the server will know about the existence of the table. -If the table was previously detached (``DETACH``), meaning that its structure is known, you can use shorthand without defining the structure. +If the table was previously detached (`DETACH`), meaning that its structure is known, you can use shorthand without defining the structure. -```sql +``` sql ATTACH TABLE [IF NOT EXISTS] [db.]name [ON CLUSTER cluster] ``` This query is used when starting the server. The server stores table metadata as files with `ATTACH` queries, which it simply runs at launch (with the exception of system tables, which are explicitly created on the server). -## CHECK TABLE +## CHECK TABLE {#check-table} Checks if the data in the table is corrupted. -```sql +``` sql CHECK TABLE [db.]name ``` The `CHECK TABLE` query compares actual file sizes with the expected values which are stored on the server. If the file sizes do not match the stored values, it means the data is corrupted. This can be caused, for example, by a system crash during query execution. The query response contains the `result` column with a single row. The row has a value of - [Boolean](../data_types/boolean.md) type: +[Boolean](../data_types/boolean.md) type: -- 0 - The data in the table is corrupted. -- 1 - The data maintains integrity. +- 0 - The data in the table is corrupted. +- 1 - The data maintains integrity. The `CHECK TABLE` query supports the following table engines: -- [Log](../operations/table_engines/log.md) -- [TinyLog](../operations/table_engines/tinylog.md) -- [StripeLog](../operations/table_engines/stripelog.md) -- [MergeTree family](../operations/table_engines/mergetree.md) +- [Log](../operations/table_engines/log.md) +- [TinyLog](../operations/table_engines/tinylog.md) +- [StripeLog](../operations/table_engines/stripelog.md) +- [MergeTree family](../operations/table_engines/mergetree.md) Performed over the tables with another table engines causes an exception. -Engines from the `*Log` family don't provide automatic data recovery on failure. Use the `CHECK TABLE` query to track data loss in a timely manner. +Engines from the `*Log` family don’t provide automatic data recovery on failure. Use the `CHECK TABLE` query to track data loss in a timely manner. For `MergeTree` family engines, the `CHECK TABLE` query shows a check status for every individual data part of a table on the local server. @@ -49,76 +49,74 @@ For `MergeTree` family engines, the `CHECK TABLE` query shows a check status for If the table is corrupted, you can copy the non-corrupted data to another table. To do this: -1. Create a new table with the same structure as damaged table. To do this execute the query `CREATE TABLE AS `. -2. Set the [max_threads](../operations/settings/settings.md#settings-max_threads) value to 1 to process the next query in a single thread. To do this run the query `SET max_threads = 1`. -3. Execute the query `INSERT INTO SELECT * FROM `. This request copies the non-corrupted data from the damaged table to another table. Only the data before the corrupted part will be copied. -4. Restart the `clickhouse-client` to reset the `max_threads` value. +1. Create a new table with the same structure as damaged table. To do this execute the query `CREATE TABLE AS `. +2. Set the [max\_threads](../operations/settings/settings.md#settings-max_threads) value to 1 to process the next query in a single thread. To do this run the query `SET max_threads = 1`. +3. Execute the query `INSERT INTO SELECT * FROM `. This request copies the non-corrupted data from the damaged table to another table. Only the data before the corrupted part will be copied. +4. Restart the `clickhouse-client` to reset the `max_threads` value. ## DESCRIBE TABLE {#misc-describe-table} -```sql +``` sql DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] ``` Returns the following `String` type columns: -- `name` — Column name. -- `type`— Column type. -- `default_type` — Clause that is used in [default expression](create.md#create-default-values) (`DEFAULT`, `MATERIALIZED` or `ALIAS`). Column contains an empty string, if the default expression isn't specified. -- `default_expression` — Value specified in the `DEFAULT` clause. -- `comment_expression` — Comment text. +- `name` — Column name. +- `type`— Column type. +- `default_type` — Clause that is used in [default expression](create.md#create-default-values) (`DEFAULT`, `MATERIALIZED` or `ALIAS`). Column contains an empty string, if the default expression isn’t specified. +- `default_expression` — Value specified in the `DEFAULT` clause. +- `comment_expression` — Comment text. -Nested data structures are output in "expanded" format. Each column is shown separately, with the name after a dot. +Nested data structures are output in “expanded” format. Each column is shown separately, with the name after a dot. -## DETACH +## DETACH {#detach} -Deletes information about the 'name' table from the server. The server stops knowing about the table's existence. +Deletes information about the ‘name’ table from the server. The server stops knowing about the table’s existence. -```sql +``` sql DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` -This does not delete the table's data or metadata. On the next server launch, the server will read the metadata and find out about the table again. -Similarly, a "detached" table can be re-attached using the `ATTACH` query (with the exception of system tables, which do not have metadata stored for them). +This does not delete the table’s data or metadata. On the next server launch, the server will read the metadata and find out about the table again. +Similarly, a “detached” table can be re-attached using the `ATTACH` query (with the exception of system tables, which do not have metadata stored for them). There is no `DETACH DATABASE` query. -## DROP +## DROP {#drop} This query has two types: `DROP DATABASE` and `DROP TABLE`. -```sql +``` sql DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster] ``` -Deletes all tables inside the 'db' database, then deletes the 'db' database itself. -If `IF EXISTS` is specified, it doesn't return an error if the database doesn't exist. +Deletes all tables inside the ‘db’ database, then deletes the ‘db’ database itself. +If `IF EXISTS` is specified, it doesn’t return an error if the database doesn’t exist. -```sql +``` sql DROP [TEMPORARY] TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` Deletes the table. -If `IF EXISTS` is specified, it doesn't return an error if the table doesn't exist or the database doesn't exist. +If `IF EXISTS` is specified, it doesn’t return an error if the table doesn’t exist or the database doesn’t exist. -``` -DROP DICTIONARY [IF EXISTS] [db.]name -``` + DROP DICTIONARY [IF EXISTS] [db.]name Delets the dictionary. -If `IF EXISTS` is specified, it doesn't return an error if the table doesn't exist or the database doesn't exist. +If `IF EXISTS` is specified, it doesn’t return an error if the table doesn’t exist or the database doesn’t exist. -## EXISTS +## EXISTS {#exists} -```sql +``` sql EXISTS [TEMPORARY] [TABLE|DICTIONARY] [db.]name [INTO OUTFILE filename] [FORMAT format] ``` -Returns a single `UInt8`-type column, which contains the single value `0` if the table or database doesn't exist, or `1` if the table exists in the specified database. +Returns a single `UInt8`-type column, which contains the single value `0` if the table or database doesn’t exist, or `1` if the table exists in the specified database. -## KILL QUERY +## KILL QUERY {#kill-query} -```sql +``` sql KILL QUERY [ON CLUSTER cluster] WHERE [SYNC|ASYNC|TEST] @@ -130,7 +128,7 @@ The queries to terminate are selected from the system.processes table using the Examples: -```sql +``` sql -- Forcibly terminates all queries with the specified query_id: KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90' @@ -140,20 +138,20 @@ KILL QUERY WHERE user='username' SYNC Read-only users can only stop their own queries. -By default, the asynchronous version of queries is used (`ASYNC`), which doesn't wait for confirmation that queries have stopped. +By default, the asynchronous version of queries is used (`ASYNC`), which doesn’t wait for confirmation that queries have stopped. The synchronous version (`SYNC`) waits for all queries to stop and displays information about each process as it stops. The response contains the `kill_status` column, which can take the following values: -1. 'finished' – The query was terminated successfully. -2. 'waiting' – Waiting for the query to end after sending it a signal to terminate. -3. The other values ​​explain why the query can't be stopped. +1. ‘finished’ – The query was terminated successfully. +2. ‘waiting’ – Waiting for the query to end after sending it a signal to terminate. +3. The other values ​​explain why the query can’t be stopped. -A test query (`TEST`) only checks the user's rights and displays a list of queries to stop. +A test query (`TEST`) only checks the user’s rights and displays a list of queries to stop. ## KILL MUTATION {#kill-mutation} -```sql +``` sql KILL MUTATION [ON CLUSTER cluster] WHERE [TEST] @@ -162,11 +160,11 @@ KILL MUTATION [ON CLUSTER cluster] Tries to cancel and remove [mutations](alter.md#alter-mutations) that are currently executing. Mutations to cancel are selected from the [`system.mutations`](../operations/system_tables.md#system_tables-mutations) table using the filter specified by the `WHERE` clause of the `KILL` query. -A test query (`TEST`) only checks the user's rights and displays a list of queries to stop. +A test query (`TEST`) only checks the user’s rights and displays a list of queries to stop. Examples: -```sql +``` sql -- Cancel and remove all mutations of the single table: KILL MUTATION WHERE database = 'default' AND table = 'table' @@ -174,35 +172,35 @@ KILL MUTATION WHERE database = 'default' AND table = 'table' KILL MUTATION WHERE database = 'default' AND table = 'table' AND mutation_id = 'mutation_3.txt' ``` -The query is useful when a mutation is stuck and cannot finish (e.g. if some function in the mutation query throws an exception when applied to the data contained in the table). +The query is useful when a mutation is stuck and cannot finish (e.g. if some function in the mutation query throws an exception when applied to the data contained in the table). Changes already made by the mutation are not rolled back. -## OPTIMIZE {#misc_operations-optimize} +## OPTIMIZE {#misc-operations-optimize} -```sql +``` sql OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE] ``` This query tries to initialize an unscheduled merge of data parts for tables with a table engine from the [MergeTree](../operations/table_engines/mergetree.md) family. -The `OPTMIZE` query is also supported for the [MaterializedView](../operations/table_engines/materializedview.md) and the [Buffer](../operations/table_engines/buffer.md) engines. Other table engines aren't supported. +The `OPTMIZE` query is also supported for the [MaterializedView](../operations/table_engines/materializedview.md) and the [Buffer](../operations/table_engines/buffer.md) engines. Other table engines aren’t supported. When `OPTIMIZE` is used with the [ReplicatedMergeTree](../operations/table_engines/replication.md) family of table engines, ClickHouse creates a task for merging and waits for execution on all nodes (if the `replication_alter_partitions_sync` setting is enabled). -- If `OPTIMIZE` doesn't perform a merge for any reason, it doesn't notify the client. To enable notifications, use the [optimize_throw_if_noop](../operations/settings/settings.md#setting-optimize_throw_if_noop) setting. -- If you specify a `PARTITION`, only the specified partition is optimized. [How to set partition expression](alter.md#alter-how-to-specify-part-expr). -- If you specify `FINAL`, optimization is performed even when all the data is already in one part. -- If you specify `DEDUPLICATE`, then completely identical rows will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine. +- If `OPTIMIZE` doesn’t perform a merge for any reason, it doesn’t notify the client. To enable notifications, use the [optimize\_throw\_if\_noop](../operations/settings/settings.md#setting-optimize_throw_if_noop) setting. +- If you specify a `PARTITION`, only the specified partition is optimized. [How to set partition expression](alter.md#alter-how-to-specify-part-expr). +- If you specify `FINAL`, optimization is performed even when all the data is already in one part. +- If you specify `DEDUPLICATE`, then completely identical rows will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine. !!! warning "Warning" - `OPTIMIZE` can't fix the "Too many parts" error. + `OPTIMIZE` can’t fix the “Too many parts” error. -## RENAME {#misc_operations-rename} +## RENAME {#misc-operations-rename} Renames one or more tables. -```sql +``` sql RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ... [ON CLUSTER cluster] ``` @@ -210,7 +208,7 @@ All tables are renamed under global locking. Renaming tables is a light operatio ## SET {#query-set} -```sql +``` sql SET param = value ``` @@ -218,15 +216,15 @@ Assigns `value` to the `param` [setting](../operations/settings/index.md) for th You can also set all the values from the specified settings profile in a single query. -```sql +``` sql SET profile = 'profile-name-from-the-settings-file' ``` For more information, see [Settings](../operations/settings/settings.md). -## TRUNCATE +## TRUNCATE {#truncate} -```sql +``` sql TRUNCATE TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` @@ -234,14 +232,14 @@ Removes all data from a table. When the clause `IF EXISTS` is omitted, the query The `TRUNCATE` query is not supported for [View](../operations/table_engines/view.md), [File](../operations/table_engines/file.md), [URL](../operations/table_engines/url.md) and [Null](../operations/table_engines/null.md) table engines. -## USE +## USE {#use} -```sql +``` sql USE db ``` Lets you set the current database for the session. The current database is used for searching for tables if the database is not explicitly defined in the query with a dot before the table name. -This query can't be made when using the HTTP protocol, since there is no concept of a session. +This query can’t be made when using the HTTP protocol, since there is no concept of a session. [Original article](https://clickhouse.tech/docs/en/query_language/misc/) diff --git a/docs/en/query_language/operators.md b/docs/en/query_language/operators.md index 3c280a3a399..757e7f4866a 100644 --- a/docs/en/query_language/operators.md +++ b/docs/en/query_language/operators.md @@ -1,19 +1,19 @@ -# Operators +# Operators {#operators} All operators are transformed to their corresponding functions at the query parsing stage in accordance with their precedence and associativity. Groups of operators are listed in order of priority (the higher it is in the list, the earlier the operator is connected to its arguments). -## Access Operators +## Access Operators {#access-operators} `a[N]` – Access to an element of an array. The `arrayElement(a, N)` function. `a.N` – Access to a tuple element. The `tupleElement(a, N)` function. -## Numeric Negation Operator +## Numeric Negation Operator {#numeric-negation-operator} `-a` – The `negate (a)` function. -## Multiplication and Division Operators +## Multiplication and Division Operators {#multiplication-and-division-operators} `a * b` – The `multiply (a, b)` function. @@ -21,13 +21,13 @@ Groups of operators are listed in order of priority (the higher it is in the lis `a % b` – The `modulo(a, b)` function. -## Addition and Subtraction Operators +## Addition and Subtraction Operators {#addition-and-subtraction-operators} `a + b` – The `plus(a, b)` function. `a - b` – The `minus(a, b)` function. -## Comparison Operators +## Comparison Operators {#comparison-operators} `a = b` – The `equals(a, b)` function. @@ -53,7 +53,7 @@ Groups of operators are listed in order of priority (the higher it is in the lis `a NOT BETWEEN b AND c` – The same as `a < b OR a > c`. -## Operators for Working With Data Sets +## Operators for Working With Data Sets {#operators-for-working-with-data-sets} *See [IN operators](select.md#select-in-operators).* @@ -69,7 +69,7 @@ Groups of operators are listed in order of priority (the higher it is in the lis ### EXTRACT {#operator-extract} -```sql +``` sql EXTRACT(part FROM date); ``` @@ -77,12 +77,12 @@ Extracts a part from a given date. For example, you can retrieve a month from a The `part` parameter specifies which part of the date to retrieve. The following values are available: -- `DAY` — The day of the month. Possible values: 1–31. -- `MONTH` — The number of a month. Possible values: 1–12. -- `YEAR` — The year. -- `SECOND` — The second. Possible values: 0–59. -- `MINUTE` — The minute. Possible values: 0–59. -- `HOUR` — The hour. Possible values: 0–23. +- `DAY` — The day of the month. Possible values: 1–31. +- `MONTH` — The number of a month. Possible values: 1–12. +- `YEAR` — The year. +- `SECOND` — The second. Possible values: 0–59. +- `MINUTE` — The minute. Possible values: 0–59. +- `HOUR` — The hour. Possible values: 0–23. The `part` parameter is case-insensitive. @@ -90,7 +90,7 @@ The `date` parameter specifies the date or the time to process. Either [Date](.. Examples: -```sql +``` sql SELECT EXTRACT(DAY FROM toDate('2017-06-15')); SELECT EXTRACT(MONTH FROM toDate('2017-06-15')); SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); @@ -98,7 +98,7 @@ SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); In the following example we create a table and insert into it a value with the `DateTime` type. -```sql +``` sql CREATE TABLE test.Orders ( OrderId UInt64, @@ -108,10 +108,11 @@ CREATE TABLE test.Orders ENGINE = Log; ``` -```sql +``` sql INSERT INTO test.Orders VALUES (1, 'Jarlsberg Cheese', toDateTime('2008-10-11 13:23:44')); ``` -```sql + +``` sql SELECT toYear(OrderDate) AS OrderYear, toMonth(OrderDate) AS OrderMonth, @@ -121,7 +122,8 @@ SELECT toSecond(OrderDate) AS OrderSecond FROM test.Orders; ``` -```text + +``` text ┌─OrderYear─┬─OrderMonth─┬─OrderDay─┬─OrderHour─┬─OrderMinute─┬─OrderSecond─┐ │ 2008 │ 10 │ 11 │ 13 │ 23 │ 44 │ └───────────┴────────────┴──────────┴───────────┴─────────────┴─────────────┘ @@ -144,14 +146,15 @@ Types of intervals: - `YEAR` !!! warning "Warning" - Intervals with different types can't be combined. You can't use expressions like `INTERVAL 4 DAY 1 HOUR`. Express intervals in units that are smaller or equal the the smallest unit of the interval, for example `INTERVAL 25 HOUR`. You can use consequtive operations like in the example below. + Intervals with different types can’t be combined. You can’t use expressions like `INTERVAL 4 DAY 1 HOUR`. Express intervals in units that are smaller or equal the the smallest unit of the interval, for example `INTERVAL 25 HOUR`. You can use consequtive operations like in the example below. Example: -```sql +``` sql SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL 3 HOUR ``` -```text + +``` text ┌───current_date_time─┬─plus(plus(now(), toIntervalDay(4)), toIntervalHour(3))─┐ │ 2019-10-23 11:16:28 │ 2019-10-27 14:16:28 │ └─────────────────────┴────────────────────────────────────────────────────────┘ @@ -159,32 +162,32 @@ SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL **See Also** -- [Interval](../data_types/special_data_types/interval.md) data type -- [toInterval](functions/type_conversion_functions.md#function-tointerval) type convertion functions +- [Interval](../data_types/special_data_types/interval.md) data type +- [toInterval](functions/type_conversion_functions.md#function-tointerval) type convertion functions -## Logical Negation Operator +## Logical Negation Operator {#logical-negation-operator} `NOT a` – The `not(a)` function. -## Logical AND Operator +## Logical AND Operator {#logical-and-operator} `a AND b` – The`and(a, b)` function. -## Logical OR Operator +## Logical OR Operator {#logical-or-operator} `a OR b` – The `or(a, b)` function. -## Conditional Operator +## Conditional Operator {#conditional-operator} `a ? b : c` – The `if(a, b, c)` function. Note: -The conditional operator calculates the values of b and c, then checks whether condition a is met, and then returns the corresponding value. If `b` or `C` is an [arrayJoin()](functions/array_join.md#functions_arrayjoin) function, each row will be replicated regardless of the "a" condition. +The conditional operator calculates the values of b and c, then checks whether condition a is met, and then returns the corresponding value. If `b` or `C` is an [arrayJoin()](functions/array_join.md#functions_arrayjoin) function, each row will be replicated regardless of the “a” condition. -## Conditional Expression {#operator_case} +## Conditional Expression {#operator-case} -```sql +``` sql CASE [x] WHEN a THEN b [WHEN ... THEN ...] @@ -198,63 +201,68 @@ If there is no `ELSE c` clause in the expression, the default value is `NULL`. The `transform` function does not work with `NULL`. -## Concatenation Operator +## Concatenation Operator {#concatenation-operator} `s1 || s2` – The `concat(s1, s2) function.` -## Lambda Creation Operator +## Lambda Creation Operator {#lambda-creation-operator} `x -> expr` – The `lambda(x, expr) function.` The following operators do not have a priority, since they are brackets: -## Array Creation Operator +## Array Creation Operator {#array-creation-operator} `[x1, ...]` – The `array(x1, ...) function.` -## Tuple Creation Operator +## Tuple Creation Operator {#tuple-creation-operator} `(x1, x2, ...)` – The `tuple(x2, x2, ...) function.` -## Associativity +## Associativity {#associativity} All binary operators have left associativity. For example, `1 + 2 + 3` is transformed to `plus(plus(1, 2), 3)`. -Sometimes this doesn't work the way you expect. For example, ` SELECT 4 > 2 > 3` will result in 0. +Sometimes this doesn’t work the way you expect. For example, `SELECT 4 > 2 > 3` will result in 0. For efficiency, the `and` and `or` functions accept any number of arguments. The corresponding chains of `AND` and `OR` operators are transformed to a single call of these functions. -## Checking for `NULL` +## Checking for `NULL` {#checking-for-null} ClickHouse supports the `IS NULL` and `IS NOT NULL` operators. ### IS NULL {#operator-is-null} -- For [Nullable](../data_types/nullable.md) type values, the `IS NULL` operator returns: - - `1`, if the value is `NULL`. - - `0` otherwise. -- For other values, the `IS NULL` operator always returns `0`. +- For [Nullable](../data_types/nullable.md) type values, the `IS NULL` operator returns: + - `1`, if the value is `NULL`. + - `0` otherwise. +- For other values, the `IS NULL` operator always returns `0`. -```sql + + +``` sql SELECT x+100 FROM t_null WHERE y IS NULL ``` -```text + +``` text ┌─plus(x, 100)─┐ │ 101 │ └──────────────┘ ``` +### IS NOT NULL {#is-not-null} -### IS NOT NULL +- For [Nullable](../data_types/nullable.md) type values, the `IS NOT NULL` operator returns: + - `0`, if the value is `NULL`. + - `1` otherwise. +- For other values, the `IS NOT NULL` operator always returns `1`. -- For [Nullable](../data_types/nullable.md) type values, the `IS NOT NULL` operator returns: - - `0`, if the value is `NULL`. - - `1` otherwise. -- For other values, the `IS NOT NULL` operator always returns `1`. + -```sql +``` sql SELECT * FROM t_null WHERE y IS NOT NULL ``` -```text + +``` text ┌─x─┬─y─┐ │ 2 │ 3 │ └───┴───┘ diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md index 6a19444f44e..d46c0be1aaf 100644 --- a/docs/en/query_language/select.md +++ b/docs/en/query_language/select.md @@ -1,8 +1,8 @@ -# SELECT Queries Syntax +# SELECT Queries Syntax {#select-queries-syntax} `SELECT` performs data retrieval. -```sql +``` sql [WITH expr_list|(subquery)] SELECT [DISTINCT] expr_list [FROM [db.]table | (subquery) | table_function] [FINAL] @@ -25,17 +25,19 @@ All the clauses are optional, except for the required list of expressions immedi The clauses below are described in almost the same order as in the query execution conveyor. If the query omits the `DISTINCT`, `GROUP BY` and `ORDER BY` clauses and the `IN` and `JOIN` subqueries, the query will be completely stream processed, using O(1) amount of RAM. -Otherwise, the query might consume a lot of RAM if the appropriate restrictions are not specified: `max_memory_usage`, `max_rows_to_group_by`, `max_rows_to_sort`, `max_rows_in_distinct`, `max_bytes_in_distinct`, `max_rows_in_set`, `max_bytes_in_set`, `max_rows_in_join`, `max_bytes_in_join`, `max_bytes_before_external_sort`, `max_bytes_before_external_group_by`. For more information, see the section "Settings". It is possible to use external sorting (saving temporary tables to a disk) and external aggregation. `The system does not have "merge join"`. +Otherwise, the query might consume a lot of RAM if the appropriate restrictions are not specified: `max_memory_usage`, `max_rows_to_group_by`, `max_rows_to_sort`, `max_rows_in_distinct`, `max_bytes_in_distinct`, `max_rows_in_set`, `max_bytes_in_set`, `max_rows_in_join`, `max_bytes_in_join`, `max_bytes_before_external_sort`, `max_bytes_before_external_group_by`. For more information, see the section “Settings”. It is possible to use external sorting (saving temporary tables to a disk) and external aggregation. `The system does not have "merge join"`. + +### WITH Clause {#with-clause} -### WITH Clause This section provides support for Common Table Expressions ([CTE](https://en.wikipedia.org/wiki/Hierarchical_and_recursive_queries_in_SQL)), with some limitations: 1. Recursive queries are not supported -2. When subquery is used inside WITH section, it's result should be scalar with exactly one row -3. Expression's results are not available in subqueries +2. When subquery is used inside WITH section, it’s result should be scalar with exactly one row +3. Expression’s results are not available in subqueries Results of WITH clause expressions can be used inside SELECT clause. -Example 1: Using constant expression as "variable" -```sql +Example 1: Using constant expression as “variable” + +``` sql WITH '2019-08-01 15:23:00' as ts_upper_bound SELECT * FROM hits @@ -45,7 +47,8 @@ WHERE ``` Example 2: Evicting sum(bytes) expression result from SELECT clause column list -```sql + +``` sql WITH sum(bytes) as s SELECT formatReadableSize(s), @@ -56,7 +59,8 @@ ORDER BY s ``` Example 3: Using results of scalar subquery -```sql + +``` sql /* this example would return TOP 10 of most huge tables */ WITH ( @@ -75,7 +79,8 @@ LIMIT 10 Example 4: Re-using expression in subquery As a workaround for current limitation for expression usage in subqueries, you may duplicate it. -```sql + +``` sql WITH ['hello'] AS hello SELECT hello, @@ -86,13 +91,13 @@ FROM SELECT hello ) ``` -```text + +``` text ┌─hello─────┬─hello─────┐ │ ['hello'] │ ['hello'] │ └───────────┴───────────┘ ``` - ### FROM Clause {#select-from} If the FROM clause is omitted, data will be read from the `system.one` table. @@ -100,9 +105,9 @@ The `system.one` table contains exactly one row (this table fulfills the same pu The `FROM` clause specifies the source to read data from: -- Table -- Subquery -- [Table function](table_functions/index.md) +- Table +- Subquery +- [Table function](table_functions/index.md) `ARRAY JOIN` and the regular `JOIN` may also be included (see below). @@ -120,10 +125,10 @@ Also supported for: - [Replicated](../operations/table_engines/replication.md) versions of `MergeTree` engines. - [View](../operations/table_engines/view.md), [Buffer](../operations/table_engines/buffer.md), [Distributed](../operations/table_engines/distributed.md), and [MaterializedView](../operations/table_engines/materializedview.md) engines that operate over other engines, provided they were created over `MergeTree`-engine tables. -Queries that use `FINAL` are executed not as fast as similar queries that don't, because: +Queries that use `FINAL` are executed not as fast as similar queries that don’t, because: -- Query is executed in a single thread and data is merged during query execution. -- Queries with `FINAL` read primary key columns in addition to the columns specified in the query. +- Query is executed in a single thread and data is merged during query execution. +- Queries with `FINAL` read primary key columns in addition to the columns specified in the query. In most cases, avoid using `FINAL`. @@ -135,27 +140,26 @@ When data sampling is enabled, the query is not performed on all the data, but o Approximated query processing can be useful in the following cases: -- When you have strict timing requirements (like <100ms) but you can't justify the cost of additional hardware resources to meet them. -- When your raw data is not accurate, so approximation doesn't noticeably degrade the quality. -- Business requirements target approximate results (for cost-effectiveness, or in order to market exact results to premium users). +- When you have strict timing requirements (like \<100ms) but you can’t justify the cost of additional hardware resources to meet them. +- When your raw data is not accurate, so approximation doesn’t noticeably degrade the quality. +- Business requirements target approximate results (for cost-effectiveness, or in order to market exact results to premium users). -!!! note +!!! note "Note" You can only use sampling with the tables in the [MergeTree](../operations/table_engines/mergetree.md) family, and only if the sampling expression was specified during table creation (see [MergeTree engine](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table)). The features of data sampling are listed below: -- Data sampling is a deterministic mechanism. The result of the same `SELECT .. SAMPLE` query is always the same. -- Sampling works consistently for different tables. For tables with a single sampling key, a sample with the same coefficient always selects the same subset of possible data. For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This means that you can use the sample in subqueries in the [IN](#select-in-operators) clause. Also, you can join samples using the [JOIN](#select-join) clause. -- Sampling allows reading less data from a disk. Note that you must specify the sampling key correctly. For more information, see [Creating a MergeTree Table](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table). +- Data sampling is a deterministic mechanism. The result of the same `SELECT .. SAMPLE` query is always the same. +- Sampling works consistently for different tables. For tables with a single sampling key, a sample with the same coefficient always selects the same subset of possible data. For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This means that you can use the sample in subqueries in the [IN](#select-in-operators) clause. Also, you can join samples using the [JOIN](#select-join) clause. +- Sampling allows reading less data from a disk. Note that you must specify the sampling key correctly. For more information, see [Creating a MergeTree Table](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table). For the `SAMPLE` clause the following syntax is supported: -| SAMPLE Clause Syntax | Description | -| ---------------- | --------- | -| `SAMPLE k` | Here `k` is the number from 0 to 1.
The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k)| -| `SAMPLE n` | Here `n` is a sufficiently large integer.
The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) | -| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1.
The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | - +| SAMPLE Clause Syntax | Description | +|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `SAMPLE k` | Here `k` is the number from 0 to 1.
The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k) | +| `SAMPLE n` | Here `n` is a sufficiently large integer.
The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) | +| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1.
The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | #### SAMPLE k {#select-sample-k} @@ -163,7 +167,7 @@ Here `k` is the number from 0 to 1 (both fractional and decimal notations are su In a `SAMPLE k` clause, the sample is taken from the `k` fraction of data. The example is shown below: -```sql +``` sql SELECT Title, count() * 10 AS PageViews @@ -185,33 +189,33 @@ In this case, the query is executed on a sample of at least `n` rows (but not si Since the minimum unit for data reading is one granule (its size is set by the `index_granularity` setting), it makes sense to set a sample that is much larger than the size of the granule. -When using the `SAMPLE n` clause, you don't know which relative percent of data was processed. So you don't know the coefficient the aggregate functions should be multiplied by. Use the `_sample_factor` virtual column to get the approximate result. +When using the `SAMPLE n` clause, you don’t know which relative percent of data was processed. So you don’t know the coefficient the aggregate functions should be multiplied by. Use the `_sample_factor` virtual column to get the approximate result. The `_sample_factor` column contains relative coefficients that are calculated dynamically. This column is created automatically when you [create](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table) a table with the specified sampling key. The usage examples of the `_sample_factor` column are shown below. -Let's consider the table `visits`, which contains the statistics about site visits. The first example shows how to calculate the number of page views: +Let’s consider the table `visits`, which contains the statistics about site visits. The first example shows how to calculate the number of page views: -```sql +``` sql SELECT sum(PageViews * _sample_factor) FROM visits SAMPLE 10000000 -``` +``` The next example shows how to calculate the total number of visits: -```sql +``` sql SELECT sum(_sample_factor) FROM visits SAMPLE 10000000 -``` +``` -The example below shows how to calculate the average session duration. Note that you don't need to use the relative coefficient to calculate the average values. +The example below shows how to calculate the average session duration. Note that you don’t need to use the relative coefficient to calculate the average values. -```sql +``` sql SELECT avg(Duration) FROM visits SAMPLE 10000000 -``` +``` #### SAMPLE k OFFSET m {#select-sample-offset} @@ -219,7 +223,7 @@ Here `k` and `m` are numbers from 0 to 1. Examples are shown below. **Example 1** -```sql +``` sql SAMPLE 1/10 ``` @@ -229,7 +233,7 @@ In this example, the sample is 1/10th of all data: **Example 2** -```sql +``` sql SAMPLE 1/10 OFFSET 1/2 ``` @@ -241,7 +245,7 @@ Here, a sample of 10% is taken from the second half of the data. Allows executing `JOIN` with an array or nested data structure. The intent is similar to the [arrayJoin](functions/array_join.md#functions_arrayjoin) function, but its functionality is broader. -```sql +``` sql SELECT FROM [LEFT] ARRAY JOIN @@ -255,12 +259,12 @@ The query execution order is optimized when running `ARRAY JOIN`. Although `ARRA Supported types of `ARRAY JOIN` are listed below: -- `ARRAY JOIN` - In this case, empty arrays are not included in the result of `JOIN`. -- `LEFT ARRAY JOIN` - The result of `JOIN` contains rows with empty arrays. The value for an empty array is set to the default value for the array element type (usually 0, empty string or NULL). +- `ARRAY JOIN` - In this case, empty arrays are not included in the result of `JOIN`. +- `LEFT ARRAY JOIN` - The result of `JOIN` contains rows with empty arrays. The value for an empty array is set to the default value for the array element type (usually 0, empty string or NULL). -The examples below demonstrate the usage of the `ARRAY JOIN` and `LEFT ARRAY JOIN` clauses. Let's create a table with an [Array](../data_types/array.md) type column and insert values into it: +The examples below demonstrate the usage of the `ARRAY JOIN` and `LEFT ARRAY JOIN` clauses. Let’s create a table with an [Array](../data_types/array.md) type column and insert values into it: -```sql +``` sql CREATE TABLE arrays_test ( s String, @@ -270,7 +274,8 @@ CREATE TABLE arrays_test INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); ``` -```text + +``` text ┌─s───────────┬─arr─────┐ │ Hello │ [1,2] │ │ World │ [3,4,5] │ @@ -280,12 +285,13 @@ VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); The example below uses the `ARRAY JOIN` clause: -```sql +``` sql SELECT s, arr FROM arrays_test ARRAY JOIN arr; ``` -```text + +``` text ┌─s─────┬─arr─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -297,12 +303,13 @@ ARRAY JOIN arr; The next example uses the `LEFT ARRAY JOIN` clause: -```sql +``` sql SELECT s, arr FROM arrays_test LEFT ARRAY JOIN arr; ``` -```text + +``` text ┌─s───────────┬─arr─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -313,17 +320,17 @@ LEFT ARRAY JOIN arr; └─────────────┴─────┘ ``` -#### Using Aliases +#### Using Aliases {#using-aliases} An alias can be specified for an array in the `ARRAY JOIN` clause. In this case, an array item can be accessed by this alias, but the array itself is accessed by the original name. Example: -```sql +``` sql SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a; ``` -```text +``` text ┌─s─────┬─arr─────┬─a─┐ │ Hello │ [1,2] │ 1 │ │ Hello │ [1,2] │ 2 │ @@ -335,13 +342,13 @@ ARRAY JOIN arr AS a; Using aliases, you can perform `ARRAY JOIN` with an external array. For example: -```sql +``` sql SELECT s, arr_external FROM arrays_test ARRAY JOIN [1, 2, 3] AS arr_external; ``` -```text +``` text ┌─s───────────┬─arr_external─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -357,13 +364,13 @@ ARRAY JOIN [1, 2, 3] AS arr_external; Multiple arrays can be comma-separated in the `ARRAY JOIN` clause. In this case, `JOIN` is performed with them simultaneously (the direct sum, not the cartesian product). Note that all the arrays must have the same size. Example: -```sql +``` sql SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped; ``` -```text +``` text ┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ │ Hello │ [1,2] │ 1 │ 1 │ 2 │ │ Hello │ [1,2] │ 2 │ 2 │ 3 │ @@ -375,13 +382,13 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS ma The example below uses the [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate) function: -```sql +``` sql SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; ``` -```text +``` text ┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ │ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ │ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ @@ -391,11 +398,11 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; └───────┴─────────┴───┴─────┴─────────────────────┘ ``` -#### ARRAY JOIN With Nested Data Structure +#### ARRAY JOIN With Nested Data Structure {#array-join-with-nested-data-structure} -`ARRAY `JOIN`` also works with [nested data structures](../data_types/nested_data_structures/nested.md). Example: +`ARRAY`JOIN\`\` also works with [nested data structures](../data_types/nested_data_structures/nested.md). Example: -```sql +``` sql CREATE TABLE nested_test ( s String, @@ -408,7 +415,7 @@ INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []); ``` -```text +``` text ┌─s───────┬─nest.x──┬─nest.y─────┐ │ Hello │ [1,2] │ [10,20] │ │ World │ [3,4,5] │ [30,40,50] │ @@ -416,13 +423,13 @@ VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [] └─────────┴─────────┴────────────┘ ``` -```sql +``` sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN nest; ``` -```text +``` text ┌─s─────┬─nest.x─┬─nest.y─┐ │ Hello │ 1 │ 10 │ │ Hello │ 2 │ 20 │ @@ -434,13 +441,13 @@ ARRAY JOIN nest; When specifying names of nested data structures in `ARRAY JOIN`, the meaning is the same as `ARRAY JOIN` with all the array elements that it consists of. Examples are listed below: -```sql +``` sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN `nest.x`, `nest.y`; ``` -```text +``` text ┌─s─────┬─nest.x─┬─nest.y─┐ │ Hello │ 1 │ 10 │ │ Hello │ 2 │ 20 │ @@ -452,13 +459,13 @@ ARRAY JOIN `nest.x`, `nest.y`; This variation also makes sense: -```sql +``` sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN `nest.x`; ``` -```text +``` text ┌─s─────┬─nest.x─┬─nest.y─────┐ │ Hello │ 1 │ [10,20] │ │ Hello │ 2 │ [10,20] │ @@ -470,13 +477,13 @@ ARRAY JOIN `nest.x`; An alias may be used for a nested data structure, in order to select either the `JOIN` result or the source array. Example: -```sql +``` sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN nest AS n; ``` -```text +``` text ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ @@ -488,13 +495,13 @@ ARRAY JOIN nest AS n; Example of using the [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate) function: -```sql +``` sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num; ``` -```text +``` text ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ @@ -511,7 +518,7 @@ Joins the data in the normal [SQL JOIN](https://en.wikipedia.org/wiki/Join_(SQL) !!! info "Note" Not related to [ARRAY JOIN](#select-array-join-clause). -```sql +``` sql SELECT FROM [GLOBAL] [ANY|ALL] [INNER|LEFT|RIGHT|FULL|CROSS] [OUTER] JOIN @@ -522,15 +529,15 @@ The table names can be specified instead of `` and ``, `>=`, `<`, `<=`. Syntax `ASOF JOIN ... USING`: -```sql +``` sql SELECT expressions_list FROM table_1 ASOF JOIN table_2 @@ -590,11 +597,11 @@ USING (equi_column1, ... equi_columnN, asof_column) For example, consider the following tables: -```text +``` text table_1 table_2 event | ev_time | user_id event | ev_time | user_id -----------|---------|---------- ----------|---------|---------- +----------|---------|---------- ----------|---------|---------- ... ... event_1_1 | 12:00 | 42 event_2_1 | 11:59 | 42 ... event_2_2 | 12:30 | 42 @@ -602,15 +609,14 @@ event_1_2 | 13:00 | 42 event_2_3 | 13:00 | 42 ... ... ``` -`ASOF JOIN` can take the timestamp of a user event from `table_1` and find an event in `table_2` where the timestamp is closest to the timestamp of the event from `table_1` corresponding to the closest match condition. Equal timestamp values are the closest if available. Here, the `user_id` column can be used for joining on equality and the `ev_time` column can be used for joining on the closest match. In our example, `event_1_1` can be joined with `event_2_1` and `event_1_2` can be joined with `event_2_3`, but `event_2_2` can't be joined. - +`ASOF JOIN` can take the timestamp of a user event from `table_1` and find an event in `table_2` where the timestamp is closest to the timestamp of the event from `table_1` corresponding to the closest match condition. Equal timestamp values are the closest if available. Here, the `user_id` column can be used for joining on equality and the `ev_time` column can be used for joining on the closest match. In our example, `event_1_1` can be joined with `event_2_1` and `event_1_2` can be joined with `event_2_3`, but `event_2_2` can’t be joined. !!! note "Note" `ASOF` join is **not** supported in the [Join](../operations/table_engines/join.md) table engine. -To set the default strictness value, use the session configuration parameter [join_default_strictness](../operations/settings/settings.md#settings-join_default_strictness). +To set the default strictness value, use the session configuration parameter [join\_default\_strictness](../operations/settings/settings.md#settings-join_default_strictness). -#### GLOBAL JOIN +#### GLOBAL JOIN {#global-join} When using a normal `JOIN`, the query is sent to remote servers. Subqueries are run on each of them in order to make the right table, and the join is performed with this table. In other words, the right table is formed on each server separately. @@ -618,13 +624,13 @@ When using `GLOBAL ... JOIN`, first the requestor server runs a subquery to calc Be careful when using `GLOBAL`. For more information, see the section [Distributed subqueries](#select-distributed-subqueries). -#### Usage Recommendations +#### Usage Recommendations {#usage-recommendations} When running a `JOIN`, there is no optimization of the order of execution in relation to other stages of the query. The join (a search in the right table) is run before filtering in `WHERE` and before aggregation. In order to explicitly set the processing order, we recommend running a `JOIN` subquery with a subquery. Example: -```sql +``` sql SELECT CounterID, hits, @@ -648,7 +654,7 @@ ORDER BY hits DESC LIMIT 10 ``` -```text +``` text ┌─CounterID─┬───hits─┬─visits─┐ │ 1143050 │ 523264 │ 13665 │ │ 731962 │ 475698 │ 102716 │ @@ -663,46 +669,45 @@ LIMIT 10 └───────────┴────────┴────────┘ ``` -Subqueries don't allow you to set names or use them for referencing a column from a specific subquery. +Subqueries don’t allow you to set names or use them for referencing a column from a specific subquery. The columns specified in `USING` must have the same names in both subqueries, and the other columns must be named differently. You can use aliases to change the names of columns in subqueries (the example uses the aliases `hits` and `visits`). The `USING` clause specifies one or more columns to join, which establishes the equality of these columns. The list of columns is set without brackets. More complex join conditions are not supported. -The right table (the subquery result) resides in RAM. If there isn't enough memory, you can't run a `JOIN`. +The right table (the subquery result) resides in RAM. If there isn’t enough memory, you can’t run a `JOIN`. Each time a query is run with the same `JOIN`, the subquery is run again because the result is not cached. To avoid this, use the special [Join](../operations/table_engines/join.md) table engine, which is a prepared array for joining that is always in RAM. In some cases, it is more efficient to use `IN` instead of `JOIN`. Among the various types of `JOIN`, the most efficient is `ANY LEFT JOIN`, then `ANY INNER JOIN`. The least efficient are `ALL LEFT JOIN` and `ALL INNER JOIN`. -If you need a `JOIN` for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a `JOIN` might not be very convenient due to the fact that the right table is re-accessed for every query. For such cases, there is an "external dictionaries" feature that you should use instead of `JOIN`. For more information, see the section [External dictionaries](dicts/external_dicts.md). +If you need a `JOIN` for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a `JOIN` might not be very convenient due to the fact that the right table is re-accessed for every query. For such cases, there is an “external dictionaries” feature that you should use instead of `JOIN`. For more information, see the section [External dictionaries](dicts/external_dicts.md). **Memory Limitations** ClickHouse uses the [hash join](https://en.wikipedia.org/wiki/Hash_join) algorithm. ClickHouse takes the `` and creates a hash table for it in RAM. If you need to restrict join operation memory consumption use the following settings: -- [max_rows_in_join](../operations/settings/query_complexity.md#settings-max_rows_in_join) — Limits number of rows in the hash table. -- [max_bytes_in_join](../operations/settings/query_complexity.md#settings-max_bytes_in_join) — Limits size of the hash table. +- [max\_rows\_in\_join](../operations/settings/query_complexity.md#settings-max_rows_in_join) — Limits number of rows in the hash table. +- [max\_bytes\_in\_join](../operations/settings/query_complexity.md#settings-max_bytes_in_join) — Limits size of the hash table. -When any of these limits is reached, ClickHouse acts as the [join_overflow_mode](../operations/settings/query_complexity.md#settings-join_overflow_mode) setting instructs. +When any of these limits is reached, ClickHouse acts as the [join\_overflow\_mode](../operations/settings/query_complexity.md#settings-join_overflow_mode) setting instructs. -#### Processing of Empty or NULL Cells +#### Processing of Empty or NULL Cells {#processing-of-empty-or-null-cells} -While joining tables, the empty cells may appear. The setting [join_use_nulls](../operations/settings/settings.md#join_use_nulls) define how ClickHouse fills these cells. +While joining tables, the empty cells may appear. The setting [join\_use\_nulls](../operations/settings/settings.md#join_use_nulls) define how ClickHouse fills these cells. If the `JOIN` keys are [Nullable](../data_types/nullable.md) fields, the rows where at least one of the keys has the value [NULL](syntax.md#null-literal) are not joined. -#### Syntax Limitations +#### Syntax Limitations {#syntax-limitations} For multiple `JOIN` clauses in a single `SELECT` query: -- Taking all the columns via `*` is available only if tables are joined, not subqueries. -- The `PREWHERE` clause is not available. +- Taking all the columns via `*` is available only if tables are joined, not subqueries. +- The `PREWHERE` clause is not available. For `ON`, `WHERE`, and `GROUP BY` clauses: -- Arbitrary expressions cannot be used in `ON`, `WHERE`, and `GROUP BY` clauses, but you can define an expression in a `SELECT` clause and then use it in these clauses via an alias. - +- Arbitrary expressions cannot be used in `ON`, `WHERE`, and `GROUP BY` clauses, but you can define an expression in a `SELECT` clause and then use it in these clauses via an alias. ### WHERE Clause {#select-where} @@ -711,8 +716,7 @@ This expression will be used for filtering data before all other transformations If indexes are supported by the database table engine, the expression is evaluated on the ability to use indexes. - -### PREWHERE Clause +### PREWHERE Clause {#prewhere-clause} This clause has the same meaning as the WHERE clause. The difference is in which data is read from the table. When using PREWHERE, first only the columns necessary for executing PREWHERE are read. Then the other columns are read that are needed for running the query, but only those blocks where the PREWHERE expression is true. @@ -725,20 +729,20 @@ PREWHERE is only supported by tables from the `*MergeTree` family. A query may simultaneously specify PREWHERE and WHERE. In this case, PREWHERE precedes WHERE. -If the 'optimize_move_to_prewhere' setting is set to 1 and PREWHERE is omitted, the system uses heuristics to automatically move parts of expressions from WHERE to PREWHERE. +If the ‘optimize\_move\_to\_prewhere’ setting is set to 1 and PREWHERE is omitted, the system uses heuristics to automatically move parts of expressions from WHERE to PREWHERE. ### GROUP BY Clause {#select-group-by-clause} This is one of the most important parts of a column-oriented DBMS. -If there is a GROUP BY clause, it must contain a list of expressions. Each expression will be referred to here as a "key". +If there is a GROUP BY clause, it must contain a list of expressions. Each expression will be referred to here as a “key”. All the expressions in the SELECT, HAVING, and ORDER BY clauses must be calculated from keys or from aggregate functions. In other words, each column selected from the table must be used either in keys or inside aggregate functions. If a query contains only table columns inside aggregate functions, the GROUP BY clause can be omitted, and aggregation by an empty set of keys is assumed. Example: -```sql +``` sql SELECT count(), median(FetchTiming > 60 ? 60 : FetchTiming), @@ -746,13 +750,13 @@ SELECT FROM hits ``` -However, in contrast to standard SQL, if the table doesn't have any rows (either there aren't any at all, or there aren't any after using WHERE to filter), an empty result is returned, and not the result from one of the rows containing the initial values of aggregate functions. +However, in contrast to standard SQL, if the table doesn’t have any rows (either there aren’t any at all, or there aren’t any after using WHERE to filter), an empty result is returned, and not the result from one of the rows containing the initial values of aggregate functions. -As opposed to MySQL (and conforming to standard SQL), you can't get some value of some column that is not in a key or aggregate function (except constant expressions). To work around this, you can use the 'any' aggregate function (get the first encountered value) or 'min/max'. +As opposed to MySQL (and conforming to standard SQL), you can’t get some value of some column that is not in a key or aggregate function (except constant expressions). To work around this, you can use the ‘any’ aggregate function (get the first encountered value) or ‘min/max’. Example: -```sql +``` sql SELECT domainWithoutWWW(URL) AS domain, count(), @@ -765,17 +769,17 @@ For every different key value encountered, GROUP BY calculates a set of aggregat GROUP BY is not supported for array columns. -A constant can't be specified as arguments for aggregate functions. Example: sum(1). Instead of this, you can get rid of the constant. Example: `count()`. +A constant can’t be specified as arguments for aggregate functions. Example: sum(1). Instead of this, you can get rid of the constant. Example: `count()`. -#### NULL processing +#### NULL processing {#null-processing} For grouping, ClickHouse interprets [NULL](syntax.md) as a value, and `NULL=NULL`. -Here's an example to show what this means. +Here’s an example to show what this means. Assume you have this table: -```text +``` text ┌─x─┬────y─┐ │ 1 │ 2 │ │ 2 │ ᴺᵁᴸᴸ │ @@ -787,7 +791,7 @@ Assume you have this table: The query `SELECT sum(x), y FROM t_null_big GROUP BY y` results in: -```text +``` text ┌─sum(x)─┬────y─┐ │ 4 │ 2 │ │ 3 │ 3 │ @@ -799,24 +803,24 @@ You can see that `GROUP BY` for `y = NULL` summed up `x`, as if `NULL` is this v If you pass several keys to `GROUP BY`, the result will give you all the combinations of the selection, as if `NULL` were a specific value. -#### WITH TOTALS Modifier +#### WITH TOTALS Modifier {#with-totals-modifier} -If the WITH TOTALS modifier is specified, another row will be calculated. This row will have key columns containing default values (zeros or empty lines), and columns of aggregate functions with the values calculated across all the rows (the "total" values). +If the WITH TOTALS modifier is specified, another row will be calculated. This row will have key columns containing default values (zeros or empty lines), and columns of aggregate functions with the values calculated across all the rows (the “total” values). This extra row is output in JSON\*, TabSeparated\*, and Pretty\* formats, separately from the other rows. In the other formats, this row is not output. -In JSON\* formats, this row is output as a separate 'totals' field. In TabSeparated\* formats, the row comes after the main result, preceded by an empty row (after the other data). In Pretty\* formats, the row is output as a separate table after the main result. +In JSON\* formats, this row is output as a separate ‘totals’ field. In TabSeparated\* formats, the row comes after the main result, preceded by an empty row (after the other data). In Pretty\* formats, the row is output as a separate table after the main result. -`WITH TOTALS` can be run in different ways when HAVING is present. The behavior depends on the 'totals_mode' setting. -By default, `totals_mode = 'before_having'`. In this case, 'totals' is calculated across all rows, including the ones that don't pass through HAVING and 'max_rows_to_group_by'. +`WITH TOTALS` can be run in different ways when HAVING is present. The behavior depends on the ‘totals\_mode’ setting. +By default, `totals_mode = 'before_having'`. In this case, ‘totals’ is calculated across all rows, including the ones that don’t pass through HAVING and ‘max\_rows\_to\_group\_by’. -The other alternatives include only the rows that pass through HAVING in 'totals', and behave differently with the setting `max_rows_to_group_by` and `group_by_overflow_mode = 'any'`. +The other alternatives include only the rows that pass through HAVING in ‘totals’, and behave differently with the setting `max_rows_to_group_by` and `group_by_overflow_mode = 'any'`. -`after_having_exclusive` – Don't include rows that didn't pass through `max_rows_to_group_by`. In other words, 'totals' will have less than or the same number of rows as it would if `max_rows_to_group_by` were omitted. +`after_having_exclusive` – Don’t include rows that didn’t pass through `max_rows_to_group_by`. In other words, ‘totals’ will have less than or the same number of rows as it would if `max_rows_to_group_by` were omitted. -`after_having_inclusive` – Include all the rows that didn't pass through 'max_rows_to_group_by' in 'totals'. In other words, 'totals' will have more than or the same number of rows as it would if `max_rows_to_group_by` were omitted. +`after_having_inclusive` – Include all the rows that didn’t pass through ‘max\_rows\_to\_group\_by’ in ‘totals’. In other words, ‘totals’ will have more than or the same number of rows as it would if `max_rows_to_group_by` were omitted. -`after_having_auto` – Count the number of rows that passed through HAVING. If it is more than a certain amount (by default, 50%), include all the rows that didn't pass through 'max_rows_to_group_by' in 'totals'. Otherwise, do not include them. +`after_having_auto` – Count the number of rows that passed through HAVING. If it is more than a certain amount (by default, 50%), include all the rows that didn’t pass through ‘max\_rows\_to\_group\_by’ in ‘totals’. Otherwise, do not include them. `totals_auto_threshold` – By default, 0.5. The coefficient for `after_having_auto`. @@ -827,28 +831,28 @@ You can use WITH TOTALS in subqueries, including subqueries in the JOIN clause ( #### GROUP BY in External Memory {#select-group-by-in-external-memory} You can enable dumping temporary data to the disk to restrict memory usage during `GROUP BY`. -The [max_bytes_before_external_group_by](../operations/settings/settings.md#settings-max_bytes_before_external_group_by) setting determines the threshold RAM consumption for dumping `GROUP BY` temporary data to the file system. If set to 0 (the default), it is disabled. +The [max\_bytes\_before\_external\_group\_by](../operations/settings/settings.md#settings-max_bytes_before_external_group_by) setting determines the threshold RAM consumption for dumping `GROUP BY` temporary data to the file system. If set to 0 (the default), it is disabled. -When using `max_bytes_before_external_group_by`, we recommend that you set `max_memory_usage` about twice as high. This is necessary because there are two stages to aggregation: reading the date and forming intermediate data (1) and merging the intermediate data (2). Dumping data to the file system can only occur during stage 1. If the temporary data wasn't dumped, then stage 2 might require up to the same amount of memory as in stage 1. +When using `max_bytes_before_external_group_by`, we recommend that you set `max_memory_usage` about twice as high. This is necessary because there are two stages to aggregation: reading the date and forming intermediate data (1) and merging the intermediate data (2). Dumping data to the file system can only occur during stage 1. If the temporary data wasn’t dumped, then stage 2 might require up to the same amount of memory as in stage 1. -For example, if [max_memory_usage](../operations/settings/settings.md#settings_max_memory_usage) was set to 10000000000 and you want to use external aggregation, it makes sense to set `max_bytes_before_external_group_by` to 10000000000, and max_memory_usage to 20000000000. When external aggregation is triggered (if there was at least one dump of temporary data), maximum consumption of RAM is only slightly more than `max_bytes_before_external_group_by`. +For example, if [max\_memory\_usage](../operations/settings/settings.md#settings_max_memory_usage) was set to 10000000000 and you want to use external aggregation, it makes sense to set `max_bytes_before_external_group_by` to 10000000000, and max\_memory\_usage to 20000000000. When external aggregation is triggered (if there was at least one dump of temporary data), maximum consumption of RAM is only slightly more than `max_bytes_before_external_group_by`. With distributed query processing, external aggregation is performed on remote servers. In order for the requester server to use only a small amount of RAM, set `distributed_aggregation_memory_efficient` to 1. When merging data flushed to the disk, as well as when merging results from remote servers when the `distributed_aggregation_memory_efficient` setting is enabled, consumes up to `1/256 * the_number_of_threads` from the total amount of RAM. -When external aggregation is enabled, if there was less than `max_bytes_before_external_group_by` of data (i.e. data was not flushed), the query runs just as fast as without external aggregation. If any temporary data was flushed, the run time will be several times longer (approximately three times). +When external aggregation is enabled, if there was less than `max_bytes_before_external_group_by` of data (i.e. data was not flushed), the query runs just as fast as without external aggregation. If any temporary data was flushed, the run time will be several times longer (approximately three times). -If you have an `ORDER BY` with a `LIMIT` after `GROUP BY`, then the amount of used RAM depends on the amount of data in `LIMIT`, not in the whole table. But if the `ORDER BY` doesn't have `LIMIT`, don't forget to enable external sorting (`max_bytes_before_external_sort`). +If you have an `ORDER BY` with a `LIMIT` after `GROUP BY`, then the amount of used RAM depends on the amount of data in `LIMIT`, not in the whole table. But if the `ORDER BY` doesn’t have `LIMIT`, don’t forget to enable external sorting (`max_bytes_before_external_sort`). -### LIMIT BY Clause +### LIMIT BY Clause {#limit-by-clause} A query with the `LIMIT n BY expressions` clause selects the first `n` rows for each distinct value of `expressions`. The key for `LIMIT BY` can contain any number of [expressions](syntax.md#syntax-expressions). ClickHouse supports the following syntax: -- `LIMIT [offset_value, ]n BY expressions` -- `LIMIT n OFFSET offset_value BY expressions` +- `LIMIT [offset_value, ]n BY expressions` +- `LIMIT n OFFSET offset_value BY expressions` During query processing, ClickHouse selects data ordered by sorting key. The sorting key is set explicitly using an [ORDER BY](#select-order-by) clause or implicitly as a property of the table engine. Then ClickHouse applies `LIMIT n BY expressions` and returns the first `n` rows for each distinct combination of `expressions`. If `OFFSET` is specified, then for each data block that belongs to a distinct combination of `expressions`, ClickHouse skips `offset_value` number of rows from the beginning of the block and returns a maximum of `n` rows as a result. If `offset_value` is bigger than the number of rows in the data block, ClickHouse returns zero rows from the block. @@ -858,17 +862,18 @@ During query processing, ClickHouse selects data ordered by sorting key. The sor Sample table: -```sql +``` sql CREATE TABLE limit_by(id Int, val Int) ENGINE = Memory; INSERT INTO limit_by values(1, 10), (1, 11), (1, 12), (2, 20), (2, 21); ``` Queries: -```sql +``` sql SELECT * FROM limit_by ORDER BY id, val LIMIT 2 BY id ``` -```text + +``` text ┌─id─┬─val─┐ │ 1 │ 10 │ │ 1 │ 11 │ @@ -876,10 +881,12 @@ SELECT * FROM limit_by ORDER BY id, val LIMIT 2 BY id │ 2 │ 21 │ └────┴─────┘ ``` -```sql + +``` sql SELECT * FROM limit_by ORDER BY id, val LIMIT 1, 2 BY id ``` -```text + +``` text ┌─id─┬─val─┐ │ 1 │ 11 │ │ 1 │ 12 │ @@ -891,7 +898,7 @@ The `SELECT * FROM limit_by ORDER BY id, val LIMIT 2 OFFSET 1 BY id` query retur The following query returns the top 5 referrers for each `domain, device_type` pair with a maximum of 100 rows in total (`LIMIT n BY + LIMIT`). -```sql +``` sql SELECT domainWithoutWWW(URL) AS domain, domainWithoutWWW(REFERRER_URL) AS referrer, @@ -904,12 +911,11 @@ LIMIT 5 BY domain, device_type LIMIT 100 ``` -### HAVING Clause +### HAVING Clause {#having-clause} Allows filtering the result received after GROUP BY, similar to the WHERE clause. WHERE and HAVING differ in that WHERE is performed before aggregation (GROUP BY), while HAVING is performed after it. -If aggregation is not performed, HAVING can't be used. - +If aggregation is not performed, HAVING can’t be used. ### ORDER BY Clause {#select-order-by} @@ -924,15 +930,15 @@ If the ORDER BY clause is omitted, the order of the rows is also undefined, and `NaN` and `NULL` sorting order: -- With the modifier `NULLS FIRST` — First `NULL`, then `NaN`, then other values. -- With the modifier `NULLS LAST` — First the values, then `NaN`, then `NULL`. -- Default — The same as with the `NULLS LAST` modifier. +- With the modifier `NULLS FIRST` — First `NULL`, then `NaN`, then other values. +- With the modifier `NULLS LAST` — First the values, then `NaN`, then `NULL`. +- Default — The same as with the `NULLS LAST` modifier. Example: For the table -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 2 │ @@ -949,7 +955,7 @@ For the table Run the query `SELECT * FROM t_null_nan ORDER BY y NULLS FIRST` to get: -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 7 │ ᴺᵁᴸᴸ │ @@ -968,9 +974,9 @@ When floating point numbers are sorted, NaNs are separate from the other values. Less RAM is used if a small enough LIMIT is specified in addition to ORDER BY. Otherwise, the amount of memory spent is proportional to the volume of data for sorting. For distributed query processing, if GROUP BY is omitted, sorting is partially done on remote servers, and the results are merged on the requestor server. This means that for distributed sorting, the volume of data to sort can be greater than the amount of memory on a single server. -If there is not enough RAM, it is possible to perform sorting in external memory (creating temporary files on a disk). Use the setting `max_bytes_before_external_sort` for this purpose. If it is set to 0 (the default), external sorting is disabled. If it is enabled, when the volume of data to sort reaches the specified number of bytes, the collected data is sorted and dumped into a temporary file. After all data is read, all the sorted files are merged and the results are output. Files are written to the /var/lib/clickhouse/tmp/ directory in the config (by default, but you can use the 'tmp_path' parameter to change this setting). +If there is not enough RAM, it is possible to perform sorting in external memory (creating temporary files on a disk). Use the setting `max_bytes_before_external_sort` for this purpose. If it is set to 0 (the default), external sorting is disabled. If it is enabled, when the volume of data to sort reaches the specified number of bytes, the collected data is sorted and dumped into a temporary file. After all data is read, all the sorted files are merged and the results are output. Files are written to the /var/lib/clickhouse/tmp/ directory in the config (by default, but you can use the ‘tmp\_path’ parameter to change this setting). -Running a query may use more memory than 'max_bytes_before_external_sort'. For this reason, this setting must have a value significantly smaller than 'max_memory_usage'. As an example, if your server has 128 GB of RAM and you need to run a single query, set 'max_memory_usage' to 100 GB, and 'max_bytes_before_external_sort' to 80 GB. +Running a query may use more memory than ‘max\_bytes\_before\_external\_sort’. For this reason, this setting must have a value significantly smaller than ‘max\_memory\_usage’. As an example, if your server has 128 GB of RAM and you need to run a single query, set ‘max\_memory\_usage’ to 100 GB, and ‘max\_bytes\_before\_external\_sort’ to 80 GB. External sorting works much less effectively than sorting in RAM. @@ -982,22 +988,23 @@ If you want to include all columns in the result, use the asterisk (`*`) symbol. To match some columns in the result with a [re2](https://en.wikipedia.org/wiki/RE2_(software)) regular expression, you can use the `COLUMNS` expression. -```sql +``` sql COLUMNS('regexp') ``` For example, consider the table: -```sql +``` sql CREATE TABLE default.col_names (aa Int8, ab Int8, bc Int8) ENGINE = TinyLog ``` The following query selects data from all the columns containing the `a` symbol in their name. -```sql +``` sql SELECT COLUMNS('a') FROM col_names ``` -```text + +``` text ┌─aa─┬─ab─┐ │ 1 │ 1 │ └────┴────┘ @@ -1009,40 +1016,41 @@ You can use multiple `COLUMNS` expressions in a query and apply functions to the For example: -```sql +``` sql SELECT COLUMNS('a'), COLUMNS('c'), toTypeName(COLUMNS('c')) FROM col_names ``` -```text + +``` text ┌─aa─┬─ab─┬─bc─┬─toTypeName(bc)─┐ │ 1 │ 1 │ 1 │ Int8 │ └────┴────┴────┴────────────────┘ ``` -Each column returned by the `COLUMNS` expression is passed to the function as a separate argument. Also you can pass other arguments to the function if it supports them. Be careful when using functions. If a function doesn't support the number of arguments you have passed to it, ClickHouse throws an exception. +Each column returned by the `COLUMNS` expression is passed to the function as a separate argument. Also you can pass other arguments to the function if it supports them. Be careful when using functions. If a function doesn’t support the number of arguments you have passed to it, ClickHouse throws an exception. For example: -```sql +``` sql SELECT COLUMNS('a') + COLUMNS('c') FROM col_names ``` -```text + +``` text Received exception from server (version 19.14.1): -Code: 42. DB::Exception: Received from localhost:9000. DB::Exception: Number of arguments for function plus doesn't match: passed 3, should be 2. +Code: 42. DB::Exception: Received from localhost:9000. DB::Exception: Number of arguments for function plus doesn't match: passed 3, should be 2. ``` -In this example, `COLUMNS('a')` returns two columns: `aa` and `ab`. `COLUMNS('c')` returns the `bc` column. The `+` operator can't apply to 3 arguments, so ClickHouse throws an exception with the relevant message. - -Columns that matched the `COLUMNS` expression can have different data types. If `COLUMNS` doesn't match any columns and is the only expression in `SELECT`, ClickHouse throws an exception. +In this example, `COLUMNS('a')` returns two columns: `aa` and `ab`. `COLUMNS('c')` returns the `bc` column. The `+` operator can’t apply to 3 arguments, so ClickHouse throws an exception with the relevant message. +Columns that matched the `COLUMNS` expression can have different data types. If `COLUMNS` doesn’t match any columns and is the only expression in `SELECT`, ClickHouse throws an exception. ### DISTINCT Clause {#select-distinct} If DISTINCT is specified, only a single row will remain out of all the sets of fully matching rows in the result. The result will be the same as if GROUP BY were specified across all the fields specified in SELECT without aggregate functions. But there are several differences from GROUP BY: -- DISTINCT can be applied together with GROUP BY. -- When ORDER BY is omitted and LIMIT is defined, the query stops running immediately after the required number of different rows has been read. -- Data blocks are output as they are processed, without waiting for the entire query to finish running. +- DISTINCT can be applied together with GROUP BY. +- When ORDER BY is omitted and LIMIT is defined, the query stops running immediately after the required number of different rows has been read. +- Data blocks are output as they are processed, without waiting for the entire query to finish running. DISTINCT is not supported if SELECT has at least one array column. @@ -1052,7 +1060,7 @@ ClickHouse supports using the `DISTINCT` and `ORDER BY` clauses for different co Example table: -```text +``` text ┌─a─┬─b─┐ │ 2 │ 1 │ │ 1 │ 2 │ @@ -1063,7 +1071,7 @@ Example table: When selecting data with the `SELECT DISTINCT a FROM t1 ORDER BY b ASC` query, we get the following result: -```text +``` text ┌─a─┐ │ 2 │ │ 1 │ @@ -1073,7 +1081,7 @@ When selecting data with the `SELECT DISTINCT a FROM t1 ORDER BY b ASC` query, w If we change the sorting direction `SELECT DISTINCT a FROM t1 ORDER BY b DESC`, we get the following result: -```text +``` text ┌─a─┐ │ 3 │ │ 1 │ @@ -1085,7 +1093,7 @@ Row `2, 4` was cut before sorting. Take this implementation specificity into account when programming queries. -### LIMIT Clause +### LIMIT Clause {#limit-clause} `LIMIT m` allows you to select the first `m` rows from the result. @@ -1093,13 +1101,13 @@ Take this implementation specificity into account when programming queries. `n` and `m` must be non-negative integers. -If there isn't an `ORDER BY` clause that explicitly sorts results, the result may be arbitrary and nondeterministic. +If there isn’t an `ORDER BY` clause that explicitly sorts results, the result may be arbitrary and nondeterministic. -### UNION ALL Clause +### UNION ALL Clause {#union-all-clause} You can use UNION ALL to combine any number of queries. Example: -```sql +``` sql SELECT CounterID, 1 AS table, toInt64(count()) AS c FROM test.hits GROUP BY CounterID @@ -1118,7 +1126,7 @@ Queries that are parts of UNION ALL can be run simultaneously, and their results The structure of results (the number and type of columns) must match for the queries. But the column names can differ. In this case, the column names for the final result will be taken from the first query. Type casting is performed for unions. For example, if two queries being combined have the same field with non-`Nullable` and `Nullable` types from a compatible type, the resulting `UNION ALL` has a `Nullable` type field. -Queries that are parts of UNION ALL can't be enclosed in brackets. ORDER BY and LIMIT are applied to separate queries, not to the final result. If you need to apply a conversion to the final result, you can put all the queries with UNION ALL in a subquery in the FROM clause. +Queries that are parts of UNION ALL can’t be enclosed in brackets. ORDER BY and LIMIT are applied to separate queries, not to the final result. If you need to apply a conversion to the final result, you can put all the queries with UNION ALL in a subquery in the FROM clause. ### INTO OUTFILE Clause {#into-outfile-clause} @@ -1130,14 +1138,13 @@ The default output format is TabSeparated (the same as in the command-line clien ### FORMAT Clause {#format-clause} -Specify 'FORMAT format' to get data in any specified format. +Specify ‘FORMAT format’ to get data in any specified format. You can use this for convenience, or for creating dumps. -For more information, see the section "Formats". +For more information, see the section “Formats”. If the FORMAT clause is omitted, the default format is used, which depends on both the settings and the interface used for accessing the DB. For the HTTP interface and the command-line client in batch mode, the default format is TabSeparated. For the command-line client in interactive mode, the default format is PrettyCompact (it has attractive and compact tables). When using the command-line client, data is passed to the client in an internal efficient format. The client independently interprets the FORMAT clause of the query and formats the data itself (thus relieving the network and the server from the load). - ### IN Operators {#select-in-operators} The `IN`, `NOT IN`, `GLOBAL IN`, and `GLOBAL NOT IN` operators are covered separately, since their functionality is quite rich. @@ -1146,25 +1153,25 @@ The left side of the operator is either a single column or a tuple. Examples: -```sql +``` sql SELECT UserID IN (123, 456) FROM ... SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... ``` If the left side is a single column that is in the index, and the right side is a set of constants, the system uses the index for processing the query. -Don't list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section "External data for query processing"), then use a subquery. +Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section “External data for query processing”), then use a subquery. The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. -If the right side of the operator is the name of a table (for example, `UserID IN users`), this is equivalent to the subquery `UserID IN (SELECT * FROM users)`. Use this when working with external data that is sent along with the query. For example, the query can be sent together with a set of user IDs loaded to the 'users' temporary table, which should be filtered. +If the right side of the operator is the name of a table (for example, `UserID IN users`), this is equivalent to the subquery `UserID IN (SELECT * FROM users)`. Use this when working with external data that is sent along with the query. For example, the query can be sent together with a set of user IDs loaded to the ‘users’ temporary table, which should be filtered. If the right side of the operator is a table name that has the Set engine (a prepared data set that is always in RAM), the data set will not be created over again for each query. The subquery may specify more than one column for filtering tuples. Example: -```sql +``` sql SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ... ``` @@ -1173,7 +1180,7 @@ The columns to the left and right of the IN operator should have the same type. The IN operator and subquery may occur in any part of the query, including in aggregate functions and lambda functions. Example: -```sql +``` sql SELECT EventDate, avg(UserID IN @@ -1187,7 +1194,7 @@ GROUP BY EventDate ORDER BY EventDate ASC ``` -```text +``` text ┌──EventDate─┬────ratio─┐ │ 2014-03-17 │ 1 │ │ 2014-03-18 │ 0.807696 │ @@ -1202,13 +1209,13 @@ ORDER BY EventDate ASC For each day after March 17th, count the percentage of pageviews made by users who visited the site on March 17th. A subquery in the IN clause is always run just one time on a single server. There are no dependent subqueries. -#### NULL processing +#### NULL processing {#null-processing-1} During request processing, the IN operator assumes that the result of an operation with [NULL](syntax.md) is always equal to `0`, regardless of whether `NULL` is on the right or left side of the operator. `NULL` values are not included in any dataset, do not correspond to each other and cannot be compared. Here is an example with the `t_null` table: -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -1217,31 +1224,31 @@ Here is an example with the `t_null` table: Running the query `SELECT x FROM t_null WHERE y IN (NULL,3)` gives you the following result: -```text +``` text ┌─x─┐ │ 2 │ └───┘ ``` -You can see that the row in which `y = NULL` is thrown out of the query results. This is because ClickHouse can't decide whether `NULL` is included in the `(NULL,3)` set, returns `0` as the result of the operation, and `SELECT` excludes this row from the final output. +You can see that the row in which `y = NULL` is thrown out of the query results. This is because ClickHouse can’t decide whether `NULL` is included in the `(NULL,3)` set, returns `0` as the result of the operation, and `SELECT` excludes this row from the final output. -```sql +``` sql SELECT y IN (NULL, 3) FROM t_null ``` -```text + +``` text ┌─in(y, tuple(NULL, 3))─┐ │ 0 │ │ 1 │ └───────────────────────┘ ``` - #### Distributed Subqueries {#select-distributed-subqueries} There are two options for IN-s with subqueries (similar to JOINs): normal `IN` / `JOIN` and `GLOBAL IN` / `GLOBAL JOIN`. They differ in how they are run for distributed query processing. -!!! attention +!!! attention "Attention" Remember that the algorithms described below may work differently depending on the [settings](../operations/settings/settings.md) `distributed_product_mode` setting. When using the regular IN, the query is sent to remote servers, and each of them runs the subqueries in the `IN` or `JOIN` clause. @@ -1252,77 +1259,77 @@ For a non-distributed query, use the regular `IN` / `JOIN`. Be careful when using subqueries in the `IN` / `JOIN` clauses for distributed query processing. -Let's look at some examples. Assume that each server in the cluster has a normal **local_table**. Each server also has a **distributed_table** table with the **Distributed** type, which looks at all the servers in the cluster. +Let’s look at some examples. Assume that each server in the cluster has a normal **local\_table**. Each server also has a **distributed\_table** table with the **Distributed** type, which looks at all the servers in the cluster. -For a query to the **distributed_table**, the query will be sent to all the remote servers and run on them using the **local_table**. +For a query to the **distributed\_table**, the query will be sent to all the remote servers and run on them using the **local\_table**. For example, the query -```sql +``` sql SELECT uniq(UserID) FROM distributed_table ``` will be sent to all remote servers as -```sql +``` sql SELECT uniq(UserID) FROM local_table ``` and run on each of them in parallel, until it reaches the stage where intermediate results can be combined. Then the intermediate results will be returned to the requestor server and merged on it, and the final result will be sent to the client. -Now let's examine a query with IN: +Now let’s examine a query with IN: -```sql +``` sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) ``` -- Calculation of the intersection of audiences of two sites. +- Calculation of the intersection of audiences of two sites. This query will be sent to all remote servers as -```sql +``` sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) ``` In other words, the data set in the IN clause will be collected on each server independently, only across the data that is stored locally on each of the servers. -This will work correctly and optimally if you are prepared for this case and have spread data across the cluster servers such that the data for a single UserID resides entirely on a single server. In this case, all the necessary data will be available locally on each server. Otherwise, the result will be inaccurate. We refer to this variation of the query as "local IN". +This will work correctly and optimally if you are prepared for this case and have spread data across the cluster servers such that the data for a single UserID resides entirely on a single server. In this case, all the necessary data will be available locally on each server. Otherwise, the result will be inaccurate. We refer to this variation of the query as “local IN”. -To correct how the query works when data is spread randomly across the cluster servers, you could specify **distributed_table** inside a subquery. The query would look like this: +To correct how the query works when data is spread randomly across the cluster servers, you could specify **distributed\_table** inside a subquery. The query would look like this: -```sql +``` sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` This query will be sent to all remote servers as -```sql +``` sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` The subquery will begin running on each remote server. Since the subquery uses a distributed table, the subquery that is on each remote server will be resent to every remote server as -```sql +``` sql SELECT UserID FROM local_table WHERE CounterID = 34 ``` For example, if you have a cluster of 100 servers, executing the entire query will require 10,000 elementary requests, which is generally considered unacceptable. -In such cases, you should always use GLOBAL IN instead of IN. Let's look at how it works for the query +In such cases, you should always use GLOBAL IN instead of IN. Let’s look at how it works for the query -```sql +``` sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID GLOBAL IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` The requestor server will run the subquery -```sql +``` sql SELECT UserID FROM distributed_table WHERE CounterID = 34 ``` and the result will be put in a temporary table in RAM. Then the request will be sent to each remote server as -```sql +``` sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL IN _data1 ``` @@ -1330,39 +1337,39 @@ and the temporary table `_data1` will be sent to every remote server with the qu This is more optimal than using the normal IN. However, keep the following points in mind: -1. When creating a temporary table, data is not made unique. To reduce the volume of data transmitted over the network, specify DISTINCT in the subquery. (You don't need to do this for a normal IN.) -2. The temporary table will be sent to all the remote servers. Transmission does not account for network topology. For example, if 10 remote servers reside in a datacenter that is very remote in relation to the requestor server, the data will be sent 10 times over the channel to the remote datacenter. Try to avoid large data sets when using GLOBAL IN. -3. When transmitting data to remote servers, restrictions on network bandwidth are not configurable. You might overload the network. -4. Try to distribute data across servers so that you don't need to use GLOBAL IN on a regular basis. -5. If you need to use GLOBAL IN often, plan the location of the ClickHouse cluster so that a single group of replicas resides in no more than one data center with a fast network between them, so that a query can be processed entirely within a single data center. +1. When creating a temporary table, data is not made unique. To reduce the volume of data transmitted over the network, specify DISTINCT in the subquery. (You don’t need to do this for a normal IN.) +2. The temporary table will be sent to all the remote servers. Transmission does not account for network topology. For example, if 10 remote servers reside in a datacenter that is very remote in relation to the requestor server, the data will be sent 10 times over the channel to the remote datacenter. Try to avoid large data sets when using GLOBAL IN. +3. When transmitting data to remote servers, restrictions on network bandwidth are not configurable. You might overload the network. +4. Try to distribute data across servers so that you don’t need to use GLOBAL IN on a regular basis. +5. If you need to use GLOBAL IN often, plan the location of the ClickHouse cluster so that a single group of replicas resides in no more than one data center with a fast network between them, so that a query can be processed entirely within a single data center. It also makes sense to specify a local table in the `GLOBAL IN` clause, in case this local table is only available on the requestor server and you want to use data from it on remote servers. -### Extreme Values +### Extreme Values {#extreme-values} In addition to results, you can also get minimum and maximum values for the results columns. To do this, set the **extremes** setting to 1. Minimums and maximums are calculated for numeric types, dates, and dates with times. For other columns, the default values are output. An extra two rows are calculated – the minimums and maximums, respectively. These extra two rows are output in `JSON*`, `TabSeparated*`, and `Pretty*` [formats](../interfaces/formats.md), separate from the other rows. They are not output for other formats. -In `JSON*` formats, the extreme values are output in a separate 'extremes' field. In `TabSeparated*` formats, the row comes after the main result, and after 'totals' if present. It is preceded by an empty row (after the other data). In `Pretty*` formats, the row is output as a separate table after the main result, and after `totals` if present. +In `JSON*` formats, the extreme values are output in a separate ‘extremes’ field. In `TabSeparated*` formats, the row comes after the main result, and after ‘totals’ if present. It is preceded by an empty row (after the other data). In `Pretty*` formats, the row is output as a separate table after the main result, and after `totals` if present. Extreme values are calculated for rows before `LIMIT`, but after `LIMIT BY`. However, when using `LIMIT offset, size`, the rows before `offset` are included in `extremes`. In stream requests, the result may also include a small number of rows that passed through `LIMIT`. -### Notes +### Notes {#notes} The `GROUP BY` and `ORDER BY` clauses do not support positional arguments. This contradicts MySQL, but conforms to standard SQL. -For example, `GROUP BY 1, 2` will be interpreted as grouping by constants (i.e. aggregation of all rows into one). +For example, `GROUP BY 1, 2` will be interpreted as grouping by constants (i.e. aggregation of all rows into one). You can use synonyms (`AS` aliases) in any part of a query. You can put an asterisk in any part of a query instead of an expression. When the query is analyzed, the asterisk is expanded to a list of all table columns (excluding the `MATERIALIZED` and `ALIAS` columns). There are only a few cases when using an asterisk is justified: -- When creating a table dump. -- For tables containing just a few columns, such as system tables. -- For getting information about what columns are in a table. In this case, set `LIMIT 1`. But it is better to use the `DESC TABLE` query. -- When there is strong filtration on a small number of columns using `PREWHERE`. -- In subqueries (since columns that aren't needed for the external query are excluded from subqueries). +- When creating a table dump. +- For tables containing just a few columns, such as system tables. +- For getting information about what columns are in a table. In this case, set `LIMIT 1`. But it is better to use the `DESC TABLE` query. +- When there is strong filtration on a small number of columns using `PREWHERE`. +- In subqueries (since columns that aren’t needed for the external query are excluded from subqueries). -In all other cases, we don't recommend using the asterisk, since it only gives you the drawbacks of a columnar DBMS instead of the advantages. In other words using the asterisk is not recommended. +In all other cases, we don’t recommend using the asterisk, since it only gives you the drawbacks of a columnar DBMS instead of the advantages. In other words using the asterisk is not recommended. [Original article](https://clickhouse.tech/docs/en/query_language/select/) diff --git a/docs/en/query_language/show.md b/docs/en/query_language/show.md index f6a9cc6865b..ca3366de80d 100644 --- a/docs/en/query_language/show.md +++ b/docs/en/query_language/show.md @@ -1,25 +1,25 @@ -# SHOW Queries +# SHOW Queries {#show-queries} -## SHOW CREATE TABLE +## SHOW CREATE TABLE {#show-create-table} -```sql +``` sql SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY] [db.]table [INTO OUTFILE filename] [FORMAT format] ``` -Returns a single `String`-type 'statement' column, which contains a single value – the `CREATE` query used for creating the specified object. +Returns a single `String`-type ‘statement’ column, which contains a single value – the `CREATE` query used for creating the specified object. ## SHOW DATABASES {#show-databases} -```sql +``` sql SHOW DATABASES [INTO OUTFILE filename] [FORMAT format] ``` Prints a list of all databases. This query is identical to `SELECT name FROM system.databases [INTO OUTFILE filename] [FORMAT format]`. -## SHOW PROCESSLIST +## SHOW PROCESSLIST {#show-processlist} -```sql +``` sql SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] ``` @@ -29,15 +29,15 @@ The `SELECT * FROM system.processes` query returns data about all the current qu Tip (execute in the console): -```bash +``` bash $ watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" ``` -## SHOW TABLES +## SHOW TABLES {#show-tables} Displays a list of tables. -```sql +``` sql SHOW [TEMPORARY] TABLES [{FROM | IN} ] [LIKE '' | WHERE expr] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` @@ -45,7 +45,7 @@ If the `FROM` clause is not specified, the query returns the list of tables from You can get the same results as the `SHOW TABLES` query in the following way: -```sql +``` sql SELECT name FROM system.tables WHERE database = [AND name LIKE ] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` @@ -53,21 +53,22 @@ SELECT name FROM system.tables WHERE database = [AND name LIKE ] [ The following query selects the first two rows from the list of tables in the `system` database, whose names contain `co`. -```sql +``` sql SHOW TABLES FROM system LIKE '%co%' LIMIT 2 ``` -```text + +``` text ┌─name───────────────────────────┐ │ aggregate_function_combinators │ │ collations │ └────────────────────────────────┘ ``` -## SHOW DICTIONARIES +## SHOW DICTIONARIES {#show-dictionaries} Displays a list of [external dictionaries](dicts/external_dicts.md). -```sql +``` sql SHOW DICTIONARIES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` @@ -75,7 +76,7 @@ If the `FROM` clause is not specified, the query returns the list of dictionarie You can get the same results as the `SHOW DICTIONARIES` query in the following way: -```sql +``` sql SELECT name FROM system.dictionaries WHERE database = [AND name LIKE ] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` @@ -83,10 +84,11 @@ SELECT name FROM system.dictionaries WHERE database = [AND name LIKE diff --git a/docs/en/query_language/system.md b/docs/en/query_language/system.md index 6bff8381f0e..cc403bb6606 100644 --- a/docs/en/query_language/system.md +++ b/docs/en/query_language/system.md @@ -1,109 +1,105 @@ -# SYSTEM Queries {#query_language-system} +# SYSTEM Queries {#query-language-system} -- [RELOAD DICTIONARIES](#query_language-system-reload-dictionaries) -- [RELOAD DICTIONARY](#query_language-system-reload-dictionary) -- [DROP DNS CACHE](#query_language-system-drop-dns-cache) -- [DROP MARK CACHE](#query_language-system-drop-mark-cache) -- [FLUSH LOGS](#query_language-system-flush_logs) -- [RELOAD CONFIG](#query_language-system-reload-config) -- [SHUTDOWN](#query_language-system-shutdown) -- [KILL](#query_language-system-kill) -- [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends) -- [FLUSH DISTRIBUTED](#query_language-system-flush-distributed) -- [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) -- [STOP MERGES](#query_language-system-stop-merges) -- [START MERGES](#query_language-system-start-merges) +- [RELOAD DICTIONARIES](#query_language-system-reload-dictionaries) +- [RELOAD DICTIONARY](#query_language-system-reload-dictionary) +- [DROP DNS CACHE](#query_language-system-drop-dns-cache) +- [DROP MARK CACHE](#query_language-system-drop-mark-cache) +- [FLUSH LOGS](#query_language-system-flush_logs) +- [RELOAD CONFIG](#query_language-system-reload-config) +- [SHUTDOWN](#query_language-system-shutdown) +- [KILL](#query_language-system-kill) +- [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends) +- [FLUSH DISTRIBUTED](#query_language-system-flush-distributed) +- [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) +- [STOP MERGES](#query_language-system-stop-merges) +- [START MERGES](#query_language-system-start-merges) -## RELOAD DICTIONARIES {#query_language-system-reload-dictionaries} +## RELOAD DICTIONARIES {#query-language-system-reload-dictionaries} Reloads all dictionaries that have been successfully loaded before. -By default, dictionaries are loaded lazily (see [dictionaries_lazy_load](../operations/server_settings/settings.md#server_settings-dictionaries_lazy_load)), so instead of being loaded automatically at startup, they are initialized on first access through dictGet function or SELECT from tables with ENGINE = Dictionary. The `SYSTEM RELOAD DICTIONARIES` query reloads such dictionaries (LOADED). +By default, dictionaries are loaded lazily (see [dictionaries\_lazy\_load](../operations/server_settings/settings.md#server_settings-dictionaries_lazy_load)), so instead of being loaded automatically at startup, they are initialized on first access through dictGet function or SELECT from tables with ENGINE = Dictionary. The `SYSTEM RELOAD DICTIONARIES` query reloads such dictionaries (LOADED). Always returns `Ok.` regardless of the result of the dictionary update. -## RELOAD DICTIONARY dictionary_name {#query_language-system-reload-dictionary} +## RELOAD DICTIONARY dictionary\_name {#query-language-system-reload-dictionary} -Completely reloads a dictionary `dictionary_name`, regardless of the state of the dictionary (LOADED / NOT_LOADED / FAILED). +Completely reloads a dictionary `dictionary_name`, regardless of the state of the dictionary (LOADED / NOT\_LOADED / FAILED). Always returns `Ok.` regardless of the result of updating the dictionary. The status of the dictionary can be checked by querying the `system.dictionaries` table. -```sql +``` sql SELECT name, status FROM system.dictionaries; ``` -## DROP DNS CACHE {#query_language-system-drop-dns-cache} +## DROP DNS CACHE {#query-language-system-drop-dns-cache} -Resets ClickHouse's internal DNS cache. Sometimes (for old ClickHouse versions) it is necessary to use this command when changing the infrastructure (changing the IP address of another ClickHouse server or the server used by dictionaries). +Resets ClickHouse’s internal DNS cache. Sometimes (for old ClickHouse versions) it is necessary to use this command when changing the infrastructure (changing the IP address of another ClickHouse server or the server used by dictionaries). -For more convenient (automatic) cache management, see disable_internal_dns_cache, dns_cache_update_period parameters. +For more convenient (automatic) cache management, see disable\_internal\_dns\_cache, dns\_cache\_update\_period parameters. -## DROP MARK CACHE {#query_language-system-drop-mark-cache} +## DROP MARK CACHE {#query-language-system-drop-mark-cache} Resets the mark cache. Used in development of ClickHouse and performance tests. -## FLUSH LOGS {#query_language-system-flush_logs} +## FLUSH LOGS {#query-language-system-flush-logs} -Flushes buffers of log messages to system tables (e.g. system.query_log). Allows you to not wait 7.5 seconds when debugging. +Flushes buffers of log messages to system tables (e.g. system.query\_log). Allows you to not wait 7.5 seconds when debugging. -## RELOAD CONFIG {#query_language-system-reload-config} +## RELOAD CONFIG {#query-language-system-reload-config} Reloads ClickHouse configuration. Used when configuration is stored in ZooKeeeper. -## SHUTDOWN {#query_language-system-shutdown} +## SHUTDOWN {#query-language-system-shutdown} Normally shuts down ClickHouse (like `service clickhouse-server stop` / `kill {$pid_clickhouse-server}`) -## KILL {#query_language-system-kill} +## KILL {#query-language-system-kill} Aborts ClickHouse process (like `kill -9 {$ pid_clickhouse-server}`) -## Managing Distributed Tables {#query_language-system-distributed} +## Managing Distributed Tables {#query-language-system-distributed} ClickHouse can manage [distributed](../operations/table_engines/distributed.md) tables. When a user inserts data into these tables, ClickHouse first creates a queue of the data that should be sent to cluster nodes, then asynchronously sends it. You can manage queue processing with the [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends), [FLUSH DISTRIBUTED](#query_language-system-flush-distributed), and [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) queries. You can also synchronously insert distributed data with the `insert_distributed_sync` setting. - -### STOP DISTRIBUTED SENDS {#query_language-system-stop-distributed-sends} +### STOP DISTRIBUTED SENDS {#query-language-system-stop-distributed-sends} Disables background data distribution when inserting data into distributed tables. -```sql +``` sql SYSTEM STOP DISTRIBUTED SENDS [db.] ``` - -### FLUSH DISTRIBUTED {#query_language-system-flush-distributed} +### FLUSH DISTRIBUTED {#query-language-system-flush-distributed} Forces ClickHouse to send data to cluster nodes synchronously. If any nodes are unavailable, ClickHouse throws an exception and stops query execution. You can retry the query until it succeeds, which will happen when all nodes are back online. -```sql +``` sql SYSTEM FLUSH DISTRIBUTED [db.] ``` - -### START DISTRIBUTED SENDS {#query_language-system-start-distributed-sends} +### START DISTRIBUTED SENDS {#query-language-system-start-distributed-sends} Enables background data distribution when inserting data into distributed tables. -```sql +``` sql SYSTEM START DISTRIBUTED SENDS [db.] ``` - -### STOP MERGES {#query_language-system-stop-merges} +### STOP MERGES {#query-language-system-stop-merges} Provides possibility to stop background merges for tables in the MergeTree family: -```sql +``` sql SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] ``` + !!! note "Note" `DETACH / ATTACH` table will start background merges for the table even in case when merges have been stopped for all MergeTree tables before. - -### START MERGES {#query_language-system-start-merges} +### START MERGES {#query-language-system-start-merges} Provides possibility to start background merges for tables in the MergeTree family: -```sql +``` sql SYSTEM START MERGES [[db.]merge_tree_family_table_name] ``` diff --git a/docs/en/query_language/table_functions/file.md b/docs/en/query_language/table_functions/file.md index 48a7907ac80..12d30bdbdef 100644 --- a/docs/en/query_language/table_functions/file.md +++ b/docs/en/query_language/table_functions/file.md @@ -1,17 +1,16 @@ - -# file +# file {#file} Creates a table from a file. This table function is similar to [url](url.md) and [hdfs](hdfs.md) ones. -```sql +``` sql file(path, format, structure) ``` **Input parameters** -- `path` — The relative path to the file from [user_files_path](../../operations/server_settings/settings.md#server_settings-user_files_path). Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, ``'abc', 'def'` — strings. -- `format` — The [format](../../interfaces/formats.md#formats) of the file. -- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. +- `path` — The relative path to the file from [user\_files\_path](../../operations/server_settings/settings.md#server_settings-user_files_path). Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, \``'abc', 'def'` — strings. +- `format` — The [format](../../interfaces/formats.md#formats) of the file. +- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. **Returned value** @@ -21,7 +20,7 @@ A table with the specified structure for reading or writing data in the specifie Setting `user_files_path` and the contents of the file `test.csv`: -```bash +``` bash $ grep user_files_path /etc/clickhouse-server/config.xml /var/lib/clickhouse/user_files/ @@ -33,20 +32,20 @@ $ cat /var/lib/clickhouse/user_files/test.csv Table from`test.csv` and selection of the first two rows from it: -```sql +``` sql SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2 ``` -```text +``` text ┌─column1─┬─column2─┬─column3─┐ │ 1 │ 2 │ 3 │ │ 3 │ 2 │ 1 │ └─────────┴─────────┴─────────┘ ``` -```sql +``` sql -- getting the first 10 lines of a table that contains 3 columns of UInt32 type from a CSV file SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10 ``` @@ -55,56 +54,61 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). -- `*` — Substitutes any number of any characters except `/` including empty string. -- `?` — Substitutes any single character. -- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Substitutes any number in range from N to M including both borders. +- `*` — Substitutes any number of any characters except `/` including empty string. +- `?` — Substitutes any single character. +- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Substitutes any number in range from N to M including both borders. - Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)). +Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)). **Example** -1. Suppose we have several files with the following relative paths: +1. Suppose we have several files with the following relative paths: -- 'some_dir/some_file_1' -- 'some_dir/some_file_2' -- 'some_dir/some_file_3' -- 'another_dir/some_file_1' -- 'another_dir/some_file_2' -- 'another_dir/some_file_3' +- ‘some\_dir/some\_file\_1’ +- ‘some\_dir/some\_file\_2’ +- ‘some\_dir/some\_file\_3’ +- ‘another\_dir/some\_file\_1’ +- ‘another\_dir/some\_file\_2’ +- ‘another\_dir/some\_file\_3’ -2. Query the amount of rows in these files: +1. Query the amount of rows in these files: -```sql + + +``` sql SELECT count(*) FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') ``` -3. Query the amount of rows in all files of these two directories: +1. Query the amount of rows in all files of these two directories: -```sql + + +``` sql SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') ``` -!!! warning + +!!! warning "Warning" If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **Example** -Query the data from files named `file000`, `file001`, ... , `file999`: +Query the data from files named `file000`, `file001`, … , `file999`: -```sql +``` sql SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') ``` -## Virtual Columns +## Virtual Columns {#virtual-columns} -- `_path` — Path to the file. -- `_file` — Name of the file. +- `_path` — Path to the file. +- `_file` — Name of the file. **See Also** -- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) +- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) [Original article](https://clickhouse.tech/docs/en/query_language/table_functions/file/) diff --git a/docs/en/query_language/table_functions/generate.md b/docs/en/query_language/table_functions/generate.md index cca435557ac..aef08495353 100644 --- a/docs/en/query_language/table_functions/generate.md +++ b/docs/en/query_language/table_functions/generate.md @@ -1,33 +1,33 @@ -# generateRandom +# generateRandom {#generaterandom} Generates random data with given schema. Allows to populate test tables with data. Supports all data types that can be stored in table except `LowCardinality` and `AggregateFunction`. -```sql +``` sql generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]); ``` **Parameters** -- `name` — Name of corresponding column. -- `TypeName` — Type of corresponding column. -- `limit` — Number of rows to generate. -- `max_array_length` — Maximum array length for all generated arrays. Defaults to `10`. -- `max_string_length` — Maximum string length for all generated strings. Defaults to `10`. -- `random_seed` — Specify random seed manually to produce stable results. If NULL — seed is randomly generated. +- `name` — Name of corresponding column. +- `TypeName` — Type of corresponding column. +- `limit` — Number of rows to generate. +- `max_array_length` — Maximum array length for all generated arrays. Defaults to `10`. +- `max_string_length` — Maximum string length for all generated strings. Defaults to `10`. +- `random_seed` — Specify random seed manually to produce stable results. If NULL — seed is randomly generated. **Returned Value** A table object with requested schema. -## Usage Example +## Usage Example {#usage-example} - -```sql +``` sql SELECT * FROM generateRandom('a Array(Int8), d Decimal32(4), c Tuple(DateTime64(3), UUID)', 1, 10, 2); ``` -```text + +``` text ┌─a────────┬────────────d─┬─c──────────────────────────────────────────────────────────────────┐ │ [77] │ -124167.6723 │ ('2061-04-17 21:59:44.573','3f72f405-ec3e-13c8-44ca-66ef335f7835') │ │ [32,110] │ -141397.7312 │ ('1979-02-09 03:43:48.526','982486d1-5a5d-a308-e525-7bd8b80ffa73') │ diff --git a/docs/en/query_language/table_functions/hdfs.md b/docs/en/query_language/table_functions/hdfs.md index 6acaf9842ac..ca80d0e759c 100644 --- a/docs/en/query_language/table_functions/hdfs.md +++ b/docs/en/query_language/table_functions/hdfs.md @@ -1,17 +1,16 @@ - -# hdfs +# hdfs {#hdfs} Creates a table from files in HDFS. This table function is similar to [url](url.md) and [file](file.md) ones. -```sql +``` sql hdfs(URI, format, structure) ``` **Input parameters** -- `URI` — The relative URI to the file in HDFS. Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, ``'abc', 'def'` — strings. -- `format` — The [format](../../interfaces/formats.md#formats) of the file. -- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. +- `URI` — The relative URI to the file in HDFS. Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, \``'abc', 'def'` — strings. +- `format` — The [format](../../interfaces/formats.md#formats) of the file. +- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. **Returned value** @@ -21,13 +20,13 @@ A table with the specified structure for reading or writing data in the specifie Table from `hdfs://hdfs1:9000/test` and selection of the first two rows from it: -```sql +``` sql SELECT * FROM hdfs('hdfs://hdfs1:9000/test', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2 ``` -```text +``` text ┌─column1─┬─column2─┬─column3─┐ │ 1 │ 2 │ 3 │ │ 3 │ 2 │ 1 │ @@ -38,58 +37,61 @@ LIMIT 2 Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). -- `*` — Substitutes any number of any characters except `/` including empty string. -- `?` — Substitutes any single character. -- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Substitutes any number in range from N to M including both borders. +- `*` — Substitutes any number of any characters except `/` including empty string. +- `?` — Substitutes any single character. +- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Substitutes any number in range from N to M including both borders. - - Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)). +Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)). **Example** -1. Suppose that we have several files with following URIs on HDFS: +1. Suppose that we have several files with following URIs on HDFS: -- 'hdfs://hdfs1:9000/some_dir/some_file_1' -- 'hdfs://hdfs1:9000/some_dir/some_file_2' -- 'hdfs://hdfs1:9000/some_dir/some_file_3' -- 'hdfs://hdfs1:9000/another_dir/some_file_1' -- 'hdfs://hdfs1:9000/another_dir/some_file_2' -- 'hdfs://hdfs1:9000/another_dir/some_file_3' +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_3’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_3’ -2. Query the amount of rows in these files: +1. Query the amount of rows in these files: -```sql + + +``` sql SELECT count(*) FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') ``` -3. Query the amount of rows in all files of these two directories: +1. Query the amount of rows in all files of these two directories: -```sql + + +``` sql SELECT count(*) FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32') ``` -!!! warning +!!! warning "Warning" If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **Example** -Query the data from files named `file000`, `file001`, ... , `file999`: +Query the data from files named `file000`, `file001`, … , `file999`: -```sql +``` sql SELECT count(*) FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') ``` -## Virtual Columns +## Virtual Columns {#virtual-columns} -- `_path` — Path to the file. -- `_file` — Name of the file. +- `_path` — Path to the file. +- `_file` — Name of the file. **See Also** -- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) +- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) [Original article](https://clickhouse.tech/docs/en/query_language/table_functions/hdfs/) diff --git a/docs/en/query_language/table_functions/index.md b/docs/en/query_language/table_functions/index.md index 936bb977721..ea07d5a5b00 100644 --- a/docs/en/query_language/table_functions/index.md +++ b/docs/en/query_language/table_functions/index.md @@ -1,31 +1,30 @@ -# Table Functions +# Table Functions {#table-functions} Table functions are methods for constructing tables. You can use table functions in: -* [FROM](../select.md#select-from) clause of the `SELECT` query. +- [FROM](../select.md#select-from) clause of the `SELECT` query. - The method for creating a temporary table that is available only in the current query. The table is deleted when the query finishes. + The method for creating a temporary table that is available only in the current query. The table is deleted when the query finishes. -* [CREATE TABLE AS ](../create.md#create-table-query) query. - - It's one of the methods of creating a table. +- [CREATE TABLE AS \](../create.md#create-table-query) query. + It's one of the methods of creating a table. !!! warning "Warning" - You can't use table functions if the [allow_ddl](../../operations/settings/permissions_for_queries.md#settings_allow_ddl) setting is disabled. + You can’t use table functions if the [allow\_ddl](../../operations/settings/permissions_for_queries.md#settings_allow_ddl) setting is disabled. -Function | Description ----------|------------ -[file](file.md) | Creates a [File](../../operations/table_engines/file.md)-engine table. -[merge](merge.md) | Creates a [Merge](../../operations/table_engines/merge.md)-engine table. -[numbers](numbers.md) | Creates a table with a single column filled with integer numbers. -[remote](remote.md) | Allows you to access remote servers without creating a [Distributed](../../operations/table_engines/distributed.md)-engine table. -[url](url.md) | Creates a [Url](../../operations/table_engines/url.md)-engine table. -[mysql](mysql.md) | Creates a [MySQL](../../operations/table_engines/mysql.md)-engine table. -[jdbc](jdbc.md) | Creates a [JDBC](../../operations/table_engines/jdbc.md)-engine table. -[odbc](odbc.md) | Creates a [ODBC](../../operations/table_engines/odbc.md)-engine table. -[hdfs](hdfs.md) | Creates a [HDFS](../../operations/table_engines/hdfs.md)-engine table. +| Function | Description | +|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------| +| [file](file.md) | Creates a [File](../../operations/table_engines/file.md)-engine table. | +| [merge](merge.md) | Creates a [Merge](../../operations/table_engines/merge.md)-engine table. | +| [numbers](numbers.md) | Creates a table with a single column filled with integer numbers. | +| [remote](remote.md) | Allows you to access remote servers without creating a [Distributed](../../operations/table_engines/distributed.md)-engine table. | +| [url](url.md) | Creates a [Url](../../operations/table_engines/url.md)-engine table. | +| [mysql](mysql.md) | Creates a [MySQL](../../operations/table_engines/mysql.md)-engine table. | +| [jdbc](jdbc.md) | Creates a [JDBC](../../operations/table_engines/jdbc.md)-engine table. | +| [odbc](odbc.md) | Creates a [ODBC](../../operations/table_engines/odbc.md)-engine table. | +| [hdfs](hdfs.md) | Creates a [HDFS](../../operations/table_engines/hdfs.md)-engine table. | [Original article](https://clickhouse.tech/docs/en/query_language/table_functions/) diff --git a/docs/en/query_language/table_functions/input.md b/docs/en/query_language/table_functions/input.md index f1cca3bbc50..f79a6f96d76 100644 --- a/docs/en/query_language/table_functions/input.md +++ b/docs/en/query_language/table_functions/input.md @@ -1,4 +1,4 @@ -# input +# input {#input} `input(structure)` - table function that allows effectively convert and insert data sent to the server with given structure to the table with another structure. @@ -18,16 +18,21 @@ with all transferred data is not created. **Examples** -- Let the `test` table has the following structure `(a String, b String)` -and data in `data.csv` has a different structure `(col1 String, col2 Date, col3 Int32)`. Query for insert -data from the `data.csv` into the `test` table with simultaneous conversion looks like this: +- Let the `test` table has the following structure `(a String, b String)` + and data in `data.csv` has a different structure `(col1 String, col2 Date, col3 Int32)`. Query for insert + data from the `data.csv` into the `test` table with simultaneous conversion looks like this: -```bash + + +``` bash $ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT lower(col1), col3 * col3 FROM input('col1 String, col2 Date, col3 Int32') FORMAT CSV"; ``` -- If `data.csv` contains data of the same structure `test_structure` as the table `test` then these two queries are equal: -```bash +- If `data.csv` contains data of the same structure `test_structure` as the table `test` then these two queries are equal: + + + +``` bash $ cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" $ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" ``` diff --git a/docs/en/query_language/table_functions/jdbc.md b/docs/en/query_language/table_functions/jdbc.md index 97f3b5af666..a98fb1825e7 100644 --- a/docs/en/query_language/table_functions/jdbc.md +++ b/docs/en/query_language/table_functions/jdbc.md @@ -1,22 +1,21 @@ -# jdbc {#table_function-jdbc} +# jdbc {#table-function-jdbc} `jdbc(jdbc_connection_uri, schema, table)` - returns table that is connected via JDBC driver. This table function requires separate `clickhouse-jdbc-bridge` program to be running. It supports Nullable types (based on DDL of remote table that is queried). - **Examples** -```sql +``` sql SELECT * FROM jdbc('jdbc:mysql://localhost:3306/?user=root&password=root', 'schema', 'table') ``` -```sql +``` sql SELECT * FROM jdbc('mysql://localhost:3306/?user=root&password=root', 'schema', 'table') ``` -```sql +``` sql SELECT * FROM jdbc('datasource://mysql-local', 'schema', 'table') ``` diff --git a/docs/en/query_language/table_functions/merge.md b/docs/en/query_language/table_functions/merge.md index beb12737e53..2cf87c40da0 100644 --- a/docs/en/query_language/table_functions/merge.md +++ b/docs/en/query_language/table_functions/merge.md @@ -1,8 +1,7 @@ -# merge +# merge {#merge} -`merge(db_name, 'tables_regexp')` – Creates a temporary Merge table. For more information, see the section "Table engines, Merge". +`merge(db_name, 'tables_regexp')` – Creates a temporary Merge table. For more information, see the section “Table engines, Merge”. The table structure is taken from the first table encountered that matches the regular expression. - [Original article](https://clickhouse.tech/docs/en/query_language/table_functions/merge/) diff --git a/docs/en/query_language/table_functions/mysql.md b/docs/en/query_language/table_functions/mysql.md index 8a80c1167f1..ae2590e9813 100644 --- a/docs/en/query_language/table_functions/mysql.md +++ b/docs/en/query_language/table_functions/mysql.md @@ -1,26 +1,32 @@ -# mysql +# mysql {#mysql} Allows `SELECT` queries to be performed on data that is stored on a remote MySQL server. -```sql +``` sql mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); ``` **Parameters** -- `host:port` — MySQL server address. -- `database` — Remote database name. -- `table` — Remote table name. -- `user` — MySQL user. -- `password` — User password. -- `replace_query` — Flag that converts `INSERT INTO` queries to `REPLACE INTO`. If `replace_query=1`, the query is replaced. -- `on_duplicate_clause` — The `ON DUPLICATE KEY on_duplicate_clause` expression that is added to the `INSERT` query. +- `host:port` — MySQL server address. - Example: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, where `on_duplicate_clause` is `UPDATE c2 = c2 + 1`. See the MySQL documentation to find which `on_duplicate_clause` you can use with the `ON DUPLICATE KEY` clause. +- `database` — Remote database name. - To specify `on_duplicate_clause` you need to pass `0` to the `replace_query` parameter. If you simultaneously pass `replace_query = 1` and `on_duplicate_clause`, ClickHouse generates an exception. +- `table` — Remote table name. -Simple `WHERE` clauses such as ` =, !=, >, >=, <, <=` are currently executed on the MySQL server. +- `user` — MySQL user. + +- `password` — User password. + +- `replace_query` — Flag that converts `INSERT INTO` queries to `REPLACE INTO`. If `replace_query=1`, the query is replaced. + +- `on_duplicate_clause` — The `ON DUPLICATE KEY on_duplicate_clause` expression that is added to the `INSERT` query. + + Example: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, where `on_duplicate_clause` is `UPDATE c2 = c2 + 1`. See the MySQL documentation to find which `on_duplicate_clause` you can use with the `ON DUPLICATE KEY` clause. + + To specify `on_duplicate_clause` you need to pass `0` to the `replace_query` parameter. If you simultaneously pass `replace_query = 1` and `on_duplicate_clause`, ClickHouse generates an exception. + +Simple `WHERE` clauses such as `=, !=, >, >=, <, <=` are currently executed on the MySQL server. The rest of the conditions and the `LIMIT` sampling constraint are executed in ClickHouse only after the query to MySQL finishes. @@ -28,11 +34,11 @@ The rest of the conditions and the `LIMIT` sampling constraint are executed in C A table object with the same columns as the original MySQL table. -## Usage Example +## Usage Example {#usage-example} Table in MySQL: -```text +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -55,18 +61,19 @@ mysql> select * from test; Selecting data from ClickHouse: -```sql +``` sql SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123') ``` -```text + +``` text ┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ │ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ └────────┴──────────────┴───────┴────────────────┘ ``` -## See Also +## See Also {#see-also} -- [The 'MySQL' table engine](../../operations/table_engines/mysql.md) -- [Using MySQL as a source of external dictionary](../dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) +- [The ‘MySQL’ table engine](../../operations/table_engines/mysql.md) +- [Using MySQL as a source of external dictionary](../dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) [Original article](https://clickhouse.tech/docs/en/query_language/table_functions/mysql/) diff --git a/docs/en/query_language/table_functions/numbers.md b/docs/en/query_language/table_functions/numbers.md index 112b4bf474a..4f3fe1218a5 100644 --- a/docs/en/query_language/table_functions/numbers.md +++ b/docs/en/query_language/table_functions/numbers.md @@ -1,13 +1,13 @@ -# numbers +# numbers {#numbers} -`numbers(N)` – Returns a table with the single 'number' column (UInt64) that contains integers from 0 to N-1. -`numbers(N, M)` - Returns a table with the single 'number' column (UInt64) that contains integers from N to (N + M - 1). +`numbers(N)` – Returns a table with the single ‘number’ column (UInt64) that contains integers from 0 to N-1. +`numbers(N, M)` - Returns a table with the single ‘number’ column (UInt64) that contains integers from N to (N + M - 1). Similar to the `system.numbers` table, it can be used for testing and generating successive values, `numbers(N, M)` more efficient than `system.numbers`. The following queries are equivalent: -```sql +``` sql SELECT * FROM numbers(10); SELECT * FROM numbers(0, 10); SELECT * FROM system.numbers LIMIT 10; @@ -15,10 +15,9 @@ SELECT * FROM system.numbers LIMIT 10; Examples: -```sql +``` sql -- Generate a sequence of dates from 2010-01-01 to 2010-12-31 select toDate('2010-01-01') + number as d FROM numbers(365); ``` - [Original article](https://clickhouse.tech/docs/en/query_language/table_functions/numbers/) diff --git a/docs/en/query_language/table_functions/odbc.md b/docs/en/query_language/table_functions/odbc.md index d9115557f1e..26112814fd3 100644 --- a/docs/en/query_language/table_functions/odbc.md +++ b/docs/en/query_language/table_functions/odbc.md @@ -1,22 +1,22 @@ -# odbc {#table_functions-odbc} +# odbc {#table-functions-odbc} Returns table that is connected via [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). -```sql +``` sql odbc(connection_settings, external_database, external_table) ``` Parameters: -- `connection_settings` — Name of the section with connection settings in the `odbc.ini` file. -- `external_database` — Name of a database in an external DBMS. -- `external_table` — Name of a table in the `external_database`. +- `connection_settings` — Name of the section with connection settings in the `odbc.ini` file. +- `external_database` — Name of a database in an external DBMS. +- `external_table` — Name of a table in the `external_database`. To safely implement ODBC connections, ClickHouse uses a separate program `clickhouse-odbc-bridge`. If the ODBC driver is loaded directly from `clickhouse-server`, driver problems can crash the ClickHouse server. ClickHouse automatically starts `clickhouse-odbc-bridge` when it is required. The ODBC bridge program is installed from the same package as the `clickhouse-server`. The fields with the `NULL` values from the external table are converted into the default values for the base data type. For example, if a remote MySQL table field has the `INT NULL` type it is converted to 0 (the default value for ClickHouse `Int32` data type). -## Usage example +## Usage example {#usage-example} **Getting data from the local MySQL installation via ODBC** @@ -26,17 +26,18 @@ Ensure that unixODBC and MySQL Connector are installed. By default (if installed from packages), ClickHouse starts as user `clickhouse`. Thus you need to create and configure this user in the MySQL server. -```bash +``` bash $ sudo mysql ``` -```sql + +``` sql mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; ``` Then configure the connection in `/etc/odbc.ini`. -```bash +``` bash $ cat /etc/odbc.ini [mysqlconn] DRIVER = /usr/local/lib/libmyodbc5w.so @@ -49,7 +50,7 @@ PASSWORD = clickhouse You can check the connection using the `isql` utility from the unixODBC installation. -```bash +``` bash $ isql -v mysqlconn +---------------------------------------+ | Connected! | @@ -59,7 +60,7 @@ $ isql -v mysqlconn Table in MySQL: -```text +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -82,18 +83,19 @@ mysql> select * from test; Retrieving data from the MySQL table in ClickHouse: -```sql +``` sql SELECT * FROM odbc('DSN=mysqlconn', 'test', 'test') ``` -```text + +``` text ┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ │ 1 │ 0 │ 2 │ 0 │ └────────┴──────────────┴───────┴────────────────┘ ``` -## See Also +## See Also {#see-also} -- [ODBC external dictionaries](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) -- [ODBC table engine](../../operations/table_engines/odbc.md). +- [ODBC external dictionaries](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) +- [ODBC table engine](../../operations/table_engines/odbc.md). [Original article](https://clickhouse.tech/docs/en/query_language/table_functions/jdbc/) diff --git a/docs/en/query_language/table_functions/remote.md b/docs/en/query_language/table_functions/remote.md index 79b5a93eb68..3d24229bca8 100644 --- a/docs/en/query_language/table_functions/remote.md +++ b/docs/en/query_language/table_functions/remote.md @@ -1,23 +1,22 @@ - -# remote, remoteSecure +# remote, remoteSecure {#remote-remotesecure} Allows you to access remote servers without creating a `Distributed` table. Signatures: -```sql +``` sql remote('addresses_expr', db, table[, 'user'[, 'password']]) remote('addresses_expr', db.table[, 'user'[, 'password']]) ``` -`addresses_expr` – An expression that generates addresses of remote servers. This may be just one server address. The server address is `host:port`, or just `host`. The host can be specified as the server name, or as the IPv4 or IPv6 address. An IPv6 address is specified in square brackets. The port is the TCP port on the remote server. If the port is omitted, it uses `tcp_port` from the server's config file (by default, 9000). +`addresses_expr` – An expression that generates addresses of remote servers. This may be just one server address. The server address is `host:port`, or just `host`. The host can be specified as the server name, or as the IPv4 or IPv6 address. An IPv6 address is specified in square brackets. The port is the TCP port on the remote server. If the port is omitted, it uses `tcp_port` from the server’s config file (by default, 9000). -!!! important +!!! important "Important" The port is required for an IPv6 address. Examples: -```text +``` text example01-01-1 example01-01-1:9000 localhost @@ -30,29 +29,29 @@ Multiple addresses can be comma-separated. In this case, ClickHouse will use dis Example: -```text +``` text example01-01-1,example01-02-1 ``` Part of the expression can be specified in curly brackets. The previous example can be written as follows: -```text +``` text example01-0{1,2}-1 ``` Curly brackets can contain a range of numbers separated by two dots (non-negative integers). In this case, the range is expanded to a set of values that generate shard addresses. If the first number starts with zero, the values are formed with the same zero alignment. The previous example can be written as follows: -```text +``` text example01-{01..02}-1 ``` If you have multiple pairs of curly brackets, it generates the direct product of the corresponding sets. -Addresses and parts of addresses in curly brackets can be separated by the pipe symbol (|). In this case, the corresponding sets of addresses are interpreted as replicas, and the query will be sent to the first healthy replica. However, the replicas are iterated in the order currently set in the [load_balancing](../../operations/settings/settings.md) setting. +Addresses and parts of addresses in curly brackets can be separated by the pipe symbol (\|). In this case, the corresponding sets of addresses are interpreted as replicas, and the query will be sent to the first healthy replica. However, the replicas are iterated in the order currently set in the [load\_balancing](../../operations/settings/settings.md) setting. Example: -```text +``` text example01-{01..02}-{1|2} ``` @@ -60,18 +59,18 @@ This example specifies two shards that each have two replicas. The number of addresses generated is limited by a constant. Right now this is 1000 addresses. -Using the `remote` table function is less optimal than creating a `Distributed` table, because in this case, the server connection is re-established for every request. In addition, if host names are set, the names are resolved, and errors are not counted when working with various replicas. When processing a large number of queries, always create the `Distributed` table ahead of time, and don't use the `remote` table function. +Using the `remote` table function is less optimal than creating a `Distributed` table, because in this case, the server connection is re-established for every request. In addition, if host names are set, the names are resolved, and errors are not counted when working with various replicas. When processing a large number of queries, always create the `Distributed` table ahead of time, and don’t use the `remote` table function. The `remote` table function can be useful in the following cases: -- Accessing a specific server for data comparison, debugging, and testing. -- Queries between various ClickHouse clusters for research purposes. -- Infrequent distributed requests that are made manually. -- Distributed requests where the set of servers is re-defined each time. +- Accessing a specific server for data comparison, debugging, and testing. +- Queries between various ClickHouse clusters for research purposes. +- Infrequent distributed requests that are made manually. +- Distributed requests where the set of servers is re-defined each time. If the user is not specified, `default` is used. If the password is not specified, an empty password is used. -`remoteSecure` - same as `remote` but with secured connection. Default port — [tcp_port_secure](../../operations/server_settings/settings.md#server_settings-tcp_port_secure) from config or 9440. +`remoteSecure` - same as `remote` but with secured connection. Default port — [tcp\_port\_secure](../../operations/server_settings/settings.md#server_settings-tcp_port_secure) from config or 9440. [Original article](https://clickhouse.tech/docs/en/query_language/table_functions/remote/) diff --git a/docs/en/query_language/table_functions/url.md b/docs/en/query_language/table_functions/url.md index 59ccdc1c41d..0d4f85d649d 100644 --- a/docs/en/query_language/table_functions/url.md +++ b/docs/en/query_language/table_functions/url.md @@ -1,5 +1,4 @@ - -# url +# url {#url} `url(URL, format, structure)` - returns a table created from the `URL` with given `format` and `structure`. @@ -12,7 +11,7 @@ structure - table structure in `'UserID UInt64, Name String'` format. Determines **Example** -```sql +``` sql -- getting the first 3 lines of a table that contains columns of String and UInt32 type from HTTP-server which answers in CSV format. SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3 ``` diff --git a/docs/en/roadmap.md b/docs/en/roadmap.md index 8fc91286dfe..434058d8311 100644 --- a/docs/en/roadmap.md +++ b/docs/en/roadmap.md @@ -1,9 +1,12 @@ -# Roadmap +# Roadmap {#roadmap} -## Q1 2020 +## Q1 2020 {#q1-2020} -- Role-based access control -- Integration with external authentication services -- Resource pools for more precise distribution of cluster capacity between users +- Role-based access control -[Original article](https://clickhouse.tech/docs/en/roadmap/) +## Q2 2020 {#q2-2020} + +- Integration with external authentication services +- Resource pools for more precise distribution of cluster capacity between users + +{## [Original article](https://clickhouse.tech/docs/en/roadmap/) ##} diff --git a/docs/en/security_changelog.md b/docs/en/security_changelog.md index dbd5690499a..9d33e1083d5 100644 --- a/docs/en/security_changelog.md +++ b/docs/en/security_changelog.md @@ -1,69 +1,69 @@ -## Fixed in ClickHouse Release 19.14.3.3, 2019-09-10 +## Fixed in ClickHouse Release 19.14.3.3, 2019-09-10 {#fixed-in-clickhouse-release-19-14-3-3-2019-09-10} -### CVE-2019-15024 +### CVE-2019-15024 {#cve-2019-15024} -Аn attacker having write access to ZooKeeper and who is able to run a custom server available from the network where ClickHouse runs, can create a custom-built malicious server that will act as a ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from the malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. +Аn attacker that has write access to ZooKeeper and who ican run a custom server available from the network where ClickHouse runs, can create a custom-built malicious server that will act as a ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from the malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Credits: Eldar Zaitov of Yandex Information Security Team -### CVE-2019-16535 +### CVE-2019-16535 {#cve-2019-16535} Аn OOB read, OOB write and integer underflow in decompression algorithms can be used to achieve RCE or DoS via native protocol. Credits: Eldar Zaitov of Yandex Information Security Team -### CVE-2019-16536 +### CVE-2019-16536 {#cve-2019-16536} -Stack overflow leading to DoS can be triggered by malicious authenticated client. +Stack overflow leading to DoS can be triggered by a malicious authenticated client. Credits: Eldar Zaitov of Yandex Information Security Team -## Fixed in ClickHouse Release 19.13.6.1, 2019-09-20 +## Fixed in ClickHouse Release 19.13.6.1, 2019-09-20 {#fixed-in-clickhouse-release-19-13-6-1-2019-09-20} -### CVE-2019-18657 +### CVE-2019-18657 {#cve-2019-18657} Table function `url` had the vulnerability allowed the attacker to inject arbitrary HTTP headers in the request. Credits: [Nikita Tikhomirov](https://github.com/NSTikhomirov) -## Fixed in ClickHouse Release 18.12.13, 2018-09-10 +## Fixed in ClickHouse Release 18.12.13, 2018-09-10 {#fixed-in-clickhouse-release-18-12-13-2018-09-10} -### CVE-2018-14672 +### CVE-2018-14672 {#cve-2018-14672} Functions for loading CatBoost models allowed path traversal and reading arbitrary files through error messages. Credits: Andrey Krasichkov of Yandex Information Security Team -## Fixed in ClickHouse Release 18.10.3, 2018-08-13 +## Fixed in ClickHouse Release 18.10.3, 2018-08-13 {#fixed-in-clickhouse-release-18-10-3-2018-08-13} -### CVE-2018-14671 +### CVE-2018-14671 {#cve-2018-14671} unixODBC allowed loading arbitrary shared objects from the file system which led to a Remote Code Execution vulnerability. Credits: Andrey Krasichkov and Evgeny Sidorov of Yandex Information Security Team -## Fixed in ClickHouse Release 1.1.54388, 2018-06-28 +## Fixed in ClickHouse Release 1.1.54388, 2018-06-28 {#fixed-in-clickhouse-release-1-1-54388-2018-06-28} -### CVE-2018-14668 +### CVE-2018-14668 {#cve-2018-14668} -"remote" table function allowed arbitrary symbols in "user", "password" and "default_database" fields which led to Cross Protocol Request Forgery Attacks. +“remote” table function allowed arbitrary symbols in “user”, “password” and “default\_database” fields which led to Cross Protocol Request Forgery Attacks. Credits: Andrey Krasichkov of Yandex Information Security Team -## Fixed in ClickHouse Release 1.1.54390, 2018-07-06 +## Fixed in ClickHouse Release 1.1.54390, 2018-07-06 {#fixed-in-clickhouse-release-1-1-54390-2018-07-06} -### CVE-2018-14669 +### CVE-2018-14669 {#cve-2018-14669} -ClickHouse MySQL client had "LOAD DATA LOCAL INFILE" functionality enabled that allowed a malicious MySQL database read arbitrary files from the connected ClickHouse server. +ClickHouse MySQL client had “LOAD DATA LOCAL INFILE” functionality enabled that allowed a malicious MySQL database read arbitrary files from the connected ClickHouse server. Credits: Andrey Krasichkov and Evgeny Sidorov of Yandex Information Security Team -## Fixed in ClickHouse Release 1.1.54131, 2017-01-10 +## Fixed in ClickHouse Release 1.1.54131, 2017-01-10 {#fixed-in-clickhouse-release-1-1-54131-2017-01-10} -### CVE-2018-14670 +### CVE-2018-14670 {#cve-2018-14670} -Incorrect configuration in deb package could lead to unauthorized use of the database. +Incorrect configuration in deb package could lead to the unauthorized use of the database. -Credits: the UK's National Cyber Security Centre (NCSC) +Credits: the UK’s National Cyber Security Centre (NCSC) -[Original article](https://clickhouse.tech/docs/en/security_changelog/) +{## [Original article](https://clickhouse.tech/docs/en/security_changelog/) ##} diff --git a/docs/es/changelog/2017.md b/docs/es/changelog/2017.md new file mode 100644 index 00000000000..1f443076151 --- /dev/null +++ b/docs/es/changelog/2017.md @@ -0,0 +1,261 @@ +### Lanzamiento de ClickHouse 1.1.54327, 2017-12-21 {#clickhouse-release-1-1-54327-2017-12-21} + +Esta versión contiene correcciones de errores para la versión anterior 1.1.54318: + +- Se corrigió un error con una posible condición de carrera en la replicación que podría conducir a la pérdida de datos. Este problema afecta a las versiones 1.1.54310 y 1.1.54318. Si utiliza una de estas versiones con tablas replicadas, se recomienda encarecidamente la actualización. Este problema se muestra en los registros en mensajes de advertencia como `Part ... from own log doesn't exist.` El problema es relevante incluso si no ve estos mensajes en los registros. + +### Lanzamiento de ClickHouse 1.1.54318, 2017-11-30 {#clickhouse-release-1-1-54318-2017-11-30} + +Esta versión contiene correcciones de errores para la versión anterior 1.1.54310: + +- Se corrigieron las eliminaciones de filas incorrectas durante las fusiones en el motor SummingMergeTree +- Se corrigió una pérdida de memoria en los motores MergeTree no duplicados +- Se corrigió la degradación del rendimiento con inserciones frecuentes en los motores MergeTree +- Se corrigió un problema que provocaba que la cola de replicación dejara de ejecutarse +- Rotación y archivado fijos de los registros del servidor + +### Lanzamiento de ClickHouse 1.1.54310, 2017-11-01 {#clickhouse-release-1-1-54310-2017-11-01} + +#### Novedad: {#new-features} + +- Clave de partición personalizada para la familia MergeTree de motores de tabla. +- [Kafka](https://clickhouse.yandex/docs/es/operations/table_engines/kafka/) motor de mesa. +- Se agregó soporte para cargar [CatBoost](https://catboost.yandex/) modelos y aplicarlos a los datos almacenados en ClickHouse. +- Se agregó soporte para zonas horarias con desplazamientos no enteros de UTC. +- Se agregó soporte para operaciones aritméticas con intervalos de tiempo. +- El intervalo de valores para los tipos Date y DateTime se extiende hasta el año 2105. +- Se agregó el `CREATE MATERIALIZED VIEW x TO y` Consulta (especifica una tabla existente para almacenar los datos de una vista materializada). +- Se agregó el `ATTACH TABLE` Consulta sin argumentos. +- La lógica de procesamiento para columnas anidadas con nombres que terminan en -Map en una tabla SummingMergeTree se extrajo a la función de agregado sumMap. Ahora puede especificar dichas columnas explícitamente. +- El tamaño máximo del diccionario IP trie se incrementa a 128M entradas. +- Se agregó la función getSizeOfEnumType. +- Se agregó la función agregada sumWithOverflow. +- Se agregó soporte para el formato de entrada Cap'n Proto. +- Ahora puede personalizar el nivel de compresión al usar el algoritmo zstd. + +#### Cambios incompatibles hacia atrás: {#backward-incompatible-changes} + +- No se permite la creación de tablas temporales con un motor que no sea Memoria. +- No se permite la creación explícita de tablas con el motor View o MaterializedView. +- Durante la creación de la tabla, una nueva comprobación comprueba que la expresión de clave de muestreo está incluida en la clave principal. + +#### Corrección de errores: {#bug-fixes} + +- Se corrigieron bloqueos al insertar sincrónicamente en una tabla distribuida. +- Se corrigió la adición y eliminación no atómica de partes en tablas replicadas. +- Los datos insertados en una vista materializada no están sujetos a una deduplicación innecesaria. +- La ejecución de una consulta en una tabla distribuida para la que la réplica local está rezagada y las réplicas remotas no están disponibles ya no genera ningún error. +- Los usuarios no necesitan permisos de acceso a la `default` base de datos para crear tablas temporales más. +- Se corrigió el bloqueo al especificar el tipo de matriz sin argumentos. +- Se corrigieron bloqueos cuando el volumen del disco que contiene los registros del servidor está lleno. +- Se corrigió un desbordamiento en la función toRelativeWeekNum para la primera semana de la época de Unix. + +#### Mejoras de construcción: {#build-improvements} + +- Varias bibliotecas de terceros (especialmente Poco) se actualizaron y convirtieron a submódulos de git. + +### Lanzamiento de ClickHouse 1.1.54304, 2017-10-19 {#clickhouse-release-1-1-54304-2017-10-19} + +#### Novedad: {#new-features-1} + +- Soporte TLS en el protocolo nativo (para habilitar, establecer `tcp_ssl_port` es `config.xml` ). + +#### Corrección de errores: {#bug-fixes-1} + +- `ALTER` para tablas replicadas ahora intenta comenzar a ejecutarse tan pronto como sea posible. +- Se corrigió el bloqueo al leer datos con la configuración `preferred_block_size_bytes=0.` +- Se corrigieron los bloqueos de `clickhouse-client` al presionar `Page Down` +- Interpretación correcta de ciertas consultas complejas con `GLOBAL IN` y `UNION ALL` +- `FREEZE PARTITION` siempre funciona atómicamente ahora. +- Las solicitudes POST vacías ahora devuelven una respuesta con el código 411. +- Se corrigieron errores de interpretación para expresiones como `CAST(1 AS Nullable(UInt8)).` +- Se ha corregido un error al leer `Array(Nullable(String))` las columnas de `MergeTree` tabla. +- Se corrigió el bloqueo al analizar consultas como `SELECT dummy AS dummy, dummy AS b` +- Los usuarios se actualizan correctamente `users.xml` +- Control correcto cuando un diccionario ejecutable devuelve un código de respuesta distinto de cero. + +### Lanzamiento de ClickHouse 1.1.54292, 2017-09-20 {#clickhouse-release-1-1-54292-2017-09-20} + +#### Novedad: {#new-features-2} + +- Se agregó el `pointInPolygon` función para trabajar con coordenadas en un plano de coordenadas. +- Se agregó el `sumMap` función de agregado para calcular la suma de matrices, similar a `SummingMergeTree`. +- Se agregó el `trunc` función. Rendimiento mejorado de las funciones de redondeo (`round`, `floor`, `ceil`, `roundToExp2`) y corrigió la lógica de cómo funcionan. Cambió la lógica del `roundToExp2` función para fracciones y números negativos. +- El archivo ejecutable ClickHouse ahora depende menos de la versión de libc. El mismo archivo ejecutable ClickHouse se puede ejecutar en una amplia variedad de sistemas Linux. Todavía hay una dependencia al usar consultas compiladas (con la configuración `compile = 1` , que no se usa por defecto). +- Se ha reducido el tiempo necesario para la compilación dinámica de consultas. + +#### Corrección de errores: {#bug-fixes-2} + +- Se corrigió un error que a veces producía `part ... intersects previous part` mensajes y la coherencia de réplica debilitada. +- Se corrigió un error que causaba que el servidor se bloqueara si ZooKeeper no estaba disponible durante el apagado. +- Se eliminó el registro excesivo al restaurar réplicas. +- Se corrigió un error en la implementación de UNION ALL. +- Se ha corregido un error en la función concat que se producía si la primera columna de un bloque tiene el tipo Array. +- El progreso ahora se muestra correctamente en el sistema.fusiones tabla. + +### Lanzamiento de ClickHouse 1.1.54289, 2017-09-13 {#clickhouse-release-1-1-54289-2017-09-13} + +#### Novedad: {#new-features-3} + +- `SYSTEM` consultas para la administración del servidor: `SYSTEM RELOAD DICTIONARY`, `SYSTEM RELOAD DICTIONARIES`, `SYSTEM DROP DNS CACHE`, `SYSTEM SHUTDOWN`, `SYSTEM KILL`. +- Funciones añadidas para trabajar con matrices: `concat`, `arraySlice`, `arrayPushBack`, `arrayPushFront`, `arrayPopBack`, `arrayPopFront`. +- Añadir `root` y `identity` parámetros para la configuración de ZooKeeper. Esto le permite aislar usuarios individuales en el mismo clúster de ZooKeeper. +- Funciones agregadas añadidas `groupBitAnd`, `groupBitOr`, y `groupBitXor` (por compatibilidad, también están disponibles bajo los nombres `BIT_AND`, `BIT_OR`, y `BIT_XOR`). +- Los diccionarios externos se pueden cargar desde MySQL especificando un socket en el sistema de archivos. +- Los diccionarios externos se pueden cargar desde MySQL a través de SSL (`ssl_cert`, `ssl_key`, `ssl_ca` parámetros). +- Se agregó el `max_network_bandwidth_for_user` para restringir el uso general del ancho de banda para consultas por usuario. +- Soporte para `DROP TABLE` para tablas temporales. +- Soporte para lectura `DateTime` valores en formato de marca de tiempo Unix desde el `CSV` y `JSONEachRow` formato. +- Las réplicas rezagadas en las consultas distribuidas ahora se excluyen de forma predeterminada (el umbral predeterminado es de 5 minutos). +- El bloqueo FIFO se usa durante ALTER: una consulta ALTER no se bloquea indefinidamente para consultas que se ejecutan continuamente. +- Opción para establecer `umask` en el archivo de configuración. +- Rendimiento mejorado para consultas con `DISTINCT` . + +#### Corrección de errores: {#bug-fixes-3} + +- Se ha mejorado el proceso de eliminación de nodos antiguos en ZooKeeper. Anteriormente, los nodos antiguos a veces no se eliminaban si había inserciones muy frecuentes, lo que hacía que el servidor tardara en apagarse, entre otras cosas. +- Se corrigió la aleatorización al elegir hosts para la conexión a ZooKeeper. +- Se corrigió la exclusión de réplicas rezagadas en consultas distribuidas si la réplica es localhost. +- Se corrigió un error por el que una parte de datos en un `ReplicatedMergeTree` la tabla podría estar rota después de correr `ALTER MODIFY` es un elemento en un `Nested` estructura. +- Se corrigió un error que podía causar que las consultas SELECT “hang”. +- Mejoras en las consultas DDL distribuidas. +- Se corrigió la consulta `CREATE TABLE ... AS `. +- Se resolvió el punto muerto en el `ALTER ... CLEAR COLUMN IN PARTITION` Consulta para `Buffer` tabla. +- Se corrigió el valor predeterminado no válido para `Enum` s (0 en lugar del mínimo) cuando se utiliza el `JSONEachRow` y `TSKV` formato. +- Se resolvió la aparición de procesos zombis al usar un diccionario con un `executable` fuente. +- Se corrigió el segfault para la consulta HEAD. + +#### Flujo de trabajo mejorado para desarrollar y ensamblar ClickHouse: {#improved-workflow-for-developing-and-assembling-clickhouse} + +- Usted puede utilizar `pbuilder` para construir ClickHouse. +- Usted puede utilizar `libc++` es lugar de `libstdc++` para construir en Linux. +- Se agregaron instrucciones para usar herramientas de análisis de código estático: `Coverage`, `clang-tidy`, `cppcheck`. + +#### Tenga en cuenta al actualizar: {#please-note-when-upgrading} + +- Ahora hay un valor predeterminado más alto para la configuración MergeTree `max_bytes_to_merge_at_max_space_in_pool` (el tamaño total máximo de las partes de datos a fusionar, en bytes): ha aumentado de 100 GiB a 150 GiB. Esto podría dar lugar a grandes fusiones que se ejecutan después de la actualización del servidor, lo que podría provocar una mayor carga en el subsistema de disco. Si el espacio libre disponible en el servidor es inferior al doble de la cantidad total de las fusiones que se están ejecutando, esto hará que todas las demás fusiones dejen de ejecutarse, incluidas las fusiones de pequeñas partes de datos. Como resultado, las consultas INSERT fallarán con el mensaje “Merges are processing significantly slower than inserts.” Descripción `SELECT * FROM system.merges` consulta para supervisar la situación. También puede comprobar el `DiskSpaceReservedForMerge` métrica en el `system.metrics` mesa, o en Grafito. No necesita hacer nada para solucionar esto, ya que el problema se resolverá una vez que finalicen las fusiones grandes. Si encuentra esto inaceptable, puede restaurar el valor anterior para el `max_bytes_to_merge_at_max_space_in_pool` configuración. Para hacer esto, vaya al sección en config.xml, establecer ``` ``107374182400 ``` y reinicie el servidor. + +### Lanzamiento de ClickHouse 1.1.54284, 2017-08-29 {#clickhouse-release-1-1-54284-2017-08-29} + +- Esta es una versión de corrección de errores para la versión anterior 1.1.54282. Corrige fugas en el directorio de piezas en ZooKeeper. + +### Lanzamiento de ClickHouse 1.1.54282, 2017-08-23 {#clickhouse-release-1-1-54282-2017-08-23} + +Esta versión contiene correcciones de errores para la versión anterior 1.1.54276: + +- Fijo `DB::Exception: Assertion violation: !_path.empty()` cuando se inserta en una tabla distribuida. +- Se corrigió el análisis al insertar en formato RowBinary si los datos de entrada comienzan con ‘;’. +- Errores durante la compilación en tiempo de ejecución de ciertas funciones agregadas (por ejemplo, `groupArray()`). + +### Clickhouse Lanzamiento 1.1.54276, 2017-08-16 {#clickhouse-release-1-1-54276-2017-08-16} + +#### Novedad: {#new-features-4} + +- Se agregó una sección WITH opcional para una consulta SELECT. Consulta de ejemplo: `WITH 1+1 AS a SELECT a, a*a` +- INSERT se puede realizar de forma sincrónica en una tabla distribuida: se devuelve OK solo después de guardar todos los datos en todos los fragmentos. Esto se activa mediante la configuración insert\_distributed\_sync=1. +- Se agregó el tipo de datos UUID para trabajar con identificadores de 16 bytes. +- Se agregaron alias de CHAR, FLOAT y otros tipos para compatibilidad con Tableau. +- Se agregaron las funciones aYYYYMM, aYYYYMMDD y aYYYYMMDDhhmmss para convertir el tiempo en números. +- Puede utilizar direcciones IP (junto con el nombre de host) para identificar servidores para consultas DDL en clúster. +- Se agregó soporte para argumentos no constantes y compensaciones negativas en la función `substring(str, pos, len).` +- Se agregó el parámetro max\_size para el `groupArray(max_size)(column)` función agregada, y optimizó su funcionamiento. + +#### Principales cambios: {#main-changes} + +- Mejoras de seguridad: todos los archivos del servidor se crean con permisos 0640 (se pueden cambiar a través de parámetro de configuración). +- Mensajes de error mejorados para consultas con sintaxis no válida. +- Se ha reducido significativamente el consumo de memoria y se ha mejorado el rendimiento al combinar grandes secciones de datos de MergeTree. +- Aumentó significativamente el rendimiento de las fusiones de datos para el motor ReplacingMergeTree. +- Mejora del rendimiento de las inserciones asincrónicas de una tabla distribuida mediante la combinación de varias inserciones de origen. Para habilitar esta funcionalidad, utilice la configuración de directorio\_distribuido\_monitor\_batch\_inserts=1. + +#### Cambios incompatibles hacia atrás: {#backward-incompatible-changes-1} + +- Se ha cambiado el formato binario de los estados agregados de `groupArray(array_column)` funciones para matrices. + +#### Lista completa de cambios: {#complete-list-of-changes} + +- Se agregó el `output_format_json_quote_denormals` configuración, que permite generar valores nan e inf en formato JSON. +- Asignación de secuencias optimizada al leer desde una tabla distribuida. +- Los ajustes se pueden configurar en modo de solo lectura si el valor no cambia. +- Se agregó la capacidad de recuperar gránulos no enteros del motor MergeTree para cumplir con las restricciones en el tamaño de bloque especificado en la configuración preferred\_block\_size\_bytes. El propósito es reducir el consumo de RAM y aumentar la localidad de caché al procesar consultas desde tablas con columnas grandes. +- Uso eficiente de índices que contienen expresiones como `toStartOfHour(x)` para condiciones como `toStartOfHour(x) op сonstexpr.` +- Se agregaron nuevas configuraciones para los motores MergeTree (la sección merge\_tree en config.XML): + - replicated\_deduplication\_window\_seconds establece el número de segundos permitidos para la deduplicación de inserciones en tablas replicadas. + - cleanup\_delay\_period establece con qué frecuencia iniciar la limpieza para eliminar datos obsoletos. + - replicated\_can\_become\_leader puede evitar que una réplica se convierta en el líder (y asigne fusiones). +- Limpieza acelerada para eliminar datos obsoletos de ZooKeeper. +- Múltiples mejoras y correcciones para consultas DDL en clúster. De particular interés es la nueva configuración distribut\_ddl\_task\_timeout, que limita el tiempo de espera para una respuesta de los servidores en el clúster. Si no se ha realizado una solicitud ddl en todos los hosts, una respuesta contendrá un error de tiempo de espera y una solicitud se ejecutará en modo asíncrono. +- Mejora de la visualización de los rastros de pila en los registros del servidor. +- Se agregó el “none” valor para el método de compresión. +- Puede usar varias secciones dictionaries\_config en config.XML. +- Es posible conectarse a MySQL a través de un socket en el sistema de archivos. +- Sistema.partes tiene una nueva columna con información sobre el tamaño de las marcas, en bytes. + +#### Corrección de errores: {#bug-fixes-4} + +- Las tablas distribuidas que utilizan una tabla Merge ahora funcionan correctamente para una consulta SELECT con una condición `_table` campo. +- Se corrigió una rara condición de carrera en ReplicatedMergeTree al verificar partes de datos. +- Se corrigió la posible congelación en “leader election” al iniciar un servidor. +- El valor max\_replica\_delay\_for\_distributed\_queries se omitió al usar una réplica local del origen de datos. Esto ha sido solucionado. +- Comportamiento incorrecto fijo de `ALTER TABLE CLEAR COLUMN IN PARTITION` al intentar limpiar una columna no existente. +- Se ha corregido una excepción en la función multiIf al usar matrices o cadenas vacías. +- Se corrigieron asignaciones de memoria excesivas al deserializar el formato nativo. +- Se corrigió la actualización automática incorrecta de los diccionarios Trie. +- Se ha corregido una excepción al ejecutar consultas con una cláusula GROUP BY desde una tabla Merge cuando se usa SAMPLE. +- Se corrigió un bloqueo de GROUP BY cuando se usabaributed\_aggregation\_memory\_efficient = 1. +- Ahora puede especificar la base de datos.es el lado derecho de IN y JOIN. +- Se usaron demasiados subprocesos para la agregación paralela. Esto ha sido solucionado. +- Corregido cómo el “if” funciona con argumentos FixedString. +- SELECT funcionó incorrectamente desde una tabla distribuida para fragmentos con un peso de 0. Esto ha sido solucionado. +- Ejecutar `CREATE VIEW IF EXISTS no longer causes crashes.` +- Se corrigió el comportamiento incorrecto cuando input\_format\_skip\_unknown\_fields = 1 se establece y hay números negativos. +- Se corrigió un bucle infinito en el `dictGetHierarchy()` función si hay algunos datos no válidos en el diccionario. +- Fijo `Syntax error: unexpected (...)` errores al ejecutar consultas distribuidas con subconsultas en una cláusula IN o JOIN y tablas Merge. +- Se corrigió una interpretación incorrecta de una consulta SELECT de tablas de diccionario. +- Se corrigió el “Cannot mremap” error al usar matrices en las cláusulas IN y JOIN con más de 2 mil millones de elementos. +- Se corrigió la conmutación por error para los diccionarios con MySQL como fuente. + +#### Flujo de trabajo mejorado para desarrollar y ensamblar ClickHouse: {#improved-workflow-for-developing-and-assembling-clickhouse-1} + +- Las construcciones se pueden ensamblar en Arcadia. +- Puedes usar gcc 7 para compilar ClickHouse. +- Las compilaciones paralelas que usan ccache + distcc son más rápidas ahora. + +### Lanzamiento de ClickHouse 1.1.54245, 2017-07-04 {#clickhouse-release-1-1-54245-2017-07-04} + +#### Novedad: {#new-features-5} + +- DDL distribuido (por ejemplo, `CREATE TABLE ON CLUSTER`) +- La consulta replicada `ALTER TABLE CLEAR COLUMN IN PARTITION.` +- El motor para las tablas del diccionario (acceso a los datos del diccionario en forma de tabla). +- Motor de base de datos de diccionario (este tipo de base de datos tiene automáticamente tablas de diccionario disponibles para todos los diccionarios externos conectados). +- Puede comprobar si hay actualizaciones en el diccionario enviando una solicitud al origen. +- Nombres de columna cualificados +- Citando identificadores usando comillas dobles. +- Sesiones en la interfaz HTTP. +- La consulta OPTIMIZE para una tabla replicada puede ejecutarse no solo en el líder. + +#### Cambios incompatibles hacia atrás: {#backward-incompatible-changes-2} + +- Eliminado SET GLOBAL. + +#### Cambios menores: {#minor-changes} + +- Ahora, después de activar una alerta, el registro imprime el seguimiento completo de la pila. +- Se relajó la verificación del número de partes de datos dañadas / adicionales al inicio (hubo demasiados falsos positivos). + +#### Corrección de errores: {#bug-fixes-5} + +- Se corrigió una mala conexión “sticking” cuando se inserta en una tabla distribuida. +- GLOBAL IN ahora funciona para una consulta de una tabla de combinación que mira a una tabla distribuida. +- Se detectó el número incorrecto de núcleos en una máquina virtual de Google Compute Engine. Esto ha sido solucionado. +- Cambios en el funcionamiento de una fuente ejecutable de diccionarios externos en caché. +- Se corrigió la comparación de cadenas que contenían caracteres nulos. +- Se corrigió la comparación de los campos de clave primaria Float32 con constantes. +- Anteriormente, una estimación incorrecta del tamaño de un campo podía conducir a asignaciones demasiado grandes. +- Se corrigió un bloqueo al consultar una columna Nullable agregada a una tabla usando ALTER. +- Se corrigió un bloqueo al ordenar por una columna Nullable, si el número de filas es menor que LIMIT. +- Se corrigió una subconsulta ORDER BY que consistía solo en valores constantes. +- Anteriormente, una tabla replicada podía permanecer en el estado no válido después de un error DROP TABLE. +- Los alias para subconsultas escalares con resultados vacíos ya no se pierden. +- Ahora una consulta que usó compilación no falla con un error si el archivo .so se daña. diff --git a/docs/es/commercial/cloud.md b/docs/es/commercial/cloud.md new file mode 100644 index 00000000000..ee6a45fc851 --- /dev/null +++ b/docs/es/commercial/cloud.md @@ -0,0 +1,14 @@ +# Proveedores de servicios en la nube de ClickHouse {#clickhouse-cloud-service-providers} + +!!! info "INFO" + Si ha lanzado una nube pública con el servicio ClickHouse administrado, no dude en [abrir una solicitud de extracción](https://github.com/ClickHouse/ClickHouse/edit/master/docs/es/commercial/cloud.md) añadiéndolo a la siguiente lista. + +## Nube de Yandex {#yandex-cloud} + +[Servicio administrado de Yandex para ClickHouse](https://cloud.yandex.com/services/managed-clickhouse?utm_source=referrals&utm_medium=clickhouseofficialsite&utm_campaign=link3) proporciona las siguientes características clave: + +- Servicio ZooKeeper totalmente gestionado para [Replicación de ClickHouse](../operations/table_engines/replication.md) +- Múltiples opciones de tipo de almacenamiento +- Réplicas en diferentes zonas de disponibilidad +- Cifrado y aislamiento +- Mantenimiento automatizado diff --git a/docs/es/data_types/array.md b/docs/es/data_types/array.md new file mode 100644 index 00000000000..1650ab461b1 --- /dev/null +++ b/docs/es/data_types/array.md @@ -0,0 +1,72 @@ +# Matriz (T) {#data-type-array} + +Matriz de `T`-tipo de artículos. + +`T` puede ser cualquier cosa, incluida una matriz. + +## Creación de una matriz {#creating-an-array} + +Puede usar una función para crear una matriz: + +``` sql +array(T) +``` + +También puede usar corchetes. + +``` sql +[] +``` + +Ejemplo de creación de una matriz: + +``` sql +SELECT array(1, 2) AS x, toTypeName(x) +``` + +``` text +┌─x─────┬─toTypeName(array(1, 2))─┐ +│ [1,2] │ Array(UInt8) │ +└───────┴─────────────────────────┘ +``` + +``` sql +SELECT [1, 2] AS x, toTypeName(x) +``` + +``` text +┌─x─────┬─toTypeName([1, 2])─┐ +│ [1,2] │ Array(UInt8) │ +└───────┴────────────────────┘ +``` + +## Trabajar con tipos de datos {#working-with-data-types} + +Al crear una matriz sobre la marcha, ClickHouse define automáticamente el tipo de argumento como el tipo de datos más estrecho que puede almacenar todos los argumentos enumerados. Si hay alguna [NULO](../query_language/syntax.md#null-literal) o [NULO](nullable.md#data_type-nullable) los argumentos de tipo, el tipo de elementos de la matriz es [NULO](nullable.md). + +Si ClickHouse no pudo determinar el tipo de datos, generará una excepción. Por ejemplo, esto sucederá al intentar crear una matriz con cadenas y números simultáneamente (`SELECT array(1, 'a')`). + +Ejemplos de detección automática de tipos de datos: + +``` sql +SELECT array(1, 2, NULL) AS x, toTypeName(x) +``` + +``` text +┌─x──────────┬─toTypeName(array(1, 2, NULL))─┐ +│ [1,2,NULL] │ Array(Nullable(UInt8)) │ +└────────────┴───────────────────────────────┘ +``` + +Si intenta crear una matriz de tipos de datos incompatibles, ClickHouse produce una excepción: + +``` sql +SELECT array(1, 'a') +``` + +``` text +Received exception from server (version 1.1.54388): +Code: 386. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: There is no supertype for types UInt8, String because some of them are String/FixedString and some of them are not. +``` + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/array/) diff --git a/docs/es/data_types/boolean.md b/docs/es/data_types/boolean.md new file mode 100644 index 00000000000..3a49cbd7658 --- /dev/null +++ b/docs/es/data_types/boolean.md @@ -0,0 +1,5 @@ +# Valores booleanos {#boolean-values} + +No hay un tipo separado para los valores booleanos. Utilice el tipo UInt8, restringido a los valores 0 o 1. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/boolean/) diff --git a/docs/es/data_types/date.md b/docs/es/data_types/date.md new file mode 100644 index 00000000000..78d265ab604 --- /dev/null +++ b/docs/es/data_types/date.md @@ -0,0 +1,8 @@ +# Fecha {#date} + +Fecha. Almacenado en dos bytes como el número de días desde 1970-01-01 (sin signo). Permite almacenar valores desde justo después del comienzo de la Época Unix hasta el umbral superior definido por una constante en la etapa de compilación (actualmente, esto es hasta el año 2106, pero el último año totalmente soportado es 2105). +El valor mínimo se emite como 0000-00-00. + +El valor de fecha se almacena sin la zona horaria. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/date/) diff --git a/docs/es/data_types/datetime.md b/docs/es/data_types/datetime.md new file mode 100644 index 00000000000..23f07674353 --- /dev/null +++ b/docs/es/data_types/datetime.md @@ -0,0 +1,122 @@ +# FechaHora {#data-type-datetime} + +Permite almacenar un instante en el tiempo, que se puede expresar como una fecha del calendario y una hora de un día. + +Sintaxis: + +``` sql +DateTime([timezone]) +``` + +Rango de valores: \[1970-01-01 00:00:00, 2105-12-31 23:59:59\]. + +Resolución: 1 segundo. + +## Observaciones de uso {#usage-remarks} + +El punto en el tiempo se guarda como un [Timestamp de Unix](https://en.wikipedia.org/wiki/Unix_time), independientemente de la zona horaria o el horario de verano. Además, el `DateTime` tipo puede almacenar zona horaria que es la misma para toda la columna, que afecta a cómo los valores de la `DateTime` Los valores de tipo se muestran en formato de texto y cómo se analizan los valores especificados como cadenas (‘2020-01-01 05:00:01’). La zona horaria no se almacena en las filas de la tabla (o en el conjunto de resultados), sino que se almacena en los metadatos de la columna. +Se puede encontrar una lista de zonas horarias compatibles en el [Base de datos de zonas horarias de IANA](https://www.iana.org/time-zones). +El `tzdata` paquete, que contiene [Base de datos de zonas horarias de IANA](https://www.iana.org/time-zones), debe instalarse en el sistema. Descripción `timedatectl list-timezones` comando para listar zonas horarias conocidas por un sistema local. + +Puede establecer explícitamente una zona horaria para `DateTime`-type columnas al crear una tabla. Si la zona horaria no está establecida, ClickHouse usa el valor [Zona horaria](../operations/server_settings/settings.md#server_settings-timezone) parámetro en la configuración del servidor o la configuración del sistema operativo en el momento del inicio del servidor ClickHouse. + +El [Casa de clics-cliente](../interfaces/cli.md) aplica la zona horaria del servidor de forma predeterminada si una zona horaria no se establece explícitamente al inicializar el tipo de datos. Para utilizar la zona horaria del cliente, ejecute `clickhouse-client` con el `--use_client_time_zone` parámetro. + +ClickHouse genera valores en `YYYY-MM-DD hh:mm:ss` formato de texto por defecto. Puede cambiar la salida con el [formatDateTime](../query_language/functions/date_time_functions.md#formatdatetime) función. + +Al insertar datos en ClickHouse, puede usar diferentes formatos de cadenas de fecha y hora, dependiendo del valor de la [Date\_time\_input\_format](../operations/settings/settings.md#settings-date_time_input_format) configuración. + +## Ejemplos {#examples} + +**1.** Creación de una tabla con un `DateTime`-tipo de columna e insertar datos en ella: + +``` sql +CREATE TABLE dt +( + `timestamp` DateTime('Europe/Moscow'), + `event_id` UInt8 +) +ENGINE = TinyLog; +``` + +``` sql +INSERT INTO dt Values (1546300800, 1), ('2019-01-01 00:00:00', 2); +``` + +``` sql +SELECT * FROM dt; +``` + +``` text +┌───────────timestamp─┬─event_id─┐ +│ 2019-01-01 03:00:00 │ 1 │ +│ 2019-01-01 00:00:00 │ 2 │ +└─────────────────────┴──────────┘ +``` + +- Al insertar datetime como un entero, se trata como Unix Timestamp (UTC). `1546300800` representante `'2019-01-01 00:00:00'` UTC. Sin embargo, como `timestamp` tetas grandes `Europe/Moscow` (UTC + 3) zona horaria especificada, al emitir como cadena, el valor se mostrará como `'2019-01-01 03:00:00'` +- Al insertar el valor de cadena como fecha y hora, se trata como si estuviera en la zona horaria de la columna. `'2019-01-01 00:00:00'` será tratado como estar en `Europe/Moscow` zona horaria y guardado como `1546290000`. + +**2.** Filtrado en `DateTime` valor + +``` sql +SELECT * FROM dt WHERE timestamp = toDateTime('2019-01-01 00:00:00', 'Europe/Moscow') +``` + +``` text +┌───────────timestamp─┬─event_id─┐ +│ 2019-01-01 00:00:00 │ 2 │ +└─────────────────────┴──────────┘ +``` + +`DateTime` se pueden filtrar usando un valor de cadena en `WHERE` predicado. Se convertirá a `DateTime` automática: + +``` sql +SELECT * FROM dt WHERE timestamp = '2019-01-01 00:00:00' +``` + +``` text +┌───────────timestamp─┬─event_id─┐ +│ 2019-01-01 03:00:00 │ 1 │ +└─────────────────────┴──────────┘ +``` + +**3.** Obtener una zona horaria para un `DateTime`-tipo columna: + +``` sql +SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x +``` + +``` text +┌──────────────column─┬─x─────────────────────────┐ +│ 2019-10-16 04:12:04 │ DateTime('Europe/Moscow') │ +└─────────────────────┴───────────────────────────┘ +``` + +**4.** Conversión de zona horaria + +``` sql +SELECT +toDateTime(timestamp, 'Europe/London') as lon_time, +toDateTime(timestamp, 'Europe/Moscow') as mos_time +FROM dt +``` + +``` text +┌───────────lon_time──┬────────────mos_time─┐ +│ 2019-01-01 00:00:00 │ 2019-01-01 03:00:00 │ +│ 2018-12-31 21:00:00 │ 2019-01-01 00:00:00 │ +└─────────────────────┴─────────────────────┘ +``` + +## Ver también {#see-also} + +- [Funciones de conversión de tipos](../query_language/functions/type_conversion_functions.md) +- [Funciones para trabajar con fechas y horas](../query_language/functions/date_time_functions.md) +- [Funciones para trabajar con matrices](../query_language/functions/array_functions.md) +- [El `date_time_input_format` configuración](../operations/settings/settings.md#settings-date_time_input_format) +- [El `timezone` parámetro de configuración del servidor](../operations/server_settings/settings.md#server_settings-timezone) +- [Operadores para trabajar con fechas y horas](../query_language/operators.md#operators-datetime) +- [El `Date` Tipo de datos](date.md) + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/datetime/) diff --git a/docs/es/data_types/datetime64.md b/docs/es/data_types/datetime64.md new file mode 100644 index 00000000000..fed3ba65dba --- /dev/null +++ b/docs/es/data_types/datetime64.md @@ -0,0 +1,97 @@ +# DateTime64 {#data-type-datetime64} + +Permite almacenar un instante en el tiempo, que se puede expresar como una fecha de calendario y una hora de un día, con una precisión de subsegundo definida + +Tamaño de la marca (precisión): 10-Precision segundo + +Sintaxis: + +``` sql +DateTime64(precision, [timezone]) +``` + +Internamente, almacena datos como un número de ‘ticks’ desde el inicio de la época (1970-01-01 00:00:00 UTC) como Int64. La resolución de tick está determinada por el parámetro precision. Además, el `DateTime64` tipo puede almacenar zona horaria que es la misma para toda la columna, que afecta a cómo los valores de la `DateTime64` Los valores de tipo se muestran en formato de texto y cómo se analizan los valores especificados como cadenas (‘2020-01-01 05:00:01.000’). La zona horaria no se almacena en las filas de la tabla (o en el conjunto de resultados), sino que se almacena en los metadatos de la columna. Ver detalles en [FechaHora](datetime.md). + +## Ejemplos {#examples} + +**1.** Creación de una tabla con `DateTime64`-tipo de columna e insertar datos en ella: + +``` sql +CREATE TABLE dt +( + `timestamp` DateTime64(3, 'Europe/Moscow'), + `event_id` UInt8 +) +ENGINE = TinyLog +``` + +``` sql +INSERT INTO dt Values (1546300800000, 1), ('2019-01-01 00:00:00', 2) +``` + +``` sql +SELECT * FROM dt +``` + +``` text +┌───────────────timestamp─┬─event_id─┐ +│ 2019-01-01 03:00:00.000 │ 1 │ +│ 2019-01-01 00:00:00.000 │ 2 │ +└─────────────────────────┴──────────┘ +``` + +- Al insertar datetime como un entero, se trata como una marca de tiempo Unix (UTC) apropiadamente escalada. `1546300800000` (con precisión 3) representa `'2019-01-01 00:00:00'` UTC. Sin embargo, como `timestamp` tetas grandes `Europe/Moscow` (UTC + 3) zona horaria especificada, al emitir como una cadena, el valor se mostrará como `'2019-01-01 03:00:00'` +- Al insertar el valor de cadena como fecha y hora, se trata como si estuviera en la zona horaria de la columna. `'2019-01-01 00:00:00'` será tratado como estar en `Europe/Moscow` Zona horaria y se almacena como `1546290000000`. + +**2.** Filtrado en `DateTime64` valor + +``` sql +SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow') +``` + +``` text +┌───────────────timestamp─┬─event_id─┐ +│ 2019-01-01 00:00:00.000 │ 2 │ +└─────────────────────────┴──────────┘ +``` + +A diferencia de `DateTime`, `DateTime64` los valores no se convierten desde `String` automática + +**3.** Obtener una zona horaria para un `DateTime64`-tipo de valor: + +``` sql +SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x +``` + +``` text +┌──────────────────column─┬─x──────────────────────────────┐ +│ 2019-10-16 04:12:04.000 │ DateTime64(3, 'Europe/Moscow') │ +└─────────────────────────┴────────────────────────────────┘ +``` + +**4.** Conversión de zona horaria + +``` sql +SELECT +toDateTime64(timestamp, 3, 'Europe/London') as lon_time, +toDateTime64(timestamp, 3, 'Europe/Moscow') as mos_time +FROM dt +``` + +``` text +┌───────────────lon_time──┬────────────────mos_time─┐ +│ 2019-01-01 00:00:00.000 │ 2019-01-01 03:00:00.000 │ +│ 2018-12-31 21:00:00.000 │ 2019-01-01 00:00:00.000 │ +└─────────────────────────┴─────────────────────────┘ +``` + +## Ver también {#see-also} + +- [Funciones de conversión de tipos](../query_language/functions/type_conversion_functions.md) +- [Funciones para trabajar con fechas y horas](../query_language/functions/date_time_functions.md) +- [Funciones para trabajar con matrices](../query_language/functions/array_functions.md) +- [El `date_time_input_format` configuración](../operations/settings/settings.md#settings-date_time_input_format) +- [El `timezone` parámetro de configuración del servidor](../operations/server_settings/settings.md#server_settings-timezone) +- [Operadores para trabajar con fechas y horas](../query_language/operators.md#operators-datetime) +- [`Date` Tipo de datos](date.md) +- [`DateTime` Tipo de datos](datetime.md) diff --git a/docs/es/data_types/decimal.md b/docs/es/data_types/decimal.md new file mode 100644 index 00000000000..2a355ed79a9 --- /dev/null +++ b/docs/es/data_types/decimal.md @@ -0,0 +1,102 @@ +# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S) {#decimalp-s-decimal32s-decimal64s-decimal128s} + +Números de punto fijo firmados que mantienen la precisión durante las operaciones de suma, resta y multiplicación. Para la división se descartan los dígitos menos significativos (no redondeados). + +## Parámetros {#parameters} + +- P - precisión. Rango válido: \[ 1 : 38 \]. Determina cuántos dígitos decimales puede tener el número (incluida la fracción). +- S - escala. Rango válido: \[ 0 : P \]. Determina cuántos dígitos decimales puede tener la fracción. + +Dependiendo del valor del parámetro P Decimal(P, S) es un sinónimo de: +- P de \[ 1 : 9 \] - para Decimal32(S) +- P de \[ 10 : 18 \] - para Decimal64(S) +- P de \[ 19 : 38 \] - para Decimal128(S) + +## Rangos de valores decimales {#decimal-value-ranges} + +- Decimal32(S) - ( -1 \* 10^(9 - S), 1 \* 10^(9 - S) ) +- Decimal64(S) - ( -1 \* 10^(18 - S), 1 \* 10^(18 - S) ) +- Decimal128(S) - ( -1 \* 10^(38 - S), 1 \* 10^(38 - S) ) + +Por ejemplo, Decimal32(4) puede contener números de -99999.9999 a 99999.9999 con el paso 0.0001. + +## Representación interna {#internal-representation} + +Internamente, los datos se representan como enteros con signo normal con el ancho de bits respectivo. Los rangos de valores reales que se pueden almacenar en la memoria son un poco más grandes que los especificados anteriormente, que se verifican solo en la conversión de una cadena. + +Debido a que las CPU modernas no admiten enteros de 128 bits de forma nativa, las operaciones en Decimal128 se emulan. Debido a esto, Decimal128 funciona significativamente más lento que Decimal32 / Decimal64. + +## Operaciones y tipo de resultado {#operations-and-result-type} + +Las operaciones binarias en Decimal dan como resultado un tipo de resultado más amplio (con cualquier orden de argumentos). + +- Decimal64(S1) Decimal32(S2) -\> Decimal64(S) +- Decimal128(S1) Decimal32(S2) -\> Decimal128(S) +- Decimal128(S1) ¿Cómo puedo hacerlo?) + +Reglas para la escala: + +- Sumar, restar: S = max (S1, S2). +- multuply: S = S1 + S2. +- división: S = S1. + +Para operaciones similares entre Decimal y enteros, el resultado es Decimal del mismo tamaño que un argumento. + +Las operaciones entre Decimal y Float32 / Float64 no están definidas. Si los necesita, puede convertir explícitamente uno de los argumentos utilizando toDecimal32, toDecimal64, toDecimal128 o toFloat32, toFloat64 builtins. Tenga en cuenta que el resultado perderá precisión y la conversión de tipo es una operación computacionalmente costosa. + +Algunas funciones en Decimal devuelven el resultado como Float64 (por ejemplo, var o stddev). Los cálculos intermedios aún se pueden realizar en Decimal, lo que podría dar lugar a resultados diferentes entre las entradas Float64 y Decimal con los mismos valores. + +## Comprobaciones de desbordamiento {#overflow-checks} + +Durante los cálculos en Decimal, pueden producirse desbordamientos de enteros. Los dígitos excesivos en una fracción se descartan (no se redondean). Los dígitos excesivos en la parte entera conducirán a una excepción. + +``` sql +SELECT toDecimal32(2, 4) AS x, x / 3 +``` + +``` text +┌──────x─┬─divide(toDecimal32(2, 4), 3)─┐ +│ 2.0000 │ 0.6666 │ +└────────┴──────────────────────────────┘ +``` + +``` sql +SELECT toDecimal32(4.2, 8) AS x, x * x +``` + +``` text +DB::Exception: Scale is out of bounds. +``` + +``` sql +SELECT toDecimal32(4.2, 8) AS x, 6 * x +``` + +``` text +DB::Exception: Decimal math overflow. +``` + +Las comprobaciones de desbordamiento conducen a la desaceleración de las operaciones. Si se sabe que los desbordamientos no son posibles, tiene sentido deshabilitar las verificaciones usando `decimal_check_overflow` configuración. Cuando las comprobaciones están deshabilitadas y se produce el desbordamiento, el resultado será incorrecto: + +``` sql +SET decimal_check_overflow = 0; +SELECT toDecimal32(4.2, 8) AS x, 6 * x +``` + +``` text +┌──────────x─┬─multiply(6, toDecimal32(4.2, 8))─┐ +│ 4.20000000 │ -17.74967296 │ +└────────────┴──────────────────────────────────┘ +``` + +Las comprobaciones de desbordamiento ocurren no solo en operaciones aritméticas sino también en la comparación de valores: + +``` sql +SELECT toDecimal32(1, 8) < 100 +``` + +``` text +DB::Exception: Can't compare. +``` + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/decimal/) diff --git a/docs/es/data_types/domains/ipv4.md b/docs/es/data_types/domains/ipv4.md new file mode 100644 index 00000000000..b2f56b15659 --- /dev/null +++ b/docs/es/data_types/domains/ipv4.md @@ -0,0 +1,77 @@ +## IPv4 {#ipv4} + +`IPv4` es un dominio basado en `UInt32` tipo y sirve como reemplazo tipo para almacenar valores IPv4. Proporciona un almacenamiento compacto con un formato de entrada-salida amigable para los humanos e información de tipo de columna sobre la inspección. + +### Uso básico {#basic-usage} + +``` sql +CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY url; + +DESCRIBE TABLE hits; +``` + +``` text +┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ +│ url │ String │ │ │ │ │ +│ from │ IPv4 │ │ │ │ │ +└──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘ +``` + +O puede usar el dominio IPv4 como clave: + +``` sql +CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY from; +``` + +`IPv4` Todos los derechos reservados: + +``` sql +INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '116.253.40.133')('https://clickhouse.tech', '183.247.232.58')('https://clickhouse.yandex/docs/es/', '116.106.34.242'); + +SELECT * FROM hits; +``` + +``` text +┌─url────────────────────────────────┬───────────from─┐ +│ https://clickhouse.tech/docs/es/ │ 116.106.34.242 │ +│ https://wikipedia.org │ 116.253.40.133 │ +│ https://clickhouse.tech │ 183.247.232.58 │ +└────────────────────────────────────┴────────────────┘ +``` + +Los valores se almacenan en forma binaria compacta: + +``` sql +SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; +``` + +``` text +┌─toTypeName(from)─┬─hex(from)─┐ +│ IPv4 │ B7F7E83A │ +└──────────────────┴───────────┘ +``` + +Los valores de dominio no se pueden convertir implícitamente en tipos distintos de `UInt32`. +Si desea convertir `IPv4` valor a una cadena, tienes que hacer eso explícitamente con `IPv4NumToString()` función: + +``` sql +SELECT toTypeName(s), IPv4NumToString(from) as s FROM hits LIMIT 1; +``` + + ┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ + │ String │ 183.247.232.58 │ + └───────────────────────────────────┴────────────────┘ + +O echar una ONU `UInt32` valor: + +``` sql +SELECT toTypeName(i), CAST(from as UInt32) as i FROM hits LIMIT 1; +``` + +``` text +┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ +│ UInt32 │ 3086477370 │ +└──────────────────────────────────┴────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/domains/ipv4) diff --git a/docs/es/data_types/domains/ipv6.md b/docs/es/data_types/domains/ipv6.md new file mode 100644 index 00000000000..f385774ee04 --- /dev/null +++ b/docs/es/data_types/domains/ipv6.md @@ -0,0 +1,79 @@ +## IPv6 {#ipv6} + +`IPv6` es un dominio basado en `FixedString(16)` tipo y sirve como reemplazo tipo para almacenar valores IPv6. Proporciona un almacenamiento compacto con un formato de entrada-salida amigable para los humanos e información de tipo de columna sobre la inspección. + +### Uso básico {#basic-usage} + +``` sql +CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; + +DESCRIBE TABLE hits; +``` + +``` text +┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ +│ url │ String │ │ │ │ │ +│ from │ IPv6 │ │ │ │ │ +└──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘ +``` + +O puedes usar `IPv6` dominio como clave: + +``` sql +CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from; +``` + +`IPv6` dominio admite entradas personalizadas como cadenas IPv6: + +``` sql +INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '2a02:aa08:e000:3100::2')('https://clickhouse.tech', '2001:44c8:129:2632:33:0:252:2')('https://clickhouse.yandex/docs/es/', '2a02:e980:1e::1'); + +SELECT * FROM hits; +``` + +``` text +┌─url────────────────────────────────┬─from──────────────────────────┐ +│ https://clickhouse.tech │ 2001:44c8:129:2632:33:0:252:2 │ +│ https://clickhouse.tech/docs/es/ │ 2a02:e980:1e::1 │ +│ https://wikipedia.org │ 2a02:aa08:e000:3100::2 │ +└────────────────────────────────────┴───────────────────────────────┘ +``` + +Los valores se almacenan en forma binaria compacta: + +``` sql +SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; +``` + +``` text +┌─toTypeName(from)─┬─hex(from)────────────────────────┐ +│ IPv6 │ 200144C8012926320033000002520002 │ +└──────────────────┴──────────────────────────────────┘ +``` + +Los valores de dominio no se pueden convertir implícitamente en tipos distintos de `FixedString(16)`. +Si desea convertir `IPv6` valor a una cadena, tienes que hacer eso explícitamente con `IPv6NumToString()` función: + +``` sql +SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1; +``` + +``` text +┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐ +│ String │ 2001:44c8:129:2632:33:0:252:2 │ +└───────────────────────────────────┴───────────────────────────────┘ +``` + +O echar una ONU `FixedString(16)` valor: + +``` sql +SELECT toTypeName(i), CAST(from as FixedString(16)) as i FROM hits LIMIT 1; +``` + +``` text +┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐ +│ FixedString(16) │ ��� │ +└───────────────────────────────────────────┴─────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/domains/ipv6) diff --git a/docs/es/data_types/domains/overview.md b/docs/es/data_types/domains/overview.md new file mode 100644 index 00000000000..378b7b02997 --- /dev/null +++ b/docs/es/data_types/domains/overview.md @@ -0,0 +1,26 @@ +# Dominio {#domains} + +Los dominios son tipos de propósito especial, que agregan algunas características adicionales encima del tipo base existente, dejando intacto el formato en cable y en disco de la tabla subyacente. Por el momento, ClickHouse no admite dominios definidos por el usuario. + +Puede usar dominios en cualquier lugar que se pueda usar el tipo base correspondiente: + +- Crear una columna de tipo de dominio +- Leer/escribir valores desde/a la columna de dominio +- Úselo como índice si el tipo base se puede usar como índice +- Funciones de llamada con valores de la columna de dominio +- sucesivamente. + +### Características adicionales de los dominios {#extra-features-of-domains} + +- Nombre de tipo de columna explícito en `SHOW CREATE TABLE` o `DESCRIBE TABLE` +- Entrada del formato humano-amistoso con `INSERT INTO domain_table(domain_column) VALUES(...)` +- Salida al formato humano-amistoso para `SELECT domain_column FROM domain_table` +- Carga de datos desde una fuente externa en un formato amigable para los humanos: `INSERT INTO domain_table FORMAT CSV ...` + +### Limitación {#limitations} + +- No se puede convertir la columna de índice del tipo base al tipo de dominio a través de `ALTER TABLE`. +- No se pueden convertir implícitamente valores de cadena en valores de dominio al insertar datos de otra columna o tabla. +- Domain no agrega restricciones en los valores almacenados. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/domains/overview) diff --git a/docs/es/data_types/enum.md b/docs/es/data_types/enum.md new file mode 100644 index 00000000000..2c66fa20d31 --- /dev/null +++ b/docs/es/data_types/enum.md @@ -0,0 +1,125 @@ +# Enum {#enum} + +Tipo enumerado que consta de valores con nombre. + +Los valores con nombre deben declararse como `'string' = integer` par. ClickHouse almacena solo números, pero admite operaciones con los valores a través de sus nombres. + +Soporta ClickHouse: + +- de 8 bits `Enum`. Puede contener hasta 256 valores enumerados en el `[-128, 127]` gama. +- de 16 bits `Enum`. Puede contener hasta 65536 valores enumerados en el `[-32768, 32767]` gama. + +ClickHouse elige automáticamente el tipo de `Enum` cuando se insertan datos. También puede utilizar `Enum8` o `Enum16` para estar seguro en el tamaño de almacenamiento. + +## Ejemplos de uso {#usage-examples} + +Aquí creamos una tabla con un `Enum8('hello' = 1, 'world' = 2)` tipo columna: + +``` sql +CREATE TABLE t_enum +( + x Enum('hello' = 1, 'world' = 2) +) +ENGINE = TinyLog +``` + +Columna `x` sólo puede almacenar valores que se enumeran en la definición de tipo: `'hello'` o `'world'`. Si intenta guardar cualquier otro valor, ClickHouse generará una excepción. Tamaño de 8 bits para esto `Enum` se elige automáticamente. + +``` sql +INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') +``` + +``` text +Ok. +``` + +``` sql +INSERT INTO t_enum values('a') +``` + +``` text +Exception on client: +Code: 49. DB::Exception: Unknown element 'a' for type Enum('hello' = 1, 'world' = 2) +``` + +Al consultar datos de la tabla, ClickHouse genera los valores de cadena de `Enum`. + +``` sql +SELECT * FROM t_enum +``` + +``` text +┌─x─────┐ +│ hello │ +│ world │ +│ hello │ +└───────┘ +``` + +Si necesita ver los equivalentes numéricos de las filas, debe `Enum` valor a tipo entero. + +``` sql +SELECT CAST(x, 'Int8') FROM t_enum +``` + +``` text +┌─CAST(x, 'Int8')─┐ +│ 1 │ +│ 2 │ +│ 1 │ +└─────────────────┘ +``` + +Para crear un valor Enum en una consulta, también debe usar `CAST`. + +``` sql +SELECT toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)')) +``` + +``` text +┌─toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)'))─┐ +│ Enum8('a' = 1, 'b' = 2) │ +└─────────────────────────────────────────────────────┘ +``` + +## Reglas generales y uso {#general-rules-and-usage} + +A cada uno de los valores se le asigna un número en el rango `-128 ... 127` para `Enum8` o en el rango `-32768 ... 32767` para `Enum16`. Todas las cadenas y números deben ser diferentes. Se permite una cadena vacía. Si se especifica este tipo (en una definición de tabla), los números pueden estar en un orden arbitrario. Sin embargo, el orden no importa. + +Ni la cadena ni el valor numérico en un `Enum` puede ser [NULO](../query_language/syntax.md). + +Un `Enum` puede estar contenido en [NULL](nullable.md) tipo. Entonces, si crea una tabla usando la consulta + +``` sql +CREATE TABLE t_enum_nullable +( + x Nullable( Enum8('hello' = 1, 'world' = 2) ) +) +ENGINE = TinyLog +``` + +puede almacenar no sólo `'hello'` y `'world'`, pero `NULL`, también. + +``` sql +INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) +``` + +En RAM, un `Enum` columna se almacena de la misma manera que `Int8` o `Int16` de los valores numéricos correspondientes. + +Al leer en forma de texto, ClickHouse analiza el valor como una cadena y busca la cadena correspondiente del conjunto de valores Enum. Si no se encuentra, se lanza una excepción. Al leer en formato de texto, se lee la cadena y se busca el valor numérico correspondiente. Se lanzará una excepción si no se encuentra. +Al escribir en forma de texto, escribe el valor como la cadena correspondiente. Si los datos de columna contienen elementos no utilizados (números que no son del conjunto válido), se produce una excepción. Al leer y escribir en forma binaria, funciona de la misma manera que para los tipos de datos Int8 e Int16. +El valor predeterminado es el valor con el número más bajo. + +Durante `ORDER BY`, `GROUP BY`, `IN`, `DISTINCT` y así sucesivamente, las enumeraciones se comportan de la misma manera que los números correspondientes. Por ejemplo, ORDER BY los ordena numéricamente. Los operadores de igualdad y comparación funcionan de la misma manera en enumeraciones que en los valores numéricos subyacentes. + +Los valores de Enum no se pueden comparar con los números. Las enumeraciones se pueden comparar con una cadena constante. Si la cadena en comparación con no es un valor válido para el Enum, se lanzará una excepción. El operador IN es compatible con el Enum en el lado izquierdo y un conjunto de cadenas en el lado derecho. Las cadenas son los valores del Enum correspondiente. + +Most numeric and string operations are not defined for Enum values, e.g. adding a number to an Enum or concatenating a string to an Enum. +Sin embargo, el Enum tiene un `toString` función que devuelve su valor de cadena. + +Los valores de Enum también se pueden convertir a tipos numéricos utilizando el `toT` función, donde T es un tipo numérico. Cuando T corresponde al tipo numérico subyacente de la enumeración, esta conversión es de costo cero. +El tipo Enum se puede cambiar sin costo usando ALTER, si solo se cambia el conjunto de valores. Es posible agregar y eliminar miembros del Enum usando ALTER (eliminar es seguro solo si el valor eliminado nunca se ha usado en la tabla). Como salvaguardia, al cambiar el valor numérico de un miembro Enum definido previamente se producirá una excepción. + +Usando ALTER, es posible cambiar un Enum8 a un Enum16 o viceversa, al igual que cambiar un Int8 a Int16. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/enum/) diff --git a/docs/es/data_types/fixedstring.md b/docs/es/data_types/fixedstring.md new file mode 100644 index 00000000000..7e02cf6eb48 --- /dev/null +++ b/docs/es/data_types/fixedstring.md @@ -0,0 +1,56 @@ +# Cadena fija {#fixedstring} + +Una cadena de longitud fija de `N` bytes (ni caracteres ni puntos de código). + +Para declarar una columna de `FixedString` tipo, utilice la siguiente sintaxis: + +``` sql + FixedString(N) +``` + +Donde `N` es un número natural. + +El `FixedString` tipo es eficiente cuando los datos tienen la longitud de `N` byte. En todos los demás casos, es probable que reduzca la eficiencia. + +Ejemplos de los valores que se pueden almacenar eficientemente en `FixedString`Escrito columnas: + +- La representación binaria de direcciones IP (`FixedString(16)` para IPv6). +- Language codes (ru\_RU, en\_US … ). +- Currency codes (USD, RUB … ). +- Representación binaria de hashes (`FixedString(16)` para MD5, `FixedString(32)` para SHA256). + +Para almacenar valores UUID, utilice el [UUID](uuid.md) tipo de datos. + +Al insertar los datos, ClickHouse: + +- Complementa una cadena con bytes nulos si la cadena contiene menos de `N` byte. +- Lanza el `Too large value for FixedString(N)` excepción si la cadena contiene más de `N` byte. + +Al seleccionar los datos, ClickHouse no elimina los bytes nulos al final de la cadena. Si utiliza el `WHERE` cláusula, debe agregar bytes nulos manualmente para que coincida con el `FixedString` valor. En el ejemplo siguiente se muestra cómo utilizar el `WHERE` cláusula con `FixedString`. + +Consideremos la siguiente tabla con el único `FixedString(2)` columna: + +``` text +┌─name──┐ +│ b │ +└───────┘ +``` + +Consulta `SELECT * FROM FixedStringTable WHERE a = 'b'` no devuelve ningún dato como resultado. Debemos complementar el patrón de filtro con bytes nulos. + +``` sql +SELECT * FROM FixedStringTable +WHERE a = 'b\0' +``` + +``` text +┌─a─┐ +│ b │ +└───┘ +``` + +Este comportamiento difiere de MySQL para el `CHAR` tipo (donde las cadenas se rellenan con espacios y los espacios se eliminan para la salida). + +Tenga en cuenta que la longitud del `FixedString(N)` el valor es constante. El [longitud](../query_language/functions/array_functions.md#array_functions-length) función devuelve `N` incluso si el `FixedString(N)` sólo se rellena con bytes nulos, pero el valor [Vaciar](../query_language/functions/string_functions.md#empty) función devuelve `1` en este caso. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/fixedstring/) diff --git a/docs/es/data_types/float.md b/docs/es/data_types/float.md new file mode 100644 index 00000000000..b3883b83950 --- /dev/null +++ b/docs/es/data_types/float.md @@ -0,0 +1,80 @@ +# Descripción del producto {#float32-float64} + +[Números de punto flotante](https://en.wikipedia.org/wiki/IEEE_754). + +Los tipos son equivalentes a los tipos de C: + +- `Float32` - `float` +- `Float64` - `double` + +Le recomendamos que almacene los datos en formato entero siempre que sea posible. Por ejemplo, convierta números de precisión fija en valores enteros, como importes monetarios o tiempos de carga de página en milisegundos. + +## Uso de números de punto flotante {#using-floating-point-numbers} + +- Los cálculos con números de punto flotante pueden producir un error de redondeo. + + + +``` sql +SELECT 1 - 0.9 +``` + +``` text +┌───────minus(1, 0.9)─┐ +│ 0.09999999999999998 │ +└─────────────────────┘ +``` + +- El resultado del cálculo depende del método de cálculo (el tipo de procesador y la arquitectura del sistema informático). +- Los cálculos de puntos flotantes pueden dar como resultado números como el infinito (`Inf`) y “not-a-number” (`NaN`). Esto debe tenerse en cuenta al procesar los resultados de los cálculos. +- Al analizar números de punto flotante a partir de texto, el resultado puede no ser el número representable por máquina más cercano. + +## NaN y Inf {#data-type-float-nan-inf} + +A diferencia de SQL estándar, ClickHouse admite las siguientes categorías de números de punto flotante: + +- `Inf` – Infinito. + + + +``` sql +SELECT 0.5 / 0 +``` + +``` text +┌─divide(0.5, 0)─┐ +│ inf │ +└────────────────┘ +``` + +- `-Inf` – Infinito negativo. + + + +``` sql +SELECT -0.5 / 0 +``` + +``` text +┌─divide(-0.5, 0)─┐ +│ -inf │ +└─────────────────┘ +``` + +- `NaN` – No es un número. + + + +``` sql +SELECT 0 / 0 +``` + +``` text +┌─divide(0, 0)─┐ +│ nan │ +└──────────────┘ +``` + + See the rules for `NaN` sorting in the section [ORDER BY clause](../query_language/select.md). + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/float/) diff --git a/docs/es/data_types/index.md b/docs/es/data_types/index.md new file mode 100644 index 00000000000..bbedf53bf54 --- /dev/null +++ b/docs/es/data_types/index.md @@ -0,0 +1,7 @@ +# Tipos de datos {#data-types} + +ClickHouse puede almacenar varios tipos de datos en celdas de tabla. + +En esta sección se describen los tipos de datos admitidos y las consideraciones especiales para usarlos o implementarlos, si los hubiere. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/) diff --git a/docs/es/data_types/int_uint.md b/docs/es/data_types/int_uint.md new file mode 100644 index 00000000000..949b490fc99 --- /dev/null +++ b/docs/es/data_types/int_uint.md @@ -0,0 +1,19 @@ +# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 {#uint8-uint16-uint32-uint64-int8-int16-int32-int64} + +Enteros de longitud fija, con o sin signo. + +## Rangos Int {#int-ranges} + +- Int8 - \[-128 : 127\] +- Int16 - \[-32768 : 32767\] +- Int32 - \[-2147483648 : 2147483647\] +- Int64 - \[-9223372036854775808 : 9223372036854775807\] + +## Rangos de Uint {#uint-ranges} + +- UInt8 - \[0 : 255\] +- UInt16 - \[0 : 65535\] +- UInt32 - \[0 : 4294967295\] +- UInt64 - \[0 : 18446744073709551615\] + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/int_uint/) diff --git a/docs/es/data_types/nested_data_structures/aggregatefunction.md b/docs/es/data_types/nested_data_structures/aggregatefunction.md new file mode 100644 index 00000000000..9c496b8ec1e --- /dev/null +++ b/docs/es/data_types/nested_data_structures/aggregatefunction.md @@ -0,0 +1,63 @@ +# AggregateFunction(name, types\_of\_arguments…) {#data-type-aggregatefunction} + +El estado intermedio de una función agregada. Para obtenerlo, use funciones agregadas con el `-State` sufijo. Para obtener datos agregados en el futuro, debe utilizar las mismas funciones agregadas con el `-Merge`sufijo. + +`AggregateFunction` — parametric data type. + +**Parámetros** + +- Nombre de la función de agregado. + + If the function is parametric specify its parameters too. + +- Tipos de los argumentos de la función agregada. + +**Ejemplo** + +``` sql +CREATE TABLE t +( + column1 AggregateFunction(uniq, UInt64), + column2 AggregateFunction(anyIf, String, UInt8), + column3 AggregateFunction(quantiles(0.5, 0.9), UInt64) +) ENGINE = ... +``` + +[uniq](../../query_language/agg_functions/reference.md#agg_function-uniq), anyIf ([cualquier](../../query_language/agg_functions/reference.md#agg_function-any)+[Si](../../query_language/agg_functions/combinators.md#agg-functions-combinator-if)) y [cantiles](../../query_language/agg_functions/reference.md) son las funciones agregadas admitidas en ClickHouse. + +## Uso {#usage} + +### Inserción de datos {#data-insertion} + +Para insertar datos, utilice `INSERT SELECT` con agregado `-State`- función. + +**Ejemplos de funciones** + +``` sql +uniqState(UserID) +quantilesState(0.5, 0.9)(SendTiming) +``` + +En contraste con las funciones correspondientes `uniq` y `quantiles`, `-State`- funciones devuelven el estado, en lugar del valor final. En otras palabras, devuelven un valor de `AggregateFunction` tipo. + +En los resultados de `SELECT` consulta, los valores de `AggregateFunction` tipo tiene representación binaria específica de la implementación para todos los formatos de salida de ClickHouse. Si volcar datos en, por ejemplo, `TabSeparated` Formato con `SELECT` Consulta entonces este volcado se puede cargar de nuevo usando `INSERT` consulta. + +### Selección de datos {#data-selection} + +Al seleccionar datos de `AggregatingMergeTree` mesa, uso `GROUP BY` cláusula y las mismas funciones agregadas que al insertar datos, pero usando `-Merge`sufijo. + +Una función agregada con `-Merge` sufijo toma un conjunto de estados, los combina y devuelve el resultado de la agregación de datos completa. + +Por ejemplo, las siguientes dos consultas devuelven el mismo resultado: + +``` sql +SELECT uniq(UserID) FROM table + +SELECT uniqMerge(state) FROM (SELECT uniqState(UserID) AS state FROM table GROUP BY RegionID) +``` + +## Ejemplo de uso {#usage-example} + +Ver [AgregaciónMergeTree](../../operations/table_engines/aggregatingmergetree.md) Descripción del motor. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/nested_data_structures/aggregatefunction/) diff --git a/docs/es/data_types/nested_data_structures/index.md b/docs/es/data_types/nested_data_structures/index.md new file mode 100644 index 00000000000..dac0e7417c4 --- /dev/null +++ b/docs/es/data_types/nested_data_structures/index.md @@ -0,0 +1,3 @@ +# Estructuras de datos anidados {#nested-data-structures} + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/nested_data_structures/) diff --git a/docs/es/data_types/nested_data_structures/nested.md b/docs/es/data_types/nested_data_structures/nested.md new file mode 100644 index 00000000000..1ea1e777ade --- /dev/null +++ b/docs/es/data_types/nested_data_structures/nested.md @@ -0,0 +1,99 @@ +# Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} + +A nested data structure is like a nested table. The parameters of a nested data structure – the column names and types – are specified the same way as in a CREATE query. Each table row can correspond to any number of rows in a nested data structure. + +Ejemplo: + +``` sql +CREATE TABLE test.visits +( + CounterID UInt32, + StartDate Date, + Sign Int8, + IsNew UInt8, + VisitID UInt64, + UserID UInt64, + ... + Goals Nested + ( + ID UInt32, + Serial UInt32, + EventTime DateTime, + Price Int64, + OrderID String, + CurrencyID UInt32 + ), + ... +) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) +``` + +Este ejemplo declara la `Goals` estructura de datos anidada, que contiene datos sobre conversiones (objetivos alcanzados). Cada fila en el ‘visits’ la tabla puede corresponder a cero o cualquier número de conversiones. + +Solo se admite un único nivel de anidamiento. Las columnas de estructuras anidadas que contienen matrices son equivalentes a matrices multidimensionales, por lo que tienen un soporte limitado (no hay soporte para almacenar estas columnas en tablas con el motor MergeTree). + +En la mayoría de los casos, cuando se trabaja con una estructura de datos anidada, se especifican sus columnas individuales. Para hacer esto, los nombres de columna están separados por un punto. Estas columnas forman una matriz de tipos coincidentes. Todas las matrices de columnas de una sola estructura de datos anidados tienen la misma longitud. + +Ejemplo: + +``` sql +SELECT + Goals.ID, + Goals.EventTime +FROM test.visits +WHERE CounterID = 101500 AND length(Goals.ID) < 5 +LIMIT 10 +``` + +``` text +┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ +│ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ +│ [1073752] │ ['2014-03-17 00:28:25'] │ +│ [1073752] │ ['2014-03-17 10:46:20'] │ +│ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │ +│ [] │ [] │ +│ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │ +│ [] │ [] │ +│ [] │ [] │ +│ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │ +│ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │ +└────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Es más fácil pensar en una estructura de datos anidados como un conjunto de múltiples matrices de columnas de la misma longitud. + +El único lugar donde una consulta SELECT puede especificar el nombre de una estructura de datos anidada completa en lugar de columnas individuales es la cláusula ARRAY JOIN. Para obtener más información, consulte “ARRAY JOIN clause”. Ejemplo: + +``` sql +SELECT + Goal.ID, + Goal.EventTime +FROM test.visits +ARRAY JOIN Goals AS Goal +WHERE CounterID = 101500 AND length(Goals.ID) < 5 +LIMIT 10 +``` + +``` text +┌─Goal.ID─┬──────Goal.EventTime─┐ +│ 1073752 │ 2014-03-17 16:38:10 │ +│ 591325 │ 2014-03-17 16:38:48 │ +│ 591325 │ 2014-03-17 16:42:27 │ +│ 1073752 │ 2014-03-17 00:28:25 │ +│ 1073752 │ 2014-03-17 10:46:20 │ +│ 1073752 │ 2014-03-17 13:59:20 │ +│ 591325 │ 2014-03-17 22:17:55 │ +│ 591325 │ 2014-03-17 22:18:07 │ +│ 591325 │ 2014-03-17 22:18:51 │ +│ 1073752 │ 2014-03-17 11:37:06 │ +└─────────┴─────────────────────┘ +``` + +No puede realizar SELECT para toda una estructura de datos anidados. Solo puede enumerar explícitamente columnas individuales que forman parte de él. + +Para una consulta INSERT, debe pasar todas las matrices de columnas de componentes de una estructura de datos anidada por separado (como si fueran matrices de columnas individuales). Durante la inserción, el sistema comprueba que tienen la misma longitud. + +Para una consulta DESCRIBE, las columnas de una estructura de datos anidada se enumeran por separado de la misma manera. + +La consulta ALTER es muy limitada para los elementos de una estructura de datos anidada. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/nested_data_structures/nested/) diff --git a/docs/es/data_types/nullable.md b/docs/es/data_types/nullable.md new file mode 100644 index 00000000000..56cccafd05d --- /dev/null +++ b/docs/es/data_types/nullable.md @@ -0,0 +1,39 @@ +# Nombre de tipo) {#data-type-nullable} + +Permite almacenar marcador especial ([NULO](../query_language/syntax.md)) que denota “missing value” con los valores normales permitidos por `TypeName`. Por ejemplo, un `Nullable(Int8)` Tipo columna puede almacenar `Int8` valores de tipo, y las filas que no tienen un valor almacenarán `NULL`. + +Para un `TypeName`, no puede usar tipos de datos compuestos [Matriz](array.md) y [Tupla](tuple.md). Los tipos de datos compuestos pueden contener `Nullable` valores de tipo, como `Array(Nullable(Int8))`. + +Naciones `Nullable` no se puede incluir en los índices de tabla. + +`NULL` es el valor predeterminado para cualquier `Nullable` tipo, a menos que se especifique lo contrario en la configuración del servidor ClickHouse. + +## Características de almacenamiento {#storage-features} + +Almacenar `Nullable` es una columna de tabla, ClickHouse usa un archivo separado con `NULL` máscaras además del archivo normal con valores. Las entradas en el archivo de máscaras permiten ClickHouse distinguir entre `NULL` y un valor predeterminado del tipo de datos correspondiente para cada fila de la tabla. Debido a un archivo adicional, `Nullable` La columna consume espacio de almacenamiento adicional en comparación con una normal similar. + +!!! info "Nota" + Utilizar `Nullable` casi siempre afecta negativamente al rendimiento, tenga esto en cuenta al diseñar sus bases de datos. + +## Ejemplo de uso {#usage-example} + +``` sql +CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog +``` + +``` sql +INSERT INTO t_null VALUES (1, NULL), (2, 3) +``` + +``` sql +SELECT x + y FROM t_null +``` + +``` text +┌─plus(x, y)─┐ +│ ᴺᵁᴸᴸ │ +│ 5 │ +└────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/nullable/) diff --git a/docs/es/data_types/special_data_types/expression.md b/docs/es/data_types/special_data_types/expression.md new file mode 100644 index 00000000000..c1da3d79455 --- /dev/null +++ b/docs/es/data_types/special_data_types/expression.md @@ -0,0 +1,5 @@ +# Expresion {#expression} + +Se utiliza para representar expresiones lambda en funciones de orden superior. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/special_data_types/expression/) diff --git a/docs/es/data_types/special_data_types/index.md b/docs/es/data_types/special_data_types/index.md new file mode 100644 index 00000000000..8c02cb2b23f --- /dev/null +++ b/docs/es/data_types/special_data_types/index.md @@ -0,0 +1,5 @@ +# Tipos de datos especiales {#special-data-types} + +Special data type values can’t be saved to a table or output in results, but can be used as the intermediate result when running a query. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/special_data_types/) diff --git a/docs/es/data_types/special_data_types/interval.md b/docs/es/data_types/special_data_types/interval.md new file mode 100644 index 00000000000..a95bbacbd8b --- /dev/null +++ b/docs/es/data_types/special_data_types/interval.md @@ -0,0 +1,78 @@ +# Intervalo {#data-type-interval} + +La familia de tipos de datos que representan intervalos de fecha y hora. Los tipos resultantes del [INTERVALO](../../query_language/operators.md#operator-interval) operador. + +!!! warning "Advertencia" + `Interval` los valores de tipo de datos no se pueden almacenar en tablas. + +Estructura: + +- Intervalo de tiempo como un valor entero sin signo. +- Tipo de intervalo. + +Tipos de intervalo admitidos: + +- `SECOND` +- `MINUTE` +- `HOUR` +- `DAY` +- `WEEK` +- `MONTH` +- `QUARTER` +- `YEAR` + +Para cada tipo de intervalo, hay un tipo de datos independiente. Por ejemplo, el `DAY` se expresa como el intervalo `IntervalDay` Tipo de datos: + +``` sql +SELECT toTypeName(INTERVAL 4 DAY) +``` + +``` text +┌─toTypeName(toIntervalDay(4))─┐ +│ IntervalDay │ +└──────────────────────────────┘ +``` + +## Observaciones de uso {#data-type-interval-usage-remarks} + +Usted puede utilizar `Interval`-type valores en operaciones aritméticas con [Fecha](../../data_types/date.md) y [FechaHora](../../data_types/datetime.md)-type valores. Por ejemplo, puede agregar 4 días a la hora actual: + +``` sql +SELECT now() as current_date_time, current_date_time + INTERVAL 4 DAY +``` + +``` text +┌───current_date_time─┬─plus(now(), toIntervalDay(4))─┐ +│ 2019-10-23 10:58:45 │ 2019-10-27 10:58:45 │ +└─────────────────────┴───────────────────────────────┘ +``` + +Los intervalos con diferentes tipos no se pueden combinar. No puedes usar intervalos como `4 DAY 1 HOUR`. Exprese los intervalos en unidades que son más pequeñas o iguales que la unidad más pequeña del intervalo, por ejemplo, el intervalo `1 day and an hour` se puede expresar como `25 HOUR` o `90000 SECOND`. + +No puede realizar operaciones aritméticas con `Interval`-type valores, pero puede agregar intervalos de diferentes tipos en consecuencia a los valores en `Date` o `DateTime` tipos de datos. Por ejemplo: + +``` sql +SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL 3 HOUR +``` + +``` text +┌───current_date_time─┬─plus(plus(now(), toIntervalDay(4)), toIntervalHour(3))─┐ +│ 2019-10-23 11:16:28 │ 2019-10-27 14:16:28 │ +└─────────────────────┴────────────────────────────────────────────────────────┘ +``` + +La siguiente consulta provoca una excepción: + +``` sql +select now() AS current_date_time, current_date_time + (INTERVAL 4 DAY + INTERVAL 3 HOUR) +``` + +``` text +Received exception from server (version 19.14.1): +Code: 43. DB::Exception: Received from localhost:9000. DB::Exception: Wrong argument types for function plus: if one argument is Interval, then another must be Date or DateTime.. +``` + +## Ver también {#see-also} + +- [INTERVALO](../../query_language/operators.md#operator-interval) operador +- [ToInterval](../../query_language/functions/type_conversion_functions.md#function-tointerval) funciones de conversión de tipo diff --git a/docs/es/data_types/special_data_types/nothing.md b/docs/es/data_types/special_data_types/nothing.md new file mode 100644 index 00000000000..6f6c8e5f310 --- /dev/null +++ b/docs/es/data_types/special_data_types/nothing.md @@ -0,0 +1,19 @@ +# Nada {#nothing} + +El único propósito de este tipo de datos es representar casos en los que no se espera un valor. Entonces no puedes crear un `Nothing` valor de tipo. + +Por ejemplo, literal [NULO](../../query_language/syntax.md#null-literal) tiene tipo de `Nullable(Nothing)`. Ver más sobre [NULO](../../data_types/nullable.md). + +El `Nothing` tipo puede también se utiliza para denotar matrices vacías: + +``` sql +SELECT toTypeName(array()) +``` + +``` text +┌─toTypeName(array())─┐ +│ Array(Nothing) │ +└─────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/special_data_types/nothing/) diff --git a/docs/es/data_types/special_data_types/set.md b/docs/es/data_types/special_data_types/set.md new file mode 100644 index 00000000000..2e3112813f6 --- /dev/null +++ b/docs/es/data_types/special_data_types/set.md @@ -0,0 +1,5 @@ +# Establecer {#set} + +Utilizado para la mitad derecha de un [ES](../../query_language/select.md#select-in-operators) expresión. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/special_data_types/set/) diff --git a/docs/es/data_types/string.md b/docs/es/data_types/string.md new file mode 100644 index 00000000000..e86e8fbdab7 --- /dev/null +++ b/docs/es/data_types/string.md @@ -0,0 +1,13 @@ +# Cadena {#string} + +Cuerdas de una longitud arbitraria. La longitud no está limitada. El valor puede contener un conjunto arbitrario de bytes, incluidos los bytes nulos. +El tipo String reemplaza los tipos VARCHAR, BLOB, CLOB y otros de otros DBMS. + +## Codificación {#encodings} + +ClickHouse no tiene el concepto de codificaciones. Las cadenas pueden contener un conjunto arbitrario de bytes, que se almacenan y salen tal cual. +Si necesita almacenar textos, le recomendamos que utilice la codificación UTF-8. Como mínimo, si su terminal usa UTF-8 (según lo recomendado), puede leer y escribir sus valores sin realizar conversiones. +Del mismo modo, ciertas funciones para trabajar con cadenas tienen variaciones separadas que funcionan bajo el supuesto de que la cadena contiene un conjunto de bytes que representan un texto codificado en UTF-8. +Por ejemplo, el ‘length’ función calcula la longitud de cadena en bytes, mientras que la ‘lengthUTF8’ función calcula la longitud de la cadena en puntos de código Unicode, suponiendo que el valor está codificado UTF-8. + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/string/) diff --git a/docs/es/data_types/tuple.md b/docs/es/data_types/tuple.md new file mode 100644 index 00000000000..41121c4481c --- /dev/null +++ b/docs/es/data_types/tuple.md @@ -0,0 +1,45 @@ +# Tuple (T1, T2, …) {#tuplet1-t2} + +Una tupla de elementos, cada uno con un individuo [Tipo](index.md#data_types). + +Las tuplas se utilizan para la agrupación temporal de columnas. Las columnas se pueden agrupar cuando se usa una expresión IN en una consulta y para especificar ciertos parámetros formales de las funciones lambda. Para obtener más información, consulte las secciones [IN operadores](../query_language/select.md) y [Funciones de orden superior](../query_language/functions/higher_order_functions.md). + +Las tuplas pueden ser el resultado de una consulta. En este caso, para formatos de texto distintos de JSON, los valores están separados por comas entre corchetes. En formatos JSON, las tuplas se generan como matrices (entre corchetes). + +## Creando una tupla {#creating-a-tuple} + +Puedes usar una función para crear una tupla: + +``` sql +tuple(T1, T2, ...) +``` + +Ejemplo de creación de una tupla: + +``` sql +SELECT tuple(1,'a') AS x, toTypeName(x) +``` + +``` text +┌─x───────┬─toTypeName(tuple(1, 'a'))─┐ +│ (1,'a') │ Tuple(UInt8, String) │ +└─────────┴───────────────────────────┘ +``` + +## Trabajar con tipos de datos {#working-with-data-types} + +Al crear una tupla sobre la marcha, ClickHouse detecta automáticamente el tipo de cada argumento como el mínimo de los tipos que pueden almacenar el valor del argumento. Si el argumento es [NULO](../query_language/syntax.md#null-literal), el tipo del elemento de tupla es [NULO](nullable.md). + +Ejemplo de detección automática de tipos de datos: + +``` sql +SELECT tuple(1, NULL) AS x, toTypeName(x) +``` + +``` text +┌─x────────┬─toTypeName(tuple(1, NULL))──────┐ +│ (1,NULL) │ Tuple(UInt8, Nullable(Nothing)) │ +└──────────┴─────────────────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/tuple/) diff --git a/docs/es/data_types/uuid.md b/docs/es/data_types/uuid.md new file mode 100644 index 00000000000..a847bb82eb6 --- /dev/null +++ b/docs/es/data_types/uuid.md @@ -0,0 +1,70 @@ +# UUID {#uuid-data-type} + +Un identificador único universal (UUID) es un número de 16 bytes utilizado para identificar registros. Para obtener información detallada sobre el UUID, consulte [Wikipedia](https://en.wikipedia.org/wiki/Universally_unique_identifier). + +El ejemplo de valor de tipo UUID se representa a continuación: + +``` text +61f0c404-5cb3-11e7-907b-a6006ad3dba0 +``` + +Si no especifica el valor de la columna UUID al insertar un nuevo registro, el valor UUID se rellena con cero: + +``` text +00000000-0000-0000-0000-000000000000 +``` + +## Cómo generar {#how-to-generate} + +Para generar el valor UUID, ClickHouse proporciona el [GenerateUUIDv4](../query_language/functions/uuid_functions.md) función. + +## Ejemplo de uso {#usage-example} + +**Ejemplo 1** + +En este ejemplo se muestra la creación de una tabla con la columna de tipo UUID e insertar un valor en la tabla. + +``` sql +CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog +``` + +``` sql +INSERT INTO t_uuid SELECT generateUUIDv4(), 'Example 1' +``` + +``` sql +SELECT * FROM t_uuid +``` + +``` text +┌────────────────────────────────────x─┬─y─────────┐ +│ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ +└──────────────────────────────────────┴───────────┘ +``` + +**Ejemplo 2** + +En este ejemplo, el valor de la columna UUID no se especifica al insertar un nuevo registro. + +``` sql +INSERT INTO t_uuid (y) VALUES ('Example 2') +``` + +``` sql +SELECT * FROM t_uuid +``` + +``` text +┌────────────────────────────────────x─┬─y─────────┐ +│ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ +│ 00000000-0000-0000-0000-000000000000 │ Example 2 │ +└──────────────────────────────────────┴───────────┘ +``` + +## Restricción {#restrictions} + +El tipo de datos UUID sólo admite funciones que [Cadena](string.md) tipo de datos también soporta (por ejemplo, [minuto](../query_language/agg_functions/reference.md#agg_function-min), [máximo](../query_language/agg_functions/reference.md#agg_function-max), y [contar](../query_language/agg_functions/reference.md#agg_function-count)). + +El tipo de datos UUID no es compatible con operaciones aritméticas (por ejemplo, [abdominales](../query_language/functions/arithmetic_functions.md#arithm_func-abs)) o funciones agregadas, tales como [resumir](../query_language/agg_functions/reference.md#agg_function-sum) y [avg](../query_language/agg_functions/reference.md#agg_function-avg). + +[Artículo Original](https://clickhouse.tech/docs/es/data_types/uuid/) diff --git a/docs/es/database_engines/index.md b/docs/es/database_engines/index.md new file mode 100644 index 00000000000..635f19d7d66 --- /dev/null +++ b/docs/es/database_engines/index.md @@ -0,0 +1,13 @@ +# Motores de base de datos {#database-engines} + +Los motores de bases de datos le permiten trabajar con tablas. + +De forma predeterminada, ClickHouse utiliza su motor de base de datos nativa, que proporciona [motores de mesa](../operations/table_engines/index.md) y una [Sistema abierto.](../query_language/syntax.md). + +También puede utilizar los siguientes motores de base de datos: + +- [MySQL](mysql.md) + +- [Perezoso](lazy.md) + +[Artículo Original](https://clickhouse.tech/docs/es/database_engines/) diff --git a/docs/es/database_engines/lazy.md b/docs/es/database_engines/lazy.md new file mode 100644 index 00000000000..4a39ad2cc8c --- /dev/null +++ b/docs/es/database_engines/lazy.md @@ -0,0 +1,11 @@ +# Perezoso {#lazy} + +Mantiene las tablas en RAM solamente `expiration_time_in_seconds` segundos después del último acceso. Solo se puede usar con tablas \*Log. + +Está optimizado para almacenar muchas tablas pequeñas \* Log, para las cuales hay un intervalo de tiempo largo entre los accesos. + +## Creación de una base de datos {#creating-a-database} + +CREAR BASE DE DATOS testlazy ENGINE = Lazy(expiration\_time\_in\_seconds); + +[Artículo Original](https://clickhouse.tech/docs/es/database_engines/lazy/) diff --git a/docs/es/database_engines/mysql.md b/docs/es/database_engines/mysql.md new file mode 100644 index 00000000000..56aba9899bc --- /dev/null +++ b/docs/es/database_engines/mysql.md @@ -0,0 +1,128 @@ +# MySQL {#mysql} + +Permite conectarse a bases de datos en un servidor MySQL remoto y realizar `INSERT` y `SELECT` Consultas para intercambiar datos entre ClickHouse y MySQL. + +El `MySQL` motor de base de datos traducir consultas al servidor MySQL para que pueda realizar operaciones tales como `SHOW TABLES` o `SHOW CREATE TABLE`. + +No puede realizar las siguientes consultas: + +- `RENAME` +- `CREATE TABLE` +- `ALTER` + +## Creación de una base de datos {#creating-a-database} + +``` sql +CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] +ENGINE = MySQL('host:port', 'database', 'user', 'password') +``` + +**Parámetros del motor** + +- `host:port` — Dirección del servidor MySQL. +- `database` — Nombre de base de datos remota. +- `user` — Usuario de MySQL. +- `password` — Contraseña de usuario. + +## Soporte de tipos de datos {#data-types-support} + +| MySQL | Haga clic en Casa | +|-----------------------------------|---------------------------------------------| +| TINYINT NO FIRMADO | [UInt8](../data_types/int_uint.md) | +| TINYINT | [Int8](../data_types/int_uint.md) | +| SMALLINT UNSIGNED | [UInt16](../data_types/int_uint.md) | +| SMALLINT | [Int16](../data_types/int_uint.md) | +| UNFIRED INT, MEDIUMINT NO FIRMADO | [UInt32](../data_types/int_uint.md) | +| INT, MEDIUMINT | [Int32](../data_types/int_uint.md) | +| BIGINT NO FIRMADO | [UInt64](../data_types/int_uint.md) | +| BIGINT | [Int64](../data_types/int_uint.md) | +| FLOTANTE | [Float32](../data_types/float.md) | +| DOBLE | [Float64](../data_types/float.md) | +| FECHA | [Fecha](../data_types/date.md) | +| DATETIME, TIMESTAMP | [FechaHora](../data_types/datetime.md) | +| BINARIO | [Cadena fija](../data_types/fixedstring.md) | + +Todos los demás tipos de datos MySQL se convierten en [Cadena](../data_types/string.md). + +[NULO](../data_types/nullable.md) se admite. + +## Ejemplos de uso {#examples-of-use} + +Tabla en MySQL: + +``` text +mysql> USE test; +Database changed + +mysql> CREATE TABLE `mysql_table` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `float` FLOAT NOT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into mysql_table (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from mysql_table; ++--------+-------+ +| int_id | value | ++--------+-------+ +| 1 | 2 | ++--------+-------+ +1 row in set (0,00 sec) +``` + +Base de datos en ClickHouse, intercambiando datos con el servidor MySQL: + +``` sql +CREATE DATABASE mysql_db ENGINE = MySQL('localhost:3306', 'test', 'my_user', 'user_password') +``` + +``` sql +SHOW DATABASES +``` + +``` text +┌─name─────┐ +│ default │ +│ mysql_db │ +│ system │ +└──────────┘ +``` + +``` sql +SHOW TABLES FROM mysql_db +``` + +``` text +┌─name─────────┐ +│ mysql_table │ +└──────────────┘ +``` + +``` sql +SELECT * FROM mysql_db.mysql_table +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +└────────┴───────┘ +``` + +``` sql +INSERT INTO mysql_db.mysql_table VALUES (3,4) +``` + +``` sql +SELECT * FROM mysql_db.mysql_table +``` + +``` text +┌─int_id─┬─value─┐ +│ 1 │ 2 │ +│ 3 │ 4 │ +└────────┴───────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/database_engines/mysql/) diff --git a/docs/es/development/architecture.md b/docs/es/development/architecture.md new file mode 100644 index 00000000000..b5ab9c55e70 --- /dev/null +++ b/docs/es/development/architecture.md @@ -0,0 +1,194 @@ +# Descripción general de la arquitectura ClickHouse {#overview-of-clickhouse-architecture} + +ClickHouse es un verdadero DBMS orientado a columnas. Los datos se almacenan por columnas y durante la ejecución de matrices (vectores o fragmentos de columnas). Siempre que sea posible, las operaciones se envían en matrices, en lugar de en valores individuales. Esto se llama “vectorized query execution,” y ayuda a reducir el costo del procesamiento de datos real. + +> Esta idea no es nada nuevo. Se remonta a la `APL` lenguaje de programación y sus descendientes: `A +`, `J`, `K`, y `Q`. La programación de matrices se utiliza en el procesamiento de datos científicos. Tampoco es esta idea algo nuevo en las bases de datos relacionales: por ejemplo, se usa en el `Vectorwise` sistema. + +Existen dos enfoques diferentes para acelerar el procesamiento de consultas: la ejecución de consultas vectorizadas y la generación de código en tiempo de ejecución. En este último, el código se genera para cada tipo de consulta sobre la marcha, eliminando toda la indirección y el despacho dinámico. Ninguno de estos enfoques es estrictamente mejor que el otro. La generación de código de tiempo de ejecución puede ser mejor cuando fusiona muchas operaciones, utilizando así las unidades de ejecución de la CPU y la canalización. La ejecución de consultas vectorizadas puede ser menos práctica porque implica vectores temporales que deben escribirse en la memoria caché y leerse. Si los datos temporales no caben en la memoria caché L2, esto se convierte en un problema. Pero la ejecución de consultas vectorizadas utiliza más fácilmente las capacidades SIMD de la CPU. Un [documento de investigación](http://15721.courses.cs.cmu.edu/spring2016/papers/p5-sompolski.pdf) escrito por nuestros amigos muestra que es mejor combinar ambos enfoques. ClickHouse utiliza la ejecución de consultas vectorizadas y tiene un soporte inicial limitado para la generación de código en tiempo de ejecución. + +## Columna {#columns} + +Para representar columnas en la memoria (en realidad, fragmentos de columnas), el `IColumn` se utiliza la interfaz. Esta interfaz proporciona métodos auxiliares para la implementación de varios operadores relacionales. Casi todas las operaciones son inmutables: no modifican la columna original, sino que crean una nueva modificada. Por ejemplo, el `IColumn :: filter` método acepta una máscara de bytes de filtro. Se utiliza para el `WHERE` y `HAVING` operadores relacionales. Ejemplos adicionales: el `IColumn :: permute` para apoyar `ORDER BY` el `IColumn :: cut` para apoyar `LIMIT` y así sucesivamente. + +Diversos `IColumn` aplicación (`ColumnUInt8`, `ColumnString` y así sucesivamente) son responsables del diseño de memoria de las columnas. El diseño de memoria suele ser una matriz contigua. Para el tipo entero de columnas, es solo una matriz contigua, como `std :: vector`. Para `String` y `Array` columnas, son dos vectores: uno para todos los elementos de la matriz, colocados contiguamente, y un segundo para los desplazamientos al comienzo de cada matriz. También hay `ColumnConst` que almacena solo un valor en la memoria, pero parece una columna. + +## Campo {#field} + +Sin embargo, también es posible trabajar con valores individuales. Para representar un valor individual, el `Field` se utiliza. `Field` es sólo una unión discriminada de `UInt64`, `Int64`, `Float64`, `String` y `Array`. `IColumn` tiene el `operator[]` para obtener el valor n-ésimo como un `Field` y el `insert` método para agregar un `Field` al final de una columna. Estos métodos no son muy eficientes, ya que requieren tratar con temporal `Field` objetos que representan un valor individual. Hay métodos más eficientes, tales como `insertFrom`, `insertRangeFrom` y así sucesivamente. + +`Field` no tiene suficiente información sobre un tipo de datos específico para una tabla. Por ejemplo, `UInt8`, `UInt16`, `UInt32`, y `UInt64` todos están representados como `UInt64` es una `Field`. + +## Abstracciones con fugas {#leaky-abstractions} + +`IColumn` tiene métodos para transformaciones relacionales comunes de datos, pero no satisfacen todas las necesidades. Por ejemplo, `ColumnUInt64` no tiene un método para calcular la suma de dos columnas, y `ColumnString` no tiene un método para ejecutar una búsqueda de subcadena. Estas innumerables rutinas se implementan fuera de `IColumn`. + +Varias funciones en columnas se pueden implementar de una manera genérica, no eficiente utilizando `IColumn` para extraer `Field` valores, o de una manera especializada utilizando el conocimiento del diseño de la memoria interna de los datos en un `IColumn` aplicación. Para hacer esto, las funciones se convierten en un `IColumn` escriba y trate con la representación interna directamente. Por ejemplo, `ColumnUInt64` tiene el `getData` método que devuelve una referencia a una matriz interna, luego una rutina separada lee o llena esa matriz directamente. De hecho, tenemos “leaky abstractions” para permitir especializaciones eficientes de varias rutinas. + +## Tipos de datos {#data-types} + +`IDataType` es responsable de la serialización y deserialización: para leer y escribir fragmentos de columnas o valores individuales en formato binario o de texto. `IDataType` corresponde directamente a los tipos de datos en las tablas. Por ejemplo, heno `DataTypeUInt32`, `DataTypeDateTime`, `DataTypeString` y así sucesivamente. + +`IDataType` y `IColumn` están vagamente relacionados entre sí. Diferentes tipos de datos se pueden representar en la memoria por el mismo `IColumn` aplicación. Por ejemplo, `DataTypeUInt32` y `DataTypeDateTime` están representados por `ColumnUInt32` o `ColumnConstUInt32`. Además, el mismo tipo de datos se puede representar mediante `IColumn` aplicación. Por ejemplo, `DataTypeUInt8` puede ser representado por `ColumnUInt8` o `ColumnConstUInt8`. + +`IDataType` sólo almacena metadatos. Por ejemplo, `DataTypeUInt8` no almacena nada en absoluto (excepto vptr) y `DataTypeFixedString` tiendas en solitario `N` (el tamaño de las cadenas de tamaño fijo). + +`IDataType` tiene métodos auxiliares para varios formatos de datos. Los ejemplos son métodos para serializar un valor con posibles citas, para serializar un valor para JSON y para serializar un valor como parte del formato XML. No hay correspondencia directa con los formatos de datos. Por ejemplo, los diferentes formatos de datos `Pretty` y `TabSeparated` Puede utilizar el mismo `serializeTextEscaped` método de ayuda de la `IDataType` interfaz. + +## Bloque {#block} + +Naciones `Block` es un contenedor que representa un subconjunto (porción) de una tabla en la memoria. Es sólo un conjunto de triples: `(IColumn, IDataType, column name)`. Durante la ejecución de la consulta, los datos son procesados por `Block`s. Si tenemos un `Block`, tenemos datos (en el `IColumn` objeto), tenemos información sobre su tipo (en `IDataType`) que nos dice cómo lidiar con esa columna, y tenemos el nombre de la columna (ya sea el nombre de la columna original de la tabla o algún nombre artificial asignado para obtener resultados temporales de los cálculos). + +Cuando calculamos alguna función sobre columnas en un bloque, agregamos otra columna con su resultado al bloque, y no tocamos columnas para argumentos de la función porque las operaciones son inmutables. Más tarde, las columnas innecesarias se pueden eliminar del bloque, pero no se pueden modificar. Esto es conveniente para la eliminación de subexpresiones comunes. + +Se crean bloques para cada fragmento de datos procesado. Tenga en cuenta que para el mismo tipo de cálculo, los nombres y tipos de columna siguen siendo los mismos para diferentes bloques y solo los cambios de datos de columna. Es mejor dividir los datos del bloque desde el encabezado del bloque porque los tamaños de bloque pequeños tendrán una gran sobrecarga de cadenas temporales para copiar shared\_ptrs y nombres de columna. + +## Bloquear flujos {#block-streams} + +Los flujos de bloques son para procesar datos. Usamos flujos de bloques para leer datos de algún lugar, realizar transformaciones de datos o escribir datos en algún lugar. `IBlockInputStream` tiene el `read` método para buscar el siguiente bloque mientras esté disponible. `IBlockOutputStream` tiene el `write` método para empujar el bloque en alguna parte. + +Los flujos son responsables de: + +1. Leer o escribir en una mesa. La tabla solo devuelve una secuencia para leer o escribir bloques. +2. Implementación de formatos de datos. Por ejemplo, si desea enviar datos a un terminal en `Pretty` formato, crea un flujo de salida de bloque donde presiona bloques y los formatea. +3. Realización de transformaciones de datos. Digamos que tienes `IBlockInputStream` y desea crear una secuencia filtrada. Bienvenidos `FilterBlockInputStream` e inicializarlo con su transmisión. Luego, cuando tiras de un bloque de `FilterBlockInputStream`, extrae un bloque de su flujo, lo filtra y le devuelve el bloque filtrado. Las canalizaciones de ejecución de consultas se representan de esta manera. + +Hay transformaciones más sofisticadas. Por ejemplo, cuando tiras de `AggregatingBlockInputStream`, lee todos los datos de su origen, los agrega y, a continuación, devuelve un flujo de datos agregados para usted. Otro ejemplo: `UnionBlockInputStream` acepta muchas fuentes de entrada en el constructor y también una serie de subprocesos. Lanza múltiples hilos y lee de múltiples fuentes en paralelo. + +> Las secuencias de bloques usan el “pull” enfoque para controlar el flujo: cuando extrae un bloque de la primera secuencia, en consecuencia extrae los bloques requeridos de las secuencias anidadas, y toda la tubería de ejecución funcionará. Ni “pull” ni “push” es la mejor solución, porque el flujo de control está implícito y eso limita la implementación de varias características, como la ejecución simultánea de múltiples consultas (fusionando muchas tuberías). Esta limitación podría superarse con coroutines o simplemente ejecutando hilos adicionales que se esperan el uno al otro. Podemos tener más posibilidades si hacemos explícito el flujo de control: si localizamos la lógica para pasar datos de una unidad de cálculo a otra fuera de esas unidades de cálculo. Lea esto [artículo](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/) para más pensamientos. + +Debemos tener en cuenta que la canalización de ejecución de consultas crea datos temporales en cada paso. Tratamos de mantener el tamaño del bloque lo suficientemente pequeño para que los datos temporales se ajusten a la memoria caché de la CPU. Con esa suposición, escribir y leer datos temporales es casi gratis en comparación con otros cálculos. Podríamos considerar una alternativa, que es fusionar muchas operaciones en la tubería, para hacer que la tubería sea lo más corta posible y eliminar gran parte de los datos temporales. Esto podría ser una ventaja, pero también tiene inconvenientes. Por ejemplo, una canalización dividida facilita la implementación de almacenamiento en caché de datos intermedios, el robo de datos intermedios de consultas similares que se ejecutan al mismo tiempo y la fusión de canalizaciones para consultas similares. + +## Formato {#formats} + +Los formatos de datos se implementan con flujos de bloques. Heno “presentational” sólo es adecuado para la salida de datos al cliente, tales como `Pretty` formato, que proporciona sólo `IBlockOutputStream`. Y hay formatos de entrada / salida, como `TabSeparated` o `JSONEachRow`. + +También hay secuencias de filas: `IRowInputStream` y `IRowOutputStream`. Permiten pull/push datos por filas individuales, no por bloques. Y solo son necesarios para simplificar la implementación de formatos orientados a filas. Envoltura `BlockInputStreamFromRowInputStream` y `BlockOutputStreamFromRowOutputStream` seguros de que el usuario puede acceder a la información sobre cualquier tipo de información. + +## E/S {#io} + +Para la entrada / salida orientada a bytes, heno `ReadBuffer` y `WriteBuffer` clases abstractas. Se usan en lugar de C ++ `iostream`s. No se preocupe: cada proyecto maduro de C ++ está usando algo más que `iostream`s por buenas razones. + +`ReadBuffer` y `WriteBuffer` son solo un búfer contiguo y un cursor apuntando a la posición en ese búfer. Las implementaciones pueden poseer o no la memoria del búfer. Hay un método virtual para llenar el búfer con los siguientes datos (para `ReadBuffer`) o para vaciar el búfer en algún lugar (para `WriteBuffer`). Los métodos virtuales rara vez se llaman. + +Implementaciones de `ReadBuffer`/`WriteBuffer` se utilizan para trabajar con archivos y descriptores de archivos y sockets de red, para implementar la compresión (`CompressedWriteBuffer` se inicializa con otro WriteBuffer y realiza la compresión antes de escribir datos en él), y para otros fines: los nombres `ConcatReadBuffer`, `LimitReadBuffer`, y `HashingWriteBuffer` hablar por sí mismos. + +Read / WriteBuffers solo se ocupan de bytes. Para ayudar con la entrada / salida formateada (por ejemplo, para escribir un número en formato decimal), hay funciones de `ReadHelpers` y `WriteHelpers` archivos de encabezado. + +Veamos qué sucede cuando quieres escribir un conjunto de resultados en `JSON` Formato una salida estándar. Tiene un conjunto de resultados listo para ser recuperado de `IBlockInputStream`. Bienvenidos `WriteBufferFromFileDescriptor(STDOUT_FILENO)` para escribir bytes en stdout. Bienvenidos `JSONRowOutputStream`, inicializado con eso `WriteBuffer` para escribir filas en `JSON` una salida estándar. Bienvenidos `BlockOutputStreamFromRowOutputStream` encima de él, para representarlo como `IBlockOutputStream`. Entonces usted llama `copyData` para transferir datos desde `IBlockInputStream` Naciones `IBlockOutputStream` y todo funciona. Internamente, `JSONRowOutputStream` escribirá varios delimitadores JSON y llamará al `IDataType::serializeTextJSON` con una referencia a `IColumn` y el número de fila como argumentos. Consecuentemente, `IDataType::serializeTextJSON` llamará a un método de `WriteHelpers.h`: por ejemplo, `writeText` para tipos numéricos y `writeJSONString` para `DataTypeString`. + +## Tabla {#tables} + +Las tablas están representadas por el `IStorage` interfaz. Las diferentes implementaciones de esa interfaz son diferentes motores de tabla. Los ejemplos hijo `StorageMergeTree`, `StorageMemory` y así sucesivamente. Las instancias de estas clases son solo tablas. + +El más importante `IStorage` hijo `read` y `write`. También hay `alter`, `rename`, `drop` y así sucesivamente. El `read` método acepta los siguientes argumentos: el conjunto de columnas para leer de una tabla, el `AST` consulta a considerar, y el número deseado de flujos para devolver. Devuelve uno o varios `IBlockInputStream` objetos e información sobre la etapa de procesamiento de datos que se completó dentro de un motor de tablas durante la ejecución de la consulta. + +En la mayoría de los casos, el método de lectura solo es responsable de leer las columnas especificadas de una tabla, no de ningún procesamiento de datos adicional. Todo el procesamiento de datos adicional es realizado por el intérprete de consultas y está fuera de la responsabilidad de `IStorage`. + +Pero hay excepciones notables: + +- La consulta AST se pasa al `read` y el motor de tablas puede usarlo para derivar el uso del índice y leer menos datos de una tabla. +- A veces, el motor de tablas puede procesar los datos a una etapa específica. Por ejemplo, `StorageDistributed` puede enviar una consulta a servidores remotos, pedirles que procesen datos a una etapa donde se puedan fusionar datos de diferentes servidores remotos y devolver esos datos preprocesados. El intérprete de consultas termina de procesar los datos. + +La mesa de `read` método puede devolver múltiples `IBlockInputStream` objetos para permitir el procesamiento de datos en paralelo. Estos flujos de entrada de bloques múltiples pueden leer de una tabla en paralelo. A continuación, puede ajustar estas secuencias con varias transformaciones (como la evaluación de expresiones o el filtrado) que se pueden calcular de forma independiente y crear un `UnionBlockInputStream` encima de ellos, para leer desde múltiples flujos en paralelo. + +También hay `TableFunction`Estas son funciones que devuelven un `IStorage` objeto a utilizar en el `FROM` cláusula de una consulta. + +Para tener una idea rápida de cómo implementar su propio motor de tablas, vea algo simple, como `StorageMemory` o `StorageTinyLog`. + +> Como resultado de la `read` método, `IStorage` devoluciones `QueryProcessingStage` – información sobre qué partes de la consulta ya se han calculado dentro del almacenamiento. Actualmente, solo tenemos una granularidad muy gruesa para esa información. No hay forma de que el almacenamiento diga “I have already processed this part of the expression in WHERE, for this range of data”. Tenemos que trabajar en eso. + +## Analizador {#parsers} + +Una consulta es analizada por un analizador de descenso recursivo escrito a mano. Por ejemplo, `ParserSelectQuery` simplemente llama recursivamente a los analizadores subyacentes para varias partes de la consulta. Los analizadores crean un `AST`. El `AST` está representado por nodos, que son instancias de `IAST`. + +> Los generadores de analizadores no se utilizan por razones históricas. + +## Interprete {#interpreters} + +Los intérpretes son responsables de crear la canalización de ejecución de consultas `AST`. Hay intérpretes simples, como `InterpreterExistsQuery` y `InterpreterDropQuery` o el más sofisticado `InterpreterSelectQuery`. La canalización de ejecución de consultas es una combinación de flujos de entrada o salida de bloques. Por ejemplo, el resultado de interpretar el `SELECT` la consulta es la `IBlockInputStream` para leer el conjunto de resultados; el resultado de la consulta INSERT es el `IBlockOutputStream` para escribir datos para su inserción, y el resultado de interpretar el `INSERT SELECT` la consulta es la `IBlockInputStream` que devuelve un conjunto de resultados vacío en la primera lectura, pero que copia datos de `SELECT` Naciones `INSERT` al mismo tiempo. + +`InterpreterSelectQuery` utilizar `ExpressionAnalyzer` y `ExpressionActions` maquinaria para el análisis de consultas y transformaciones. Aquí es donde se realizan la mayoría de las optimizaciones de consultas basadas en reglas. `ExpressionAnalyzer` Se deben extraer varias transformaciones de consultas y optimizaciones para separar clases para permitir transformaciones modulares o consultas. + +## Función {#functions} + +Hay funciones ordinarias y funciones agregadas. Para las funciones agregadas, consulte la siguiente sección. + +Las funciones ordinarias no cambian el número de filas; funcionan como si estuvieran procesando cada fila de forma independiente. De hecho, las funciones no se llaman para filas individuales, sino para `Block`de datos para implementar la ejecución de consultas vectorizadas. + +Hay algunas funciones diversas, como [BlockSize](../query_language/functions/other_functions.md#function-blocksize), [rowNumberInBlock](../query_language/functions/other_functions.md#function-rownumberinblock), y [runningAccumulate](../query_language/functions/other_functions.md#function-runningaccumulate), que explotan el procesamiento de bloques y violan la independencia de las filas. + +ClickHouse tiene una tipificación fuerte, por lo que no se produce la conversión de tipo implícita. Si una función no admite una combinación específica de tipos, se lanzará una excepción. Pero las funciones pueden funcionar (estar sobrecargadas) para muchas combinaciones diferentes de tipos. Por ejemplo, el `plus` función (para implementar el `+` operador) funciona para cualquier combinación de tipos numéricos: `UInt8` + `Float32`, `UInt16` + `Int8` y así sucesivamente. Además, algunas funciones variadas pueden aceptar cualquier número de argumentos, como el `concat` función. + +Implementar una función puede ser un poco inconveniente porque una función distribuye explícitamente tipos de datos compatibles y `IColumns`. Por ejemplo, el `plus` función tiene código generado por la creación de instancias de una plantilla de C ++ para cada combinación de tipos numéricos, y para argumentos izquierdo y derecho constantes o no constantes. + +> Este es un buen lugar para implementar la generación de código en tiempo de ejecución para evitar la hinchazón del código de la plantilla. Además, permitirá agregar funciones fusionadas como multiplicar-agregar fusionado o hacer comparaciones múltiples en una iteración de bucle. + +Debido a la ejecución de consultas vectorizadas, las funciones no se cortocircuitan. Por ejemplo, si escribe `WHERE f(x) AND g(y)`, ambos lados se calcularán, incluso para las filas, cuando `f(x)` es cero (excepto cuando `f(x)` es una expresión constante cero). Pero si la selectividad del `f(x)` la condición es alta, y el cálculo de `f(x)` es mucho más barato que `g(y)`, es mejor implementar el cálculo de paso múltiple: primero calcular `f(x)`, a continuación, filtrar columnas por el resultado, y luego calcular `g(y)` solo para trozos de datos más pequeños y filtrados. + +## Funciones agregadas {#aggregate-functions} + +Las funciones agregadas son funciones con estado. Acumulan valores pasados en algún estado y le permiten obtener resultados de ese estado. Se gestionan con el `IAggregateFunction` interfaz. Los estados pueden ser bastante simples (el estado para `AggregateFunctionCount` es sólo una sola `UInt64` valor) o bastante complejo (el estado de `AggregateFunctionUniqCombined` es una combinación de una matriz lineal, una tabla hash y un `HyperLogLog` estructura de datos probabilística). + +Para tratar con múltiples estados mientras se ejecuta una alta cardinalidad `GROUP BY` consulta, los estados se asignan en `Arena` (un grupo de memoria), o podrían asignarse en cualquier pieza de memoria adecuada. Los estados pueden tener un constructor y destructor no triviales: por ejemplo, los estados de agregación complejos pueden asignar memoria adicional ellos mismos. Esto requiere cierta atención a la creación y destrucción de estados y pasar adecuadamente su propiedad, para realizar un seguimiento de quién y cuándo destruirá los estados. + +Los estados de agregación se pueden serializar y deserializar para pasar a través de la red durante la ejecución de consultas distribuidas o para escribirlos en el disco donde no hay suficiente RAM. Incluso se pueden almacenar en una tabla con el `DataTypeAggregateFunction` para permitir la agregación incremental de datos. + +> El formato de datos serializados para los estados de función agregados no tiene versiones en este momento. Esto está bien si los estados agregados solo se almacenan temporalmente. Pero tenemos el `AggregatingMergeTree` motor de tabla para la agregación incremental, y la gente ya lo está utilizando en producción. Esta es la razón por la que deberíamos agregar compatibilidad con versiones anteriores al cambiar el formato serializado para cualquier función agregada en el futuro. + +## Servidor {#server} + +El servidor implementa varias interfaces diferentes: + +- Una interfaz HTTP para cualquier cliente extranjero. +- Una interfaz TCP para el cliente nativo de ClickHouse y para la comunicación entre servidores durante la ejecución de consultas distribuidas. +- Una interfaz para transferir datos para la replicación. + +Internamente, es solo un servidor multiproceso básico sin corutinas, fibras, etc. Dado que el servidor no está diseñado para procesar una alta tasa de consultas simples, sino que está destinado a procesar una tasa relativamente baja de consultas complejas, cada uno de ellos puede procesar una gran cantidad de datos para análisis. + +El servidor inicializa el `Context` clase con el entorno necesario para la ejecución de consultas: la lista de bases de datos disponibles, usuarios y derechos de acceso, configuración, clústeres, la lista de procesos, el registro de consultas, etc. Este entorno es utilizado por intérpretes. + +Mantenemos una compatibilidad completa hacia atrás y hacia adelante para el protocolo TCP del servidor: los clientes antiguos pueden hablar con servidores nuevos y los nuevos clientes pueden hablar con servidores antiguos. Pero no queremos mantenerlo eternamente, y estamos eliminando el soporte para versiones antiguas después de aproximadamente un año. + +> Para todas las aplicaciones externas, recomendamos usar la interfaz HTTP porque es simple y fácil de usar. El protocolo TCP está más estrechamente vinculado a las estructuras de datos internas: utiliza un formato interno para pasar bloques de datos y utiliza marcos personalizados para datos comprimidos. No hemos lanzado una biblioteca C para ese protocolo porque requiere vincular la mayor parte de la base de código ClickHouse, lo cual no es práctico. + +## Ejecución de consultas distribuidas {#distributed-query-execution} + +Los servidores de una configuración de clúster son en su mayoría independientes. Puede crear un `Distributed` en uno o todos los servidores de un clúster. El `Distributed` la tabla no almacena datos en sí misma; solo proporciona un “view” a todas las tablas locales en varios nodos de un clúster. Cuando se SELECCIONA desde un `Distributed` tabla, reescribe esa consulta, elige nodos remotos de acuerdo con la configuración de equilibrio de carga y les envía la consulta. El `Distributed` table solicita a los servidores remotos que procesen una consulta hasta una etapa en la que se pueden fusionar resultados intermedios de diferentes servidores. Luego recibe los resultados intermedios y los fusiona. La tabla distribuida intenta distribuir tanto trabajo como sea posible a servidores remotos y no envía muchos datos intermedios a través de la red. + +> Las cosas se vuelven más complicadas cuando tiene subconsultas en cláusulas IN o JOIN y cada una de ellas usa un `Distributed` tabla. Tenemos diferentes estrategias para la ejecución de estas consultas. + +No existe un plan de consulta global para la ejecución de consultas distribuidas. Cada nodo tiene su propio plan de consulta local para su parte del trabajo. Solo tenemos una ejecución simple de consultas distribuidas de un solo paso: enviamos consultas para nodos remotos y luego fusionamos los resultados. Pero esto no es factible para consultas difíciles con alta cardinalidad GROUP BY o con una gran cantidad de datos temporales para JOIN: en tales casos, necesitamos “reshuffle” datos entre servidores, lo que requiere una coordinación adicional. ClickHouse no admite ese tipo de ejecución de consultas, y tenemos que trabajar en ello. + +## Árbol de fusión {#merge-tree} + +`MergeTree` es una familia de motores de almacenamiento que admite la indexación por clave principal. La clave principal puede ser una tupla arbitraria de columnas o expresiones. Datos en un `MergeTree` se almacena en “parts”. Cada parte almacena datos en el orden de clave primaria (los datos son ordenados lexicográficamente por la tupla de clave primaria). Todas las columnas de la tabla se almacenan en `column.bin` archivos en estas partes. Los archivos consisten en bloques comprimidos. Cada bloque suele ser de 64 KB a 1 MB de datos sin comprimir, dependiendo del tamaño del valor promedio. Los bloques constan de valores de columna colocados contiguamente uno tras otro. Los valores de columna están en el mismo orden para cada columna (el orden está definido por la clave principal), por lo que cuando itera por muchas columnas, obtiene valores para las filas correspondientes. + +La clave principal en sí es “sparse”. No aborda todas las filas, sino solo algunos rangos de datos. Separado `primary.idx` file tiene el valor de la clave principal para cada fila N-ésima, donde se llama N `index_granularity` (generalmente, N = 8192). Además, para cada columna, tenemos `column.mrk` Archivos con “marks,” que son desplazamientos a cada fila N-ésima en el archivo de datos. Cada marca es un par: el desplazamiento en el archivo al comienzo del bloque comprimido y el desplazamiento en el bloque descomprimido al comienzo de los datos. Por lo general, los bloques comprimidos están alineados por marcas, y el desplazamiento en el bloque descomprimido es cero. Datos para `primary.idx` siempre reside en la memoria y los datos para `column.mrk` archivos se almacena en caché. + +Cuando vamos a leer algo de una parte en `MergeTree` miramos `primary.idx` datos y localice rangos que posiblemente podrían contener datos solicitados, luego mire `column.mrk` datos y calcular compensaciones para dónde comenzar a leer esos rangos. Debido a la escasez, el exceso de datos puede ser leído. ClickHouse no es adecuado para una gran carga de consultas de puntos simples, porque todo el rango con `index_granularity` se deben leer filas para cada clave, y todo el bloque comprimido debe descomprimirse para cada columna. Hicimos que el índice sea disperso porque debemos poder mantener billones de filas por único servidor sin un consumo de memoria notable para el índice. Además, debido a que la clave principal es escasa, no es única: no puede verificar la existencia de la clave en la tabla en el momento de INSERTAR. Podría tener muchas filas con la misma clave en una tabla. + +Cuando `INSERT` un montón de datos en `MergeTree`, ese grupo está ordenado por orden de clave primaria y forma una nueva parte. Para mantener el número de partes relativamente bajo, hay subprocesos de fondo que seleccionan periódicamente algunas partes y las fusionan en una sola parte ordenada. Es por eso que se llama `MergeTree`. Por supuesto, la fusión conduce a “write amplification”. Todas las partes son inmutables: solo se crean y eliminan, pero no se modifican. Cuando se ejecuta SELECT, contiene una instantánea de la tabla (un conjunto de partes). Después de la fusión, también mantenemos las piezas viejas durante algún tiempo para facilitar la recuperación después de la falla, por lo que si vemos que alguna parte fusionada probablemente esté rota, podemos reemplazarla con sus partes fuente. + +`MergeTree` no es un árbol de LSM porque no contiene “memtable” y “log”: los datos insertados se escriben directamente en el sistema de archivos. Esto lo hace adecuado solo para INSERTAR datos en lotes, no por fila individual y no con mucha frecuencia: aproximadamente una vez por segundo está bien, pero mil veces por segundo no lo está. Lo hicimos de esta manera por simplicidad, y porque ya estamos insertando datos en lotes en nuestras aplicaciones. + +> Las tablas MergeTree solo pueden tener un índice (primario): no hay índices secundarios. Sería bueno permitir múltiples representaciones físicas bajo una tabla lógica, por ejemplo, para almacenar datos en más de un orden físico o incluso para permitir representaciones con datos preagregados junto con datos originales. + +Hay motores MergeTree que están haciendo un trabajo adicional durante las fusiones en segundo plano. Los ejemplos son `CollapsingMergeTree` y `AggregatingMergeTree`. Esto podría tratarse como un soporte especial para actualizaciones. Tenga en cuenta que estas no son actualizaciones reales porque los usuarios generalmente no tienen control sobre el tiempo en que se ejecutarán las fusiones en segundo plano y los datos en un `MergeTree` casi siempre se almacena en más de una parte, no en forma completamente fusionada. + +## Replicación {#replication} + +La replicación en ClickHouse se implementa por tabla. Podría tener algunas tablas replicadas y otras no replicadas en el mismo servidor. También puede tener tablas replicadas de diferentes maneras, como una tabla con replicación de dos factores y otra con replicación de tres factores. + +La replicación se implementa en el `ReplicatedMergeTree` motor de almacenamiento. El camino en `ZooKeeper` se especifica como un parámetro para el motor de almacenamiento. Todas las tablas con la misma ruta en `ZooKeeper` se convierten en réplicas entre sí: sincronizan sus datos y mantienen la coherencia. Las réplicas se pueden agregar y eliminar dinámicamente simplemente creando o soltando una tabla. + +La replicación utiliza un esquema multi-maestro asíncrono. Puede insertar datos en cualquier réplica que tenga una sesión con `ZooKeeper`, y los datos se replican en todas las demás réplicas de forma asíncrona. Debido a que ClickHouse no admite UPDATE, la replicación está libre de conflictos. Como no hay reconocimiento de quórum de inserciones, los datos recién insertados pueden perderse si un nodo falla. + +Los metadatos para la replicación se almacenan en ZooKeeper. Hay un registro de replicación que enumera las acciones que se deben realizar. Las acciones son: obtener parte; fusionar partes; soltar una partición, etc. Cada réplica copia el registro de replicación en su cola y, a continuación, ejecuta las acciones desde la cola. Por ejemplo, en la inserción, el “get the part” la acción se crea en el registro y cada réplica descarga esa parte. Las fusiones se coordinan entre réplicas para obtener resultados idénticos en bytes. Todas las piezas se combinan de la misma manera en todas las réplicas. Para lograr esto, se elige una réplica como líder y esa réplica inicia fusiones y escrituras “merge parts” Acciones al registro. + +La replicación es física: solo las partes comprimidas se transfieren entre nodos, no consultas. Para reducir el costo de la red (para evitar la amplificación de la red), las fusiones se procesan en cada réplica de forma independiente en la mayoría de los casos. Las piezas combinadas grandes se envían a través de la red solo en casos de retraso de replicación significativo. + +Además, cada réplica almacena su estado en ZooKeeper como el conjunto de piezas y sus sumas de comprobación. Cuando el estado en el sistema de archivos local difiere del estado de referencia en ZooKeeper, la réplica restaura su coherencia descargando partes faltantes y rotas de otras réplicas. Cuando hay algunos datos inesperados o rotos en el sistema de archivos local, ClickHouse no los elimina, sino que los mueve a un directorio separado y los olvida. + +> El clúster ClickHouse consta de fragmentos independientes y cada fragmento consta de réplicas. El clúster no es elástico, por lo que después de agregar un nuevo fragmento, los datos no se reequilibran automáticamente entre fragmentos. En su lugar, la carga del clúster será desigual. Esta implementación le da más control, y está bien para clústeres relativamente pequeños, como decenas de nodos. Pero para los clústeres con cientos de nodos que estamos utilizando en producción, este enfoque se convierte en un inconveniente significativo. Debemos implementar un motor de tablas que abarque sus datos en todo el clúster con regiones replicadas dinámicamente que podrían dividirse y equilibrarse entre clústeres automáticamente. + +[Artículo Original](https://clickhouse.tech/docs/es/development/architecture/) diff --git a/docs/es/development/browse_code.md b/docs/es/development/browse_code.md new file mode 100644 index 00000000000..f292f0edf06 --- /dev/null +++ b/docs/es/development/browse_code.md @@ -0,0 +1,7 @@ +# Examinar el código fuente de ClickHouse {#browse-clickhouse-source-code} + +Usted puede utilizar **Woboq** navegador de código en línea disponible [aqui](https://clickhouse-test-reports.s3.yandex.net/codebrowser/html_report///ClickHouse/dbms/src/index.html). Proporciona navegación de código y resaltado semántico, búsqueda e indexación. La instantánea de código se actualiza diariamente. + +Además, puede navegar por las fuentes en [GitHub](https://github.com/ClickHouse/ClickHouse) como de costumbre. + +Si está interesado en qué IDE usar, recomendamos CLion, QT Creator, VS Code y KDevelop (con advertencias). Puedes usar cualquier IDE favorito. Vim y Emacs también cuentan. diff --git a/docs/es/development/build.md b/docs/es/development/build.md new file mode 100644 index 00000000000..8ae379338a9 --- /dev/null +++ b/docs/es/development/build.md @@ -0,0 +1,134 @@ +# Cómo construir ClickHouse para el desarrollo {#how-to-build-clickhouse-for-development} + +El siguiente tutorial se basa en el sistema Ubuntu Linux. +Con los cambios apropiados, también debería funcionar en cualquier otra distribución de Linux. +Plataformas compatibles: x86\_64 y AArch64. El soporte para Power9 es experimental. + +## Instalar Git, CMake, Python y Ninja {#install-git-cmake-python-and-ninja} + +``` bash +$ sudo apt-get install git cmake python ninja-build +``` + +O cmake3 en lugar de cmake en sistemas más antiguos. + +## Instalar GCC 9 {#install-gcc-9} + +Hay varias formas de hacer esto. + +### Instalar desde un paquete PPA {#install-from-a-ppa-package} + +``` bash +$ sudo apt-get install software-properties-common +$ sudo apt-add-repository ppa:ubuntu-toolchain-r/test +$ sudo apt-get update +$ sudo apt-get install gcc-9 g++-9 +``` + +### Instalar desde fuentes {#install-from-sources} + +Mira [Sistema abierto.](https://github.com/ClickHouse/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh) + +## Usar GCC 9 para compilaciones {#use-gcc-9-for-builds} + +``` bash +$ export CC=gcc-9 +$ export CXX=g++-9 +``` + +## Fuentes de ClickHouse de pago {#checkout-clickhouse-sources} + +``` bash +$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git +``` + +o + +``` bash +$ git clone --recursive https://github.com/ClickHouse/ClickHouse.git +``` + +## Construir ClickHouse {#build-clickhouse} + +``` bash +$ cd ClickHouse +$ mkdir build +$ cd build +$ cmake .. +$ ninja +$ cd .. +``` + +Para crear un ejecutable, ejecute `ninja clickhouse`. +Esto creará el `dbms/programs/clickhouse` ejecutable, que se puede usar con `client` o `server` argumento. + +# Cómo construir ClickHouse en cualquier Linux {#how-to-build-clickhouse-on-any-linux} + +La compilación requiere los siguientes componentes: + +- Git (se usa solo para pagar las fuentes, no es necesario para la compilación) +- CMake 3.10 o más reciente +- Ninja (recomendado) o Hacer +- Compilador de C ++: gcc 9 o clang 8 o más reciente +- Enlazador: lld u oro (el clásico GNU ld no funcionará) +- Python (solo se usa dentro de la compilación LLVM y es opcional) + +Si todos los componentes están instalados, puede compilar de la misma manera que los pasos anteriores. + +Ejemplo para Ubuntu Eoan: + + sudo apt update + sudo apt install git cmake ninja-build g++ python + git clone --recursive https://github.com/ClickHouse/ClickHouse.git + mkdir build && cd build + cmake ../ClickHouse + ninja + +Ejemplo de OpenSUSE Tumbleweed: + + sudo zypper install git cmake ninja gcc-c++ python lld + git clone --recursive https://github.com/ClickHouse/ClickHouse.git + mkdir build && cd build + cmake ../ClickHouse + ninja + +Ejemplo de Fedora Rawhide: + + sudo yum update + yum --nogpg install git cmake make gcc-c++ python2 + git clone --recursive https://github.com/ClickHouse/ClickHouse.git + mkdir build && cd build + cmake ../ClickHouse + make -j $(nproc) + +# Usted no tiene que construir ClickHouse {#you-dont-have-to-build-clickhouse} + +ClickHouse está disponible en binarios y paquetes preconstruidos. Los binarios son portátiles y se pueden ejecutar en cualquier tipo de Linux. + +Están diseñados para lanzamientos estables, preestablecidos y de prueba, siempre que para cada compromiso con el maestro y para cada solicitud de extracción. + +Para encontrar la construcción más fresca de `master`, ir a [se compromete página](https://github.com/ClickHouse/ClickHouse/commits/master), haga clic en la primera marca de verificación verde o cruz roja cerca de confirmar, y haga clic en “Details” enlace justo después “ClickHouse Build Check”. + +# Cómo construir el paquete Debian ClickHouse {#how-to-build-clickhouse-debian-package} + +## Instalar Git y Pbuilder {#install-git-and-pbuilder} + +``` bash +$ sudo apt-get update +$ sudo apt-get install git python pbuilder debhelper lsb-release fakeroot sudo debian-archive-keyring debian-keyring +``` + +## Fuentes de ClickHouse de pago {#checkout-clickhouse-sources-1} + +``` bash +$ git clone --recursive --branch master https://github.com/ClickHouse/ClickHouse.git +$ cd ClickHouse +``` + +## Ejecutar secuencia de comandos de lanzamiento {#run-release-script} + +``` bash +$ ./release +``` + +[Artículo Original](https://clickhouse.tech/docs/es/development/build/) diff --git a/docs/es/development/build_cross_arm.md b/docs/es/development/build_cross_arm.md new file mode 100644 index 00000000000..c09fd0bc151 --- /dev/null +++ b/docs/es/development/build_cross_arm.md @@ -0,0 +1,36 @@ +# Cómo construir ClickHouse en Linux para la arquitectura AARCH64 (ARM64 {#how-to-build-clickhouse-on-linux-for-aarch64-arm64-architecture} + +Esto es para el caso cuando tiene una máquina Linux y desea usarla para compilar `clickhouse` binario que se ejecutará en otra máquina Linux con arquitectura de CPU AARCH64. Esto está destinado a las comprobaciones de integración continua que se ejecutan en servidores Linux. + +La compilación cruzada para AARCH64 se basa en el [Instrucciones de construcción](build.md), seguirlos primero. + +# Instalar Clang-8 {#install-clang-8} + +Siga las instrucciones de https://apt.llvm.org/ para la configuración de Ubuntu o Debian. +Por ejemplo, en Ubuntu Bionic puede usar los siguientes comandos: + +``` bash +echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" | sudo tee /etc/apt/sources.list.d/llvm.list +sudo apt-get update +sudo apt-get install clang-8 +``` + +# Instalar conjunto de herramientas de compilación cruzada {#install-cross-compilation-toolset} + +``` bash +cd ClickHouse +mkdir -p build-aarch64/cmake/toolchain/linux-aarch64 +wget 'https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&la=en' -O gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz +tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build-aarch64/cmake/toolchain/linux-aarch64 --strip-components=1 +``` + +# Construir ClickHouse {#build-clickhouse} + +``` bash +cd ClickHouse +mkdir build-arm64 +CC=clang-8 CXX=clang++-8 cmake . -Bbuild-arm64 -DCMAKE_TOOLCHAIN_FILE=cmake/linux/toolchain-aarch64.cmake +ninja -C build-arm64 +``` + +El binario resultante se ejecutará solo en Linux con la arquitectura de CPU AARCH64. diff --git a/docs/es/development/build_cross_osx.md b/docs/es/development/build_cross_osx.md new file mode 100644 index 00000000000..062a049bd53 --- /dev/null +++ b/docs/es/development/build_cross_osx.md @@ -0,0 +1,57 @@ +# Cómo construir ClickHouse en Linux para Mac OS X {#how-to-build-clickhouse-on-linux-for-mac-os-x} + +Esto es para el caso cuando tiene una máquina Linux y desea usarla para compilar `clickhouse` Esto está destinado a las comprobaciones de integración continuas que se ejecutan en servidores Linux. Si desea crear ClickHouse directamente en Mac OS X, continúe con [otra instrucción](build_osx.md). + +La compilación cruzada para Mac OS X se basa en el [Instrucciones de construcción](build.md), seguirlos primero. + +# Instalar Clang-8 {#install-clang-8} + +Siga las instrucciones de https://apt.llvm.org/ para la configuración de Ubuntu o Debian. +Por ejemplo, los comandos para Bionic son como: + +``` bash +sudo echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" >> /etc/apt/sources.list +sudo apt-get install clang-8 +``` + +# Instalar conjunto de herramientas de compilación cruzada {#install-cross-compilation-toolset} + +Recordemos la ruta donde instalamos `cctools` Como ${CCTOOLS} + +``` bash +mkdir ${CCTOOLS} + +git clone https://github.com/tpoechtrager/apple-libtapi.git +cd apple-libtapi +INSTALLPREFIX=${CCTOOLS} ./build.sh +./install.sh +cd .. + +git clone https://github.com/tpoechtrager/cctools-port.git +cd cctools-port/cctools +./configure --prefix=${CCTOOLS} --with-libtapi=${CCTOOLS} --target=x86_64-apple-darwin +make install +``` + +Además, necesitamos descargar macOS X SDK en el árbol de trabajo. + +``` bash +cd ClickHouse +wget 'https://github.com/phracker/MacOSX-SDKs/releases/download/10.14-beta4/MacOSX10.14.sdk.tar.xz' +mkdir -p build-darwin/cmake/toolchain/darwin-x86_64 +tar xJf MacOSX10.14.sdk.tar.xz -C build-darwin/cmake/toolchain/darwin-x86_64 --strip-components=1 +``` + +# Construir ClickHouse {#build-clickhouse} + +``` bash +cd ClickHouse +mkdir build-osx +CC=clang-8 CXX=clang++-8 cmake . -Bbuild-osx -DCMAKE_TOOLCHAIN_FILE=cmake/darwin/toolchain-x86_64.cmake \ + -DCMAKE_AR:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ar \ + -DCMAKE_RANLIB:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ranlib \ + -DLINKER_NAME=${CCTOOLS}/bin/x86_64-apple-darwin-ld +ninja -C build-osx +``` + +El binario resultante tendrá un formato ejecutable Mach-O y no se puede ejecutar en Linux. diff --git a/docs/es/development/build_osx.md b/docs/es/development/build_osx.md new file mode 100644 index 00000000000..653046fc221 --- /dev/null +++ b/docs/es/development/build_osx.md @@ -0,0 +1,86 @@ +# Cómo crear ClickHouse en Mac OS X {#how-to-build-clickhouse-on-mac-os-x} + +Build debería funcionar en Mac OS X 10.15 (Catalina) + +## Instalar Homebrew {#install-homebrew} + +``` bash +$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" +``` + +## Instalar compiladores, herramientas y bibliotecas necesarios {#install-required-compilers-tools-and-libraries} + +``` bash +$ brew install cmake ninja libtool gettext +``` + +## Fuentes de ClickHouse de pago {#checkout-clickhouse-sources} + +``` bash +$ git clone --recursive git@github.com:ClickHouse/ClickHouse.git +``` + +o + +``` bash +$ git clone --recursive https://github.com/ClickHouse/ClickHouse.git + +$ cd ClickHouse +``` + +## Construir ClickHouse {#build-clickhouse} + +``` bash +$ mkdir build +$ cd build +$ cmake .. -DCMAKE_CXX_COMPILER=`which clang++` -DCMAKE_C_COMPILER=`which clang` +$ ninja +$ cd .. +``` + +## Advertencia {#caveats} + +Si tiene la intención de ejecutar clickhouse-server, asegúrese de aumentar la variable maxfiles del sistema. + +!!! info "Nota" + Tendrá que utilizar sudo. + +Para ello, cree el siguiente archivo: + +/Library/LaunchDaemons/limit.maxfiles.lista: + +``` xml + + + + + Label + limit.maxfiles + ProgramArguments + + launchctl + limit + maxfiles + 524288 + 524288 + + RunAtLoad + + ServiceIPC + + + +``` + +Ejecute el siguiente comando: + +``` bash +$ sudo chown root:wheel /Library/LaunchDaemons/limit.maxfiles.plist +``` + +Reiniciar. + +Para comprobar si está funcionando, puede usar `ulimit -n` comando. + +[Artículo Original](https://clickhouse.tech/docs/es/development/build_osx/) diff --git a/docs/es/development/contrib.md b/docs/es/development/contrib.md new file mode 100644 index 00000000000..456a38fc1ca --- /dev/null +++ b/docs/es/development/contrib.md @@ -0,0 +1,35 @@ +# Bibliotecas de terceros utilizadas {#third-party-libraries-used} + +| Biblioteca | Licencia | +|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------| +| base64 | [Licencia BSD de 2 cláusulas](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) | +| Impulsar | [Licencia de software Boost 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) | +| Bienvenido | [MIT](https://github.com/google/brotli/blob/master/LICENSE) | +| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/master/LICENSE) | +| Cctz | [Licencia Apache 2.0](https://github.com/google/cctz/blob/4f9776a310f4952454636363def82c2bf6641d5f/LICENSE.txt) | +| doble conversión | [Licencia de 3 cláusulas BSD](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | +| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | +| Más información | [Licencia de 3 cláusulas BSD](https://github.com/google/googletest/blob/master/LICENSE) | +| Hombre | [Licencia Apache 2.0](https://github.com/uber/h3/blob/master/LICENSE) | +| hyperscan | [Licencia de 3 cláusulas BSD](https://github.com/intel/hyperscan/blob/master/LICENSE) | +| libbtrie | [Licencia BSD de 2 cláusulas](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | +| libcxxabi | [Sistema abierto.](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | +| libdivide | [Licencia Zlib](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | +| libgsasl | [Información adicional](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | +| libhdfs3 | [Licencia Apache 2.0](https://github.com/ClickHouse-Extras/libhdfs3/blob/bd6505cbb0c130b0db695305b9a38546fa880e5a/LICENSE.txt) | +| libmetrohash | [Licencia Apache 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | +| Libpcg-al azar | [Licencia Apache 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libpcg-random/LICENSE-APACHE.txt) | +| Libressl | [Licencia OpenSSL](https://github.com/ClickHouse-Extras/ssl/blob/master/COPYING) | +| Librdkafka | [Licencia BSD de 2 cláusulas](https://github.com/edenhill/librdkafka/blob/363dcad5a23dc29381cc626620e68ae418b3af19/LICENSE) | +| libwidechar\_width | [CC0 1.0 Universal](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libwidechar_width/LICENSE) | +| llvm | [Licencia de 3 cláusulas BSD](https://github.com/ClickHouse-Extras/llvm/blob/163def217817c90fb982a6daf384744d8472b92b/llvm/LICENSE.TXT) | +| lz4 | [Licencia BSD de 2 cláusulas](https://github.com/lz4/lz4/blob/c10863b98e1503af90616ae99725ecd120265dfb/LICENSE) | +| Mariadb-conector-c | [Información adicional](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/3.1/COPYING.LIB) | +| murmurhash | [Dominio público](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) | +| pdqsort | [Licencia Zlib](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/pdqsort/license.txt) | +| Poco | [Boost Software License - Versión 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) | +| protobuf | [Licencia de 3 cláusulas BSD](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) | +| Re2 | [Licencia de 3 cláusulas BSD](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) | +| UnixODBC | [Información adicional](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) | +| Sistema abierto. | [Licencia Zlib](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) | +| zstd | [Licencia de 3 cláusulas BSD](https://github.com/facebook/zstd/blob/dev/LICENSE) | diff --git a/docs/es/development/developer_instruction.md b/docs/es/development/developer_instruction.md new file mode 100644 index 00000000000..17fc8bb7d90 --- /dev/null +++ b/docs/es/development/developer_instruction.md @@ -0,0 +1,278 @@ +La construcción de ClickHouse es compatible con Linux, FreeBSD y Mac OS X. + +# Si utiliza Windows {#if-you-use-windows} + +Si usa Windows, necesita crear una máquina virtual con Ubuntu. Para comenzar a trabajar con una máquina virtual, instale VirtualBox. Puede descargar Ubuntu desde el sitio web: https://www.ubuntu.com/\#download. Por favor, cree una máquina virtual a partir de la imagen descargada (debe reservar al menos 4 GB de RAM para ello). Para ejecutar un terminal de línea de comandos en Ubuntu, busque un programa que contenga la palabra “terminal” es su nombre (gnome-terminal, konsole, etc.) o simplemente presione Ctrl + Alt + T. + +# Si utiliza un sistema de 32 bits {#if-you-use-a-32-bit-system} + +ClickHouse no puede funcionar ni construir en un sistema de 32 bits. Debe adquirir acceso a un sistema de 64 bits y puede continuar leyendo. + +# Crear un repositorio en GitHub {#creating-a-repository-on-github} + +Para comenzar a trabajar con el repositorio de ClickHouse, necesitará una cuenta de GitHub. + +Probablemente ya tenga uno, pero si no lo hace, regístrese en https://github.com. En caso de que no tenga claves SSH, debe generarlas y luego cargarlas en GitHub. Es necesario para enviar a través de sus parches. También es posible usar las mismas claves SSH que usa con cualquier otro servidor SSH, probablemente ya las tenga. + +Cree una bifurcación del repositorio ClickHouse. Para hacerlo por favor haga clic en el “fork” botón en la esquina superior derecha en https://github.com/ClickHouse/ClickHouse . Se bifurcará su propia copia de ClickHouse/ClickHouse a su cuenta. + +El proceso de desarrollo consiste en comprometer primero los cambios previstos en su bifurcación de ClickHouse y luego crear un “pull request” Para que estos cambios sean aceptados en el repositorio principal (ClickHouse / ClickHouse). + +Para trabajar con repositorios git, instale `git`. + +Para hacer eso en Ubuntu, ejecutaría en la terminal de línea de comandos: + + sudo apt update + sudo apt install git + +Puede encontrar un breve manual sobre el uso de Git aquí: https://services.github.com/on-demand/downloads/github-git-cheat-sheet.pdf . +Para obtener un manual detallado sobre Git, consulte https://git-scm.com/book/ru/v2 . + +# Clonación de un repositorio en su máquina de desarrollo {#cloning-a-repository-to-your-development-machine} + +A continuación, debe descargar los archivos fuente en su máquina de trabajo. Esto se llama “to clone a repository” porque crea una copia local del repositorio en su máquina de trabajo. + +En el terminal de línea de comandos, ejecute: + + git clone --recursive git@guthub.com:your_github_username/ClickHouse.git + cd ClickHouse + +Nota: por favor, sustituye *your\_github\_username* con lo que es apropiado! + +Este comando creará un directorio `ClickHouse` que contiene la copia de trabajo del proyecto. + +Es importante que la ruta al directorio de trabajo no contenga espacios en blanco, ya que puede ocasionar problemas con la ejecución del sistema de compilación. + +Tenga en cuenta que el repositorio ClickHouse utiliza `submodules`. Así es como se llaman las referencias a repositorios adicionales (es decir, bibliotecas externas de las que depende el proyecto). Significa que al clonar el repositorio debe especificar el `--recursive` como en el ejemplo anterior. Si el repositorio se ha clonado sin submódulos, para descargarlos debe ejecutar lo siguiente: + + git submodule init + git submodule update + +Puede verificar el estado con el comando: `git submodule status`. + +Si recibe el siguiente mensaje de error: + + Permission denied (publickey). + fatal: Could not read from remote repository. + + Please make sure you have the correct access rights + and the repository exists. + +Por lo general, significa que faltan las claves SSH para conectarse a GitHub. Estas teclas se encuentran normalmente en `~/.ssh`. Para que las claves SSH sean aceptadas, debe cargarlas en la sección de configuración de la interfaz de usuario de GitHub. + +También puede clonar el repositorio a través del protocolo https: + + git clone https://github.com/ClickHouse/ClickHouse.git + +Sin embargo, esto no le permitirá enviar los cambios al servidor. Aún puede usarlo temporalmente y agregar las claves SSH más tarde reemplazando la dirección remota del repositorio con `git remote` comando. + +También puede agregar la dirección original del repositorio de ClickHouse a su repositorio local para extraer actualizaciones desde allí: + + git remote add upstream git@github.com:ClickHouse/ClickHouse.git + +Después de ejecutar con éxito este comando, podrá extraer actualizaciones del repositorio principal de ClickHouse ejecutando `git pull upstream master`. + +## Trabajar con submódulos {#working-with-submodules} + +Trabajar con submódulos en git podría ser doloroso. Los siguientes comandos ayudarán a administrarlo: + + # ! each command accepts --recursive + # Update remote URLs for submodules. Barely rare case + git submodule sync + # Add new submodules + git submodule init + # Update existing submodules to the current state + git submodule update + # Two last commands could be merged together + git submodule update --init + +Los siguientes comandos le ayudarían a restablecer todos los submódulos al estado inicial (!¡ADVERTENCIA! - cualquier cambio en el interior será eliminado): + + # Synchronizes submodules' remote URL with .gitmodules + git submodule sync --recursive + # Update the registered submodules with initialize not yet initialized + git submodule update --init --recursive + # Reset all changes done after HEAD + git submodule foreach git reset --hard + # Clean files from .gitignore + git submodule foreach git clean -xfd + # Repeat last 4 commands for all submodule + git submodule foreach git submodule sync --recursive + git submodule foreach git submodule update --init --recursive + git submodule foreach git submodule foreach git reset --hard + git submodule foreach git submodule foreach git clean -xfd + +# Sistema de construcción {#build-system} + +ClickHouse utiliza CMake y Ninja para la construcción. + +CMake - un sistema de meta-construcción que puede generar archivos Ninja (tareas de construcción). +Ninja: un sistema de compilación más pequeño con un enfoque en la velocidad utilizada para ejecutar esas tareas generadas por cmake. + +Para instalar en Ubuntu, Debian o Mint run `sudo apt install cmake ninja-build`. + +En CentOS, RedHat se ejecuta `sudo yum install cmake ninja-build`. + +Si usa Arch o Gentoo, probablemente lo sepa usted mismo cómo instalar CMake. + +Para instalar CMake y Ninja en Mac OS X, primero instale Homebrew y luego instale todo lo demás a través de brew: + + /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" + brew install cmake ninja + +A continuación, verifique la versión de CMake: `cmake --version`. Si está por debajo de 3.3, debe instalar una versión más reciente desde el sitio web: https://cmake.org/download/. + +# Bibliotecas externas opcionales {#optional-external-libraries} + +ClickHouse utiliza varias bibliotecas externas para la construcción. Todos ellos no necesitan ser instalados por separado, ya que se construyen junto con ClickHouse a partir de las fuentes ubicadas en los submódulos. Puede consultar la lista en `contrib`. + +# Compilador de C ++ {#c-compiler} + +Los compiladores GCC a partir de la versión 9 y Clang versión 8 o superior son compatibles para construir ClickHouse. + +Las compilaciones oficiales de Yandex actualmente usan GCC porque genera código de máquina de un rendimiento ligeramente mejor (con una diferencia de hasta varios por ciento según nuestros puntos de referencia). Y Clang es más conveniente para el desarrollo generalmente. Sin embargo, nuestra plataforma de integración continua (CI) ejecuta verificaciones de aproximadamente una docena de combinaciones de compilación. + +Para instalar GCC en Ubuntu, ejecute: `sudo apt install gcc g++` + +Compruebe la versión de gcc: `gcc --version`. Si está por debajo de 9, siga las instrucciones aquí: https://clickhouse .tech/docs/es/development/build/\#install-gcc-9. + +La compilación de Mac OS X solo es compatible con Clang. Sólo tiene que ejecutar `brew install llvm` + +Si decide utilizar Clang, también puede instalar `libc++` y `lld` sabe lo que es. Utilizar `ccache` también se recomienda. + +# El proceso de construcción {#the-building-process} + +Ahora que está listo para construir ClickHouse, le recomendamos que cree un directorio separado `build` Dentro `ClickHouse` que contendrá todos los de la generación de artefactos: + + mkdir build + cd build + +Puede tener varios directorios diferentes (build\_release, build\_debug, etc.) para diferentes tipos de construcción. + +Mientras que dentro de la `build` directorio, configure su compilación ejecutando CMake. Antes de la primera ejecución, debe definir variables de entorno que especifiquen el compilador (compilador gcc versión 9 en este ejemplo). + +Linux: + + export CC=gcc-9 CXX=g++-9 + cmake .. + +Mac OS X: + + export CC=clang CXX=clang++ + cmake .. + +El `CC` variable especifica el compilador para C (abreviatura de C Compiler), y `CXX` variable indica qué compilador de C ++ se usará para compilar. + +Para una construcción más rápida, puede recurrir al `debug` tipo de compilación: una compilación sin optimizaciones. Para ese suministro el siguiente parámetro `-D CMAKE_BUILD_TYPE=Debug`: + + cmake -D CMAKE_BUILD_TYPE=Debug .. + +Puede cambiar el tipo de compilación ejecutando este comando en el `build` Directorio. + +Ejecutar ninja para construir: + + ninja clickhouse-server clickhouse-client + +Solo los binarios requeridos se van a construir en este ejemplo. + +Si necesita construir todos los binarios (utilidades y pruebas), debe ejecutar ninja sin parámetros: + + ninja + +La compilación completa requiere aproximadamente 30 GB de espacio libre en disco o 15 GB para construir los binarios principales. + +Cuando hay una gran cantidad de RAM disponible en la máquina de compilación, debe limitar el número de tareas de compilación que se ejecutan en paralelo con `-j` parámetro: + + ninja -j 1 clickhouse-server clickhouse-client + +En máquinas con 4GB de RAM, se recomienda especificar 1, para 8GB de RAM `-j 2` se recomienda. + +Si recibe el mensaje: `ninja: error: loading 'build.ninja': No such file or directory`, significa que la generación de una configuración de compilación ha fallado y necesita inspeccionar el mensaje anterior. + +Cuando se inicie correctamente el proceso de construcción, verá el progreso de construcción: el número de tareas procesadas y el número total de tareas. + +Al crear mensajes sobre archivos protobuf en la biblioteca libhdfs2, como `libprotobuf WARNING` puede aparecer. Afectan a nada y son seguros para ser ignorado. + +Tras la compilación exitosa, obtienes un archivo ejecutable `ClickHouse//dbms/programs/clickhouse`: + + ls -l dbms/programs/clickhouse + +# Ejecutando el ejecutable construido de ClickHouse {#running-the-built-executable-of-clickhouse} + +Para ejecutar el servidor bajo el usuario actual, debe navegar hasta `ClickHouse/dbms/programs/server/` (situado fuera de `build`) y ejecutar: + + ../../../build/dbms/programs/clickhouse server + +En este caso, ClickHouse usará archivos de configuración ubicados en el directorio actual. Puede ejecutar `clickhouse server` desde cualquier directorio que especifique la ruta a un archivo de configuración como un parámetro de línea de comandos `--config-file`. + +Para conectarse a ClickHouse con clickhouse-client en otro terminal, vaya a `ClickHouse/build/dbms/programs/` y ejecutar `clickhouse client`. + +Si usted consigue `Connection refused` mensaje en Mac OS X o FreeBSD, intente especificar la dirección de host 127.0.0.1: + + clickhouse client --host 127.0.0.1 + +Puede reemplazar la versión de producción del binario ClickHouse instalado en su sistema con su binario ClickHouse personalizado. Para ello, instale ClickHouse en su máquina siguiendo las instrucciones del sitio web oficial. A continuación, ejecute lo siguiente: + + sudo service clickhouse-server stop + sudo cp ClickHouse/build/dbms/programs/clickhouse /usr/bin/ + sudo service clickhouse-server start + +Tenga en cuenta que `clickhouse-client`, `clickhouse-server` y otros son enlaces simbólicos a los comúnmente compartidos `clickhouse` Binario. + +También puede ejecutar su binario ClickHouse personalizado con el archivo de configuración del paquete ClickHouse instalado en su sistema: + + sudo service clickhouse-server stop + sudo -u clickhouse ClickHouse/build/dbms/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml + +# IDE (entorno de desarrollo integrado) {#ide-integrated-development-environment} + +Si no sabe qué IDE usar, le recomendamos que use CLion. CLion es un software comercial, pero ofrece un período de prueba gratuito de 30 días. También es gratuito para los estudiantes. CLion se puede usar tanto en Linux como en Mac OS X. + +KDevelop y QTCreator son otras excelentes alternativas de un IDE para desarrollar ClickHouse. KDevelop viene como un IDE muy útil aunque inestable. Si KDevelop se bloquea después de un tiempo al abrir el proyecto, debe hacer clic “Stop All” botón tan pronto como se ha abierto la lista de archivos del proyecto. Después de hacerlo, KDevelop debería estar bien para trabajar. + +Como editores de código simples, puede usar Sublime Text o Visual Studio Code, o Kate (todos los cuales están disponibles en Linux). + +Por si acaso, vale la pena mencionar que CLion crea `build` por sí mismo, también por sí mismo selecciona `debug` para el tipo de compilación, para la configuración usa una versión de CMake que está definida en CLion y no la instalada por usted, y finalmente, CLion usará `make` para ejecutar tareas de compilación en lugar de `ninja`. Este es un comportamiento normal, solo tenlo en cuenta para evitar confusiones. + +# Código de escritura {#writing-code} + +La descripción de la arquitectura ClickHouse se puede encontrar aquí: https://clickhouse.tech/docs/es/desarrollo/arquitectura/ + +La Guía de estilo de código: https://clickhouse.tech/docs/es/development/style/ + +Pruebas de escritura: https://clickhouse.tech/docs/es/development/tests/ + +Lista de tareas: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/instructions/easy\_tasks\_sorted\_en.md + +# Datos de prueba {#test-data} + +El desarrollo de ClickHouse a menudo requiere cargar conjuntos de datos realistas. Es particularmente importante para las pruebas de rendimiento. Tenemos un conjunto especialmente preparado de datos anónimos de Yandex.Métrica. Se requiere, además, unos 3 GB de espacio libre en disco. Tenga en cuenta que estos datos no son necesarios para realizar la mayoría de las tareas de desarrollo. + + sudo apt install wget xz-utils + + wget https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz + wget https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz + + xz -v -d hits_v1.tsv.xz + xz -v -d visits_v1.tsv.xz + + clickhouse-client + + CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime); + + CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID); + + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv + +# Creación de solicitud de extracción {#creating-pull-request} + +Navega a tu repositorio de fork en la interfaz de usuario de GitHub. Si ha estado desarrollando en una sucursal, debe seleccionar esa sucursal. Habrá un “Pull request” botón situado en la pantalla. En esencia, esto significa “create a request for accepting my changes into the main repository”. + +Se puede crear una solicitud de extracción incluso si el trabajo aún no se ha completado. En este caso, por favor ponga la palabra “WIP” (trabajo en curso) al comienzo del título, se puede cambiar más tarde. Esto es útil para la revisión cooperativa y la discusión de los cambios, así como para ejecutar todas las pruebas disponibles. Es importante que proporcione una breve descripción de sus cambios, que más tarde se utilizará para generar registros de cambios de lanzamiento. + +Las pruebas comenzarán tan pronto como los empleados de Yandex etiqueten su PR con una etiqueta “can be tested”. Los resultados de algunas primeras comprobaciones (por ejemplo, el estilo de código) llegarán en varios minutos. Los resultados de la comprobación de compilación llegarán dentro de media hora. Y el conjunto principal de pruebas se informará dentro de una hora. + +El sistema preparará compilaciones binarias ClickHouse para su solicitud de extracción individualmente. Para recuperar estas compilaciones, haga clic en “Details” junto al enlace “ClickHouse build check” en la lista de cheques. Allí encontrará enlaces directos a la construcción.deb paquetes de ClickHouse que puede implementar incluso en sus servidores de producción (si no tiene miedo). + +Lo más probable es que algunas de las compilaciones fallen las primeras veces. Esto se debe al hecho de que verificamos las compilaciones tanto con gcc como con clang, con casi todas las advertencias existentes (siempre con el `-Werror` bandera) habilitado para sonido. En esa misma página, puede encontrar todos los registros de compilación para que no tenga que compilar ClickHouse de todas las formas posibles. diff --git a/docs/es/development/index.md b/docs/es/development/index.md new file mode 100644 index 00000000000..c7d13caf3cd --- /dev/null +++ b/docs/es/development/index.md @@ -0,0 +1,3 @@ +# Desarrollo de ClickHouse {#clickhouse-development} + +[Artículo Original](https://clickhouse.tech/docs/es/development/) diff --git a/docs/es/development/style.md b/docs/es/development/style.md new file mode 100644 index 00000000000..207ebe65f13 --- /dev/null +++ b/docs/es/development/style.md @@ -0,0 +1,834 @@ +# Cómo escribir código C ++ {#how-to-write-c-code} + +## Recomendaciones generales {#general-recommendations} + +**1.** Las siguientes son recomendaciones, no requisitos. + +**2.** Si está editando código, tiene sentido seguir el formato del código existente. + +**3.** El estilo de código es necesario para la coherencia. La consistencia facilita la lectura del código y también facilita la búsqueda del código. + +**4.** Muchas de las reglas no tienen razones lógicas; están dictadas por prácticas establecidas. + +## Formatear {#formatting} + +**1.** La mayor parte del formato se realizará automáticamente por `clang-format`. + +**2.** Las sangrías son 4 espacios. Configure el entorno de desarrollo para que una pestaña agregue cuatro espacios. + +**3.** Abrir y cerrar llaves deben estar en una línea separada. + +``` cpp +inline void readBoolText(bool & x, ReadBuffer & buf) +{ + char tmp = '0'; + readChar(tmp, buf); + x = tmp != '0'; +} +``` + +**4.** Si todo el cuerpo de la función es `statement`, se puede colocar en una sola línea. Coloque espacios alrededor de llaves (además del espacio al final de la línea). + +``` cpp +inline size_t mask() const { return buf_size() - 1; } +inline size_t place(HashValue x) const { return x & mask(); } +``` + +**5.** Para funciones. No coloque espacios alrededor de los corchetes. + +``` cpp +void reinsert(const Value & x) +``` + +``` cpp +memcpy(&buf[place_value], &x, sizeof(x)); +``` + +**6.** En `if`, `for`, `while` y otras expresiones, se inserta un espacio delante del corchete de apertura (a diferencia de las llamadas a funciones). + +``` cpp +for (size_t i = 0; i < rows; i += storage.index_granularity) +``` + +**7.** Agregar espacios alrededor de los operadores binarios (`+`, `-`, `*`, `/`, `%`, …) y el operador ternario `?:`. + +``` cpp +UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); +UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); +UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); +``` + +**8.** Si se introduce un avance de línea, coloque al operador en una nueva línea y aumente la sangría antes de ella. + +``` cpp +if (elapsed_ns) + message << " (" + << rows_read_on_server * 1000000000 / elapsed_ns << " rows/s., " + << bytes_read_on_server * 1000.0 / elapsed_ns << " MB/s.) "; +``` + +**9.** Puede utilizar espacios para la alineación dentro de una línea, si lo desea. + +``` cpp +dst.ClickLogID = click.LogID; +dst.ClickEventID = click.EventID; +dst.ClickGoodEvent = click.GoodEvent; +``` + +**10.** No utilice espacios alrededor de los operadores `.`, `->`. + +Si es necesario, el operador se puede envolver a la siguiente línea. En este caso, el desplazamiento frente a él aumenta. + +**11.** No utilice un espacio para separar los operadores unarios (`--`, `++`, `*`, `&`, …) del argumento. + +**12.** Pon un espacio después de una coma, pero no antes. La misma regla se aplica a un punto y coma dentro de un `for` expresión. + +**13.** No utilice espacios para separar el `[]` operador. + +**14.** En un `template <...>` expresión, use un espacio entre `template` y `<`; sin espacios después de `<` o antes `>`. + +``` cpp +template +struct AggregatedStatElement +{} +``` + +**15.** En clases y estructuras, escribe `public`, `private`, y `protected` es el mismo nivel que `class/struct`, y sangrar el resto del código. + +``` cpp +template +class MultiVersion +{ +public: + /// Version of object for usage. shared_ptr manage lifetime of version. + using Version = std::shared_ptr; + ... +} +``` + +**16.** Si el mismo `namespace` se utiliza para todo el archivo, y no hay nada más significativo, un desplazamiento no es necesario dentro `namespace`. + +**17.** Si el bloque para un `if`, `for`, `while`, u otra expresión consiste en una sola `statement`, las llaves son opcionales. Coloque el `statement` en una línea separada, en su lugar. Esta regla también es válida para `if`, `for`, `while`, … + +Pero si el interior `statement` contiene llaves o `else`, el bloque externo debe escribirse entre llaves. + +``` cpp +/// Finish write. +for (auto & stream : streams) + stream.second->finalize(); +``` + +**18.** No debería haber espacios al final de las líneas. + +**19.** Los archivos de origen están codificados en UTF-8. + +**20.** Los caracteres no ASCII se pueden usar en literales de cadena. + +``` cpp +<< ", " << (timer.elapsed() / chunks_stats.hits) << " μsec/hit."; +``` + +**21.** No escriba varias expresiones en una sola línea. + +**22.** Agrupe secciones de código dentro de las funciones y sepárelas con no más de una línea vacía. + +**23.** Separe funciones, clases, etc. con una o dos líneas vacías. + +**24.** `A const` (relacionado con un valor) debe escribirse antes del nombre del tipo. + +``` cpp +//correct +const char * pos +const std::string & s +//incorrect +char const * pos +``` + +**25.** Al declarar un puntero o referencia, el `*` y `&` Los símbolos deben estar separados por espacios en ambos lados. + +``` cpp +//correct +const char * pos +//incorrect +const char* pos +const char *pos +``` + +**26.** Cuando utilice tipos de plantilla, alias con el `using` palabra clave (excepto en los casos más simples). + +En otras palabras, los parámetros de la plantilla se especifican solo en `using` y no se repiten en el código. + +`using` se puede declarar localmente, como dentro de una función. + +``` cpp +//correct +using FileStreams = std::map>; +FileStreams streams; +//incorrect +std::map> streams; +``` + +**27.** No declare varias variables de diferentes tipos en una instrucción. + +``` cpp +//incorrect +int x, *y; +``` + +**28.** No utilice moldes de estilo C. + +``` cpp +//incorrect +std::cerr << (int)c <<; std::endl; +//correct +std::cerr << static_cast(c) << std::endl; +``` + +**29.** En clases y estructuras, los miembros del grupo y las funciones por separado dentro de cada ámbito de visibilidad. + +**30.** Para clases y estructuras pequeñas, no es necesario separar la declaración del método de la implementación. + +Lo mismo es cierto para los métodos pequeños en cualquier clase o estructura. + +Para las clases y estructuras con plantillas, no separe las declaraciones de métodos de la implementación (porque de lo contrario deben definirse en la misma unidad de traducción). + +**31.** Puede ajustar líneas en 140 caracteres, en lugar de 80. + +**32.** Utilice siempre los operadores de incremento / decremento de prefijo si no se requiere postfix. + +``` cpp +for (Names::const_iterator it = column_names.begin(); it != column_names.end(); ++it) +``` + +## Comentario {#comments} + +**1.** Asegúrese de agregar comentarios para todas las partes no triviales del código. + +Esto es muy importante. Escribir el comentario puede ayudarte a darte cuenta de que el código no es necesario o que está diseñado incorrectamente. + +``` cpp +/** Part of piece of memory, that can be used. + * For example, if internal_buffer is 1MB, and there was only 10 bytes loaded to buffer from file for reading, + * then working_buffer will have size of only 10 bytes + * (working_buffer.end() will point to position right after those 10 bytes available for read). + */ +``` + +**2.** Los comentarios pueden ser tan detallados como sea necesario. + +**3.** Coloque comentarios antes del código que describen. En casos raros, los comentarios pueden aparecer después del código, en la misma línea. + +``` cpp +/** Parses and executes the query. +*/ +void executeQuery( + ReadBuffer & istr, /// Where to read the query from (and data for INSERT, if applicable) + WriteBuffer & ostr, /// Where to write the result + Context & context, /// DB, tables, data types, engines, functions, aggregate functions... + BlockInputStreamPtr & query_plan, /// Here could be written the description on how query was executed + QueryProcessingStage::Enum stage = QueryProcessingStage::Complete /// Up to which stage process the SELECT query + ) +``` + +**4.** Los comentarios deben escribirse en inglés solamente. + +**5.** Si está escribiendo una biblioteca, incluya comentarios detallados que la expliquen en el archivo de encabezado principal. + +**6.** No agregue comentarios que no proporcionen información adicional. En particular, no deje comentarios vacíos como este: + +``` cpp +/* +* Procedure Name: +* Original procedure name: +* Author: +* Date of creation: +* Dates of modification: +* Modification authors: +* Original file name: +* Purpose: +* Intent: +* Designation: +* Classes used: +* Constants: +* Local variables: +* Parameters: +* Date of creation: +* Purpose: +*/ +``` + +El ejemplo se toma prestado del recurso http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/. + +**7.** No escriba comentarios de basura (autor, fecha de creación ..) al principio de cada archivo. + +**8.** Los comentarios de una sola línea comienzan con tres barras: `///` y los comentarios de varias líneas comienzan con `/**`. Estos comentarios son considerados “documentation”. + +Nota: Puede usar Doxygen para generar documentación a partir de estos comentarios. Pero Doxygen no se usa generalmente porque es más conveniente navegar por el código en el IDE. + +**9.** Los comentarios de varias líneas no deben tener líneas vacías al principio y al final (excepto la línea que cierra un comentario de varias líneas). + +**10.** Para comentar el código, use comentarios básicos, no “documenting” comentario. + +**11.** Elimine las partes comentadas del código antes de confirmar. + +**12.** No use blasfemias en comentarios o código. + +**13.** No use letras mayúsculas. No use puntuación excesiva. + +``` cpp +/// WHAT THE FAIL??? +``` + +**14.** No use comentarios para hacer delímetros. + +``` cpp +///****************************************************** +``` + +**15.** No comiencen las discusiones en los comentarios. + +``` cpp +/// Why did you do this stuff? +``` + +**16.** No hay necesidad de escribir un comentario al final de un bloque describiendo de qué se trataba. + +``` cpp +/// for +``` + +## Nombre {#names} + +**1.** Use letras minúsculas con guiones bajos en los nombres de variables y miembros de clase. + +``` cpp +size_t max_block_size; +``` + +**2.** Para los nombres de las funciones (métodos), use camelCase comenzando con una letra minúscula. + +``` cpp +std::string getName() const override { return "Memory"; } +``` + +**3.** Para los nombres de las clases (estructuras), use CamelCase comenzando con una letra mayúscula. Los prefijos distintos de I no se usan para interfaces. + +``` cpp +class StorageMemory : public IStorage +``` + +**4.** `using` se nombran de la misma manera que las clases, o con `_t` al final. + +**5.** Nombres de argumentos de tipo de plantilla: en casos simples, use `T`; `T`, `U`; `T1`, `T2`. + +Para casos más complejos, siga las reglas para los nombres de clase o agregue el prefijo `T`. + +``` cpp +template +struct AggregatedStatElement +``` + +**6.** Nombres de argumentos constantes de plantilla: siga las reglas para los nombres de variables o use `N` en casos simples. + +``` cpp +template +struct ExtractDomain +``` + +**7.** Para clases abstractas (interfaces) puede agregar el `I` prefijo. + +``` cpp +class IBlockInputStream +``` + +**8.** Si usa una variable localmente, puede usar el nombre corto. + +En todos los demás casos, use un nombre que describa el significado. + +``` cpp +bool info_successfully_loaded = false; +``` + +**9.** Nombres de `define`s y las constantes globales usan ALL\_CAPS con guiones bajos. + +``` cpp +#define MAX_SRC_TABLE_NAMES_TO_STORE 1000 +``` + +**10.** Los nombres de archivo deben usar el mismo estilo que su contenido. + +Si un archivo contiene una sola clase, nombre el archivo de la misma manera que la clase (CamelCase). + +Si el archivo contiene una sola función, nombre el archivo de la misma manera que la función (camelCase). + +**11.** Si el nombre contiene una abreviatura, : + +- Para los nombres de variables, la abreviatura debe usar letras minúsculas `mysql_connection` (ni `mySQL_connection`). +- Para los nombres de clases y funciones, mantenga las letras mayúsculas en la abreviatura`MySQLConnection` (ni `MySqlConnection`). + +**12.** Los argumentos del constructor que se usan solo para inicializar los miembros de la clase deben nombrarse de la misma manera que los miembros de la clase, pero con un guión bajo al final. + +``` cpp +FileQueueProcessor( + const std::string & path_, + const std::string & prefix_, + std::shared_ptr handler_) + : path(path_), + prefix(prefix_), + handler(handler_), + log(&Logger::get("FileQueueProcessor")) +{ +} +``` + +El sufijo de subrayado se puede omitir si el argumento no se usa en el cuerpo del constructor. + +**13.** No hay diferencia en los nombres de las variables locales y los miembros de la clase (no se requieren prefijos). + +``` cpp +timer (not m_timer) +``` + +**14.** Para las constantes en un `enum`, usar CamelCase con una letra mayúscula. ALL\_CAPS también es aceptable. Si el `enum` no es local, utilice un `enum class`. + +``` cpp +enum class CompressionMethod +{ + QuickLZ = 0, + LZ4 = 1, +}; +``` + +**15.** Todos los nombres deben estar en inglés. La transliteración de palabras rusas no está permitida. + + not Stroka + +**16.** Las abreviaturas son aceptables si son bien conocidas (cuando puede encontrar fácilmente el significado de la abreviatura en Wikipedia o en un motor de búsqueda). + + `AST`, `SQL`. + + Not `NVDH` (some random letters) + +Las palabras incompletas son aceptables si la versión abreviada es de uso común. + +También puede usar una abreviatura si el nombre completo se incluye junto a él en los comentarios. + +**17.** Los nombres de archivo con código fuente de C++ deben tener `.cpp` ampliación. Los archivos de encabezado deben tener `.h` ampliación. + +## Cómo escribir código {#how-to-write-code} + +**1.** Gestión de la memoria. + +Desasignación de memoria manual (`delete`) solo se puede usar en el código de la biblioteca. + +En el código de la biblioteca, el `delete` operador sólo se puede utilizar en destructores. + +En el código de la aplicación, la memoria debe ser liberada por el objeto que la posee. + +Ejemplos: + +- La forma más fácil es colocar un objeto en la pila o convertirlo en miembro de otra clase. +- Para una gran cantidad de objetos pequeños, use contenedores. +- Para la desasignación automática de un pequeño número de objetos que residen en el montón, use `shared_ptr/unique_ptr`. + +**2.** Gestión de recursos. + +Utilizar `RAII` y ver arriba. + +**3.** Manejo de errores. + +Utilice excepciones. En la mayoría de los casos, solo necesita lanzar una excepción y no necesita atraparla (debido a `RAII`). + +En las aplicaciones de procesamiento de datos fuera de línea, a menudo es aceptable no detectar excepciones. + +En los servidores que manejan las solicitudes de los usuarios, suele ser suficiente para detectar excepciones en el nivel superior del controlador de conexión. + +En las funciones de subproceso, debe capturar y mantener todas las excepciones para volver a lanzarlas en el subproceso principal después `join`. + +``` cpp +/// If there weren't any calculations yet, calculate the first block synchronously +if (!started) +{ + calculate(); + started = true; +} +else /// If calculations are already in progress, wait for the result + pool.wait(); + +if (exception) + exception->rethrow(); +``` + +Nunca oculte excepciones sin manejo. Nunca simplemente ponga ciegamente todas las excepciones para iniciar sesión. + +``` cpp +//Not correct +catch (...) {} +``` + +Si necesita ignorar algunas excepciones, hágalo solo para las específicas y vuelva a lanzar el resto. + +``` cpp +catch (const DB::Exception & e) +{ + if (e.code() == ErrorCodes::UNKNOWN_AGGREGATE_FUNCTION) + return nullptr; + else + throw; +} +``` + +Al usar funciones con códigos de respuesta o `errno`, siempre verifique el resultado y arroje una excepción en caso de error. + +``` cpp +if (0 != close(fd)) + throwFromErrno("Cannot close file " + file_name, ErrorCodes::CANNOT_CLOSE_FILE); +``` + +`Do not use assert`. + +**4.** Tipos de excepción. + +No es necesario utilizar una jerarquía de excepciones compleja en el código de la aplicación. El texto de excepción debe ser comprensible para un administrador del sistema. + +**5.** Lanzar excepciones de destructores. + +Esto no es recomendable, pero está permitido. + +Utilice las siguientes opciones: + +- Crear una función (`done()` o `finalize()`) que hará todo el trabajo de antemano que podría conducir a una excepción. Si se llamó a esa función, no debería haber excepciones en el destructor más adelante. +- Las tareas que son demasiado complejas (como enviar mensajes a través de la red) se pueden poner en un método separado al que el usuario de la clase tendrá que llamar antes de la destrucción. +- Si hay una excepción en el destructor, es mejor registrarla que ocultarla (si el registrador está disponible). +- En aplicaciones simples, es aceptable confiar en `std::terminate` (para los casos de `noexcept` de forma predeterminada en C ++ 11) para manejar excepciones. + +**6.** Bloques de código anónimos. + +Puede crear un bloque de código separado dentro de una sola función para hacer que ciertas variables sean locales, de modo que se llame a los destructores al salir del bloque. + +``` cpp +Block block = data.in->read(); + +{ + std::lock_guard lock(mutex); + data.ready = true; + data.block = block; +} + +ready_any.set(); +``` + +**7.** Multithreading. + +En programas de procesamiento de datos fuera de línea: + +- Trate de obtener el mejor rendimiento posible en un solo núcleo de CPU. A continuación, puede paralelizar su código si es necesario. + +En aplicaciones de servidor: + +- Utilice el grupo de subprocesos para procesar solicitudes. En este punto, no hemos tenido ninguna tarea que requiera el cambio de contexto de espacio de usuario. + +La horquilla no se usa para la paralelización. + +**8.** Sincronización de hilos. + +A menudo es posible hacer que diferentes hilos usen diferentes celdas de memoria (incluso mejor: diferentes líneas de caché) y no usar ninguna sincronización de hilos (excepto `joinAll`). + +Si se requiere sincronización, en la mayoría de los casos, es suficiente usar mutex bajo `lock_guard`. + +En otros casos, use primitivas de sincronización del sistema. No utilice la espera ocupada. + +Las operaciones atómicas deben usarse solo en los casos más simples. + +No intente implementar estructuras de datos sin bloqueo a menos que sea su principal área de especialización. + +**9.** Punteros vs referencias. + +En la mayoría de los casos, prefiera referencias. + +**10.** Construir. + +Usar referencias constantes, punteros a constantes, `const_iterator`, y métodos const. + +Considerar `const` para ser predeterminado y usar no-`const` sólo cuando sea necesario. + +Al pasar variables por valor, usando `const` por lo general no tiene sentido. + +**11.** sin firmar. + +Utilizar `unsigned` si es necesario. + +**12.** Tipos numéricos. + +Utilice los tipos `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, y `Int64`, así como `size_t`, `ssize_t`, y `ptrdiff_t`. + +No utilice estos tipos para los números: `signed/unsigned long`, `long long`, `short`, `signed/unsigned char`, `char`. + +**13.** Pasando argumentos. + +Pasar valores complejos por referencia (incluyendo `std::string`). + +Si una función captura la propiedad de un objeto creado en el montón, cree el tipo de argumento `shared_ptr` o `unique_ptr`. + +**14.** Valores devueltos. + +En la mayoría de los casos, sólo tiene que utilizar `return`. No escribir `[return std::move(res)]{.strike}`. + +Si la función asigna un objeto en el montón y lo devuelve, use `shared_ptr` o `unique_ptr`. + +En casos excepcionales, es posible que deba devolver el valor a través de un argumento. En este caso, el argumento debe ser una referencia. + +``` cpp +using AggregateFunctionPtr = std::shared_ptr; + +/** Allows creating an aggregate function by its name. + */ +class AggregateFunctionFactory +{ +public: + AggregateFunctionFactory(); + AggregateFunctionPtr get(const String & name, const DataTypes & argument_types) const; +``` + +**15.** espacio de nombres. + +No hay necesidad de usar un `namespace` para el código de aplicación. + +Las bibliotecas pequeñas tampoco necesitan esto. + +Para bibliotecas medianas a grandes, coloque todo en un `namespace`. + +En la biblioteca `.h` Archivo, se puede utilizar `namespace detail` para ocultar los detalles de implementación no necesarios para el código de la aplicación. + +En un `.cpp` archivo, puede usar un `static` o espacio de nombres anónimo para ocultar símbolos. + +Además, un `namespace` Puede ser utilizado para un `enum` para evitar que los nombres correspondientes caigan en un `namespace` (pero es mejor usar un `enum class`). + +**16.** Inicialización diferida. + +Si se requieren argumentos para la inicialización, normalmente no debe escribir un constructor predeterminado. + +Si más adelante necesitará retrasar la inicialización, puede agregar un constructor predeterminado que creará un objeto no válido. O, para un pequeño número de objetos, puede usar `shared_ptr/unique_ptr`. + +``` cpp +Loader(DB::Connection * connection_, const std::string & query, size_t max_block_size_); + +/// For deferred initialization +Loader() {} +``` + +**17.** Funciones virtuales. + +Si la clase no está destinada para uso polimórfico, no necesita hacer que las funciones sean virtuales. Esto también se aplica al destructor. + +**18.** Codificación. + +Usa UTF-8 en todas partes. Utilizar `std::string`y`char *`. No uso `std::wstring`y`wchar_t`. + +**19.** Tala. + +Vea los ejemplos en todas partes del código. + +Antes de confirmar, elimine todo el registro de depuración y sin sentido, y cualquier otro tipo de salida de depuración. + +Se debe evitar el registro en ciclos, incluso en el nivel Trace. + +Los registros deben ser legibles en cualquier nivel de registro. + +El registro solo debe usarse en el código de la aplicación, en su mayor parte. + +Los mensajes de registro deben estar escritos en inglés. + +El registro debe ser preferiblemente comprensible para el administrador del sistema. + +No use blasfemias en el registro. + +Utilice la codificación UTF-8 en el registro. En casos excepcionales, puede usar caracteres que no sean ASCII en el registro. + +**20.** Entrada-salida. + +No utilice `iostreams` en ciclos internos que son críticos para el rendimiento de la aplicación (y nunca usan `stringstream`). + +Descripción `DB/IO` biblioteca en su lugar. + +**21.** Fecha y hora. + +Ver el `DateLUT` biblioteca. + +**22.** incluir. + +Utilice siempre `#pragma once` en lugar de incluir guardias. + +**23.** utilizar. + +`using namespace` no se utiliza. Usted puede utilizar `using` con algo específico. Pero hazlo local dentro de una clase o función. + +**24.** No uso `trailing return type` para funciones a menos que sea necesario. + +``` cpp +[auto f() -> void;]{.strike} +``` + +**25.** Declaración e inicialización de variables. + +``` cpp +//right way +std::string s = "Hello"; +std::string s{"Hello"}; + +//wrong way +auto s = std::string{"Hello"}; +``` + +**26.** Para funciones virtuales, escriba `virtual` es la clase base, pero escribe `override` es lugar de `virtual` en las clases descendientes. + +## Características no utilizadas de C ++ {#unused-features-of-c} + +**1.** La herencia virtual no se utiliza. + +**2.** Los especificadores de excepción de C ++ 03 no se usan. + +## Plataforma {#platform} + +**1.** Escribimos código para una plataforma específica. + +Pero en igualdad de condiciones, se prefiere el código multiplataforma o portátil. + +**2.** Idioma: hacer ++ 17. + +**3.** Compilación: `gcc`. En este momento (diciembre de 2017), el código se compila utilizando la versión 7.2. (También se puede compilar usando `clang 4`.) + +Se utiliza la biblioteca estándar (`libstdc++` o `libc++`). + +**4.**OS: Linux Ubuntu, no más viejo que Precise. + +**5.**El código está escrito para la arquitectura de CPU x86\_64. + +El conjunto de instrucciones de CPU es el conjunto mínimo admitido entre nuestros servidores. Actualmente, es SSE 4.2. + +**6.** Utilizar `-Wall -Wextra -Werror` flags de compilación. + +**7.** Use enlaces estáticos con todas las bibliotecas, excepto aquellas a las que son difíciles de conectar estáticamente (consulte la salida de la `ldd` comando). + +**8.** El código se desarrolla y se depura con la configuración de la versión. + +## Herramienta {#tools} + +**1.** KDevelop es un buen IDE. + +**2.** Para la depuración, use `gdb`, `valgrind` (`memcheck`), `strace`, `-fsanitize=...`, o `tcmalloc_minimal_debug`. + +**3.** Para crear perfiles, uso `Linux Perf`, `valgrind` (`callgrind`Ciudad `strace -cf`. + +**4.** Las fuentes están en Git. + +**5.** Usos de ensamblaje `CMake`. + +**6.** Los programas se lanzan usando `deb` paquete. + +**7.** Los compromisos a dominar no deben romper la compilación. + +Aunque solo las revisiones seleccionadas se consideran viables. + +**8.** Realice confirmaciones tan a menudo como sea posible, incluso si el código está parcialmente listo. + +Use ramas para este propósito. + +Si su código en el `master` branch todavía no se puede construir, excluirlo de la compilación antes de la `push`. Tendrá que terminarlo o eliminarlo dentro de unos días. + +**9.** Para cambios no triviales, use ramas y publíquelas en el servidor. + +**10.** El código no utilizado se elimina del repositorio. + +## Biblioteca {#libraries} + +**1.** Se utiliza la biblioteca estándar de C ++ 14 (se permiten extensiones experimentales), así como `boost` y `Poco` marco. + +**2.** Si es necesario, puede usar cualquier biblioteca conocida disponible en el paquete del sistema operativo. + +Si ya hay una buena solución disponible, úsela, incluso si eso significa que debe instalar otra biblioteca. + +(Pero prepárese para eliminar las bibliotecas incorrectas del código.) + +**3.** Puede instalar una biblioteca que no está en los paquetes, si los paquetes no tienen lo que necesita o tienen una versión obsoleta o el tipo incorrecto de compilación. + +**4.** Si la biblioteca es pequeña y no tiene su propio sistema de compilación complejo, coloque los archivos `contrib` carpeta. + +**5.** Siempre se da preferencia a las bibliotecas que ya están en uso. + +## Recomendaciones generales {#general-recommendations-1} + +**1.** Escribe el menor código posible. + +**2.** Pruebe la solución más simple. + +**3.** No escriba código hasta que sepa cómo va a funcionar y cómo funcionará el bucle interno. + +**4.** En los casos más simples, use `using` en lugar de clases o estructuras. + +**5.** Si es posible, no escriba constructores de copia, operadores de asignación, destructores (que no sean virtuales, si la clase contiene al menos una función virtual), mueva constructores o mueva operadores de asignación. En otras palabras, las funciones generadas por el compilador deben funcionar correctamente. Usted puede utilizar `default`. + +**6.** Se fomenta la simplificación del código. Reduzca el tamaño de su código siempre que sea posible. + +## Recomendaciones adicionales {#additional-recommendations} + +**1.** Especificar explícitamente `std::` para tipos de `stddef.h` + +no se recomienda. En otras palabras, recomendamos escribir `size_t` es su lugar `std::size_t` porque es más corto. + +Es aceptable agregar `std::`. + +**2.** Especificar explícitamente `std::` para funciones de la biblioteca C estándar + +no se recomienda. En otras palabras, escribir `memcpy` es lugar de `std::memcpy`. + +La razón es que hay funciones no estándar similares, tales como `memmem`. Utilizamos estas funciones en ocasiones. Estas funciones no existen en `namespace std`. + +Si usted escribe `std::memcpy` es lugar de `memcpy` es todas partes, entonces `memmem` pecado `std::` se verá extraño. + +Sin embargo, todavía puedes usar `std::` si lo prefieres. + +**3.** Usar funciones de C cuando las mismas están disponibles en la biblioteca estándar de C ++. + +Esto es aceptable si es más eficiente. + +Por ejemplo, uso `memcpy` es lugar de `std::copy` para copiar grandes trozos de memoria. + +**4.** Argumentos de función multilínea. + +Se permite cualquiera de los siguientes estilos de ajuste: + +``` cpp +function( + T1 x1, + T2 x2) +``` + +``` cpp +function( + size_t left, size_t right, + const & RangesInDataParts ranges, + size_t limit) +``` + +``` cpp +function(size_t left, size_t right, + const & RangesInDataParts ranges, + size_t limit) +``` + +``` cpp +function(size_t left, size_t right, + const & RangesInDataParts ranges, + size_t limit) +``` + +``` cpp +function( + size_t left, + size_t right, + const & RangesInDataParts ranges, + size_t limit) +``` + +[Artículo Original](https://clickhouse.tech/docs/es/development/style/) diff --git a/docs/es/development/tests.md b/docs/es/development/tests.md new file mode 100644 index 00000000000..9682682a9df --- /dev/null +++ b/docs/es/development/tests.md @@ -0,0 +1,244 @@ +# Pruebas de ClickHouse {#clickhouse-testing} + +## Pruebas funcionales {#functional-tests} + +Las pruebas funcionales son las más simples y cómodas de usar. La mayoría de las características de ClickHouse se pueden probar con pruebas funcionales y son obligatorias para cada cambio en el código de ClickHouse que se puede probar de esa manera. + +Cada prueba funcional envía una o varias consultas al servidor ClickHouse en ejecución y compara el resultado con la referencia. + +Las pruebas se encuentran en `dbms/tests/queries` Directorio. Hay dos subdirectorios: `stateless` y `stateful`. Las pruebas sin estado ejecutan consultas sin datos de prueba precargados: a menudo crean pequeños conjuntos de datos sintéticos sobre la marcha, dentro de la prueba misma. Las pruebas estatales requieren datos de prueba precargados de Yandex.Métrica y no está disponible para el público en general. Tendemos a usar sólo `stateless` pruebas y evitar la adición de nuevos `stateful` prueba. + +Cada prueba puede ser de dos tipos: `.sql` y `.sh`. `.sql` prueba es el script SQL simple que se canaliza a `clickhouse-client --multiquery --testmode`. `.sh` test es un script que se ejecuta por sí mismo. + +Para ejecutar todas las pruebas, use `dbms/tests/clickhouse-test` herramienta. Mira `--help` para la lista de posibles opciones. Simplemente puede ejecutar todas las pruebas o ejecutar un subconjunto de pruebas filtradas por subcadena en el nombre de la prueba: `./clickhouse-test substring`. + +La forma más sencilla de invocar pruebas funcionales es copiar `clickhouse-client` Naciones `/usr/bin/`, ejecutar `clickhouse-server` y luego ejecutar `./clickhouse-test` de su propio directorio. + +Atracciones cercanas al hotel `.sql` o `.sh` archivo en `dbms/tests/queries/0_stateless` directorio, compruébelo manualmente y luego genere `.reference` archivo de la siguiente manera: `clickhouse-client -n --testmode < 00000_test.sql > 00000_test.reference` o `./00000_test.sh > ./00000_test.reference`. + +Las pruebas deben usar (crear, soltar, etc.) solo tablas en `test` base de datos que se supone que se crea de antemano; también las pruebas pueden usar tablas temporales. + +Si desea utilizar consultas distribuidas en pruebas funcionales, puede aprovechar `remote` función de la tabla con `127.0.0.{1..2}` direcciones para que el servidor se consulte; o puede usar clústeres de prueba predefinidos en el archivo de configuración del servidor como `test_shard_localhost`. + +Algunas pruebas están marcadas con `zookeeper`, `shard` o `long` en sus nombres. +`zookeeper` es para pruebas que están usando ZooKeeper. `shard` es para pruebas que +Requiere servidor para escuchar `127.0.0.*`; `distributed` o `global` Tienen el mismo +significado. `long` es para pruebas que duran un poco más de un segundo. Usted puede +Deshabilitar estos grupos de pruebas utilizando `--no-zookeeper`, `--no-shard` y +`--no-long` opciones, respectivamente. + +## Errores conocidos {#known-bugs} + +Si conocemos algunos errores que se pueden reproducir fácilmente mediante pruebas funcionales, colocamos pruebas funcionales preparadas en `dbms/tests/queries/bugs` directorio. Estas pruebas se moverán a `dbms/tests/queries/0_stateless` cuando se corrigen errores. + +## Pruebas de integración {#integration-tests} + +Las pruebas de integración permiten probar ClickHouse en la configuración agrupada y la interacción de ClickHouse con otros servidores como MySQL, Postgres, MongoDB. Son útiles para emular divisiones de red, caídas de paquetes, etc. Estas pruebas se ejecutan bajo Docker y crean múltiples contenedores con varios software. + +Ver `dbms/tests/integration/README.md` sobre cómo ejecutar estas pruebas. + +Tenga en cuenta que la integración de ClickHouse con controladores de terceros no se ha probado. Además, actualmente no tenemos pruebas de integración con nuestros controladores JDBC y ODBC. + +## Pruebas unitarias {#unit-tests} + +Las pruebas unitarias son útiles cuando desea probar no ClickHouse como un todo, sino una sola biblioteca o clase aislada. Puede habilitar o deshabilitar la compilación de pruebas con `ENABLE_TESTS` Opción CMake. Las pruebas unitarias (y otros programas de prueba) se encuentran en `tests` subdirectorios en todo el código. Para ejecutar pruebas unitarias, escriba `ninja test`. Algunas pruebas usan `gtest`, pero algunos son solo programas que devuelven un código de salida distinto de cero en caso de fallo de prueba. + +No es necesariamente tener pruebas unitarias si el código ya está cubierto por pruebas funcionales (y las pruebas funcionales suelen ser mucho más simples de usar). + +## Pruebas de rendimiento {#performance-tests} + +Las pruebas de rendimiento permiten medir y comparar el rendimiento de alguna parte aislada de ClickHouse en consultas sintéticas. Las pruebas se encuentran en `dbms/tests/performance`. Cada prueba está representada por `.xml` archivo con la descripción del caso de prueba. Las pruebas se ejecutan con `clickhouse performance-test` herramienta (que está incrustada en `clickhouse` binario). Ver `--help` para la invocación. + +Cada prueba ejecuta una o múltiples consultas (posiblemente con combinaciones de parámetros) en un bucle con algunas condiciones para stop (como “maximum execution speed is not changing in three seconds”) y medir algunas métricas sobre el rendimiento de las consultas (como “maximum execution speed”). Algunas pruebas pueden contener condiciones previas en el conjunto de datos de pruebas precargado. + +Si desea mejorar el rendimiento de ClickHouse en algún escenario, y si se pueden observar mejoras en consultas simples, se recomienda encarecidamente escribir una prueba de rendimiento. Siempre tiene sentido usar `perf top` u otras herramientas de perf durante sus pruebas. + +## Herramientas de prueba y secuencias de comandos {#test-tools-and-scripts} + +Algunos programas en `tests` directorio no son pruebas preparadas, pero son herramientas de prueba. Por ejemplo, párr `Lexer` hay una herramienta `dbms/src/Parsers/tests/lexer` que solo hacen la tokenización de stdin y escriben el resultado coloreado en stdout. Puede usar este tipo de herramientas como ejemplos de código y para exploración y pruebas manuales. + +También puede colocar un par de archivos `.sh` y `.reference` junto con la herramienta para ejecutarlo en alguna entrada predefinida, entonces el resultado del script se puede comparar con `.reference` file. Este tipo de pruebas no están automatizadas. + +## Pruebas misceláneas {#miscellanous-tests} + +Hay pruebas para diccionarios externos ubicados en `dbms/tests/external_dictionaries` y para modelos aprendidos a máquina en `dbms/tests/external_models`. Estas pruebas no se actualizan y deben transferirse a pruebas de integración. + +Hay una prueba separada para inserciones de quórum. Esta prueba ejecuta el clúster ClickHouse en servidores separados y emula varios casos de fallas: división de red, caída de paquetes (entre nodos ClickHouse, entre ClickHouse y ZooKeeper, entre el servidor ClickHouse y el cliente, etc.), `kill -9`, `kill -STOP` y `kill -CONT` , como [Jepsen](https://aphyr.com/tags/Jepsen). A continuación, la prueba comprueba que todas las inserciones reconocidas se escribieron y todas las inserciones rechazadas no. + +La prueba de quórum fue escrita por un equipo separado antes de que ClickHouse fuera de código abierto. Este equipo ya no trabaja con ClickHouse. La prueba fue escrita accidentalmente en Java. Por estas razones, la prueba de quórum debe reescribirse y trasladarse a pruebas de integración. + +## Pruebas manuales {#manual-testing} + +Cuando desarrolla una nueva característica, es razonable probarla también manualmente. Puede hacerlo con los siguientes pasos: + +Construir ClickHouse. Ejecute ClickHouse desde el terminal: cambie el directorio a `dbms/src/programs/clickhouse-server` y ejecutarlo con `./clickhouse-server`. Se utilizará la configuración (`config.xml`, `users.xml` y archivos dentro de `config.d` y `users.d` directorios) desde el directorio actual de forma predeterminada. Para conectarse al servidor ClickHouse, ejecute `dbms/src/programs/clickhouse-client/clickhouse-client`. + +Tenga en cuenta que todas las herramientas de clickhouse (servidor, cliente, etc.) son solo enlaces simbólicos a un único binario llamado `clickhouse`. Puede encontrar este binario en `dbms/src/programs/clickhouse`. Todas las herramientas también se pueden invocar como `clickhouse tool` es lugar de `clickhouse-tool`. + +Alternativamente, puede instalar el paquete ClickHouse: ya sea una versión estable del repositorio de Yandex o puede crear un paquete para usted con `./release` en la raíz de fuentes de ClickHouse. Luego inicie el servidor con `sudo service clickhouse-server start` (o detener para detener el servidor). Busque registros en `/etc/clickhouse-server/clickhouse-server.log`. + +Cuando ClickHouse ya está instalado en su sistema, puede crear un nuevo `clickhouse` binario y reemplazar el binario existente: + +``` bash +$ sudo service clickhouse-server stop +$ sudo cp ./clickhouse /usr/bin/ +$ sudo service clickhouse-server start +``` + +También puede detener el servidor de clickhouse del sistema y ejecutar el suyo propio con la misma configuración pero con el registro en la terminal: + +``` bash +$ sudo service clickhouse-server stop +$ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml +``` + +Ejemplo con gdb: + +``` bash +$ sudo -u clickhouse gdb --args /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml +``` + +Si el servidor de clickhouse del sistema ya se está ejecutando y no desea detenerlo, puede cambiar los números de `config.xml` (o anularlos en un archivo en `config.d` directorio), proporcione la ruta de datos adecuada y ejecútela. + +`clickhouse` binary casi no tiene dependencias y funciona en una amplia gama de distribuciones de Linux. Para probar rápidamente y sucio sus cambios en un servidor, simplemente puede `scp` su fresco construido `clickhouse` binario a su servidor y luego ejecútelo como en los ejemplos anteriores. + +## Entorno de prueba {#testing-environment} + +Antes de publicar la versión como estable, la implementamos en el entorno de prueba. El entorno de prueba es un clúster que procesa 1/39 parte de [El Yandex.Métrica](https://metrica.yandex.com/) datos. Compartimos nuestro entorno de pruebas con Yandex.Equipo de Metrica. ClickHouse se actualiza sin tiempo de inactividad sobre los datos existentes. Nos fijamos en un primer momento que los datos se procesan con éxito sin retraso de tiempo real, la replicación continúan trabajando y no hay problemas visibles para Yandex.Equipo de Metrica. La primera comprobación se puede hacer de la siguiente manera: + +``` sql +SELECT hostName() AS h, any(version()), any(uptime()), max(UTCEventTime), count() FROM remote('example01-01-{1..3}t', merge, hits) WHERE EventDate >= today() - 2 GROUP BY h ORDER BY h; +``` + +En algunos casos también implementamos en el entorno de prueba de nuestros equipos de amigos en Yandex: Market, Cloud, etc. También tenemos algunos servidores de hardware que se utilizan con fines de desarrollo. + +## Pruebas de carga {#load-testing} + +Después de implementar en el entorno de prueba, ejecutamos pruebas de carga con consultas del clúster de producción. Esto se hace manualmente. + +Asegúrese de que ha habilitado `query_log` en su clúster de producción. + +Recopilar el registro de consultas para un día o más: + +``` bash +$ clickhouse-client --query="SELECT DISTINCT query FROM system.query_log WHERE event_date = today() AND query LIKE '%ym:%' AND query NOT LIKE '%system.query_log%' AND type = 2 AND is_initial_query" > queries.tsv +``` + +Este es un ejemplo complicado. `type = 2` filtrará las consultas que se ejecutan correctamente. `query LIKE '%ym:%'` es seleccionar consultas relevantes de Yandex.Métrica. `is_initial_query` es seleccionar solo las consultas iniciadas por el cliente, no por ClickHouse (como partes del procesamiento de consultas distribuidas). + +`scp` este registro en su clúster de prueba y ejecútelo de la siguiente manera: + +``` bash +$ clickhouse benchmark --concurrency 16 < queries.tsv +``` + +(probablemente también desee especificar un `--user`) + +Luego déjalo por una noche o un fin de semana e ir a tomar un descanso. + +Usted debe comprobar que `clickhouse-server` no se bloquea, la huella de memoria está limitada y el rendimiento no se degrada con el tiempo. + +Los tiempos de ejecución de consultas precisos no se registran y no se comparan debido a la alta variabilidad de las consultas y el entorno. + +## Pruebas de construcción {#build-tests} + +Las pruebas de compilación permiten verificar que la compilación no esté rota en varias configuraciones alternativas y en algunos sistemas extranjeros. Las pruebas se encuentran en `ci` directorio. Ejecutan compilación desde la fuente dentro de Docker, Vagrant y, a veces, con `qemu-user-static` dentro de Docker. Estas pruebas están en desarrollo y las ejecuciones de pruebas no están automatizadas. + +Motivación: + +Normalmente lanzamos y ejecutamos todas las pruebas en una sola variante de compilación ClickHouse. Pero hay variantes de construcción alternativas que no se prueban a fondo. Ejemplos: + +- construir en FreeBSD; +- construir en Debian con bibliotecas de paquetes del sistema; +- Construir con enlaces compartidos de bibliotecas; +- construir en la plataforma AArch64; +- Construir en la plataforma PowerPc. + +Por ejemplo, construir con paquetes del sistema es una mala práctica, porque no podemos garantizar qué versión exacta de paquetes tendrá un sistema. Pero esto es realmente necesario para los mantenedores de Debian. Por esta razón, al menos tenemos que admitir esta variante de construcción. Otro ejemplo: la vinculación compartida es una fuente común de problemas, pero es necesaria para algunos entusiastas. + +Aunque no podemos ejecutar todas las pruebas en todas las variantes de compilaciones, queremos verificar al menos que varias variantes de compilación no estén rotas. Para este propósito utilizamos pruebas de construcción. + +## Pruebas de compatibilidad de protocolos {#testing-for-protocol-compatibility} + +Cuando ampliamos el protocolo de red ClickHouse, probamos manualmente que el antiguo clickhouse-client funciona con el nuevo clickhouse-server y el nuevo clickhouse-client funciona con el antiguo clickhouse-server (simplemente ejecutando binarios de los paquetes correspondientes). + +## Ayuda del compilador {#help-from-the-compiler} + +Código principal de ClickHouse (que se encuentra en `dbms` Directorio) se construye con `-Wall -Wextra -Werror` y con algunas advertencias habilitadas adicionales. Aunque estas opciones no están habilitadas para bibliotecas de terceros. + +Clang tiene advertencias aún más útiles: puedes buscarlas con `-Weverything` y elige algo para la compilación predeterminada. + +Para las compilaciones de producción, se usa gcc (todavía genera un código ligeramente más eficiente que clang). Para el desarrollo, el clang suele ser más conveniente de usar. Puede construir en su propia máquina con el modo de depuración (para ahorrar batería de su computadora portátil), pero tenga en cuenta que el compilador puede generar más advertencias con `-O3` debido a un mejor flujo de control y análisis entre procedimientos. Al construir con clang, `libc++` se utiliza en lugar de `libstdc++` y al construir con el modo de depuración, la versión de depuración de `libc++` se utiliza que permite detectar más errores en tiempo de ejecución. + +## Desinfectantes {#sanitizers} + +**Dirección desinfectante**. +Ejecutamos pruebas funcionales y de integración bajo ASan por compromiso. + +**Valgrind (Memcheck)**. +Realizamos pruebas funcionales bajo Valgrind durante la noche. Se tarda varias horas. Actualmente hay un falso positivo conocido en `re2` Biblioteca, ver [este artículo](https://research.swtch.com/sparse). + +**Desinfectante de comportamiento indefinido.** +Ejecutamos pruebas funcionales y de integración bajo ASan por compromiso. + +**Desinfectante de hilo**. +Ejecutamos pruebas funcionales bajo TSan por compromiso. Todavía no ejecutamos pruebas de integración bajo TSan por compromiso. + +**Desinfectante de memoria**. +Actualmente todavía no usamos MSan. + +**Asignador de depuración.** +Versión de depuración de `jemalloc` se utiliza para la compilación de depuración. + +## Fuzzing {#fuzzing} + +Usamos una prueba de fuzz simple para generar consultas SQL aleatorias y para verificar que el servidor no muera. Las pruebas de pelusa se realizan con el desinfectante Address. Lo puedes encontrar en `00746_sql_fuzzy.pl`. Esta prueba debe ejecutarse de forma continua (de la noche a la mañana y más). + +A partir de diciembre de 2018, todavía no usamos pruebas de fuzz aisladas del código de la biblioteca. + +## Auditoría de seguridad {#security-audit} + +La gente del departamento de Yandex Cloud hace una visión general básica de las capacidades de ClickHouse desde el punto de vista de la seguridad. + +## Analizadores estáticos {#static-analyzers} + +Corremos `PVS-Studio` por compromiso. Hemos evaluado `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`. Encontrará instrucciones de uso en `dbms/tests/instructions/` directorio. También puedes leer [el artículo en ruso](https://habr.com/company/yandex/blog/342018/). + +Si usted estados unidos `CLion` Como IDE, puede aprovechar algunos `clang-tidy` comprueba fuera de la caja. + +## Endurecer {#hardening} + +`FORTIFY_SOURCE` se utiliza de forma predeterminada. Es casi inútil, pero todavía tiene sentido en casos raros y no lo desactivamos. + +## Estilo de código {#code-style} + +Se describen las reglas de estilo de código [aqui](https://clickhouse.tech/docs/es/development/style/). + +Para comprobar si hay algunas violaciones de estilo comunes, puede usar `utils/check-style` script. + +Para forzar el estilo adecuado de su código, puede usar `clang-format`. File `.clang-format` se encuentra en la raíz de las fuentes. Se corresponde principalmente con nuestro estilo de código real. Pero no se recomienda aplicar `clang-format` a los archivos existentes porque empeora el formato. Usted puede utilizar `clang-format-diff` herramienta que puede encontrar en el repositorio de origen clang. + +Alternativamente, puede intentar `uncrustify` herramienta para reformatear su código. La configuración está en `uncrustify.cfg` en la raíz de las fuentes. Es menos probado que `clang-format`. + +`CLion` tiene su propio formateador de código que debe ajustarse para nuestro estilo de código. + +## Pruebas Metrica B2B {#metrica-b2b-tests} + +Cada lanzamiento de ClickHouse se prueba con los motores Yandex Metrica y AppMetrica. Las pruebas y las versiones estables de ClickHouse se implementan en máquinas virtuales y se ejecutan con una copia pequeña del motor Metrica que procesa una muestra fija de datos de entrada. A continuación, los resultados de dos instancias del motor Metrica se comparan juntos. + +Estas pruebas son automatizadas por un equipo separado. Debido a la gran cantidad de piezas móviles, las pruebas fallan la mayor parte del tiempo por razones completamente no relacionadas, que son muy difíciles de descubrir. Lo más probable es que estas pruebas tengan un valor negativo para nosotros. Sin embargo, se demostró que estas pruebas son útiles en aproximadamente una o dos veces de cada cientos. + +## Cobertura de prueba {#test-coverage} + +A partir de julio de 2018 no realizamos un seguimiento de la cobertura de las pruebas. + +## Automatización de pruebas {#test-automation} + +Realizamos pruebas con el CI interno de Yandex y el sistema de automatización de trabajos llamado “Sandbox”. + +Los trabajos de compilación y las pruebas se ejecutan en Sandbox por confirmación. Los paquetes resultantes y los resultados de las pruebas se publican en GitHub y se pueden descargar mediante enlaces directos. Los artefactos se almacenan eternamente. Cuando envías una solicitud de extracción en GitHub, la etiquetamos como “can be tested” y nuestro sistema CI construirá paquetes ClickHouse (liberación, depuración, con desinfectante de direcciones, etc.) para usted. + +No usamos Travis CI debido al límite de tiempo y potencia computacional. +No usamos Jenkins. Se usó antes y ahora estamos felices de no estar usando Jenkins. + +[Artículo Original](https://clickhouse.tech/docs/es/development/tests/) diff --git a/docs/es/faq/general.md b/docs/es/faq/general.md new file mode 100644 index 00000000000..580ae7d1241 --- /dev/null +++ b/docs/es/faq/general.md @@ -0,0 +1,53 @@ +# Preguntas generales {#general-questions} + +## ¿Por qué no usar algo como MapReduce? {#why-not-use-something-like-mapreduce} + +Podemos referirnos a sistemas como MapReduce como sistemas informáticos distribuidos en los que la operación de reducción se basa en la clasificación distribuida. La solución de código abierto más común en esta clase es [Acerca de nosotros](http://hadoop.apache.org). Yandex utiliza su solución interna, YT. + +Estos sistemas no son apropiados para consultas en línea debido a su alta latencia. En otras palabras, no se pueden usar como back-end para una interfaz web. Estos tipos de sistemas no son útiles para actualizaciones de datos en tiempo real. La clasificación distribuida no es la mejor manera de realizar operaciones de reducción si el resultado de la operación y todos los resultados intermedios (si los hay) se encuentran en la RAM de un único servidor, que suele ser el caso para las consultas en línea. En tal caso, una tabla hash es una forma óptima de realizar operaciones de reducción. Un enfoque común para optimizar las tareas de reducción de mapas es la preagregación (reducción parcial) utilizando una tabla hash en RAM. El usuario realiza esta optimización manualmente. La clasificación distribuida es una de las principales causas de un rendimiento reducido cuando se ejecutan tareas simples de reducción de mapas. + +La mayoría de las implementaciones de MapReduce le permiten ejecutar código arbitrario en un clúster. Pero un lenguaje de consulta declarativo es más adecuado para OLAP para ejecutar experimentos rápidamente. Por ejemplo, Hadoop tiene Hive y Pig. También considere Cloudera Impala o Shark (obsoleto) para Spark, así como Spark SQL, Presto y Apache Drill. El rendimiento cuando se ejecutan tales tareas es muy subóptimo en comparación con los sistemas especializados, pero la latencia relativamente alta hace que sea poco realista utilizar estos sistemas como back-end para una interfaz web. + +## ¿Qué sucede si tengo un problema con las codificaciones al usar Oracle a través de ODBC? {#oracle-odbc-encodings} + +Si utiliza Oracle a través del controlador ODBC como fuente de diccionarios externos, debe establecer el valor `NLS_LANG` variable de entorno en `/etc/default/clickhouse`. Para obtener más información, consulte [Oracle NLS\_LANG Preguntas frecuentes](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). + +**Ejemplo** + +``` sql +NLS_LANG=RUSSIAN_RUSSIA.UTF8 +``` + +## Cómo exporto datos de ClickHouse a un archivo? {#how-to-export-to-file} + +### Uso de la cláusula INTO OUTFILE {#using-into-outfile-clause} + +Añadir un [INTO OUTFILE](../query_language/select/#into-outfile-clause) cláusula a su consulta. + +Por ejemplo: + +``` sql +SELECT * FROM table INTO OUTFILE 'file' +``` + +De forma predeterminada, ClickHouse usa el [TabSeparated](../interfaces/formats.md#tabseparated) formato de datos de salida. Para seleccionar el [Formato de datos](../interfaces/formats.md), utilizar el [Cláusula FORMAT](../query_language/select/#format-clause). + +Por ejemplo: + +``` sql +SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV +``` + +### Uso de una tabla de motor de archivo {#using-a-file-engine-table} + +Ver [File](../operations/table_engines/file.md). + +### Uso de la redirección de línea de comandos {#using-command-line-redirection} + +``` sql +$ clickhouse-client --query "SELECT * from table" --format FormatName > result.txt +``` + +Ver [Casa de clics-cliente](../interfaces/cli.md). + +[Artículo Original](https://clickhouse.tech/docs/es/faq/general/) diff --git a/docs/es/getting_started/example_datasets/amplab_benchmark.md b/docs/es/getting_started/example_datasets/amplab_benchmark.md new file mode 100644 index 00000000000..45dc417ae46 --- /dev/null +++ b/docs/es/getting_started/example_datasets/amplab_benchmark.md @@ -0,0 +1,122 @@ +# Referencia de Big Data de AMPLab {#amplab-big-data-benchmark} + +Ver https://amplab.cs.berkeley.edu/benchmark/ + +Regístrese para obtener una cuenta gratuita en https://aws.amazon.com. Necesitará una tarjeta de crédito, correo electrónico y número de teléfono. Obtenga una nueva clave de acceso en https://console.aws.amazon.com/iam/home?nc2=h\_m\_sc\#security\_credential + +Ejecute lo siguiente en la consola: + +``` bash +$ sudo apt-get install s3cmd +$ mkdir tiny; cd tiny; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . +$ cd .. +$ mkdir 1node; cd 1node; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . +$ cd .. +$ mkdir 5nodes; cd 5nodes; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . +$ cd .. +``` + +Ejecute las siguientes consultas de ClickHouse: + +``` sql +CREATE TABLE rankings_tiny +( + pageURL String, + pageRank UInt32, + avgDuration UInt32 +) ENGINE = Log; + +CREATE TABLE uservisits_tiny +( + sourceIP String, + destinationURL String, + visitDate Date, + adRevenue Float32, + UserAgent String, + cCode FixedString(3), + lCode FixedString(6), + searchWord String, + duration UInt32 +) ENGINE = MergeTree(visitDate, visitDate, 8192); + +CREATE TABLE rankings_1node +( + pageURL String, + pageRank UInt32, + avgDuration UInt32 +) ENGINE = Log; + +CREATE TABLE uservisits_1node +( + sourceIP String, + destinationURL String, + visitDate Date, + adRevenue Float32, + UserAgent String, + cCode FixedString(3), + lCode FixedString(6), + searchWord String, + duration UInt32 +) ENGINE = MergeTree(visitDate, visitDate, 8192); + +CREATE TABLE rankings_5nodes_on_single +( + pageURL String, + pageRank UInt32, + avgDuration UInt32 +) ENGINE = Log; + +CREATE TABLE uservisits_5nodes_on_single +( + sourceIP String, + destinationURL String, + visitDate Date, + adRevenue Float32, + UserAgent String, + cCode FixedString(3), + lCode FixedString(6), + searchWord String, + duration UInt32 +) ENGINE = MergeTree(visitDate, visitDate, 8192); +``` + +Volver a la consola: + +``` bash +$ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done +$ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done +$ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done +$ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done +$ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done +$ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done +``` + +Consultas para obtener muestras de datos: + +``` sql +SELECT pageURL, pageRank FROM rankings_1node WHERE pageRank > 1000 + +SELECT substring(sourceIP, 1, 8), sum(adRevenue) FROM uservisits_1node GROUP BY substring(sourceIP, 1, 8) + +SELECT + sourceIP, + sum(adRevenue) AS totalRevenue, + avg(pageRank) AS pageRank +FROM rankings_1node ALL INNER JOIN +( + SELECT + sourceIP, + destinationURL AS pageURL, + adRevenue + FROM uservisits_1node + WHERE (visitDate > '1980-01-01') AND (visitDate < '1980-04-01') +) USING pageURL +GROUP BY sourceIP +ORDER BY totalRevenue DESC +LIMIT 1 +``` + +[Artículo Original](https://clickhouse.tech/docs/es/getting_started/example_datasets/amplab_benchmark/) diff --git a/docs/es/getting_started/example_datasets/criteo.md b/docs/es/getting_started/example_datasets/criteo.md new file mode 100644 index 00000000000..89922f00679 --- /dev/null +++ b/docs/es/getting_started/example_datasets/criteo.md @@ -0,0 +1,74 @@ +# Terabyte de registros de clics de Criteo {#terabyte-of-click-logs-from-criteo} + +Descargue los datos de http://labs.criteo.com/downloads/download-terabyte-click-logs/ + +Cree una tabla para importar el registro: + +``` sql +CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log +``` + +Descargar los datos: + +``` bash +$ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done +``` + +Crear una tabla para los datos convertidos: + +``` sql +CREATE TABLE criteo +( + date Date, + clicked UInt8, + int1 Int32, + int2 Int32, + int3 Int32, + int4 Int32, + int5 Int32, + int6 Int32, + int7 Int32, + int8 Int32, + int9 Int32, + int10 Int32, + int11 Int32, + int12 Int32, + int13 Int32, + icat1 UInt32, + icat2 UInt32, + icat3 UInt32, + icat4 UInt32, + icat5 UInt32, + icat6 UInt32, + icat7 UInt32, + icat8 UInt32, + icat9 UInt32, + icat10 UInt32, + icat11 UInt32, + icat12 UInt32, + icat13 UInt32, + icat14 UInt32, + icat15 UInt32, + icat16 UInt32, + icat17 UInt32, + icat18 UInt32, + icat19 UInt32, + icat20 UInt32, + icat21 UInt32, + icat22 UInt32, + icat23 UInt32, + icat24 UInt32, + icat25 UInt32, + icat26 UInt32 +) ENGINE = MergeTree(date, intHash32(icat1), (date, intHash32(icat1)), 8192) +``` + +Transforme los datos del registro sin procesar y colóquelos en la segunda tabla: + +``` sql +INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log; + +DROP TABLE criteo_log; +``` + +[Artículo Original](https://clickhouse.tech/docs/es/getting_started/example_datasets/criteo/) diff --git a/docs/es/getting_started/example_datasets/metrica.md b/docs/es/getting_started/example_datasets/metrica.md new file mode 100644 index 00000000000..bb359351043 --- /dev/null +++ b/docs/es/getting_started/example_datasets/metrica.md @@ -0,0 +1,63 @@ +# Yandex anonimizado.Metrica Datos {#anonymized-yandex-metrica-data} + +El conjunto de datos consta de dos tablas que contienen datos anónimos sobre los hits (`hits_v1`) y visitas (`visits_v1`) el Yandex.Métrica. Puedes leer más sobre Yandex.Metrica en [Historial de ClickHouse](../../introduction/history.md) apartado. + +El conjunto de datos consta de dos tablas, cualquiera de ellas se puede descargar como `tsv.xz` o como particiones preparadas. Además, una versión extendida de la `hits` La tabla que contiene 100 millones de filas está disponible como TSV en https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits\_100m\_obfuscated\_v1.tsv.xz y como particiones preparadas en https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits\_100m\_obfuscated\_v1.tar.xz. + +## Obtención de tablas a partir de particiones preparadas {#obtaining-tables-from-prepared-partitions} + +Descargar e importar tabla de hits: + +``` bash +curl -O https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar +tar xvf hits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory +# check permissions on unpacked data, fix if required +sudo service clickhouse-server restart +clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" +``` + +Descargar e importar visitas: + +``` bash +curl -O https://clickhouse-datasets.s3.yandex.net/visits/partitions/visits_v1.tar +tar xvf visits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory +# check permissions on unpacked data, fix if required +sudo service clickhouse-server restart +clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" +``` + +## Obtención de tablas a partir de un archivo TSV comprimido {#obtaining-tables-from-compressed-tsv-file} + +Descargar e importar hits desde un archivo TSV comprimido: + +``` bash +curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv +# now create table +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" +# import data +cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000 +# optionally you can optimize table +clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL" +clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" +``` + +Descargue e importe visitas desde un archivo tsv comprimido: + +``` bash +curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv +# now create table +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign)" +# import data +cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000 +# optionally you can optimize table +clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" +clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" +``` + +## Consultas de ejemplo {#example-queries} + +[Tutorial de ClickHouse](../../getting_started/tutorial.md) se basa en Yandex.El conjunto de datos de Metrica y la forma recomendada de comenzar con este conjunto de datos es simplemente pasar por el tutorial. + +Se pueden encontrar ejemplos adicionales de consultas a estas tablas entre [pruebas estatales](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/tests/queries/1_stateful) de ClickHouse (se nombran `test.hists` y `test.visits` Todos los derechos reservados. diff --git a/docs/es/getting_started/example_datasets/nyc_taxi.md b/docs/es/getting_started/example_datasets/nyc_taxi.md new file mode 100644 index 00000000000..9d2499e2f42 --- /dev/null +++ b/docs/es/getting_started/example_datasets/nyc_taxi.md @@ -0,0 +1,383 @@ +# Datos de taxis de Nueva York {#new-york-taxi-data} + +Este conjunto de datos se puede obtener de dos maneras: + +- importación de datos sin procesar +- descarga de particiones preparadas + +## Cómo importar los datos sin procesar {#how-to-import-the-raw-data} + +Consulte https://github.com/toddwschneider/nyc-taxi-data y http://tech.marksblogg.com/billion-nyc-taxi-rides-redshift.html para obtener la descripción de un conjunto de datos e instrucciones para descargar. + +La descarga dará como resultado aproximadamente 227 GB de datos sin comprimir en archivos CSV. La descarga tarda aproximadamente una hora en una conexión de 1 Gbit (la descarga paralela de s3.amazonaws.com recupera al menos la mitad de un canal de 1 Gbit). +Es posible que algunos de los archivos no se descarguen por completo. Verifique los tamaños de archivo y vuelva a descargar cualquiera que parezca dudoso. + +Algunos de los archivos pueden contener filas no válidas. Puede arreglarlos de la siguiente manera: + +``` bash +sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-02.csv > data/yellow_tripdata_2010-02.csv_ +sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-03.csv > data/yellow_tripdata_2010-03.csv_ +mv data/yellow_tripdata_2010-02.csv_ data/yellow_tripdata_2010-02.csv +mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv +``` + +Entonces los datos deben ser preprocesados en PostgreSQL. Esto creará selecciones de puntos en los polígonos (para hacer coincidir los puntos en el mapa con los distritos de la ciudad de Nueva York) y combinará todos los datos en una única tabla plana desnormalizada mediante el uso de una unión. Para hacer esto, deberá instalar PostgreSQL con soporte PostGIS. + +Tenga cuidado al correr `initialize_database.sh` y volver a verificar manualmente que todas las tablas se crearon correctamente. + +Se tarda entre 20 y 30 minutos en procesar los datos de cada mes en PostgreSQL, por un total de aproximadamente 48 horas. + +Puede comprobar el número de filas descargadas de la siguiente manera: + +``` bash +$ time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" +## Count + 1298979494 +(1 row) + +real 7m9.164s +``` + +(Esto es un poco más de 1.1 mil millones de filas reportadas por Mark Litwintschik en una serie de publicaciones de blog.) + +Los datos en PostgreSQL utilizan 370 GB de espacio. + +Exportación de los datos de PostgreSQL: + +``` sql +COPY +( + SELECT trips.id, + trips.vendor_id, + trips.pickup_datetime, + trips.dropoff_datetime, + trips.store_and_fwd_flag, + trips.rate_code_id, + trips.pickup_longitude, + trips.pickup_latitude, + trips.dropoff_longitude, + trips.dropoff_latitude, + trips.passenger_count, + trips.trip_distance, + trips.fare_amount, + trips.extra, + trips.mta_tax, + trips.tip_amount, + trips.tolls_amount, + trips.ehail_fee, + trips.improvement_surcharge, + trips.total_amount, + trips.payment_type, + trips.trip_type, + trips.pickup, + trips.dropoff, + + cab_types.type cab_type, + + weather.precipitation_tenths_of_mm rain, + weather.snow_depth_mm, + weather.snowfall_mm, + weather.max_temperature_tenths_degrees_celsius max_temp, + weather.min_temperature_tenths_degrees_celsius min_temp, + weather.average_wind_speed_tenths_of_meters_per_second wind, + + pick_up.gid pickup_nyct2010_gid, + pick_up.ctlabel pickup_ctlabel, + pick_up.borocode pickup_borocode, + pick_up.boroname pickup_boroname, + pick_up.ct2010 pickup_ct2010, + pick_up.boroct2010 pickup_boroct2010, + pick_up.cdeligibil pickup_cdeligibil, + pick_up.ntacode pickup_ntacode, + pick_up.ntaname pickup_ntaname, + pick_up.puma pickup_puma, + + drop_off.gid dropoff_nyct2010_gid, + drop_off.ctlabel dropoff_ctlabel, + drop_off.borocode dropoff_borocode, + drop_off.boroname dropoff_boroname, + drop_off.ct2010 dropoff_ct2010, + drop_off.boroct2010 dropoff_boroct2010, + drop_off.cdeligibil dropoff_cdeligibil, + drop_off.ntacode dropoff_ntacode, + drop_off.ntaname dropoff_ntaname, + drop_off.puma dropoff_puma + FROM trips + LEFT JOIN cab_types + ON trips.cab_type_id = cab_types.id + LEFT JOIN central_park_weather_observations_raw weather + ON weather.date = trips.pickup_datetime::date + LEFT JOIN nyct2010 pick_up + ON pick_up.gid = trips.pickup_nyct2010_gid + LEFT JOIN nyct2010 drop_off + ON drop_off.gid = trips.dropoff_nyct2010_gid +) TO '/opt/milovidov/nyc-taxi-data/trips.tsv'; +``` + +La instantánea de datos se crea a una velocidad de aproximadamente 50 MB por segundo. Al crear la instantánea, PostgreSQL lee desde el disco a una velocidad de aproximadamente 28 MB por segundo. +Esto toma alrededor de 5 horas. El archivo TSV resultante es 590612904969 bytes. + +Crear una tabla temporal en ClickHouse: + +``` sql +CREATE TABLE trips +( +trip_id UInt32, +vendor_id String, +pickup_datetime DateTime, +dropoff_datetime Nullable(DateTime), +store_and_fwd_flag Nullable(FixedString(1)), +rate_code_id Nullable(UInt8), +pickup_longitude Nullable(Float64), +pickup_latitude Nullable(Float64), +dropoff_longitude Nullable(Float64), +dropoff_latitude Nullable(Float64), +passenger_count Nullable(UInt8), +trip_distance Nullable(Float64), +fare_amount Nullable(Float32), +extra Nullable(Float32), +mta_tax Nullable(Float32), +tip_amount Nullable(Float32), +tolls_amount Nullable(Float32), +ehail_fee Nullable(Float32), +improvement_surcharge Nullable(Float32), +total_amount Nullable(Float32), +payment_type Nullable(String), +trip_type Nullable(UInt8), +pickup Nullable(String), +dropoff Nullable(String), +cab_type Nullable(String), +precipitation Nullable(UInt8), +snow_depth Nullable(UInt8), +snowfall Nullable(UInt8), +max_temperature Nullable(UInt8), +min_temperature Nullable(UInt8), +average_wind_speed Nullable(UInt8), +pickup_nyct2010_gid Nullable(UInt8), +pickup_ctlabel Nullable(String), +pickup_borocode Nullable(UInt8), +pickup_boroname Nullable(String), +pickup_ct2010 Nullable(String), +pickup_boroct2010 Nullable(String), +pickup_cdeligibil Nullable(FixedString(1)), +pickup_ntacode Nullable(String), +pickup_ntaname Nullable(String), +pickup_puma Nullable(String), +dropoff_nyct2010_gid Nullable(UInt8), +dropoff_ctlabel Nullable(String), +dropoff_borocode Nullable(UInt8), +dropoff_boroname Nullable(String), +dropoff_ct2010 Nullable(String), +dropoff_boroct2010 Nullable(String), +dropoff_cdeligibil Nullable(String), +dropoff_ntacode Nullable(String), +dropoff_ntaname Nullable(String), +dropoff_puma Nullable(String) +) ENGINE = Log; +``` + +Es necesario para convertir campos a tipos de datos más correctos y, si es posible, para eliminar NULL. + +``` bash +$ time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv + +real 75m56.214s +``` + +Los datos se leen a una velocidad de 112-140 Mb / segundo. +La carga de datos en una tabla de tipos de registro en una secuencia tardó 76 minutos. +Los datos de esta tabla utilizan 142 GB. + +(Importar datos directamente desde Postgres también es posible usando `COPY ... TO PROGRAM`.) + +Unfortunately, all the fields associated with the weather (precipitation…average\_wind\_speed) were filled with NULL. Because of this, we will remove them from the final data set. + +Para empezar, crearemos una tabla en un único servidor. Posteriormente haremos la mesa distribuida. + +Crear y rellenar una tabla de resumen: + +``` sql +CREATE TABLE trips_mergetree +ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) +AS SELECT + +trip_id, +CAST(vendor_id AS Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14)) AS vendor_id, +toDate(pickup_datetime) AS pickup_date, +ifNull(pickup_datetime, toDateTime(0)) AS pickup_datetime, +toDate(dropoff_datetime) AS dropoff_date, +ifNull(dropoff_datetime, toDateTime(0)) AS dropoff_datetime, +assumeNotNull(store_and_fwd_flag) IN ('Y', '1', '2') AS store_and_fwd_flag, +assumeNotNull(rate_code_id) AS rate_code_id, +assumeNotNull(pickup_longitude) AS pickup_longitude, +assumeNotNull(pickup_latitude) AS pickup_latitude, +assumeNotNull(dropoff_longitude) AS dropoff_longitude, +assumeNotNull(dropoff_latitude) AS dropoff_latitude, +assumeNotNull(passenger_count) AS passenger_count, +assumeNotNull(trip_distance) AS trip_distance, +assumeNotNull(fare_amount) AS fare_amount, +assumeNotNull(extra) AS extra, +assumeNotNull(mta_tax) AS mta_tax, +assumeNotNull(tip_amount) AS tip_amount, +assumeNotNull(tolls_amount) AS tolls_amount, +assumeNotNull(ehail_fee) AS ehail_fee, +assumeNotNull(improvement_surcharge) AS improvement_surcharge, +assumeNotNull(total_amount) AS total_amount, +CAST((assumeNotNull(payment_type) AS pt) IN ('CSH', 'CASH', 'Cash', 'CAS', 'Cas', '1') ? 'CSH' : (pt IN ('CRD', 'Credit', 'Cre', 'CRE', 'CREDIT', '2') ? 'CRE' : (pt IN ('NOC', 'No Charge', 'No', '3') ? 'NOC' : (pt IN ('DIS', 'Dispute', 'Dis', '4') ? 'DIS' : 'UNK'))) AS Enum8('CSH' = 1, 'CRE' = 2, 'UNK' = 0, 'NOC' = 3, 'DIS' = 4)) AS payment_type_, +assumeNotNull(trip_type) AS trip_type, +ifNull(toFixedString(unhex(pickup), 25), toFixedString('', 25)) AS pickup, +ifNull(toFixedString(unhex(dropoff), 25), toFixedString('', 25)) AS dropoff, +CAST(assumeNotNull(cab_type) AS Enum8('yellow' = 1, 'green' = 2, 'uber' = 3)) AS cab_type, + +assumeNotNull(pickup_nyct2010_gid) AS pickup_nyct2010_gid, +toFloat32(ifNull(pickup_ctlabel, '0')) AS pickup_ctlabel, +assumeNotNull(pickup_borocode) AS pickup_borocode, +CAST(assumeNotNull(pickup_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS pickup_boroname, +toFixedString(ifNull(pickup_ct2010, '000000'), 6) AS pickup_ct2010, +toFixedString(ifNull(pickup_boroct2010, '0000000'), 7) AS pickup_boroct2010, +CAST(assumeNotNull(ifNull(pickup_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS pickup_cdeligibil, +toFixedString(ifNull(pickup_ntacode, '0000'), 4) AS pickup_ntacode, + +CAST(assumeNotNull(pickup_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS pickup_ntaname, + +toUInt16(ifNull(pickup_puma, '0')) AS pickup_puma, + +assumeNotNull(dropoff_nyct2010_gid) AS dropoff_nyct2010_gid, +toFloat32(ifNull(dropoff_ctlabel, '0')) AS dropoff_ctlabel, +assumeNotNull(dropoff_borocode) AS dropoff_borocode, +CAST(assumeNotNull(dropoff_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS dropoff_boroname, +toFixedString(ifNull(dropoff_ct2010, '000000'), 6) AS dropoff_ct2010, +toFixedString(ifNull(dropoff_boroct2010, '0000000'), 7) AS dropoff_boroct2010, +CAST(assumeNotNull(ifNull(dropoff_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS dropoff_cdeligibil, +toFixedString(ifNull(dropoff_ntacode, '0000'), 4) AS dropoff_ntacode, + +CAST(assumeNotNull(dropoff_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS dropoff_ntaname, + +toUInt16(ifNull(dropoff_puma, '0')) AS dropoff_puma + +FROM trips +``` + +Esto toma 3030 segundos a una velocidad de aproximadamente 428,000 filas por segundo. +Para cargarlo más rápido, puede crear la tabla con el `Log` motor en lugar de `MergeTree`. En este caso, la descarga funciona más rápido que 200 segundos. + +La tabla utiliza 126 GB de espacio en disco. + +``` sql +SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active +``` + +``` text +┌─formatReadableSize(sum(bytes))─┐ +│ 126.18 GiB │ +└────────────────────────────────┘ +``` + +Entre otras cosas, puede ejecutar la consulta OPTIMIZE en MergeTree. Pero no es necesario ya que todo estará bien sin él. + +## Descarga de Particiones Preparadas {#download-of-prepared-partitions} + +``` bash +$ curl -O https://clickhouse-datasets.s3.yandex.net/trips_mergetree/partitions/trips_mergetree.tar +$ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # path to ClickHouse data directory +$ # check permissions of unpacked data, fix if required +$ sudo service clickhouse-server restart +$ clickhouse-client --query "select count(*) from datasets.trips_mergetree" +``` + +!!! info "INFO" + Si va a ejecutar las consultas que se describen a continuación, debe usar el nombre completo de la tabla, `datasets.trips_mergetree`. + +## Resultados en un solo servidor {#results-on-single-server} + +Q1: + +``` sql +SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type +``` + +0.490 segundos. + +Q2: + +``` sql +SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenger_count +``` + +1.224 segundos. + +Q3: + +``` sql +SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetree GROUP BY passenger_count, year +``` + +2.104 segundos. + +Q4: + +``` sql +SELECT passenger_count, toYear(pickup_date) AS year, round(trip_distance) AS distance, count(*) +FROM trips_mergetree +GROUP BY passenger_count, year, distance +ORDER BY year, count(*) DESC +``` + +3.593 segundos. + +Se utilizó el siguiente servidor: + +Dos CPU Intel (R) Xeon (R) E5-2650 v2 @ 2.60GHz, 16 núcleos físicos en total, 128 GiB RAM, 8x6 TB HD en hardware RAID-5 + +El tiempo de ejecución es el mejor de tres carreras. Pero a partir de la segunda ejecución, las consultas leen datos de la memoria caché del sistema de archivos. No se produce más almacenamiento en caché: los datos se leen y procesan en cada ejecución. + +Creación de una tabla en tres servidores: + +En cada servidor: + +``` sql +CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) +``` + +En el servidor de origen: + +``` sql +CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) +``` + +La siguiente consulta redistribuye los datos: + +``` sql +INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree +``` + +Esto tarda 2454 segundos. + +En tres servidores: + +Q1: 0.212 segundos. +Q2: 0.438 segundos. +Q3: 0.733 segundos. +Q4: 1.241 segundos. + +No hay sorpresas aquí, ya que las consultas se escalan linealmente. + +También tenemos los resultados de un clúster de 140 servidores: + +Q1: 0,028 seg. +Q2: 0,043 seg. +Q3: 0,051 seg. +Q4: 0,072 seg. + +En este caso, el tiempo de procesamiento de la consulta está determinado sobre todo por la latencia de la red. +Ejecutamos consultas utilizando un cliente ubicado en un centro de datos de Yandex en Finlandia en un clúster en Rusia, que agregó aproximadamente 20 ms de latencia. + +## Resumen {#summary} + +| servidor | Q1 | Q2 | Q3 | Q4 | +|-----------------|------------------|---------------------|-----------------|---------| +| Uno | 0.490 | Ciudad de México | Más información | 3.593 | +| Cómo hacer | Número de modelo | Código del artículo | 0.733 | Puertas | +| Más información | 0.028 | 0.043 | 0.051 | 0.072 | + +[Artículo Original](https://clickhouse.tech/docs/es/getting_started/example_datasets/nyc_taxi/) diff --git a/docs/es/getting_started/example_datasets/ontime.md b/docs/es/getting_started/example_datasets/ontime.md new file mode 100644 index 00000000000..a3d22cf747a --- /dev/null +++ b/docs/es/getting_started/example_datasets/ontime.md @@ -0,0 +1,405 @@ +# Un tiempo {#ontime} + +Este conjunto de datos se puede obtener de dos maneras: + +- importación de datos sin procesar +- descarga de particiones preparadas + +## Importar desde datos sin procesar {#import-from-raw-data} + +Descarga de datos: + +``` bash +for s in `seq 1987 2018` +do +for m in `seq 1 12` +do +wget https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip +done +done +``` + +(a partir de https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh ) + +Creación de una tabla: + +``` sql +CREATE TABLE `ontime` ( + `Year` UInt16, + `Quarter` UInt8, + `Month` UInt8, + `DayofMonth` UInt8, + `DayOfWeek` UInt8, + `FlightDate` Date, + `UniqueCarrier` FixedString(7), + `AirlineID` Int32, + `Carrier` FixedString(2), + `TailNum` String, + `FlightNum` String, + `OriginAirportID` Int32, + `OriginAirportSeqID` Int32, + `OriginCityMarketID` Int32, + `Origin` FixedString(5), + `OriginCityName` String, + `OriginState` FixedString(2), + `OriginStateFips` String, + `OriginStateName` String, + `OriginWac` Int32, + `DestAirportID` Int32, + `DestAirportSeqID` Int32, + `DestCityMarketID` Int32, + `Dest` FixedString(5), + `DestCityName` String, + `DestState` FixedString(2), + `DestStateFips` String, + `DestStateName` String, + `DestWac` Int32, + `CRSDepTime` Int32, + `DepTime` Int32, + `DepDelay` Int32, + `DepDelayMinutes` Int32, + `DepDel15` Int32, + `DepartureDelayGroups` String, + `DepTimeBlk` String, + `TaxiOut` Int32, + `WheelsOff` Int32, + `WheelsOn` Int32, + `TaxiIn` Int32, + `CRSArrTime` Int32, + `ArrTime` Int32, + `ArrDelay` Int32, + `ArrDelayMinutes` Int32, + `ArrDel15` Int32, + `ArrivalDelayGroups` Int32, + `ArrTimeBlk` String, + `Cancelled` UInt8, + `CancellationCode` FixedString(1), + `Diverted` UInt8, + `CRSElapsedTime` Int32, + `ActualElapsedTime` Int32, + `AirTime` Int32, + `Flights` Int32, + `Distance` Int32, + `DistanceGroup` UInt8, + `CarrierDelay` Int32, + `WeatherDelay` Int32, + `NASDelay` Int32, + `SecurityDelay` Int32, + `LateAircraftDelay` Int32, + `FirstDepTime` String, + `TotalAddGTime` String, + `LongestAddGTime` String, + `DivAirportLandings` String, + `DivReachedDest` String, + `DivActualElapsedTime` String, + `DivArrDelay` String, + `DivDistance` String, + `Div1Airport` String, + `Div1AirportID` Int32, + `Div1AirportSeqID` Int32, + `Div1WheelsOn` String, + `Div1TotalGTime` String, + `Div1LongestGTime` String, + `Div1WheelsOff` String, + `Div1TailNum` String, + `Div2Airport` String, + `Div2AirportID` Int32, + `Div2AirportSeqID` Int32, + `Div2WheelsOn` String, + `Div2TotalGTime` String, + `Div2LongestGTime` String, + `Div2WheelsOff` String, + `Div2TailNum` String, + `Div3Airport` String, + `Div3AirportID` Int32, + `Div3AirportSeqID` Int32, + `Div3WheelsOn` String, + `Div3TotalGTime` String, + `Div3LongestGTime` String, + `Div3WheelsOff` String, + `Div3TailNum` String, + `Div4Airport` String, + `Div4AirportID` Int32, + `Div4AirportSeqID` Int32, + `Div4WheelsOn` String, + `Div4TotalGTime` String, + `Div4LongestGTime` String, + `Div4WheelsOff` String, + `Div4TailNum` String, + `Div5Airport` String, + `Div5AirportID` Int32, + `Div5AirportSeqID` Int32, + `Div5WheelsOn` String, + `Div5TotalGTime` String, + `Div5LongestGTime` String, + `Div5WheelsOff` String, + `Div5TailNum` String +) ENGINE = MergeTree +PARTITION BY Year +ORDER BY (Carrier, FlightDate) +SETTINGS index_granularity = 8192; +``` + +Carga de datos: + +``` bash +$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done +``` + +## Descarga de Particiones Preparadas {#download-of-prepared-partitions} + +``` bash +$ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar +$ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory +$ # check permissions of unpacked data, fix if required +$ sudo service clickhouse-server restart +$ clickhouse-client --query "select count(*) from datasets.ontime" +``` + +!!! info "INFO" + Si va a ejecutar las consultas que se describen a continuación, debe usar el nombre completo de la tabla, `datasets.ontime`. + +## Consulta {#queries} + +Q0. + +``` sql +SELECT avg(c1) +FROM +( + SELECT Year, Month, count(*) AS c1 + FROM ontime + GROUP BY Year, Month +); +``` + +Q1. El número de vuelos por día desde el año 2000 hasta 2008 + +``` sql +SELECT DayOfWeek, count(*) AS c +FROM ontime +WHERE Year>=2000 AND Year<=2008 +GROUP BY DayOfWeek +ORDER BY c DESC; +``` + +Preguntas frecuentes El número de vuelos retrasados por más de 10 minutos, agrupados por el día de la semana, para 2000-2008 + +``` sql +SELECT DayOfWeek, count(*) AS c +FROM ontime +WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 +GROUP BY DayOfWeek +ORDER BY c DESC; +``` + +Q3. El número de retrasos por parte del aeropuerto para 2000-2008 + +``` sql +SELECT Origin, count(*) AS c +FROM ontime +WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 +GROUP BY Origin +ORDER BY c DESC +LIMIT 10; +``` + +Preguntas más frecuentes Número de retrasos por transportista para 2007 + +``` sql +SELECT Carrier, count(*) +FROM ontime +WHERE DepDelay>10 AND Year=2007 +GROUP BY Carrier +ORDER BY count(*) DESC; +``` + +Q5. El porcentaje de retrasos por transportista para 2007 + +``` sql +SELECT Carrier, c, c2, c*100/c2 as c3 +FROM +( + SELECT + Carrier, + count(*) AS c + FROM ontime + WHERE DepDelay>10 + AND Year=2007 + GROUP BY Carrier +) +JOIN +( + SELECT + Carrier, + count(*) AS c2 + FROM ontime + WHERE Year=2007 + GROUP BY Carrier +) USING Carrier +ORDER BY c3 DESC; +``` + +Mejor versión de la misma consulta: + +``` sql +SELECT Carrier, avg(DepDelay>10)*100 AS c3 +FROM ontime +WHERE Year=2007 +GROUP BY Carrier +ORDER BY c3 DESC +``` + +¿Por qué? La solicitud anterior de una gama más amplia de años, 2000-2008 + +``` sql +SELECT Carrier, c, c2, c*100/c2 as c3 +FROM +( + SELECT + Carrier, + count(*) AS c + FROM ontime + WHERE DepDelay>10 + AND Year>=2000 AND Year<=2008 + GROUP BY Carrier +) +JOIN +( + SELECT + Carrier, + count(*) AS c2 + FROM ontime + WHERE Year>=2000 AND Year<=2008 + GROUP BY Carrier +) USING Carrier +ORDER BY c3 DESC; +``` + +Mejor versión de la misma consulta: + +``` sql +SELECT Carrier, avg(DepDelay>10)*100 AS c3 +FROM ontime +WHERE Year>=2000 AND Year<=2008 +GROUP BY Carrier +ORDER BY c3 DESC; +``` + +Preguntas frecuentes Porcentaje de vuelos retrasados por más de 10 minutos, por año + +``` sql +SELECT Year, c1/c2 +FROM +( + select + Year, + count(*)*100 as c1 + from ontime + WHERE DepDelay>10 + GROUP BY Year +) +JOIN +( + select + Year, + count(*) as c2 + from ontime + GROUP BY Year +) USING (Year) +ORDER BY Year; +``` + +Mejor versión de la misma consulta: + +``` sql +SELECT Year, avg(DepDelay>10)*100 +FROM ontime +GROUP BY Year +ORDER BY Year; +``` + +¿Por qué? Los destinos más populares por el número de ciudades conectadas directamente para varios rangos de año + +``` sql +SELECT DestCityName, uniqExact(OriginCityName) AS u +FROM ontime +WHERE Year >= 2000 and Year <= 2010 +GROUP BY DestCityName +ORDER BY u DESC LIMIT 10; +``` + +¿Por qué? + +``` sql +SELECT Year, count(*) AS c1 +FROM ontime +GROUP BY Year; +``` + +Preguntas frecuentes + +``` sql +SELECT + min(Year), max(Year), Carrier, count(*) AS cnt, + sum(ArrDelayMinutes>30) AS flights_delayed, + round(sum(ArrDelayMinutes>30)/count(*),2) AS rate +FROM ontime +WHERE + DayOfWeek NOT IN (6,7) AND OriginState NOT IN ('AK', 'HI', 'PR', 'VI') + AND DestState NOT IN ('AK', 'HI', 'PR', 'VI') + AND FlightDate < '2010-01-01' +GROUP by Carrier +HAVING cnt>100000 and max(Year)>1990 +ORDER by rate DESC +LIMIT 1000; +``` + +Bono: + +``` sql +SELECT avg(cnt) +FROM +( + SELECT Year,Month,count(*) AS cnt + FROM ontime + WHERE DepDel15=1 + GROUP BY Year,Month +); + +SELECT avg(c1) FROM +( + SELECT Year,Month,count(*) AS c1 + FROM ontime + GROUP BY Year,Month +); + +SELECT DestCityName, uniqExact(OriginCityName) AS u +FROM ontime +GROUP BY DestCityName +ORDER BY u DESC +LIMIT 10; + +SELECT OriginCityName, DestCityName, count() AS c +FROM ontime +GROUP BY OriginCityName, DestCityName +ORDER BY c DESC +LIMIT 10; + +SELECT OriginCityName, count() AS c +FROM ontime +GROUP BY OriginCityName +ORDER BY c DESC +LIMIT 10; +``` + +Esta prueba de rendimiento fue creada por Vadim Tkachenko. Ver: + +- https://www.percona.com/blog/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/ +- https://www.percona.com/blog/2009/10/26/air-traffic-queries-in-luciddb/ +- https://www.percona.com/blog/2009/11/02/air-traffic-queries-in-infinidb-early-alpha/ +- https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/ +- https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/ +- http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html + +[Artículo Original](https://clickhouse.tech/docs/es/getting_started/example_datasets/ontime/) diff --git a/docs/es/getting_started/example_datasets/star_schema.md b/docs/es/getting_started/example_datasets/star_schema.md new file mode 100644 index 00000000000..45a2d1a76e9 --- /dev/null +++ b/docs/es/getting_started/example_datasets/star_schema.md @@ -0,0 +1,363 @@ +# Estrella Schema Benchmark {#star-schema-benchmark} + +Compilación de dbgen: + +``` bash +$ git clone git@github.com:vadimtk/ssb-dbgen.git +$ cd ssb-dbgen +$ make +``` + +Generación de datos: + +!!! warning "Atención" + Desventaja `-s 100` dbgen genera 600 millones de filas (67 GB), mientras que `-s 1000` genera 6 mil millones de filas (lo que lleva mucho tiempo) + +``` bash +$ ./dbgen -s 1000 -T c +$ ./dbgen -s 1000 -T l +$ ./dbgen -s 1000 -T p +$ ./dbgen -s 1000 -T s +$ ./dbgen -s 1000 -T d +``` + +Creación de tablas en ClickHouse: + +``` sql +CREATE TABLE customer +( + C_CUSTKEY UInt32, + C_NAME String, + C_ADDRESS String, + C_CITY LowCardinality(String), + C_NATION LowCardinality(String), + C_REGION LowCardinality(String), + C_PHONE String, + C_MKTSEGMENT LowCardinality(String) +) +ENGINE = MergeTree ORDER BY (C_CUSTKEY); + +CREATE TABLE lineorder +( + LO_ORDERKEY UInt32, + LO_LINENUMBER UInt8, + LO_CUSTKEY UInt32, + LO_PARTKEY UInt32, + LO_SUPPKEY UInt32, + LO_ORDERDATE Date, + LO_ORDERPRIORITY LowCardinality(String), + LO_SHIPPRIORITY UInt8, + LO_QUANTITY UInt8, + LO_EXTENDEDPRICE UInt32, + LO_ORDTOTALPRICE UInt32, + LO_DISCOUNT UInt8, + LO_REVENUE UInt32, + LO_SUPPLYCOST UInt32, + LO_TAX UInt8, + LO_COMMITDATE Date, + LO_SHIPMODE LowCardinality(String) +) +ENGINE = MergeTree PARTITION BY toYear(LO_ORDERDATE) ORDER BY (LO_ORDERDATE, LO_ORDERKEY); + +CREATE TABLE part +( + P_PARTKEY UInt32, + P_NAME String, + P_MFGR LowCardinality(String), + P_CATEGORY LowCardinality(String), + P_BRAND LowCardinality(String), + P_COLOR LowCardinality(String), + P_TYPE LowCardinality(String), + P_SIZE UInt8, + P_CONTAINER LowCardinality(String) +) +ENGINE = MergeTree ORDER BY P_PARTKEY; + +CREATE TABLE supplier +( + S_SUPPKEY UInt32, + S_NAME String, + S_ADDRESS String, + S_CITY LowCardinality(String), + S_NATION LowCardinality(String), + S_REGION LowCardinality(String), + S_PHONE String +) +ENGINE = MergeTree ORDER BY S_SUPPKEY; +``` + +Inserte datos: + +``` bash +$ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl +$ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl +$ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl +$ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl +``` + +Conversión “star schema” a desnormalizado “flat schema”: + +``` sql +SET max_memory_usage = 20000000000; + +CREATE TABLE lineorder_flat +ENGINE = MergeTree +PARTITION BY toYear(LO_ORDERDATE) +ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS +SELECT + l.LO_ORDERKEY AS LO_ORDERKEY, + l.LO_LINENUMBER AS LO_LINENUMBER, + l.LO_CUSTKEY AS LO_CUSTKEY, + l.LO_PARTKEY AS LO_PARTKEY, + l.LO_SUPPKEY AS LO_SUPPKEY, + l.LO_ORDERDATE AS LO_ORDERDATE, + l.LO_ORDERPRIORITY AS LO_ORDERPRIORITY, + l.LO_SHIPPRIORITY AS LO_SHIPPRIORITY, + l.LO_QUANTITY AS LO_QUANTITY, + l.LO_EXTENDEDPRICE AS LO_EXTENDEDPRICE, + l.LO_ORDTOTALPRICE AS LO_ORDTOTALPRICE, + l.LO_DISCOUNT AS LO_DISCOUNT, + l.LO_REVENUE AS LO_REVENUE, + l.LO_SUPPLYCOST AS LO_SUPPLYCOST, + l.LO_TAX AS LO_TAX, + l.LO_COMMITDATE AS LO_COMMITDATE, + l.LO_SHIPMODE AS LO_SHIPMODE, + c.C_NAME AS C_NAME, + c.C_ADDRESS AS C_ADDRESS, + c.C_CITY AS C_CITY, + c.C_NATION AS C_NATION, + c.C_REGION AS C_REGION, + c.C_PHONE AS C_PHONE, + c.C_MKTSEGMENT AS C_MKTSEGMENT, + s.S_NAME AS S_NAME, + s.S_ADDRESS AS S_ADDRESS, + s.S_CITY AS S_CITY, + s.S_NATION AS S_NATION, + s.S_REGION AS S_REGION, + s.S_PHONE AS S_PHONE, + p.P_NAME AS P_NAME, + p.P_MFGR AS P_MFGR, + p.P_CATEGORY AS P_CATEGORY, + p.P_BRAND AS P_BRAND, + p.P_COLOR AS P_COLOR, + p.P_TYPE AS P_TYPE, + p.P_SIZE AS P_SIZE, + p.P_CONTAINER AS P_CONTAINER +FROM lineorder AS l +INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY +INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY +INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY; +``` + +Las consultas: + +Número de teléfono + +``` sql +SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue +FROM lineorder_flat +WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; +``` + +¿Qué puedes encontrar en Neodigit + +``` sql +SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue +FROM lineorder_flat +WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; +``` + +¿Qué puedes encontrar en Neodigit + +``` sql +SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue +FROM lineorder_flat +WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 + AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; +``` + +Preguntas frecuentes + +``` sql +SELECT + sum(LO_REVENUE), + toYear(LO_ORDERDATE) AS year, + P_BRAND +FROM lineorder_flat +WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' +GROUP BY + year, + P_BRAND +ORDER BY + year, + P_BRAND; +``` + +Preguntas frecuentes + +``` sql +SELECT + sum(LO_REVENUE), + toYear(LO_ORDERDATE) AS year, + P_BRAND +FROM lineorder_flat +WHERE P_BRAND >= 'MFGR#2221' AND P_BRAND <= 'MFGR#2228' AND S_REGION = 'ASIA' +GROUP BY + year, + P_BRAND +ORDER BY + year, + P_BRAND; +``` + +Preguntas más frecuentes + +``` sql +SELECT + sum(LO_REVENUE), + toYear(LO_ORDERDATE) AS year, + P_BRAND +FROM lineorder_flat +WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' +GROUP BY + year, + P_BRAND +ORDER BY + year, + P_BRAND; +``` + +¿Qué puedes encontrar en Neodigit + +``` sql +SELECT + C_NATION, + S_NATION, + toYear(LO_ORDERDATE) AS year, + sum(LO_REVENUE) AS revenue +FROM lineorder_flat +WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 +GROUP BY + C_NATION, + S_NATION, + year +ORDER BY + year ASC, + revenue DESC; +``` + +¿Qué puedes encontrar en Neodigit + +``` sql +SELECT + C_CITY, + S_CITY, + toYear(LO_ORDERDATE) AS year, + sum(LO_REVENUE) AS revenue +FROM lineorder_flat +WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 +GROUP BY + C_CITY, + S_CITY, + year +ORDER BY + year ASC, + revenue DESC; +``` + +¿Qué puedes encontrar en Neodigit + +``` sql +SELECT + C_CITY, + S_CITY, + toYear(LO_ORDERDATE) AS year, + sum(LO_REVENUE) AS revenue +FROM lineorder_flat +WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 +GROUP BY + C_CITY, + S_CITY, + year +ORDER BY + year ASC, + revenue DESC; +``` + +¿Qué puedes encontrar en Neodigit + +``` sql +SELECT + C_CITY, + S_CITY, + toYear(LO_ORDERDATE) AS year, + sum(LO_REVENUE) AS revenue +FROM lineorder_flat +WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = 199712 +GROUP BY + C_CITY, + S_CITY, + year +ORDER BY + year ASC, + revenue DESC; +``` + +Preguntas más frecuentes + +``` sql +SELECT + toYear(LO_ORDERDATE) AS year, + C_NATION, + sum(LO_REVENUE - LO_SUPPLYCOST) AS profit +FROM lineorder_flat +WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') +GROUP BY + year, + C_NATION +ORDER BY + year ASC, + C_NATION ASC; +``` + +Preguntas más frecuentes + +``` sql +SELECT + toYear(LO_ORDERDATE) AS year, + S_NATION, + P_CATEGORY, + sum(LO_REVENUE - LO_SUPPLYCOST) AS profit +FROM lineorder_flat +WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') +GROUP BY + year, + S_NATION, + P_CATEGORY +ORDER BY + year ASC, + S_NATION ASC, + P_CATEGORY ASC; +``` + +Preguntas más frecuentes + +``` sql +SELECT + toYear(LO_ORDERDATE) AS year, + S_CITY, + P_BRAND, + sum(LO_REVENUE - LO_SUPPLYCOST) AS profit +FROM lineorder_flat +WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' +GROUP BY + year, + S_CITY, + P_BRAND +ORDER BY + year ASC, + S_CITY ASC, + P_BRAND ASC; +``` + +[Artículo Original](https://clickhouse.tech/docs/es/getting_started/example_datasets/star_schema/) diff --git a/docs/es/getting_started/example_datasets/wikistat.md b/docs/es/getting_started/example_datasets/wikistat.md new file mode 100644 index 00000000000..92ecf267a7b --- /dev/null +++ b/docs/es/getting_started/example_datasets/wikistat.md @@ -0,0 +1,28 @@ +# Nombre de la red inalámbrica (SSID): {#wikistat} + +Ver: http://dumps.wikimedia.org/other/pagecounts-raw/ + +Creación de una tabla: + +``` sql +CREATE TABLE wikistat +( + date Date, + time DateTime, + project String, + subproject String, + path String, + hits UInt64, + size UInt64 +) ENGINE = MergeTree(date, (path, time), 8192); +``` + +Carga de datos: + +``` bash +$ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt +$ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done +$ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done +``` + +[Artículo Original](https://clickhouse.tech/docs/es/getting_started/example_datasets/wikistat/) diff --git a/docs/es/getting_started/index.md b/docs/es/getting_started/index.md new file mode 100644 index 00000000000..54db5af4ebf --- /dev/null +++ b/docs/es/getting_started/index.md @@ -0,0 +1,8 @@ +# Primeros pasos {#getting-started} + +Si eres nuevo en ClickHouse y quieres tener una sensación práctica de su rendimiento, antes que nada, debes pasar por el [proceso de instalación](install.md). Después de eso puedes: + +- [Ir a través de tutorial detallado](tutorial.md) +- [Experimente con conjuntos de datos de ejemplo](example_datasets/ontime.md) + +[Artículo Original](https://clickhouse.tech/docs/es/getting_started/) diff --git a/docs/es/getting_started/install.md b/docs/es/getting_started/install.md new file mode 100644 index 00000000000..1b93e4ac93e --- /dev/null +++ b/docs/es/getting_started/install.md @@ -0,0 +1,184 @@ +# Instalación {#installation} + +## Requisitos del sistema {#system-requirements} + +ClickHouse puede ejecutarse en cualquier Linux, FreeBSD o Mac OS X con arquitectura de CPU x86\_64, AArch64 o PowerPC64LE. + +Los binarios oficiales preconstruidos generalmente se compilan para x86\_64 y aprovechan el conjunto de instrucciones SSE 4.2, por lo que, a menos que se indique lo contrario, el uso de la CPU que lo admite se convierte en un requisito adicional del sistema. Aquí está el comando para verificar si la CPU actual tiene soporte para SSE 4.2: + +``` bash +$ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" +``` + +Para ejecutar ClickHouse en procesadores que no admiten SSE 4.2 o tienen arquitectura AArch64 o PowerPC64LE, debe [Construir ClickHouse a partir de fuentes](#from-sources) con los ajustes de configuración adecuados. + +## Opciones de instalación disponibles {#available-installation-options} + +### De paquetes DEB {#install-from-deb-packages} + +Se recomienda utilizar pre-compilado oficial `deb` " resultados de la búsqueda relacionados + +Para instalar paquetes oficiales, agregue el repositorio de Yandex en `/etc/apt/sources.list` o en otra `/etc/apt/sources.list.d/clickhouse.list` file: + + deb http://repo.clickhouse.tech/deb/stable/ main/ + +Si desea utilizar la versión más reciente, reemplace `stable` desventaja `testing` (esto se recomienda para sus entornos de prueba). + +A continuación, ejecute estos comandos para instalar realmente los paquetes: + +``` bash +sudo apt-get install dirmngr # optional +sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional +sudo apt-get update +sudo apt-get install clickhouse-client clickhouse-server +``` + +También puede descargar e instalar paquetes manualmente desde aquí: https://repo.yandex.ru/clickhouse/deb/stable/main/. + +#### Paquete {#packages} + +- `clickhouse-common-static` — Installs ClickHouse compiled binary files. +- `clickhouse-server` — Creates a symbolic link for `clickhouse-server`. Instala la configuración del servidor. +- `clickhouse-client` — Creates a symbolic link for `clickhouse-client` y otras herramientas relacionadas con el cliente. Instala configuraciones de cliente. +- `clickhouse-common-static-dbg` — Installs ClickHouse compiled binary files with debug info. + +### De paquetes RPM {#from-rpm-packages} + +Se recomienda utilizar pre-compilado oficial `rpm` paquetes para CentOS, RedHat y todas las demás distribuciones de Linux basadas en rpm. + +Primero, necesitas agregar el repositorio oficial: + +``` bash +sudo yum install yum-utils +sudo rpm --import https://repo.clickhouse.tech/CLICKHOUSE-KEY.GPG +sudo yum-config-manager --add-repo https://repo.clickhouse.tech/rpm/stable/x86_64 +``` + +Si desea utilizar la versión más reciente, reemplace `stable` desventaja `testing` (esto se recomienda para sus entornos de prueba). El `prestable` etiqueta a veces está disponible también. + +A continuación, ejecute estos comandos para instalar paquetes: + +``` bash +sudo yum install clickhouse-server clickhouse-client +``` + +También puede descargar e instalar paquetes manualmente desde aquí: https://repo.casa de clic.tecnología / rpm / estable / x86\_64. + +### De archivos tgz {#from-tgz-archives} + +Se recomienda utilizar pre-compilado oficial `tgz` para todas las distribuciones de Linux, donde la instalación de `deb` o `rpm` paquetes no es posible. + +La versión requerida se puede descargar con `curl` o `wget` desde el repositorio https://repo.yandex.ru/clickhouse/tgz/. +Después de eso, los archivos descargados deben desempaquetarse e instalarse con scripts de instalación. Ejemplo para la última versión: + +``` bash +export LATEST_VERSION=`curl https://api.github.com/repos/ClickHouse/ClickHouse/tags 2>/dev/null | grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -n 1` +curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-dbg-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.tech/tgz/clickhouse-server-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.tech/tgz/clickhouse-client-$LATEST_VERSION.tgz + +tar -xzvf clickhouse-common-static-$LATEST_VERSION.tgz +sudo clickhouse-common-static-$LATEST_VERSION/install/doinst.sh + +tar -xzvf clickhouse-common-static-dbg-$LATEST_VERSION.tgz +sudo clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh + +tar -xzvf clickhouse-server-$LATEST_VERSION.tgz +sudo clickhouse-server-$LATEST_VERSION/install/doinst.sh +sudo /etc/init.d/clickhouse-server start + +tar -xzvf clickhouse-client-$LATEST_VERSION.tgz +sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh +``` + +Para los entornos de producción, se recomienda utilizar las últimas `stable`-versión. Puede encontrar su número en la página de GitHub https://github.com/ClickHouse/ClickHouse/tags con postfix `-stable`. + +### Imagen de Desde Docker {#from-docker-image} + +Para ejecutar ClickHouse dentro de Docker, siga la guía en [Eje de acoplador](https://hub.docker.com/r/yandex/clickhouse-server/). Esas imágenes usan oficial `deb` paquetes dentro. + +### De fuentes {#from-sources} + +Para compilar manualmente ClickHouse, siga las instrucciones para [Linux](../development/build.md) o [Mac OS X](../development/build_osx.md). + +Puede compilar paquetes e instalarlos o usar programas sin instalar paquetes. Además, al construir manualmente, puede deshabilitar el requisito de SSE 4.2 o compilar para CPU AArch64. + + Client: dbms/programs/clickhouse-client + Server: dbms/programs/clickhouse-server + +Tendrá que crear carpetas de datos y metadatos y `chown` para el usuario deseado. Sus rutas se pueden cambiar en la configuración del servidor (src/dbms/programs/server/config .xml), por defecto son: + + /opt/clickhouse/data/default/ + /opt/clickhouse/metadata/default/ + +En Gentoo, puedes usar `emerge clickhouse` para instalar ClickHouse desde fuentes. + +## Lanzar {#launch} + +Para iniciar el servidor como demonio, ejecute: + +``` bash +$ sudo service clickhouse-server start +``` + +Si no tienes `service` comando ejecutar como + +``` bash +$ sudo /etc/init.d/clickhouse-server start +``` + +Vea los registros en el `/var/log/clickhouse-server/` Directorio. + +Si el servidor no se inicia, compruebe las configuraciones en el archivo `/etc/clickhouse-server/config.xml`. + +También puede iniciar manualmente el servidor desde la consola: + +``` bash +$ clickhouse-server --config-file=/etc/clickhouse-server/config.xml +``` + +En este caso, el registro se imprimirá en la consola, lo cual es conveniente durante el desarrollo. +Si el archivo de configuración está en el directorio actual, no es necesario `--config-file` parámetro. De forma predeterminada, utiliza `./config.xml`. + +ClickHouse admite la configuración de restricción de acceso. Están ubicados en el `users.xml` Archivo (junto a `config.xml`). +De forma predeterminada, se permite el acceso desde cualquier lugar `default` usuario, sin una contraseña. Ver `user/default/networks`. +Para obtener más información, consulte la sección [“Configuration Files”](../operations/configuration_files.md). + +Después de iniciar el servidor, puede usar el cliente de línea de comandos para conectarse a él: + +``` bash +$ clickhouse-client +``` + +Por defecto, se conecta a `localhost:9000` es nombre del usuario `default` sin una contraseña. También se puede usar para conectarse a un servidor remoto usando `--host` argumento. + +El terminal debe usar codificación UTF-8. +Para obtener más información, consulte la sección [“Command-line client”](../interfaces/cli.md). + +Ejemplo: + +``` bash +$ ./clickhouse-client +ClickHouse client version 0.0.18749. +Connecting to localhost:9000. +Connected to ClickHouse server version 0.0.18749. + +:) SELECT 1 + +SELECT 1 + +┌─1─┐ +│ 1 │ +└───┘ + +1 rows in set. Elapsed: 0.003 sec. + +:) +``` + +**Felicidades, el sistema funciona!** + +Para continuar experimentando, puede descargar uno de los conjuntos de datos de prueba o pasar por [tutorial](https://clickhouse.tech/tutorial.html). + +[Artículo Original](https://clickhouse.tech/docs/es/getting_started/install/) diff --git a/docs/es/getting_started/tutorial.md b/docs/es/getting_started/tutorial.md new file mode 100644 index 00000000000..28c31cc1214 --- /dev/null +++ b/docs/es/getting_started/tutorial.md @@ -0,0 +1,665 @@ +# Tutorial de ClickHouse {#clickhouse-tutorial} + +## Qué Esperar de Este Tutorial? {#what-to-expect-from-this-tutorial} + +Al pasar por este tutorial, aprenderá cómo configurar el clúster básico de ClickHouse, será pequeño, pero tolerante a fallos y escalable. Usaremos uno de los conjuntos de datos de ejemplo para llenarlo con datos y ejecutar algunas consultas de demostración. + +## Configuración de nodo único {#single-node-setup} + +Para posponer las complejidades del entorno distribuido, comenzaremos con la implementación de ClickHouse en un único servidor o máquina virtual. ClickHouse generalmente se instala desde [deb](index.md#install-from-deb-packages) o [RPM](index.md#from-rpm-packages) paquetes, pero hay [alternativa](index.md#from-docker-image) para los sistemas operativos que no los admiten. + +Por ejemplo, ha elegido `deb` paquetes y ejecutado: + +``` bash +sudo apt-get install dirmngr +sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 + +echo "deb http://repo.clickhouse.tech/deb/stable/ main/" | sudo tee /etc/apt/sources.list.d/clickhouse.list +sudo apt-get update + +sudo apt-get install -y clickhouse-server clickhouse-client +``` + +¿Qué tenemos en los paquetes que tengo instalados: + +- `clickhouse-client` el paquete contiene [Casa de clics-cliente](../interfaces/cli.md) aplicación, cliente interactivo de la consola ClickHouse. +- `clickhouse-common` El paquete contiene un archivo ejecutable ClickHouse. +- `clickhouse-server` El paquete contiene archivos de configuración para ejecutar ClickHouse como servidor. + +Los archivos de configuración del servidor se encuentran en `/etc/clickhouse-server/`. Antes de ir más lejos, tenga en cuenta el `` elemento en `config.xml`. La ruta determina la ubicación para el almacenamiento de datos, por lo que debe ubicarse en un volumen con gran capacidad de disco, el valor predeterminado es `/var/lib/clickhouse/`. Si desea ajustar la configuración, no es útil editar directamente `config.xml` archivo, teniendo en cuenta que podría ser reescrito en futuras actualizaciones de paquetes. La forma recomendada de anular los elementos de configuración es crear [archivos en config.directorio d](../operations/configuration_files.md) que sirven como “patches” de configuración.XML. + +Como habrás notado, `clickhouse-server` no se inicia automáticamente después de la instalación del paquete. Tampoco se reiniciará automáticamente después de las actualizaciones. La forma en que inicia el servidor depende de su sistema de inicio, generalmente, es: + +``` bash +sudo service clickhouse-server start +``` + +o + +``` bash +sudo /etc/init.d/clickhouse-server start +``` + +La ubicación predeterminada para los registros del servidor es `/var/log/clickhouse-server/`. El servidor estará listo para manejar las conexiones de cliente una vez `Ready for connections` se registró el mensaje. + +Una vez que el `clickhouse-server` está en funcionamiento, podemos usar `clickhouse-client` para conectarse al servidor y ejecutar algunas consultas de prueba como `SELECT "Hello, world!";`. + +
+ +Consejos rápidos para clickhouse-cliente +Modo interactivo: + +``` bash +clickhouse-client +clickhouse-client --host=... --port=... --user=... --password=... +``` + +Habilitar consultas multilínea: + +``` bash +clickhouse-client -m +clickhouse-client --multiline +``` + +Ejecutar consultas en modo por lotes: + +``` bash +clickhouse-client --query='SELECT 1' +echo 'SELECT 1' | clickhouse-client +clickhouse-client <<< 'SELECT 1' +``` + +Insertar datos de un archivo en el formato especificado: + +``` bash +clickhouse-client --query='INSERT INTO table VALUES' < data.txt +clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv +``` + +
+ +## Importar conjunto de datos de muestra {#import-sample-dataset} + +Ahora es el momento de llenar nuestro servidor ClickHouse con algunos datos de muestra. En este tutorial, usaremos datos anónimos de Yandex.Metrica, el primer servicio que ejecuta ClickHouse en forma de producción antes de que se convirtiera en código abierto (más sobre eso en [sección de historia](../introduction/history.md)). Hay [múltiples formas de importar Yandex.Conjunto de datos de Metrica](example_datasets/metrica.md) y por el bien del tutorial, iremos con el más realista. + +### Descargar y extraer datos de tabla {#download-and-extract-table-data} + +``` bash +curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv +curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv +``` + +Los archivos extraídos tienen un tamaño de aproximadamente 10 GB. + +### Crear tablas {#create-tables} + +Las tablas se agrupan lógicamente en “databases”. Hay un `default` base de datos, pero crearemos una nueva llamada `tutorial`: + +``` bash +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial" +``` + +La sintaxis para crear tablas es mucho más complicada en comparación con las bases de datos (ver [referencia](../query_language/create.md). En general `CREATE TABLE` declaración tiene que especificar tres cosas clave: + +1. Nombre de la tabla que se va a crear. +2. Table schema, i.e. list of columns and their [tipos de datos](../data_types/index.md). +3. [Motor de tabla](../operations/table_engines/index.md) y su configuración, que determina todos los detalles sobre cómo se ejecutarán físicamente las consultas a esta tabla. + +El Yandex.Metrica es un servicio de análisis web y el conjunto de datos de muestra no cubre toda su funcionalidad, por lo que solo hay dos tablas para crear: + +- `hits` es una tabla con cada acción realizada por todos los usuarios en todos los sitios web cubiertos por el servicio. +- `visits` es una tabla que contiene sesiones precompiladas en lugar de acciones individuales. + +Veamos y ejecutemos las consultas de tabla de creación real para estas tablas: + +``` sql +CREATE TABLE tutorial.hits_v1 +( + `WatchID` UInt64, + `JavaEnable` UInt8, + `Title` String, + `GoodEvent` Int16, + `EventTime` DateTime, + `EventDate` Date, + `CounterID` UInt32, + `ClientIP` UInt32, + `ClientIP6` FixedString(16), + `RegionID` UInt32, + `UserID` UInt64, + `CounterClass` Int8, + `OS` UInt8, + `UserAgent` UInt8, + `URL` String, + `Referer` String, + `URLDomain` String, + `RefererDomain` String, + `Refresh` UInt8, + `IsRobot` UInt8, + `RefererCategories` Array(UInt16), + `URLCategories` Array(UInt16), + `URLRegions` Array(UInt32), + `RefererRegions` Array(UInt32), + `ResolutionWidth` UInt16, + `ResolutionHeight` UInt16, + `ResolutionDepth` UInt8, + `FlashMajor` UInt8, + `FlashMinor` UInt8, + `FlashMinor2` String, + `NetMajor` UInt8, + `NetMinor` UInt8, + `UserAgentMajor` UInt16, + `UserAgentMinor` FixedString(2), + `CookieEnable` UInt8, + `JavascriptEnable` UInt8, + `IsMobile` UInt8, + `MobilePhone` UInt8, + `MobilePhoneModel` String, + `Params` String, + `IPNetworkID` UInt32, + `TraficSourceID` Int8, + `SearchEngineID` UInt16, + `SearchPhrase` String, + `AdvEngineID` UInt8, + `IsArtifical` UInt8, + `WindowClientWidth` UInt16, + `WindowClientHeight` UInt16, + `ClientTimeZone` Int16, + `ClientEventTime` DateTime, + `SilverlightVersion1` UInt8, + `SilverlightVersion2` UInt8, + `SilverlightVersion3` UInt32, + `SilverlightVersion4` UInt16, + `PageCharset` String, + `CodeVersion` UInt32, + `IsLink` UInt8, + `IsDownload` UInt8, + `IsNotBounce` UInt8, + `FUniqID` UInt64, + `HID` UInt32, + `IsOldCounter` UInt8, + `IsEvent` UInt8, + `IsParameter` UInt8, + `DontCountHits` UInt8, + `WithHash` UInt8, + `HitColor` FixedString(1), + `UTCEventTime` DateTime, + `Age` UInt8, + `Sex` UInt8, + `Income` UInt8, + `Interests` UInt16, + `Robotness` UInt8, + `GeneralInterests` Array(UInt16), + `RemoteIP` UInt32, + `RemoteIP6` FixedString(16), + `WindowName` Int32, + `OpenerName` Int32, + `HistoryLength` Int16, + `BrowserLanguage` FixedString(2), + `BrowserCountry` FixedString(2), + `SocialNetwork` String, + `SocialAction` String, + `HTTPError` UInt16, + `SendTiming` Int32, + `DNSTiming` Int32, + `ConnectTiming` Int32, + `ResponseStartTiming` Int32, + `ResponseEndTiming` Int32, + `FetchTiming` Int32, + `RedirectTiming` Int32, + `DOMInteractiveTiming` Int32, + `DOMContentLoadedTiming` Int32, + `DOMCompleteTiming` Int32, + `LoadEventStartTiming` Int32, + `LoadEventEndTiming` Int32, + `NSToDOMContentLoadedTiming` Int32, + `FirstPaintTiming` Int32, + `RedirectCount` Int8, + `SocialSourceNetworkID` UInt8, + `SocialSourcePage` String, + `ParamPrice` Int64, + `ParamOrderID` String, + `ParamCurrency` FixedString(3), + `ParamCurrencyID` UInt16, + `GoalsReached` Array(UInt32), + `OpenstatServiceName` String, + `OpenstatCampaignID` String, + `OpenstatAdID` String, + `OpenstatSourceID` String, + `UTMSource` String, + `UTMMedium` String, + `UTMCampaign` String, + `UTMContent` String, + `UTMTerm` String, + `FromTag` String, + `HasGCLID` UInt8, + `RefererHash` UInt64, + `URLHash` UInt64, + `CLID` UInt32, + `YCLID` UInt64, + `ShareService` String, + `ShareURL` String, + `ShareTitle` String, + `ParsedParams` Nested( + Key1 String, + Key2 String, + Key3 String, + Key4 String, + Key5 String, + ValueDouble Float64), + `IslandID` FixedString(16), + `RequestNum` UInt32, + `RequestTry` UInt8 +) +ENGINE = MergeTree() +PARTITION BY toYYYYMM(EventDate) +ORDER BY (CounterID, EventDate, intHash32(UserID)) +SAMPLE BY intHash32(UserID) +SETTINGS index_granularity = 8192 +``` + +``` sql +CREATE TABLE tutorial.visits_v1 +( + `CounterID` UInt32, + `StartDate` Date, + `Sign` Int8, + `IsNew` UInt8, + `VisitID` UInt64, + `UserID` UInt64, + `StartTime` DateTime, + `Duration` UInt32, + `UTCStartTime` DateTime, + `PageViews` Int32, + `Hits` Int32, + `IsBounce` UInt8, + `Referer` String, + `StartURL` String, + `RefererDomain` String, + `StartURLDomain` String, + `EndURL` String, + `LinkURL` String, + `IsDownload` UInt8, + `TraficSourceID` Int8, + `SearchEngineID` UInt16, + `SearchPhrase` String, + `AdvEngineID` UInt8, + `PlaceID` Int32, + `RefererCategories` Array(UInt16), + `URLCategories` Array(UInt16), + `URLRegions` Array(UInt32), + `RefererRegions` Array(UInt32), + `IsYandex` UInt8, + `GoalReachesDepth` Int32, + `GoalReachesURL` Int32, + `GoalReachesAny` Int32, + `SocialSourceNetworkID` UInt8, + `SocialSourcePage` String, + `MobilePhoneModel` String, + `ClientEventTime` DateTime, + `RegionID` UInt32, + `ClientIP` UInt32, + `ClientIP6` FixedString(16), + `RemoteIP` UInt32, + `RemoteIP6` FixedString(16), + `IPNetworkID` UInt32, + `SilverlightVersion3` UInt32, + `CodeVersion` UInt32, + `ResolutionWidth` UInt16, + `ResolutionHeight` UInt16, + `UserAgentMajor` UInt16, + `UserAgentMinor` UInt16, + `WindowClientWidth` UInt16, + `WindowClientHeight` UInt16, + `SilverlightVersion2` UInt8, + `SilverlightVersion4` UInt16, + `FlashVersion3` UInt16, + `FlashVersion4` UInt16, + `ClientTimeZone` Int16, + `OS` UInt8, + `UserAgent` UInt8, + `ResolutionDepth` UInt8, + `FlashMajor` UInt8, + `FlashMinor` UInt8, + `NetMajor` UInt8, + `NetMinor` UInt8, + `MobilePhone` UInt8, + `SilverlightVersion1` UInt8, + `Age` UInt8, + `Sex` UInt8, + `Income` UInt8, + `JavaEnable` UInt8, + `CookieEnable` UInt8, + `JavascriptEnable` UInt8, + `IsMobile` UInt8, + `BrowserLanguage` UInt16, + `BrowserCountry` UInt16, + `Interests` UInt16, + `Robotness` UInt8, + `GeneralInterests` Array(UInt16), + `Params` Array(String), + `Goals` Nested( + ID UInt32, + Serial UInt32, + EventTime DateTime, + Price Int64, + OrderID String, + CurrencyID UInt32), + `WatchIDs` Array(UInt64), + `ParamSumPrice` Int64, + `ParamCurrency` FixedString(3), + `ParamCurrencyID` UInt16, + `ClickLogID` UInt64, + `ClickEventID` Int32, + `ClickGoodEvent` Int32, + `ClickEventTime` DateTime, + `ClickPriorityID` Int32, + `ClickPhraseID` Int32, + `ClickPageID` Int32, + `ClickPlaceID` Int32, + `ClickTypeID` Int32, + `ClickResourceID` Int32, + `ClickCost` UInt32, + `ClickClientIP` UInt32, + `ClickDomainID` UInt32, + `ClickURL` String, + `ClickAttempt` UInt8, + `ClickOrderID` UInt32, + `ClickBannerID` UInt32, + `ClickMarketCategoryID` UInt32, + `ClickMarketPP` UInt32, + `ClickMarketCategoryName` String, + `ClickMarketPPName` String, + `ClickAWAPSCampaignName` String, + `ClickPageName` String, + `ClickTargetType` UInt16, + `ClickTargetPhraseID` UInt64, + `ClickContextType` UInt8, + `ClickSelectType` Int8, + `ClickOptions` String, + `ClickGroupBannerID` Int32, + `OpenstatServiceName` String, + `OpenstatCampaignID` String, + `OpenstatAdID` String, + `OpenstatSourceID` String, + `UTMSource` String, + `UTMMedium` String, + `UTMCampaign` String, + `UTMContent` String, + `UTMTerm` String, + `FromTag` String, + `HasGCLID` UInt8, + `FirstVisit` DateTime, + `PredLastVisit` Date, + `LastVisit` Date, + `TotalVisits` UInt32, + `TraficSource` Nested( + ID Int8, + SearchEngineID UInt16, + AdvEngineID UInt8, + PlaceID UInt16, + SocialSourceNetworkID UInt8, + Domain String, + SearchPhrase String, + SocialSourcePage String), + `Attendance` FixedString(16), + `CLID` UInt32, + `YCLID` UInt64, + `NormalizedRefererHash` UInt64, + `SearchPhraseHash` UInt64, + `RefererDomainHash` UInt64, + `NormalizedStartURLHash` UInt64, + `StartURLDomainHash` UInt64, + `NormalizedEndURLHash` UInt64, + `TopLevelDomain` UInt64, + `URLScheme` UInt64, + `OpenstatServiceNameHash` UInt64, + `OpenstatCampaignIDHash` UInt64, + `OpenstatAdIDHash` UInt64, + `OpenstatSourceIDHash` UInt64, + `UTMSourceHash` UInt64, + `UTMMediumHash` UInt64, + `UTMCampaignHash` UInt64, + `UTMContentHash` UInt64, + `UTMTermHash` UInt64, + `FromHash` UInt64, + `WebVisorEnabled` UInt8, + `WebVisorActivity` UInt32, + `ParsedParams` Nested( + Key1 String, + Key2 String, + Key3 String, + Key4 String, + Key5 String, + ValueDouble Float64), + `Market` Nested( + Type UInt8, + GoalID UInt32, + OrderID String, + OrderPrice Int64, + PP UInt32, + DirectPlaceID UInt32, + DirectOrderID UInt32, + DirectBannerID UInt32, + GoodID String, + GoodName String, + GoodQuantity Int32, + GoodPrice Int64), + `IslandID` FixedString(16) +) +ENGINE = CollapsingMergeTree(Sign) +PARTITION BY toYYYYMM(StartDate) +ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) +SAMPLE BY intHash32(UserID) +SETTINGS index_granularity = 8192 +``` + +Puede ejecutar esas consultas utilizando el modo interactivo de `clickhouse-client` (simplemente ejecútelo en un terminal sin especificar una consulta por adelantado) o pruebe algunos [interfaz alternativa](../interfaces/index.md) Si quieres. + +Como podemos ver, `hits_v1` utiliza el [motor básico MergeTree](../operations/table_engines/mergetree.md), mientras que el `visits_v1` utiliza el [Derrumbar](../operations/table_engines/collapsingmergetree.md) variante. + +### Importar datos {#import-data} + +La importación de datos a ClickHouse se realiza a través de [INSERTAR EN](../query_language/insert_into.md) consulta como en muchas otras bases de datos SQL. Sin embargo, los datos generalmente se proporcionan en uno de los [Formatos soportados](../interfaces/formats.md) en lugar de `VALUES` cláusula (que también es compatible). + +Los archivos que descargamos anteriormente están en formato separado por tabuladores, así que aquí le mostramos cómo importarlos a través del cliente de la consola: + +``` bash +clickhouse-client --query "INSERT INTO tutorial.hits_v1 FORMAT TSV" --max_insert_block_size=100000 < hits_v1.tsv +clickhouse-client --query "INSERT INTO tutorial.visits_v1 FORMAT TSV" --max_insert_block_size=100000 < visits_v1.tsv +``` + +ClickHouse tiene un montón de [ajustes para sintonizar](../operations/settings/index.md) y una forma de especificarlos en el cliente de la consola es a través de argumentos, como podemos ver con `--max_insert_block_size`. La forma más fácil de averiguar qué configuraciones están disponibles, qué significan y cuáles son los valores predeterminados es consultar el `system.settings` tabla: + +``` sql +SELECT name, value, changed, description +FROM system.settings +WHERE name LIKE '%max_insert_b%' +FORMAT TSV + +max_insert_block_size 1048576 0 "The maximum block size for insertion, if we control the creation of blocks for insertion." +``` + +Opcionalmente se puede [OPTIMIZAR](../query_language/misc/#misc_operations-optimize) las tablas después de la importación. Las tablas que están configuradas con el motor de la familia MergeTree siempre fusionan partes de datos en segundo plano para optimizar el almacenamiento de datos (o al menos verificar si tiene sentido). Estas consultas solo obligarán al motor de tablas a realizar la optimización del almacenamiento en este momento en lugar de algún tiempo después: + +``` bash +clickhouse-client --query "OPTIMIZE TABLE tutorial.hits_v1 FINAL" +clickhouse-client --query "OPTIMIZE TABLE tutorial.visits_v1 FINAL" +``` + +Esta es una operación intensiva de E / S y CPU, por lo que si la tabla recibe constantemente datos nuevos, es mejor dejarlo solo y dejar que las fusiones se ejecuten en segundo plano. + +Ahora podemos comprobar que las tablas se han importado correctamente: + +``` bash +clickhouse-client --query "SELECT COUNT(*) FROM tutorial.hits_v1" +clickhouse-client --query "SELECT COUNT(*) FROM tutorial.visits_v1" +``` + +## Consultas de ejemplo {#example-queries} + +``` sql +SELECT + StartURL AS URL, + AVG(Duration) AS AvgDuration +FROM tutorial.visits_v1 +WHERE StartDate BETWEEN '2014-03-23' AND '2014-03-30' +GROUP BY URL +ORDER BY AvgDuration DESC +LIMIT 10 +``` + +``` sql +SELECT + sum(Sign) AS visits, + sumIf(Sign, has(Goals.ID, 1105530)) AS goal_visits, + (100. * goal_visits) / visits AS goal_percent +FROM tutorial.visits_v1 +WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru') +``` + +## Implementación de clúster {#cluster-deployment} + +El clúster ClickHouse es un clúster homogéneo. Pasos para configurar: + +1. Instale el servidor ClickHouse en todas las máquinas del clúster +2. Configurar configuraciones de clúster en archivos de configuración +3. Crear tablas locales en cada instancia +4. Crear un [Tabla distribuida](../operations/table_engines/distributed.md) + +[Tabla distribuida](../operations/table_engines/distributed.md) es en realidad una especie de “view” a las tablas locales del clúster ClickHouse. La consulta SELECT de una tabla distribuida se ejecutará utilizando recursos de todos los fragmentos del clúster. Puede especificar configuraciones para varios clústeres y crear varias tablas distribuidas que proporcionen vistas a diferentes clústeres. + +Ejemplo de configuración para un clúster con tres fragmentos, una réplica cada uno: + +``` xml + + + + + example-perftest01j.yandex.ru + 9000 + + + + + example-perftest02j.yandex.ru + 9000 + + + + + example-perftest03j.yandex.ru + 9000 + + + + +``` + +Para más demostraciones, creemos una nueva tabla local con la misma `CREATE TABLE` consulta que utilizamos para `hits_v1`, pero nombre de tabla diferente: + +``` sql +CREATE TABLE tutorial.hits_local (...) ENGINE = MergeTree() ... +``` + +Creación de una tabla distribuida que proporcione una vista en las tablas locales del clúster: + +``` sql +CREATE TABLE tutorial.hits_all AS tutorial.hits_local +ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand()); +``` + +Una práctica común es crear tablas distribuidas similares en todas las máquinas del clúster. Esto permitiría ejecutar consultas distribuidas en cualquier máquina del clúster. También hay una opción alternativa para crear una tabla distribuida temporal para una consulta SELECT determinada usando [remoto](../query_language/table_functions/remote.md) función de la tabla. + +Vamos a correr [INSERTAR SELECCIONAR](../query_language/insert_into.md) en la tabla Distributed para extender la tabla a varios servidores. + +``` sql +INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; +``` + +!!! warning "Aviso" + Este enfoque no es adecuado para la fragmentación de tablas grandes. Hay una herramienta separada [Método de codificación de datos:](../operations/utils/clickhouse-copier.md) que puede volver a fragmentar tablas grandes arbitrarias. + +Como era de esperar, las consultas computacionalmente pesadas se ejecutan N veces más rápido y se lanzan en 3 servidores en lugar de uno. + +En este caso, hemos utilizado un clúster con 3 fragmentos, cada uno contiene una única réplica. + +Para proporcionar resiliencia en un entorno de producción, recomendamos que cada fragmento contenga 2-3 réplicas distribuidas entre varios centros de datos. Tenga en cuenta que ClickHouse admite un número ilimitado de réplicas. + +Ejemplo de configuración para un clúster de un fragmento que contiene tres réplicas: + +``` xml + + ... + + + + example-perftest01j.yandex.ru + 9000 + + + example-perftest02j.yandex.ru + 9000 + + + example-perftest03j.yandex.ru + 9000 + + + + +``` + +Para habilitar la replicación nativa ZooKeeper se requiere. ClickHouse se encargará de la coherencia de los datos en todas las réplicas y ejecutará el procedimiento de restauración después de la falla +automática. Se recomienda implementar el clúster ZooKeeper en servidores separados. + +ZooKeeper no es un requisito estricto: en algunos casos simples, puede duplicar los datos escribiéndolos en todas las réplicas de su código de aplicación. Este enfoque es **ni** recomendado, en este caso, ClickHouse no podrá +garantizar la coherencia de los datos en todas las réplicas. Esto sigue siendo responsabilidad de su aplicación. + +Las ubicaciones de ZooKeeper deben especificarse en el archivo de configuración: + +``` xml + + + zoo01.yandex.ru + 2181 + + + zoo02.yandex.ru + 2181 + + + zoo03.yandex.ru + 2181 + + +``` + +Además, necesitamos establecer macros para identificar cada fragmento y réplica, se usará en la creación de la tabla: + +``` xml + + 01 + 01 + +``` + +Si no hay réplicas en este momento en la creación de la tabla replicada, se creará una nueva primera réplica. Si ya hay réplicas activas, la nueva réplica clonará los datos de las existentes. Tiene la opción de crear primero todas las tablas replicadas e insertar datos en ella. Otra opción es crear algunas réplicas y agregar las otras después o durante la inserción de datos. + +``` sql +CREATE TABLE tutorial.hits_replica (...) +ENGINE = ReplcatedMergeTree( + '/clickhouse_perftest/tables/{shard}/hits', + '{replica}' +) +... +``` + +Aquí usamos [ReplicatedMergeTree](../operations/table_engines/replication.md) motor de mesa. En los parámetros, especificamos la ruta ZooKeeper que contiene identificadores de fragmentos y réplicas. + +``` sql +INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local; +``` + +La replicación funciona en modo multi-master. Los datos se pueden cargar en cualquier réplica y se sincronizarán con otras instancias automáticamente. La replicación es asíncrona, por lo que en un momento dado, no todas las réplicas pueden contener datos insertados recientemente. Para permitir la inserción de datos, al menos una réplica debe estar activa. Otros sincronizarán los datos y repararán la coherencia una vez que vuelvan a activarse. Tenga en cuenta que tal enfoque permite la baja posibilidad de una pérdida de datos que acaba de agregar. + +[Artículo Original](https://clickhouse.tech/docs/es/getting_started/tutorial/) diff --git a/docs/es/guides/apply_catboost_model.md b/docs/es/guides/apply_catboost_model.md new file mode 100644 index 00000000000..aa717e53da6 --- /dev/null +++ b/docs/es/guides/apply_catboost_model.md @@ -0,0 +1,232 @@ +# Aplicación de un modelo Catboost en ClickHouse {#applying-catboost-model-in-clickhouse} + +[CatBoost](https://catboost.ai) es una biblioteca de impulso de gradiente libre y de código abierto desarrollada en [Yandex](https://yandex.com/company/) para el aprendizaje automático. + +Con esta instrucción, aprenderá a aplicar modelos preentrenados en ClickHouse ejecutando la inferencia de modelos desde SQL. + +Para aplicar un modelo CatBoost en ClickHouse: + +1. [Crear una tabla](#create-table). +2. [Insertar los datos en la tabla](#insert-data-to-table). +3. [Integrar CatBoost en ClickHouse](#integrate-catboost-into-clickhouse) (Paso opcional). +4. [Ejecute la inferencia del modelo desde SQL](#run-model-inference). + +Para obtener más información sobre la formación de modelos CatBoost, consulte [Entrenamiento y aplicación de modelos](https://catboost.ai/docs/features/training.html#training). + +## Requisito {#prerequisites} + +Si usted no tiene el [Acoplador](https://docs.docker.com/install/) sin embargo, instalarlo. + +!!! note "Nota" + [Acoplador](https://www.docker.com) es una plataforma de software que le permite crear contenedores que aíslan una instalación de CatBoost y ClickHouse del resto del sistema. + +Antes de aplicar un modelo CatBoost: + +**1.** Neumático de la [Imagen de acoplador](https://hub.docker.com/r/yandex/tutorial-catboost-clickhouse) del registro: + +``` bash +$ docker pull yandex/tutorial-catboost-clickhouse +``` + +Esta imagen de Docker contiene todo lo que necesita para ejecutar CatBoost y ClickHouse: código, tiempo de ejecución, bibliotecas, variables de entorno y archivos de configuración. + +**2.** Asegúrese de que la imagen de Docker se haya extraído correctamente: + +``` bash +$ docker image ls +REPOSITORY TAG IMAGE ID CREATED SIZE +yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 hours ago 1.37GB +``` + +**3.** Inicie un contenedor Docker basado en esta imagen: + +``` bash +$ docker run -it -p 8888:8888 yandex/tutorial-catboost-clickhouse +``` + +## 1. Crear una tabla {#create-table} + +Para crear una tabla ClickHouse para el ejemplo de capacitación: + +**1.** Inicie el cliente de consola ClickHouse en el modo interactivo: + +``` bash +$ clickhouse client +``` + +!!! note "Nota" + El servidor ClickHouse ya se está ejecutando dentro del contenedor Docker. + +**2.** Cree la tabla usando el comando: + +``` sql +:) CREATE TABLE amazon_train +( + date Date MATERIALIZED today(), + ACTION UInt8, + RESOURCE UInt32, + MGR_ID UInt32, + ROLE_ROLLUP_1 UInt32, + ROLE_ROLLUP_2 UInt32, + ROLE_DEPTNAME UInt32, + ROLE_TITLE UInt32, + ROLE_FAMILY_DESC UInt32, + ROLE_FAMILY UInt32, + ROLE_CODE UInt32 +) +ENGINE = MergeTree ORDER BY date +``` + +**3.** Salir del cliente de la consola ClickHouse: + +``` sql +:) exit +``` + +## 2. Insertar los datos en la tabla {#insert-data-to-table} + +Para insertar los datos: + +**1.** Ejecute el siguiente comando: + +``` bash +$ clickhouse client --host 127.0.0.1 --query 'INSERT INTO amazon_train FORMAT CSVWithNames' < ~/amazon/train.csv +``` + +**2.** Inicie el cliente de consola ClickHouse en el modo interactivo: + +``` bash +$ clickhouse client +``` + +**3.** Asegúrese de que los datos se hayan cargado: + +``` sql +:) SELECT count() FROM amazon_train + +SELECT count() +FROM amazon_train + ++-count()-+ +| 65538 | ++---------+ +``` + +## 3. Integrar CatBoost en ClickHouse {#integrate-catboost-into-clickhouse} + +!!! note "Nota" + **Paso opcional.** La imagen de Docker contiene todo lo que necesita para ejecutar CatBoost y ClickHouse. + +Para integrar CatBoost en ClickHouse: + +**1.** Construir la biblioteca de evaluación. + +La forma más rápida de evaluar un modelo CatBoost es compilar `libcatboostmodel.` biblioteca. Para obtener más información acerca de cómo construir la biblioteca, vea [Documentación de CatBoost](https://catboost.ai/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper.html). + +**2.** Cree un nuevo directorio en cualquier lugar y con cualquier nombre, por ejemplo, `data` y poner la biblioteca creada en ella. La imagen de Docker ya contiene la biblioteca `data/libcatboostmodel.so`. + +**3.** Cree un nuevo directorio para el modelo de configuración en cualquier lugar y con cualquier nombre, por ejemplo, `models`. + +**4.** Cree un archivo de configuración de modelo con cualquier nombre, por ejemplo, `models/amazon_model.xml`. + +**5.** Describir la configuración del modelo: + +``` xml + + + + catboost + + amazon + + /home/catboost/tutorial/catboost_model.bin + + 0 + + +``` + +**6.** Agregue la ruta de acceso a CatBoost y la configuración del modelo a la configuración de ClickHouse: + +``` xml + +/home/catboost/data/libcatboostmodel.so +/home/catboost/models/*_model.xml +``` + +## 4. Ejecute la inferencia del modelo desde SQL {#run-model-inference} + +Para el modelo de prueba, ejecute el cliente ClickHouse `$ clickhouse client`. + +Vamos a asegurarnos de que el modelo está funcionando: + +``` sql +:) SELECT + modelEvaluate('amazon', + RESOURCE, + MGR_ID, + ROLE_ROLLUP_1, + ROLE_ROLLUP_2, + ROLE_DEPTNAME, + ROLE_TITLE, + ROLE_FAMILY_DESC, + ROLE_FAMILY, + ROLE_CODE) > 0 AS prediction, + ACTION AS target +FROM amazon_train +LIMIT 10 +``` + +!!! note "Nota" + Función [modelEvaluar](../query_language/functions/other_functions.md#function-modelevaluate) devuelve tupla con predicciones sin procesar por clase para modelos multiclase. + +Vamos a predecir la probabilidad: + +``` sql +:) SELECT + modelEvaluate('amazon', + RESOURCE, + MGR_ID, + ROLE_ROLLUP_1, + ROLE_ROLLUP_2, + ROLE_DEPTNAME, + ROLE_TITLE, + ROLE_FAMILY_DESC, + ROLE_FAMILY, + ROLE_CODE) AS prediction, + 1. / (1 + exp(-prediction)) AS probability, + ACTION AS target +FROM amazon_train +LIMIT 10 +``` + +!!! note "Nota" + Más información sobre [exp()](../query_language/functions/math_functions.md) función. + +Vamos a calcular LogLoss en la muestra: + +``` sql +:) SELECT -avg(tg * log(prob) + (1 - tg) * log(1 - prob)) AS logloss +FROM +( + SELECT + modelEvaluate('amazon', + RESOURCE, + MGR_ID, + ROLE_ROLLUP_1, + ROLE_ROLLUP_2, + ROLE_DEPTNAME, + ROLE_TITLE, + ROLE_FAMILY_DESC, + ROLE_FAMILY, + ROLE_CODE) AS prediction, + 1. / (1. + exp(-prediction)) AS prob, + ACTION AS tg + FROM amazon_train +) +``` + +!!! note "Nota" + Más información sobre [avg()](../query_language/agg_functions/reference.md#agg_function-avg) y [Registro()](../query_language/functions/math_functions.md) función. + +[Artículo Original](https://clickhouse.tech/docs/es/guides/apply_catboost_model/) diff --git a/docs/es/guides/index.md b/docs/es/guides/index.md new file mode 100644 index 00000000000..953251d1b17 --- /dev/null +++ b/docs/es/guides/index.md @@ -0,0 +1,7 @@ +# Guías de ClickHouse {#clickhouse-guides} + +Instrucciones detalladas paso a paso que lo ayudarán a resolver varias tareas usando ClickHouse. + +- [Aplicación de un modelo CatBoost en ClickHouse](apply_catboost_model.md) + +[Artículo Original](https://clickhouse.tech/docs/es/guides/) diff --git a/docs/es/images/column_oriented.gif b/docs/es/images/column_oriented.gif new file mode 100644 index 00000000000..15f4b12e697 Binary files /dev/null and b/docs/es/images/column_oriented.gif differ diff --git a/docs/es/images/logo.svg b/docs/es/images/logo.svg new file mode 100644 index 00000000000..865b96d98c7 --- /dev/null +++ b/docs/es/images/logo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/es/images/row_oriented.gif b/docs/es/images/row_oriented.gif new file mode 100644 index 00000000000..53daa20f322 Binary files /dev/null and b/docs/es/images/row_oriented.gif differ diff --git a/docs/es/index.md b/docs/es/index.md new file mode 100644 index 00000000000..b777996b67d --- /dev/null +++ b/docs/es/index.md @@ -0,0 +1,132 @@ +# ¿Qué es ClickHouse? {#what-is-clickhouse} + +ClickHouse es un sistema de gestión de bases de datos orientado a columnas (DBMS) para el procesamiento analítico en línea de consultas (OLAP). + +En un “normal” DBMS orientado a filas, los datos se almacenan en este orden: + +| Fila | Argumento | JavaEnable | Titular | GoodEvent | EventTime | +|--------|------------------------------------|------------|---------------------------|-----------|-----------------------------------| +| \#0 | Sistema abierto. | Uno | Relaciones con inversores | Uno | ¿Qué puedes encontrar en Neodigit | +| \#1 | Sistema abierto. | Cero | Contáctenos | Uno | ¿Qué puedes encontrar en Neodigit | +| Nombre | Método de codificación de datos: | Uno | Misión | Uno | ¿Qué puedes encontrar en Neodigit | +| \#Y | … | … | … | … | … | + +En otras palabras, todos los valores relacionados con una fila se almacenan físicamente uno junto al otro. + +Ejemplos de un DBMS orientado a filas son MySQL, Postgres y MS SQL Server. ¿Cómo?gris } + +En un DBMS orientado a columnas, los datos se almacenan así: + +| Fila: | \#0 | \#1 | Nombre | \#Y | +|-------------|-----------------------------------|-----------------------------------|------------------------------------|-----| +| Argumento: | Sistema abierto. | Sistema abierto. | Método de codificación de datos: | … | +| JavaEnable: | Uno | Cero | Uno | … | +| Titular: | Relaciones con inversores | Contáctenos | Misión | … | +| GoodEvent: | Uno | Uno | Uno | … | +| EventTime: | ¿Qué puedes encontrar en Neodigit | ¿Qué puedes encontrar en Neodigit | ¿Qué puedes encontrar en Neodigit | … | + +Estos ejemplos solo muestran el orden en el que se organizan los datos. Los valores de diferentes columnas se almacenan por separado y los datos de la misma columna se almacenan juntos. + +Ejemplos de un DBMS orientado a columnas: Vertica, Paraccel (Actian Matrix y Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise y Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid y kdb+. ¿Cómo?gris } + +Los diferentes pedidos para almacenar datos se adaptan mejor a diferentes escenarios. El escenario de acceso a datos se refiere a qué consultas se realizan, con qué frecuencia y en qué proporción; cuántos datos se leen para cada tipo de consulta: filas, columnas y bytes; la relación entre la lectura y la actualización de datos; el tamaño de trabajo de los datos y cómo se utilizan localmente; si se utilizan las transacciones y qué tan aisladas están; + +Cuanto mayor sea la carga en el sistema, más importante es personalizar el sistema configurado para que coincida con los requisitos del escenario de uso, y más fino será esta personalización. No existe un sistema que sea igualmente adecuado para escenarios significativamente diferentes. Si un sistema es adaptable a un amplio conjunto de escenarios, bajo una carga alta, el sistema manejará todos los escenarios igualmente mal, o funcionará bien para solo uno o algunos de los escenarios posibles. + +## Propiedades clave del escenario OLAP {#key-properties-of-olap-scenario} + +- La gran mayoría de las solicitudes son para acceso de lectura. +- Los datos se actualizan en lotes bastante grandes (\> 1000 filas), no por filas individuales; o no se actualiza en absoluto. +- Los datos se agregan a la base de datos pero no se modifican. +- Para las lecturas, se extrae un número bastante grande de filas de la base de datos, pero solo un pequeño subconjunto de columnas. +- Las tablas hijo “wide,” lo que significa que contienen un gran número de columnas. +- Las consultas son relativamente raras (generalmente cientos de consultas por servidor o menos por segundo). +- Para consultas simples, se permiten latencias de alrededor de 50 ms. +- Los valores de columna son bastante pequeños: números y cadenas cortas (por ejemplo, 60 bytes por URL). +- Requiere un alto rendimiento al procesar una sola consulta (hasta miles de millones de filas por segundo por servidor). +- Las transacciones no son necesarias. +- Bajos requisitos para la coherencia de los datos. +- Hay una tabla grande por consulta. Todas las mesas son pequeñas, excepto una. +- Un resultado de consulta es significativamente menor que los datos de origen. En otras palabras, los datos se filtran o se agregan, por lo que el resultado se ajusta a la memoria RAM de un solo servidor. + +Es fácil ver que el escenario OLAP es muy diferente de otros escenarios populares (como el acceso OLTP o Key-Value). Por lo tanto, no tiene sentido intentar usar OLTP o una base de datos de valor clave para procesar consultas analíticas si desea obtener un rendimiento decente. Por ejemplo, si intenta usar MongoDB o Redis para análisis, obtendrá un rendimiento muy bajo en comparación con las bases de datos OLAP. + +## Por qué las bases de datos orientadas a columnas funcionan mejor en el escenario OLAP {#why-column-oriented-databases-work-better-in-the-olap-scenario} + +Las bases de datos orientadas a columnas son más adecuadas para los escenarios OLAP: son al menos 100 veces más rápidas en el procesamiento de la mayoría de las consultas. Las razones se explican en detalle a continuación, pero el hecho es más fácil de demostrar visualmente: + +**DBMS orientado a filas** + +![Row-oriented](images/row_oriented.gif#) + +**DBMS orientado a columnas** + +![Column-oriented](images/column_oriented.gif#) + +Ver la diferencia? + +### Entrada/salida {#inputoutput} + +1. Para una consulta analítica, solo es necesario leer un pequeño número de columnas de tabla. En una base de datos orientada a columnas, puede leer solo los datos que necesita. Por ejemplo, si necesita 5 columnas de 100, puede esperar una reducción de 20 veces en E/S. +2. Dado que los datos se leen en paquetes, es más fácil de comprimir. Los datos en columnas también son más fáciles de comprimir. Esto reduce aún más el volumen de E/S. +3. Debido a la reducción de E / S, más datos se ajustan a la memoria caché del sistema. + +Por ejemplo, la consulta “count the number of records for each advertising platform” más caliente “advertising platform ID” columna, que ocupa 1 byte sin comprimir. Si la mayor parte del tráfico no proviene de plataformas publicitarias, puede esperar al menos una compresión de 10 veces de esta columna. Cuando se utiliza un algoritmo de compresión rápida, la descompresión de datos es posible a una velocidad de al menos varios gigabytes de datos sin comprimir por segundo. En otras palabras, esta consulta se puede procesar a una velocidad de aproximadamente varios miles de millones de filas por segundo en un único servidor. Esta velocidad se logra realmente en la práctica. + +
+ +Ejemplo + +``` bash +$ clickhouse-client +ClickHouse client version 0.0.52053. +Connecting to localhost:9000. +Connected to ClickHouse server version 0.0.52053. +``` + +``` sql +SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 +``` + +``` text +┌─CounterID─┬──count()─┐ +│ 114208 │ 56057344 │ +│ 115080 │ 51619590 │ +│ 3228 │ 44658301 │ +│ 38230 │ 42045932 │ +│ 145263 │ 42042158 │ +│ 91244 │ 38297270 │ +│ 154139 │ 26647572 │ +│ 150748 │ 24112755 │ +│ 242232 │ 21302571 │ +│ 338158 │ 13507087 │ +│ 62180 │ 12229491 │ +│ 82264 │ 12187441 │ +│ 232261 │ 12148031 │ +│ 146272 │ 11438516 │ +│ 168777 │ 11403636 │ +│ 4120072 │ 11227824 │ +│ 10938808 │ 10519739 │ +│ 74088 │ 9047015 │ +│ 115079 │ 8837972 │ +│ 337234 │ 8205961 │ +└───────────┴──────────┘ +``` + +
+ +### CPU {#cpu} + +Dado que la ejecución de una consulta requiere procesar un gran número de filas, ayuda enviar todas las operaciones para vectores completos en lugar de para filas separadas, o implementar el motor de consultas para que casi no haya costo de envío. Si no lo hace, con cualquier subsistema de disco medio decente, el intérprete de consultas inevitablemente detiene la CPU. Tiene sentido almacenar datos en columnas y procesarlos, cuando sea posible, por columnas. + +Hay dos formas de hacer esto: + +1. Un vector motor. Todas las operaciones se escriben para vectores, en lugar de para valores separados. Esto significa que no es necesario llamar a las operaciones muy a menudo, y los costos de envío son insignificantes. El código de operación contiene un ciclo interno optimizado. + +2. Generación de código. El código generado para la consulta tiene todas las llamadas indirectas. + +Esto no se hace en “normal” bases de datos, porque no tiene sentido cuando se ejecutan consultas simples. Sin embargo, hay excepciones. Por ejemplo, MemSQL utiliza la generación de código para reducir la latencia al procesar consultas SQL. (A modo de comparación, los DBMS analíticos requieren la optimización del rendimiento, no la latencia.) + +Tenga en cuenta que para la eficiencia de la CPU, el lenguaje de consulta debe ser declarativo (SQL o MDX), o al menos un vector (J, K). La consulta solo debe contener bucles implícitos, lo que permite la optimización. + +[Artículo Original](https://clickhouse.tech/docs/es/) diff --git a/docs/es/interfaces/cli.md b/docs/es/interfaces/cli.md new file mode 100644 index 00000000000..ce56a423d53 --- /dev/null +++ b/docs/es/interfaces/cli.md @@ -0,0 +1,142 @@ +# Cliente de línea de comandos {#command-line-client} + +ClickHouse proporciona un cliente de línea de comandos nativo: `clickhouse-client`. El cliente admite opciones de línea de comandos y archivos de configuración. Para obtener más información, consulte [Configuración](#interfaces_cli_configuration). + +[Instalar](../getting_started/index.md) desde el `clickhouse-client` paquete y ejecútelo con el comando `clickhouse-client`. + +``` bash +$ clickhouse-client +ClickHouse client version 19.17.1.1579 (official build). +Connecting to localhost:9000 as user default. +Connected to ClickHouse server version 19.17.1 revision 54428. + +:) +``` + +Las diferentes versiones de cliente y servidor son compatibles entre sí, pero es posible que algunas funciones no estén disponibles en clientes anteriores. Se recomienda utilizar la misma versión del cliente que la aplicación de servidor. Cuando intenta usar un cliente de la versión anterior, entonces el servidor, `clickhouse-client` muestra el mensaje: + + ClickHouse client version is older than ClickHouse server. It may lack support for new features. + +## Uso {#cli-usage} + +El cliente se puede utilizar en modo interactivo y no interactivo (por lotes). Para utilizar el modo por lotes, especifique el ‘query’ parámetro, o enviar datos a ‘stdin’ (verifica que ‘stdin’ no es un terminal), o ambos. Similar a la interfaz HTTP, cuando se utiliza el ‘query’ parámetro y el envío de datos a ‘stdin’ la solicitud es una concatenación de la ‘query’ parámetro, un avance de línea y los datos en ‘stdin’. Esto es conveniente para grandes consultas INSERT. + +Ejemplo de uso del cliente para insertar datos: + +``` bash +$ echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; + +$ cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +3, 'some text', '2016-08-14 00:00:00' +4, 'some more text', '2016-08-14 00:00:01' +_EOF + +$ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +``` + +En el modo por lotes, el formato de datos predeterminado es TabSeparated. Puede establecer el formato en la cláusula FORMAT de la consulta. + +De forma predeterminada, solo puede procesar una única consulta en modo por lotes. Para realizar múltiples consultas desde un “script,” utilizar el `--multiquery` parámetro. Esto funciona para todas las consultas excepto INSERT . Los resultados de la consulta se generan consecutivamente sin separadores adicionales. Del mismo modo, para procesar un gran número de consultas, puede ejecutar ‘clickhouse-client’ para cada consulta. Tenga en cuenta que puede tomar decenas de milisegundos para iniciar el ‘clickhouse-client’ programa. + +En el modo interactivo, obtiene una línea de comandos donde puede ingresar consultas. + +Si ‘multiline’ no se especifica (el valor predeterminado): Para ejecutar la consulta, pulse Intro. El punto y coma no es necesario al final de la consulta. Para introducir una consulta de varias líneas, introduzca una barra invertida `\` antes de la alimentación de línea. Después de presionar Enter, se le pedirá que ingrese la siguiente línea de la consulta. + +Si se especifica multilínea: Para ejecutar una consulta, finalícela con un punto y coma y presione Intro. Si se omitió el punto y coma al final de la línea ingresada, se le pedirá que ingrese la siguiente línea de la consulta. + +Solo se ejecuta una sola consulta, por lo que se ignora todo después del punto y coma. + +Puede especificar `\G` en lugar o después del punto y coma. Esto indica el formato vertical. En este formato, cada valor se imprime en una línea separada, lo cual es conveniente para tablas anchas. Esta característica inusual se agregó por compatibilidad con la CLI de MySQL. + +La línea de comandos se basa en ‘replxx’ (similar a ‘readline’). En otras palabras, utiliza los atajos de teclado familiares y mantiene un historial. La historia está escrita para `~/.clickhouse-client-history`. + +De forma predeterminada, el formato utilizado es PrettyCompact. Puede cambiar el formato en la cláusula FORMAT de la consulta o especificando `\G` al final de la consulta, utilizando el `--format` o `--vertical` en la línea de comandos, o utilizando el archivo de configuración del cliente. + +Para salir del cliente, presione Ctrl+D (o Ctrl+C) o introduzca una de las siguientes opciones en lugar de una consulta: “exit”, “quit”, “logout”, “exit;”, “quit;”, “logout;”, “q”, “Q”, “:q” + +Al procesar una consulta, el cliente muestra: + +1. Progreso, que se actualiza no más de 10 veces por segundo (de forma predeterminada). Para consultas rápidas, es posible que el progreso no tenga tiempo para mostrarse. +2. La consulta con formato después del análisis, para la depuración. +3. El resultado en el formato especificado. +4. El número de líneas en el resultado, el tiempo transcurrido y la velocidad promedio de procesamiento de consultas. + +Puede cancelar una consulta larga presionando Ctrl + C. Sin embargo, aún tendrá que esperar un poco para que el servidor aborte la solicitud. No es posible cancelar una consulta en determinadas etapas. Si no espera y presiona Ctrl + C por segunda vez, el cliente saldrá. + +El cliente de línea de comandos permite pasar datos externos (tablas temporales externas) para consultar. Para obtener más información, consulte la sección “External data for query processing”. + +### Consultas con parámetros {#cli-queries-with-parameters} + +Puede crear una consulta con parámetros y pasarles valores desde la aplicación cliente. Esto permite evitar formatear consultas con valores dinámicos específicos en el lado del cliente. Por ejemplo: + +``` bash +$ clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {parName:Array(UInt16)}" +``` + +#### Sintaxis de consulta {#cli-queries-with-parameters-syntax} + +Formatee una consulta como de costumbre, luego coloque los valores que desea pasar de los parámetros de la aplicación a la consulta entre llaves en el siguiente formato: + +``` sql +{:} +``` + +- `name` — Placeholder identifier. In the console client it should be used in app parameters as `--param_ = value`. +- `data type` — [Tipo de datos](../data_types/index.md) del valor del parámetro de la aplicación. Por ejemplo, una estructura de datos como `(integer, ('string', integer))` puede tener el `Tuple(UInt8, Tuple(String, UInt8))` tipo de datos (también puede usar otro [Entero](../data_types/int_uint.md) tipo). + +#### Ejemplo {#example} + +``` bash +$ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM table WHERE val = {tuple_in_tuple:Tuple(UInt8, Tuple(String, UInt8))}" +``` + +## Configuración {#interfaces-cli-configuration} + +Puede pasar parámetros a `clickhouse-client` (todos los parámetros tienen un valor predeterminado) usando: + +- Desde la línea de comandos + + Las opciones de la línea de comandos anulan los valores y valores predeterminados de los archivos de configuración. + +- Archivos de configuración. + + Los valores de los archivos de configuración anulan los valores predeterminados. + +### Opciones de línea de comandos {#command-line-options} + +- `--host, -h` -– The server name, ‘localhost’ predeterminada. Puede utilizar el nombre o la dirección IPv4 o IPv6. +- `--port` – The port to connect to. Default value: 9000. Note that the HTTP interface and the native interface use different ports. +- `--user, -u` – The username. Default value: default. +- `--password` – The password. Default value: empty string. +- `--query, -q` – The query to process when using non-interactive mode. +- `--database, -d` – Select the current default database. Default value: the current database from the server settings (‘default’ predeterminada). +- `--multiline, -m` – If specified, allow multiline queries (do not send the query on Enter). +- `--multiquery, -n` – If specified, allow processing multiple queries separated by semicolons. +- `--format, -f` – Use the specified default format to output the result. +- `--vertical, -E` – If specified, use the Vertical format by default to output the result. This is the same as ‘–format=Vertical’. En este formato, cada valor se imprime en una línea separada, lo que es útil cuando se muestran tablas anchas. +- `--time, -t` – If specified, print the query execution time to ‘stderr’ en modo no interactivo. +- `--stacktrace` – If specified, also print the stack trace if an exception occurs. +- `--config-file` – The name of the configuration file. +- `--secure` – If specified, will connect to server over secure connection. +- `--param_` — Value for a [consulta con parámetros](#cli-queries-with-parameters). + +### Archivos de configuración {#configuration-files} + +`clickhouse-client` utiliza el primer archivo existente de los siguientes: + +- Definido en el `--config-file` parámetro. +- `./clickhouse-client.xml` +- `~/.clickhouse-client/config.xml` +- `/etc/clickhouse-client/config.xml` + +Ejemplo de un archivo de configuración: + +``` xml + + username + password + False + +``` + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/cli/) diff --git a/docs/es/interfaces/cpp.md b/docs/es/interfaces/cpp.md new file mode 100644 index 00000000000..9de79433ba7 --- /dev/null +++ b/docs/es/interfaces/cpp.md @@ -0,0 +1,5 @@ +# Biblioteca de clientes de C++ {#c-client-library} + +Ver README en [Bienvenidos](https://github.com/ClickHouse/clickhouse-cpp) repositorio. + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/cpp/) diff --git a/docs/es/interfaces/http.md b/docs/es/interfaces/http.md new file mode 100644 index 00000000000..5d45c3755e5 --- /dev/null +++ b/docs/es/interfaces/http.md @@ -0,0 +1,281 @@ +# Interfaz HTTP {#http-interface} + +La interfaz HTTP le permite usar ClickHouse en cualquier plataforma desde cualquier lenguaje de programación. Lo usamos para trabajar desde Java y Perl, así como scripts de shell. En otros departamentos, la interfaz HTTP se usa desde Perl, Python y Go. La interfaz HTTP es más limitada que la interfaz nativa, pero tiene una mejor compatibilidad. + +De forma predeterminada, clickhouse-server escucha HTTP en el puerto 8123 (esto se puede cambiar en la configuración). + +Si realiza una solicitud GET / sin parámetros, devuelve 200 códigos de respuesta y la cadena que definió en [http\_server\_default\_response](../operations/server_settings/settings.md#server_settings-http_server_default_response) valor predeterminado “Ok.” (con un avance de línea al final) + +``` bash +$ curl 'http://localhost:8123/' +Ok. +``` + +Use la solicitud GET / ping en los scripts de comprobación de estado. Este controlador siempre devuelve “Ok.” (con un avance de línea al final). Disponible a partir de la versión 18.12.13. + +``` bash +$ curl 'http://localhost:8123/ping' +Ok. +``` + +Enviar la solicitud como una URL ‘query’ parámetro, o como un POST. O envíe el comienzo de la consulta en el ‘query’ parámetro, y el resto en el POST (explicaremos más adelante por qué esto es necesario). El tamaño de la URL está limitado a 16 KB, así que tenga esto en cuenta al enviar consultas grandes. + +Si tiene éxito, recibirá el código de respuesta 200 y el resultado en el cuerpo de respuesta. +Si se produce un error, recibirá el código de respuesta 500 y un texto de descripción de error en el cuerpo de la respuesta. + +Al usar el método GET, ‘readonly’ se establece. En otras palabras, para consultas que modifican datos, solo puede usar el método POST. Puede enviar la consulta en sí misma en el cuerpo POST o en el parámetro URL. + +Ejemplos: + +``` bash +$ curl 'http://localhost:8123/?query=SELECT%201' +1 + +$ wget -O- -q 'http://localhost:8123/?query=SELECT 1' +1 + +$ echo -ne 'GET /?query=SELECT%201 HTTP/1.0\r\n\r\n' | nc localhost 8123 +HTTP/1.0 200 OK +Date: Wed, 27 Nov 2019 10:30:18 GMT +Connection: Close +Content-Type: text/tab-separated-values; charset=UTF-8 +X-ClickHouse-Server-Display-Name: clickhouse.ru-central1.internal +X-ClickHouse-Query-Id: 5abe861c-239c-467f-b955-8a201abb8b7f +X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"} + +1 +``` + +Como puede ver, curl es algo inconveniente ya que los espacios deben ser URL escapadas. +Aunque wget escapa de todo en sí, no recomendamos usarlo porque no funciona bien sobre HTTP 1.1 cuando se usa keep-alive y Transfer-Encoding: chunked . + +``` bash +$ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @- +1 + +$ echo 'SELECT 1' | curl 'http://localhost:8123/?query=' --data-binary @- +1 + +$ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @- +1 +``` + +Si se envía parte de la consulta en el parámetro y parte en el POST, se inserta un avance de línea entre estas dos partes de datos. +Ejemplo (esto no funcionará): + +``` bash +$ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @- +Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL +ECT 1 +, expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception +``` + +De forma predeterminada, los datos se devuelven en formato TabSeparated (para obtener más información, “Formats” apartado). +Utilice la cláusula FORMAT de la consulta para solicitar cualquier otro formato. + +``` bash +$ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @- +┏━━━┓ +┃ 1 ┃ +┡━━━┩ +│ 1 │ +└───┘ +``` + +El método POST de transmitir datos es necesario para las consultas INSERT. En este caso, puede escribir el comienzo de la consulta en el parámetro URL y usar POST para pasar los datos a insertar. Los datos a insertar podrían ser, por ejemplo, un volcado separado por tabuladores de MySQL. De esta manera, la consulta INSERT reemplaza LOAD DATA LOCAL INFILE de MySQL. + +Ejemplos: Crear una tabla: + +``` bash +$ echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- +``` + +Uso de la consulta INSERT familiar para la inserción de datos: + +``` bash +$ echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- +``` + +Los datos se pueden enviar por separado de la consulta: + +``` bash +$ echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- +``` + +Puede especificar cualquier formato de datos. El ‘Values’ el formato es el mismo que el que se usa al escribir INSERT INTO t VALUES: + +``` bash +$ echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- +``` + +Para insertar datos de un volcado separado por tabuladores, especifique el formato correspondiente: + +``` bash +$ echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- +``` + +Lectura del contenido de la tabla. Los datos se emiten en orden aleatorio debido al procesamiento de consultas paralelas: + +``` bash +$ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' +7 +8 +9 +10 +11 +12 +1 +2 +3 +4 +5 +6 +``` + +Eliminando la mesa. + +``` bash +$ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- +``` + +Para las solicitudes correctas que no devuelven una tabla de datos, se devuelve un cuerpo de respuesta vacío. + +Puede utilizar el formato interno de compresión ClickHouse al transmitir datos. Los datos comprimidos tienen un formato no estándar, y deberá usar el `clickhouse-compressor` programa para trabajar con él (se instala con el `clickhouse-client` paquete). Para aumentar la eficiencia de la inserción de datos, puede deshabilitar la verificación de suma de comprobación [http\_native\_compression\_disable\_checksumming\_on\_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) configuración. + +Si ha especificado `compress=1` en la URL, el servidor comprime los datos que le envía. +Si ha especificado `decompress=1` en la dirección URL, el servidor descomprime los mismos datos que `POST` método. + +También puede optar por utilizar [Compresión HTTP](https://en.wikipedia.org/wiki/HTTP_compression). Para enviar un `POST` solicitud, agregue el encabezado de solicitud `Content-Encoding: compression_method`. Para que ClickHouse comprima la respuesta, debe agregar `Accept-Encoding: compression_method`. Soporte ClickHouse `gzip`, `br`, y `deflate` [métodos de compresión](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens). Para habilitar la compresión HTTP, debe usar ClickHouse [enable\_http\_compression](../operations/settings/settings.md#settings-enable_http_compression) configuración. Puede configurar el nivel de compresión de datos [http\_zlib\_compression\_level](#settings-http_zlib_compression_level) para todos los métodos de compresión. + +Puede usar esto para reducir el tráfico de red al transmitir una gran cantidad de datos o para crear volcados que se comprimen inmediatamente. + +Ejemplos de envío de datos con compresión: + +``` bash +#Sending data to the server: +$ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' + +#Sending data to the client: +$ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' +``` + +!!! note "Nota" + Algunos clientes HTTP pueden descomprimir datos del servidor de forma predeterminada (con `gzip` y `deflate`) y puede obtener datos descomprimidos incluso si usa la configuración de compresión correctamente. + +Puede usar el ‘database’ Parámetro URL para especificar la base de datos predeterminada. + +``` bash +$ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @- +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +``` + +De forma predeterminada, la base de datos que está registrada en la configuración del servidor se utiliza como base de datos predeterminada. De forma predeterminada, esta es la base de datos llamada ‘default’. Como alternativa, siempre puede especificar la base de datos utilizando un punto antes del nombre de la tabla. + +El nombre de usuario y la contraseña se pueden indicar de una de estas tres maneras: + +1. Uso de la autenticación básica HTTP. Ejemplo: + + + +``` bash +$ echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- +``` + +1. En el ‘user’ y ‘password’ Parámetros de URL. Ejemplo: + + + +``` bash +$ echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- +``` + +1. Utilizar ‘X-ClickHouse-User’ y ‘X-ClickHouse-Key’ cabecera. Ejemplo: + + + +``` bash +$ echo 'SELECT 1' | curl -H 'X-ClickHouse-User: user' -H 'X-ClickHouse-Key: password' 'http://localhost:8123/' -d @- +``` + +Si no se especifica el nombre de usuario, `default` se utiliza el nombre. Si no se especifica la contraseña, se utiliza la contraseña vacía. +También puede utilizar los parámetros de URL para especificar cualquier configuración para procesar una sola consulta o perfiles completos de configuración. Ejemplo:http://localhost:8123/?perfil=web&max\_rows\_to\_read=1000000000&consulta=SELECCIONA+1 + +Para obtener más información, consulte [Configuración](../operations/settings/index.md) apartado. + +``` bash +$ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @- +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +``` + +Para obtener información sobre otros parámetros, consulte la sección “SET”. + +Del mismo modo, puede utilizar sesiones ClickHouse en el protocolo HTTP. Para hacer esto, debe agregar el `session_id` GET parámetro a la solicitud. Puede usar cualquier cadena como ID de sesión. De forma predeterminada, la sesión finaliza después de 60 segundos de inactividad. Para cambiar este tiempo de espera, modifique `default_session_timeout` configuración en la configuración del servidor, o `session_timeout` GET parámetro a la solicitud. Para comprobar el estado de la sesión, `session_check=1` parámetro. Solo se puede ejecutar una consulta a la vez en una sola sesión. + +Puede recibir información sobre el progreso de una consulta en `X-ClickHouse-Progress` encabezados de respuesta. Para hacer esto, habilite [send\_progress\_in\_http\_headers](../operations/settings/settings.md#settings-send_progress_in_http_headers). Ejemplo de la secuencia de encabezado: + +``` text +X-ClickHouse-Progress: {"read_rows":"2752512","read_bytes":"240570816","total_rows_to_read":"8880128"} +X-ClickHouse-Progress: {"read_rows":"5439488","read_bytes":"482285394","total_rows_to_read":"8880128"} +X-ClickHouse-Progress: {"read_rows":"8783786","read_bytes":"819092887","total_rows_to_read":"8880128"} +``` + +Posibles campos de encabezado: + +- `read_rows` — Número de filas leídas. +- `read_bytes` — Volumen de datos leídos en bytes. +- `total_rows_to_read` — Número total de filas a leer. +- `written_rows` — Número de filas escritas. +- `written_bytes` — Volumen de datos escritos en bytes. + +Las solicitudes en ejecución no se detienen automáticamente si se pierde la conexión HTTP. El análisis y el formato de datos se realizan en el lado del servidor, y el uso de la red puede ser ineficaz. +Opcional ‘query\_id’ parámetro se puede pasar como el ID de consulta (cualquier cadena). Para obtener más información, consulte la sección “Settings, replace\_running\_query”. + +Opcional ‘quota\_key’ parámetro se puede pasar como la clave de cuota (cualquier cadena). Para obtener más información, consulte la sección “Quotas”. + +La interfaz HTTP permite pasar datos externos (tablas temporales externas) para consultar. Para obtener más información, consulte la sección “External data for query processing”. + +## Almacenamiento en búfer de respuesta {#response-buffering} + +Puede habilitar el almacenamiento en búfer de respuestas en el lado del servidor. El `buffer_size` y `wait_end_of_query` Los parámetros URL se proporcionan para este propósito. + +`buffer_size` determina el número de bytes en el resultado para almacenar en búfer en la memoria del servidor. Si un cuerpo de resultado es mayor que este umbral, el búfer se escribe en el canal HTTP y los datos restantes se envían directamente al canal HTTP. + +Para asegurarse de que toda la respuesta se almacena en búfer, establezca `wait_end_of_query=1`. En este caso, los datos que no se almacenan en la memoria se almacenarán en un archivo de servidor temporal. + +Ejemplo: + +``` bash +$ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' +``` + +Utilice el almacenamiento en búfer para evitar situaciones en las que se produjo un error de procesamiento de consultas después de enviar al cliente el código de respuesta y los encabezados HTTP. En esta situación, se escribe un mensaje de error al final del cuerpo de la respuesta y, en el lado del cliente, el error solo se puede detectar en la etapa de análisis. + +### Consultas con parámetros {#cli-queries-with-parameters} + +Puede crear una consulta con parámetros y pasar valores para ellos desde los parámetros de solicitud HTTP correspondientes. Para obtener más información, consulte [Consultas con parámetros para CLI](cli.md#cli-queries-with-parameters). + +### Ejemplo {#example} + +``` bash +$ curl -sS "
?param_id=2¶m_phrase=test" -d "SELECT * FROM table WHERE int_column = {id:UInt8} and string_column = {phrase:String}" +``` + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/http_interface/) diff --git a/docs/es/interfaces/index.md b/docs/es/interfaces/index.md new file mode 100644 index 00000000000..e0b5257a280 --- /dev/null +++ b/docs/es/interfaces/index.md @@ -0,0 +1,21 @@ +# Interfaz {#interfaces} + +ClickHouse proporciona dos interfaces de red (ambas se pueden ajustar opcionalmente en TLS para mayor seguridad): + +- [HTTP](http.md), que está documentado y fácil de usar directamente. +- [TCP nativo](tcp.md), que tiene menos sobrecarga. + +En la mayoría de los casos, se recomienda utilizar la herramienta o biblioteca apropiada en lugar de interactuar con ellos directamente. Oficialmente apoyados por Yandex son los siguientes: + +- [Cliente de línea de comandos](cli.md) +- [Controlador JDBC](jdbc.md) +- [Controlador ODBC](odbc.md) +- [Biblioteca cliente de C++](cpp.md) + +También hay una amplia gama de bibliotecas de terceros para trabajar con ClickHouse: + +- [Bibliotecas de clientes](third-party/client_libraries.md) +- [Integración](third-party/integrations.md) +- [Interfaces visuales](third-party/gui.md) + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/) diff --git a/docs/es/interfaces/jdbc.md b/docs/es/interfaces/jdbc.md new file mode 100644 index 00000000000..3f155da865e --- /dev/null +++ b/docs/es/interfaces/jdbc.md @@ -0,0 +1,8 @@ +# Controlador JDBC {#jdbc-driver} + +- **[Conductor oficial](https://github.com/ClickHouse/clickhouse-jdbc)** +- Controladores de terceros: + - [Sistema abierto.](https://github.com/housepower/ClickHouse-Native-JDBC) + - [Método de codificación de datos:](https://github.com/blynkkk/clickhouse4j) + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/jdbc/) diff --git a/docs/es/interfaces/mysql.md b/docs/es/interfaces/mysql.md new file mode 100644 index 00000000000..cedbac38e4d --- /dev/null +++ b/docs/es/interfaces/mysql.md @@ -0,0 +1,42 @@ +# Interfaz de MySQL {#mysql-interface} + +ClickHouse soporta el protocolo de cable MySQL. Puede ser habilitado por [mysql\_port](../operations/server_settings/settings.md#server_settings-mysql_port) configuración en el archivo de configuración: + +``` xml +9004 +``` + +Ejemplo de conexión mediante la herramienta de línea de comandos `mysql`: + +``` bash +$ mysql --protocol tcp -u default -P 9004 +``` + +Salida si una conexión se realizó correctamente: + +``` text +Welcome to the MySQL monitor. Commands end with ; or \g. +Your MySQL connection id is 4 +Server version: 20.2.1.1-ClickHouse + +Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved. + +Oracle is a registered trademark of Oracle Corporation and/or its +affiliates. Other names may be trademarks of their respective +owners. + +Type 'help;' or '\h' for help. Type '\c' to clear the current input statement. + +mysql> +``` + +Para la compatibilidad con todos los clientes MySQL, se recomienda especificar la contraseña de usuario con [Doble SHA1](../operations/settings/settings_users.md#password_double_sha1_hex) en el archivo de configuración. +Si la contraseña de usuario se especifica usando [SHA256](../operations/settings/settings_users.md#password_sha256_hex), algunos clientes no podrán autenticarse (mysqljs y versiones antiguas de la herramienta de línea de comandos mysql). + +Restricción: + +- las consultas preparadas no son compatibles + +- algunos tipos de datos se envían como cadenas + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/mysql/) diff --git a/docs/es/interfaces/odbc.md b/docs/es/interfaces/odbc.md new file mode 100644 index 00000000000..8e221d4919c --- /dev/null +++ b/docs/es/interfaces/odbc.md @@ -0,0 +1,5 @@ +# Conductor ODBC {#odbc-driver} + +- [Conductor oficial](https://github.com/ClickHouse/clickhouse-odbc). + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/odbc/) diff --git a/docs/es/interfaces/tcp.md b/docs/es/interfaces/tcp.md new file mode 100644 index 00000000000..87f91719463 --- /dev/null +++ b/docs/es/interfaces/tcp.md @@ -0,0 +1,5 @@ +# Interfaz nativa (TCP) {#native-interface-tcp} + +El protocolo nativo se utiliza en el [cliente de línea de comandos](cli.md), para la comunicación entre servidores durante el procesamiento de consultas distribuidas, y también en otros programas de C, Desafortunadamente, el protocolo nativo de ClickHouse aún no tiene especificaciones formales, pero puede ser diseñado de manera inversa desde el código fuente de ClickHouse (comenzando [por aquí](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client)) y/o mediante la interceptación y el análisis del tráfico TCP. + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/tcp/) diff --git a/docs/es/interfaces/third-party/client_libraries.md b/docs/es/interfaces/third-party/client_libraries.md new file mode 100644 index 00000000000..ff66962f87b --- /dev/null +++ b/docs/es/interfaces/third-party/client_libraries.md @@ -0,0 +1,51 @@ +# Bibliotecas de clientes de desarrolladores de terceros {#client-libraries-from-third-party-developers} + +!!! warning "Descargo" + Yandex hace **ni** mantenga las bibliotecas enumeradas a continuación y no haya realizado ninguna prueba extensa para garantizar su calidad. + +- Película + - [InformaciónSistema abierto.](https://github.com/Infinidat/infi.clickhouse_orm) + - [Casa de clics-conductor](https://github.com/mymarilyn/clickhouse-driver) + - [Casa de clics-cliente](https://github.com/yurial/clickhouse-client) + - [Aiochclient](https://github.com/maximdanilchenko/aiochclient) +- PHP + - [smi2/phpclickhouse](https://packagist.org/packages/smi2/phpClickHouse) + - [8bitov/clickhouse-php-client](https://packagist.org/packages/8bitov/clickhouse-php-client) + - [bozerkins/clickhouse-client](https://packagist.org/packages/bozerkins/clickhouse-client) + - [simpod/clickhouse-client](https://packagist.org/packages/simpod/clickhouse-client) + - [seva-code/php-click-house-client](https://packagist.org/packages/seva-code/php-click-house-client) + - [SeasClick C++ client](https://github.com/SeasX/SeasClick) +- Ve + - [Casa de clics](https://github.com/kshvakov/clickhouse/) + - [Sistema abierto.](https://github.com/roistat/go-clickhouse) + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/mailru/go-clickhouse) + - [Golang-clickhouse](https://github.com/leprosus/golang-clickhouse) +- NodeJs + - [Casa de clic (NodeJs)](https://github.com/TimonKK/clickhouse) + - [Inicio](https://github.com/apla/node-clickhouse) +- Perl + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/elcamlost/perl-DBD-ClickHouse) + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://metacpan.org/release/HTTP-ClickHouse) + - [Cualquier evento-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) +- Rubí + - [Haga clic en Casa (Ruby)](https://github.com/shlima/click_house) +- R + - [Sistema abierto.](https://github.com/hannesmuehleisen/clickhouse-r) + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/IMSMWU/RClickhouse) +- Java + - [Casa de clic-cliente-java](https://github.com/VirtusAI/clickhouse-client-java) + - [Casa de clics-cliente](https://github.com/Ecwid/clickhouse-client) +- Ciudad + - [Sistema abierto.](https://github.com/crobox/clickhouse-scala-client) +- Bienvenido + - [AORM](https://github.com/TanVD/AORM) +- C\# + - [Sistema abierto.Ado](https://github.com/killwort/ClickHouse-Net) + - [Sistema abierto.Cliente](https://github.com/DarkWanderer/ClickHouse.Client) + - [Sistema abierto.](https://github.com/ilyabreev/ClickHouse.Net) +- Elixir + - [clickhousex](https://github.com/appodeal/clickhousex/) +- Nim + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/leonardoce/nim-clickhouse) + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/third-party/client_libraries/) diff --git a/docs/es/interfaces/third-party/gui.md b/docs/es/interfaces/third-party/gui.md new file mode 100644 index 00000000000..1148825c0f9 --- /dev/null +++ b/docs/es/interfaces/third-party/gui.md @@ -0,0 +1,145 @@ +# Interfaces visuales de desarrolladores de terceros {#visual-interfaces-from-third-party-developers} + +## De código abierto {#open-source} + +### Tabix {#tabix} + +Interfaz web para ClickHouse en el [Tabix](https://github.com/tabixio/tabix) proyecto. + +Función: + +- Funciona con ClickHouse directamente desde el navegador, sin la necesidad de instalar software adicional. +- Editor de consultas con resaltado de sintaxis. +- Autocompletado de comandos. +- Herramientas para el análisis gráfico de la ejecución de consultas. +- Opciones de esquema de color. + +[Documentación de Tabix](https://tabix.io/doc/). + +### Sistema abierto. {#houseops} + +[Sistema abierto.](https://github.com/HouseOps/HouseOps) Es una interfaz de usuario / IDE para OSX, Linux y Windows. + +Función: + +- Generador de consultas con resaltado de sintaxis. Ver la respuesta en una tabla o vista JSON. +- Exportar resultados de consultas como CSV o JSON. +- Lista de procesos con descripciones. Modo de escritura. Capacidad de parar (`KILL`) proceso. +- Gráfico de base de datos. Muestra todas las tablas y sus columnas con información adicional. +- Una vista rápida del tamaño de la columna. +- Configuración del servidor. + +Las siguientes características están planificadas para el desarrollo: + +- Gestión de bases de datos. +- Gestión de usuarios. +- Análisis de datos en tiempo real. +- Supervisión de clúster. +- Gestión de clústeres. +- Monitoreo de tablas replicadas y Kafka. + +### Faro {#lighthouse} + +[Faro](https://github.com/VKCOM/lighthouse) Es una interfaz web ligera para ClickHouse. + +Función: + +- Lista de tablas con filtrado y metadatos. +- Vista previa de la tabla con filtrado y clasificación. +- Ejecución de consultas de sólo lectura. + +### Redash {#redash} + +[Redash](https://github.com/getredash/redash) es una plataforma para la visualización de datos. + +Admite múltiples fuentes de datos, incluido ClickHouse, Redash puede unir los resultados de consultas de diferentes fuentes de datos en un conjunto de datos final. + +Función: + +- Potente editor de consultas. +- Explorador de base de datos. +- Herramientas de visualización, que le permiten representar datos en diferentes formas. + +### DBeaver {#dbeaver} + +[DBeaver](https://dbeaver.io/) - Cliente de base de datos de escritorio universal con soporte ClickHouse. + +Función: + +- Desarrollo de consultas con resaltado de sintaxis y autocompletado. +- Lista de tablas con filtros y búsqueda de metadatos. +- Vista previa de datos de tabla. +- Búsqueda de texto completo. + +### Sistema abierto. {#clickhouse-cli} + +[Sistema abierto.](https://github.com/hatarist/clickhouse-cli) es un cliente de línea de comandos alternativo para ClickHouse, escrito en Python 3. + +Función: + +- Autocompletado. +- Resaltado de sintaxis para las consultas y la salida de datos. +- Soporte de buscapersonas para la salida de datos. +- Comandos similares a PostgreSQL personalizados. + +### Sistema abierto. {#clickhouse-flamegraph} + +[Sistema abierto.](https://github.com/Slach/clickhouse-flamegraph) es una herramienta especializada para visualizar el `system.trace_log` como [Gráfico de llamas](http://www.brendangregg.com/flamegraphs.html). + +## Comercial {#commercial} + +### DataGrip {#datagrip} + +[DataGrip](https://www.jetbrains.com/datagrip/) Es un IDE de base de datos de JetBrains con soporte dedicado para ClickHouse. También está integrado en otras herramientas basadas en IntelliJ: PyCharm, IntelliJ IDEA, GoLand, PhpStorm y otros. + +Función: + +- Finalización de código muy rápida. +- Resaltado de sintaxis de ClickHouse. +- Soporte para características específicas de ClickHouse, por ejemplo, columnas anidadas, motores de tablas. +- Editor de datos. +- Refactorizaciones. +- Búsqueda y navegación. + +### Yandex DataLens {#yandex-datalens} + +[Yandex DataLens](https://cloud.yandex.ru/services/datalens) es un servicio de visualización y análisis de datos. + +Función: + +- Amplia gama de visualizaciones disponibles, desde simples gráficos de barras hasta paneles complejos. +- Los paneles podrían ponerse a disposición del público. +- Soporte para múltiples fuentes de datos, incluyendo ClickHouse. +- Almacenamiento de datos materializados basados en ClickHouse. + +Nivel de Cifrado WEP [disponible de forma gratuita](https://cloud.yandex.com/docs/datalens/pricing) para proyectos de baja carga, incluso para uso comercial. + +- [Documentación de DataLens](https://cloud.yandex.com/docs/datalens/). +- [Tutorial](https://cloud.yandex.com/docs/solutions/datalens/data-from-ch-visualization) en la visualización de datos de una base de datos ClickHouse. + +### Software de Holística {#holistics-software} + +[Holística](https://www.holistics.io/) Este es una plataforma de datos de pila completa y una herramienta de inteligencia de negocios. + +Función: + +- Correo electrónico automatizado, Slack y horarios de informes de Google Sheet. +- Editor SQL con visualizaciones, control de versiones, autocompletado, componentes de consulta reutilizables y filtros dinámicos. +- Análisis integrado de informes y cuadros de mando a través de iframe. +- Preparación de datos y capacidades ETL. +- Soporte de modelado de datos SQL para mapeo relacional de datos. + +### Mirador {#looker} + +[Mirador](https://looker.com) Es una plataforma de datos y una herramienta de inteligencia de negocios con soporte para más de 50 dialectos de bases de datos, incluido ClickHouse. Bravo está disponible como una plataforma SaaS y auto-organizada. Los usuarios pueden utilizar Looker a través del navegador para explorar datos, crear visualizaciones y paneles, programar informes y compartir sus conocimientos con colegas. Looker proporciona un amplio conjunto de herramientas para incrustar estas características en otras aplicaciones y una API +para integrar datos con otras aplicaciones. + +Función: + +- Desarrollo fácil y ágil utilizando LookML, un lenguaje que soporta curado + [Modelado de datos](https://looker.com/platform/data-modeling) para apoyar a los redactores de informes y a los usuarios finales. +- Potente integración de flujo de trabajo a través de Looker's [Acciones de datos](https://looker.com/platform/actions). + +[Cómo configurar ClickHouse en Looker.](https://docs.looker.com/setup-and-management/database-config/clickhouse) + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/third-party/gui/) diff --git a/docs/es/interfaces/third-party/integrations.md b/docs/es/interfaces/third-party/integrations.md new file mode 100644 index 00000000000..b73b4a2ba7a --- /dev/null +++ b/docs/es/interfaces/third-party/integrations.md @@ -0,0 +1,88 @@ +# Bibliotecas de integración de desarrolladores externos {#integration-libraries-from-third-party-developers} + +!!! warning "Descargo" + Yandex hace **ni** mantenga las herramientas y bibliotecas que se enumeran a continuación y no haya realizado ninguna prueba extensa para garantizar su calidad. + +## Productos de infraestructura {#infrastructure-products} + +- Sistemas de gestión de bases de datos relacionales + - [MySQL](https://www.mysql.com) + - [Nombre de la red inalámbrica (SSID):](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) + - [Casa de clic-mysql-lector de datos](https://github.com/Altinity/clickhouse-mysql-data-reader) + - [Horgh-replicador](https://github.com/larsnovikov/horgh-replicator) + - [PostgreSQL](https://www.postgresql.org) + - [Haga clickhousedb\_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) + - [InformaciónSistema abierto.](https://github.com/Infinidat/infi.clickhouse_fdw) (utilizar [InformaciónSistema abierto.](https://github.com/Infinidat/infi.clickhouse_orm)) + - [Descripción](https://github.com/mkabilov/pg2ch) + - [Sistema abierto.](https://github.com/adjust/clickhouse_fdw) + - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) + - [Método de codificación de datos:](https://github.com/zlzforever/ClickHouseMigrator) +- Colas de mensajes + - [Kafka](https://kafka.apache.org) + - [Método de codificación de datos:](https://github.com/housepower/clickhouse_sinker) (utilizar [Ir cliente](https://github.com/kshvakov/clickhouse/)) +- Almacenamiento de objetos + - [S3](https://en.wikipedia.org/wiki/Amazon_S3) + - [Haga clic en el botón de copia de seguridad](https://github.com/AlexAkulov/clickhouse-backup) +- Orquestación de contenedores + - [Kubernetes](https://kubernetes.io) + - [Operador de clickhouse](https://github.com/Altinity/clickhouse-operator) +- Gestión de configuración + - [marioneta](https://puppet.com) + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://forge.puppet.com/innogames/clickhouse) + - [Sistema abierto.](https://forge.puppet.com/mfedotov/clickhouse) +- Monitoreo + - [Grafito](https://graphiteapp.org) + - [graphouse](https://github.com/yandex/graphouse) + - [de carbono-clickhouse](https://github.com/lomik/carbon-clickhouse) + + - [Sistema abierto.](https://github.com/lomik/graphite-clickhouse) + - [Grafito-ch-optimizador](https://github.com/innogames/graphite-ch-optimizer) - optimiza las particiones [\*GraphiteMergeTree](../../operations/table_engines/graphitemergetree.md#graphitemergetree) reglas de [Configuración de rollup](../../operations/table_engines/graphitemergetree.md#rollup-configuration) podría ser aplicado + - [Grafana](https://grafana.com/) + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/Vertamedia/clickhouse-grafana) + - [Prometeo](https://prometheus.io/) + - [Sistema abierto.](https://github.com/f1yegor/clickhouse_exporter) + - [Bienvenido](https://github.com/Percona-Lab/PromHouse) + - [Sistema abierto.](https://github.com/hot-wifi/clickhouse_exporter) (utilizar [Ir cliente](https://github.com/kshvakov/clickhouse/)) + - [Nagios](https://www.nagios.org/) + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/exogroup/check_clickhouse/) + - [Zabbix](https://www.zabbix.com) + - [Sistema abierto.](https://github.com/Altinity/clickhouse-zabbix-template) + - [Sematext](https://sematext.com/) + - [integración clickhouse](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) +- Tala + - [rsyslog](https://www.rsyslog.com/) + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) + - [fluentd](https://www.fluentd.org) + - [Casa de campo](https://github.com/flant/loghouse) (para [Kubernetes](https://kubernetes.io)) + - [Información](https://www.sematext.com/logagent) + - [Sistema de tabiquería interior y exterior](https://sematext.com/docs/logagent/output-plugin-clickhouse/) +- Geo + - [MaxMind](https://dev.maxmind.com/geoip/) + - [Para que usted pueda encontrar](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) + +## Programación de ecosistemas de lenguaje {#programming-language-ecosystems} + +- Película + - [SQLAlchemy](https://www.sqlalchemy.org) + - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (utilizar [InformaciónSistema abierto.](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pandas](https://pandas.pydata.org) + - [Nuestros Servicios](https://github.com/kszucs/pandahouse) +- R + - [Dplyr](https://db.rstudio.com/dplyr/) + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/IMSMWU/RClickhouse) (utilizar [Bienvenidos](https://github.com/artpaul/clickhouse-cpp)) +- Java + - [Hadoop](http://hadoop.apache.org) + - [Sistema abierto.](https://github.com/jaykelin/clickhouse-hdfs-loader) (utilizar [JDBC](../../query_language/table_functions/jdbc.md)) +- Ciudad + - [Akka](https://akka.io) + - [Sistema abierto.](https://github.com/crobox/clickhouse-scala-client) +- C\# + - [Nivel de Cifrado WEP](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) + - [Sistema abierto.Ado](https://github.com/killwort/ClickHouse-Net) + - [Sistema abierto.Cliente](https://github.com/DarkWanderer/ClickHouse.Client) + - [Sistema abierto.](https://github.com/ilyabreev/ClickHouse.Net) + - [Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/ilyabreev/ClickHouse.Net.Migrations) +- Elixir + - [Ecto](https://github.com/elixir-ecto/ecto) + - [Método de codificación de datos:](https://github.com/appodeal/clickhouse_ecto) + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/third-party/integrations/) diff --git a/docs/es/interfaces/third-party/proxy.md b/docs/es/interfaces/third-party/proxy.md new file mode 100644 index 00000000000..49f165a4a09 --- /dev/null +++ b/docs/es/interfaces/third-party/proxy.md @@ -0,0 +1,39 @@ +# Servidores proxy de desarrolladores de terceros {#proxy-servers-from-third-party-developers} + +## chproxy {#chproxy} + +[chproxy](https://github.com/Vertamedia/chproxy), es un proxy HTTP y equilibrador de carga para la base de datos ClickHouse. + +Función: + +- Enrutamiento por usuario y almacenamiento en caché de respuestas. +- Flexible límites. +- Renovación automática del certificado SSL. + +Implementado en Go. + +## Bienvenido a WordPress. {#kittenhouse} + +[Bienvenido a WordPress.](https://github.com/VKCOM/kittenhouse) está diseñado para ser un proxy local entre ClickHouse y el servidor de aplicaciones en caso de que sea imposible o inconveniente almacenar los datos INSERT en el lado de su aplicación. + +Función: + +- Almacenamiento en búfer de datos en memoria y en disco. +- Enrutamiento por tabla. +- Equilibrio de carga y comprobación de estado. + +Implementado en Go. + +## Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica {#clickhouse-bulk} + +[Bienvenidos al Portal de Licitación Electrónica de Licitación Electrónica](https://github.com/nikepan/clickhouse-bulk) es un simple colector de insertos ClickHouse. + +Función: + +- Agrupe las solicitudes y envíe por umbral o intervalo. +- Múltiples servidores remotos. +- Autenticación básica. + +Implementado en Go. + +[Artículo Original](https://clickhouse.tech/docs/es/interfaces/third-party/proxy/) diff --git a/docs/es/introduction/adopters.md b/docs/es/introduction/adopters.md new file mode 100644 index 00000000000..eab07cbe893 --- /dev/null +++ b/docs/es/introduction/adopters.md @@ -0,0 +1,75 @@ +# Adoptadores de ClickHouse {#clickhouse-adopters} + +!!! warning "Descargo" + La siguiente lista de empresas que utilizan ClickHouse y sus historias de éxito se recopila a partir de fuentes públicas, por lo que podría diferir de la realidad actual. Realmente agradeceríamos que compartiera la historia de adoptar ClickHouse en su empresa y [agregarlo a la lista](https://github.com/ClickHouse/ClickHouse/edit/master/docs/es/introduction/adopters.md), pero por favor asegúrese de que usted no tendrá ningunos problemas de NDA haciendo así. Proporcionar actualizaciones con publicaciones de otras compañías también es útil. + +| Empresa | Industria | Usecase | Tamaño de clúster | (Un)Tamaño de datos comprimidos\* | Referencia | +|--------------------------------------------------------------------------------------------|------------------------------------|-----------------------------|------------------------------------------------------------------|-------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [2gis](https://2gis.ru) | Asignar | Monitoreo | — | — | [Charla en ruso, julio 2019](https://youtu.be/58sPkXfq6nw) | +| [Navegador Aloha](https://alohabrowser.com/) | Aplicación móvil | Backend del navegador | — | — | [Diapositivas en ruso, mayo 2019](https://github.com/yandex/clickhouse-presentations/blob/master/meetup22/aloha.pdf) | +| [Amadeus](https://amadeus.com/) | Viaje | Analítica | — | — | [Comunicado de prensa, abril de 2018](https://www.altinity.com/blog/2018/4/5/amadeus-technologies-launches-investment-and-insights-tool-based-on-machine-learning-and-strategy-algorithms) | +| [Appsflyer](https://www.appsflyer.com) | Análisis móvil | Producto principal | — | — | [Charla en ruso, julio 2019](https://www.youtube.com/watch?v=M3wbRlcpBbY) | +| [ArenaData](https://arenadata.tech/) | Plataforma de datos | Producto principal | — | — | [Diapositivas en ruso, diciembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup38/indexes.pdf) | +| [Badoo](https://badoo.com) | Citas | Serie de tiempo | — | — | [Diapositivas en ruso, diciembre 2019](https://presentations.clickhouse.tech/meetup38/forecast.pdf) | +| [Benocs](https://www.benocs.com/) | Telemetría y análisis de red | Producto principal | — | — | [Diapositivas en español, octubre de 2017](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup9/lpm.pdf) | +| [Bloomberg](https://www.bloomberg.com/) | Finanzas, Medios | Monitoreo | 102 servidores | — | [Diapositivas, Mayo 2018](https://www.slideshare.net/Altinity/http-analytics-for-6m-requests-per-second-using-clickhouse-by-alexander-bocharov) | +| [Bloxy](https://bloxy.info) | Blockchain | Analítica | — | — | [Diapositivas en ruso, agosto 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/4_bloxy.pptx) | +| `Dataliance/UltraPower` | Telecomunicaciones | Analítica | — | — | [Diapositivas en chino, enero 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/telecom.pdf) | +| [CARTO](https://carto.com/) | Inteligencia de negocios | Análisis geográfico | — | — | [Procesamiento geoespacial con Clickhouse](https://carto.com/blog/geospatial-processing-with-clickhouse/) | +| [CERN](http://public.web.cern.ch/public/) | Investigación | Experimento | — | — | [Comunicado de prensa, abril de 2012](https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/) | +| [Cisco](http://cisco.com/) | Rojo | Análisis de tráfico | — | — | [Charla relámpago, octubre 2019](https://youtu.be/-hI1vDR2oPY?t=5057) | +| [Valores de la ciudadela](https://www.citadelsecurities.com/) | Financiación | — | — | — | [Contribución, marzo 2019](https://github.com/ClickHouse/ClickHouse/pull/4774) | +| [Más información](https://city-mobil.ru) | Taxi | Analítica | — | — | [Entrada de Blog en ruso, marzo de 2020](https://habr.com/en/company/citymobil/blog/490660/) | +| [ContentSquare](https://contentsquare.com) | Análisis web | Producto principal | — | — | [Publicación de blog en francés, noviembre 2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | +| [Cloudflare](https://cloudflare.com) | CDN | Análisis de tráfico | 36 servidores | — | [Mensaje del blog, Mayo 2017](https://blog.cloudflare.com/how-cloudflare-analyzes-1m-dns-queries-per-second/), [Mensaje del blog, marzo 2018](https://blog.cloudflare.com/http-analytics-for-6m-requests-per-second-using-clickhouse/) | +| [Corunet](https://coru.net/) | Analítica | Producto principal | — | — | [Diapositivas en español, Abril 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup21/predictive_models.pdf) | +| [CraiditX 氪信](https://creditx.com) | Finanzas AI | Análisis | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/udf.pptx) | +| [Criteo/Storetail](https://www.criteo.com/) | Menor | Producto principal | — | — | [Diapositivas en español, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/3_storetail.pptx) | +| [Banco de Deutsche](https://db.com) | Financiación | BI Analytics | — | — | [Diapositivas en español, octubre 2019](https://bigdatadays.ru/wp-content/uploads/2019/10/D2-H3-3_Yakunin-Goihburg.pdf) | +| [Diva-e](https://www.diva-e.com) | Consultoría digital | Producto principal | — | — | [Diapositivas en español, septiembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup29/ClickHouse-MeetUp-Unusual-Applications-sd-2019-09-17.pdf) | +| [Exness](https://www.exness.com) | Comercio | Métricas, Registro | — | — | [Charla en ruso, mayo 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | +| [Sistema abierto.](https://geniee.co.jp) | Anuncio rojo | Producto principal | — | — | [Publicación de blog en japonés, julio 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) | +| [HUYA](https://www.huya.com/) | Video Streaming | Analítica | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) | +| [Idealista](https://www.idealista.com) | Inmobiliario | Analítica | — | — | [Blog Post en Inglés, Abril 2019](https://clickhouse.yandex/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) | +| [Infovista](https://www.infovista.com/) | Rojo | Analítica | — | — | [Diapositivas en español, octubre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup30/infovista.pdf) | +| [InnoGames](https://www.innogames.com) | Juego | Métricas, Registro | — | — | [Diapositivas en ruso, septiembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/graphite_and_clickHouse.pdf) | +| [Integros](https://integros.com) | Plataforma para servicios de video | Analítica | — | — | [Diapositivas en ruso, mayo 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | +| [Datos de Kodiak](https://www.kodiakdata.com/) | Nube | Producto principal | — | — | [Diapositivas en Engish, Abril 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup13/kodiak_data.pdf) | +| [Kontur](https://kontur.ru) | Desarrollo de software | Métricas | — | — | [Charla en ruso, noviembre 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) | +| [Sistema abierto.](https://lifestreet.com/) | Anuncio rojo | Producto principal | 75 servidores (3 réplicas) | 5.27 PiB | [Publicación de blog en ruso, febrero 2017](https://habr.com/en/post/322620/) | +| [Soluciones en la nube de Mail.ru](https://mcs.mail.ru/) | Servicios en la nube | Producto principal | — | — | [Ejecución de ClickHouse Instance, en ruso](https://mcs.mail.ru/help/db-create/clickhouse#) | +| [Mensaje de pájaro](https://www.messagebird.com) | Telecomunicaciones | Estadísticas | — | — | [Diapositivas en español, noviembre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup20/messagebird.pdf) | +| [MGID](https://www.mgid.com/) | Anuncio rojo | Analítica Web | — | — | [Nuestra experiencia en la implementación analítica DBMS ClickHouse, en ruso](http://gs-studio.com/news-about-it/32777----clickhouse---c) | +| [UnoAPM](https://www.oneapm.com/) | Supervisión y análisis de datos | Producto principal | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) | +| [Pragma Innovación](http://www.pragma-innovation.fr/) | Telemetría y Análisis de Big Data | Producto principal | — | — | [Diapositivas en español, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/4_pragma_innovation.pdf) | +| [QINGCLOUD](https://www.qingcloud.com/) | Servicios en la nube | Producto principal | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) | +| [Qrator](https://qrator.net) | Protección DDoS | Producto principal | — | — | [Mensaje del blog, marzo 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) | +| [Tecnología de la información del PORCIMIENTO de Pekín Co., Ltd.](https://www.percent.cn/) | Analítica | Producto principal | — | — | [Diapositivas en chino, junio 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/4.%20ClickHouse万亿数据双中心的设计与实践%20.pdf) | +| [Rambler](https://rambler.ru) | Servicios de Internet | Analítica | — | — | [Charla en ruso, abril 2018](https://medium.com/@ramblertop/разработка-api-clickhouse-для-рамблер-топ-100-f4c7e56f3141) | +| [Tencent](https://www.tencent.com) | Mensajería | Tala | — | — | [Charla en chino, noviembre 2019](https://youtu.be/T-iVQRuw-QY?t=5050) | +| [Estrellas de tráfico](https://trafficstars.com/) | Anuncio rojo | — | — | — | [Diapositivas en ruso, mayo 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) | +| [S7 Aerolíneas](https://www.s7.ru) | Aérea | Métricas, Registro | — | — | [Charla en ruso, marzo 2019](https://www.youtube.com/watch?v=nwG68klRpPg&t=15s) | +| [SEMrush](https://www.semrush.com/) | Marketing | Producto principal | — | — | [Diapositivas en ruso, agosto 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/5_semrush.pdf) | +| [Inicio](https://www.scireum.de/) | Comercio electrónico | Producto principal | — | — | [Charla en alemán, febrero de 2020](https://www.youtube.com/watch?v=7QWAn5RbyR4) | +| [Centinela](https://sentry.io/) | Desarrollador de software | Backend para el producto | — | — | [Publicación de blog en inglés, mayo 2019](https://blog.sentry.io/2019/05/16/introducing-snuba-sentrys-new-search-infrastructure) | +| [SGK](http://www.sgk.gov.tr/wps/portal/sgk/tr) | Gobierno Seguridad Social | Analítica | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/ClickHouse%20Meetup-Ramazan%20POLAT.pdf) | +| [el seo.¿](https://seo.do/) | Analítica | Producto principal | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/CH%20Presentation-%20Metehan%20Çetinkaya.pdf) | +| [Sina](http://english.sina.com/index.html) | Noticia | — | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/6.%20ClickHouse最佳实践%20高鹏_新浪.pdf) | +| [SMI2](https://smi2.ru/) | Noticia | Analítica | — | — | [Blog Post en ruso, noviembre 2017](https://habr.com/ru/company/smi2/blog/314558/) | +| [Salto](https://www.splunk.com/) | Análisis de negocios | Producto principal | — | — | [Diapositivas en español, enero 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/splunk.pdf) | +| [Spotify](https://www.spotify.com) | Sica | Experimentación | — | — | [Diapositivas, julio 2018](https://www.slideshare.net/glebus/using-clickhouse-for-experimentation-104247173) | +| [Tencent](https://www.tencent.com) | Grandes Datos | Procesamiento de datos | — | — | [Diapositivas en chino, octubre 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/5.%20ClickHouse大数据集群应用_李俊飞腾讯网媒事业部.pdf) | +| [Más información](https://www.uber.com) | Taxi | Tala | — | — | [Diapositivas, febrero de 2020](https://presentations.clickhouse.tech/meetup40/ml.pdf) | +| [VKontakte](https://vk.com) | Red social | Estadísticas, Registro | — | — | [Diapositivas en ruso, agosto 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) | +| [Método de codificación de datos:](https://wisebits.com/) | Soluciones de TI | Analítica | — | — | [Diapositivas en ruso, mayo 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | +| [Tecnología de Xiaoxin.](https://www.xiaoheiban.cn/) | Educación | Propósito común | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/sync-clickhouse-with-mysql-mongodb.pptx) | +| [Ximalaya](https://www.ximalaya.com/) | Compartir audio | OLAP | — | — | [Diapositivas en español, noviembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/ximalaya.pdf) | +| [Nube de Yandex](https://cloud.yandex.ru/services/managed-clickhouse) | Nube pública | Producto principal | — | — | [Charla en ruso, diciembre 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | +| [Yandex DataLens](https://cloud.yandex.ru/services/datalens) | Inteligencia de negocios | Producto principal | — | — | [Diapositivas en ruso, diciembre 2019](https://presentations.clickhouse.tech/meetup38/datalens.pdf) | +| [Mercado de Yandex](https://market.yandex.ru/) | Comercio electrónico | Métricas, Registro | — | — | [Charla en ruso, enero 2019](https://youtu.be/_l1qP0DyBcA?t=478) | +| [Yandex Metrica](https://metrica.yandex.com) | Análisis web | Producto principal | 360 servidores en un clúster, 1862 servidores en un departamento | 66.41 PiB / 5.68 PiB | [Diapositivas, febrero de 2020](https://presentations.clickhouse.tech/meetup40/introduction/#13) | +| [Método de codificación de datos:](https://htc-cs.ru/) | Desarrollo de software | Métricas, Registro | — | — | [Blog Post, marzo 2019, en ruso](https://vc.ru/dev/62715-kak-my-stroili-monitoring-na-prometheus-clickhouse-i-elk) | +| [МКБ](https://mkb.ru/) | Banco | Supervisión del sistema web | — | — | [Diapositivas en ruso, septiembre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/mkb.pdf) | +| [Método de codificación de datos:](https://jinshuju.net) | BI Analytics | Producto principal | — | — | [Diapositivas en chino, octubre 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/3.%20金数据数据架构调整方案Public.pdf) | + +[Artículo Original](https://clickhouse.tech/docs/es/introduction/adopters/) diff --git a/docs/es/introduction/distinctive_features.md b/docs/es/introduction/distinctive_features.md new file mode 100644 index 00000000000..0e53b8935cd --- /dev/null +++ b/docs/es/introduction/distinctive_features.md @@ -0,0 +1,64 @@ +# Características distintivas de ClickHouse {#distinctive-features-of-clickhouse} + +## DBMS orientado a columnas verdaderas {#true-column-oriented-dbms} + +En un verdadero DBMS orientado a columnas, no se almacenan datos adicionales con los valores. Entre otras cosas, esto significa que los valores de longitud constante deben ser compatibles, para evitar almacenar su longitud “number” al lado de los valores. Como ejemplo, mil millones de valores de tipo UInt8 deberían consumir alrededor de 1 GB sin comprimir, o esto afectará fuertemente el uso de la CPU. Es muy importante almacenar los datos de forma compacta (sin “garbage”) incluso sin comprimir, ya que la velocidad de descompresión (uso de CPU) depende principalmente del volumen de datos sin comprimir. + +Esto vale la pena señalar porque hay sistemas que pueden almacenar valores de diferentes columnas por separado, pero que no pueden procesar eficazmente las consultas analíticas debido a su optimización para otros escenarios. Los ejemplos son HBase, BigTable, Cassandra e HyperTable. En estos sistemas, obtendrá un rendimiento de alrededor de cien mil filas por segundo, pero no cientos de millones de filas por segundo. + +También vale la pena señalar que ClickHouse es un sistema de gestión de bases de datos, no una sola base de datos. ClickHouse permite crear tablas y bases de datos en tiempo de ejecución, cargar datos y ejecutar consultas sin volver a configurar y reiniciar el servidor. + +## Compresión de datos {#data-compression} + +Algunos DBMS orientados a columnas (InfiniDB CE y MonetDB) no utilizan la compresión de datos. Sin embargo, la compresión de datos juega un papel clave para lograr un rendimiento excelente. + +## Almacenamiento en disco de datos {#disk-storage-of-data} + +Mantener los datos físicamente ordenados por clave principal permite extraer datos para sus valores específicos o rangos de valores con baja latencia, menos de pocas docenas de milisegundos. Algunos DBMS orientados a columnas (como SAP HANA y Google PowerDrill) solo pueden funcionar en RAM. Este enfoque fomenta la asignación de un presupuesto de hardware más grande que el realmente necesario para el análisis en tiempo real. ClickHouse está diseñado para funcionar en discos duros normales, lo que significa que el costo por GB de almacenamiento de datos es bajo, pero SSD y RAM adicional también se utilizan completamente si están disponibles. + +## Procesamiento paralelo en varios núcleos {#parallel-processing-on-multiple-cores} + +Las consultas de gran tamaño se paralelizan de forma natural, tomando todos los recursos necesarios que estén disponibles en el servidor actual. + +## Procesamiento distribuido en varios servidores {#distributed-processing-on-multiple-servers} + +Casi ninguno de los DBMS columnar mencionados anteriormente tiene soporte para el procesamiento de consultas distribuidas. +En ClickHouse, los datos pueden residir en diferentes fragmentos. Cada fragmento puede ser un grupo de réplicas que se utilizan para la tolerancia a errores. La consulta se procesa en todos los fragmentos en paralelo. Esto es transparente para el usuario. + +## Soporte SQL {#sql-support} + +ClickHouse admite un lenguaje de consulta declarativo basado en SQL que es idéntico al estándar SQL en muchos casos. +Las consultas admitidas incluyen GROUP BY, ORDER BY, subconsultas en cláusulas FROM, IN y JOIN y subconsultas escalares. +No se admiten subconsultas y funciones de ventana dependientes. + +## Motor del vector {#vector-engine} + +Los datos no solo se almacenan mediante columnas, sino que se procesan mediante vectores (partes de columnas). Esto nos permite lograr una alta eficiencia de CPU. + +## Actualizaciones de datos en tiempo real {#real-time-data-updates} + +ClickHouse admite tablas con una clave principal. Para realizar consultas rápidamente en el rango de la clave principal, los datos se ordenan incrementalmente utilizando el árbol de combinación. Debido a esto, los datos se pueden agregar continuamente a la tabla. No se toman bloqueos cuando se ingieren nuevos datos. + +## Indice {#index} + +Tener un dato ordenado físicamente por clave principal hace posible extraer datos para sus valores específicos o rangos de valores con baja latencia, menos de pocas docenas de milisegundos. + +## Adecuado para consultas en línea {#suitable-for-online-queries} + +La baja latencia significa que las consultas se pueden procesar sin demora y sin intentar preparar la respuesta con anticipación, justo en el mismo momento mientras se carga la página de la interfaz de usuario. En otras palabras, en línea. + +## Soporte para cálculos aproximados {#support-for-approximated-calculations} + +ClickHouse proporciona varias formas de intercambiar precisión por rendimiento: + +1. Funciones agregadas para el cálculo aproximado del número de valores distintos, medianas y cuantiles. +2. Ejecutar una consulta basada en una parte (muestra) de datos y obtener un resultado aproximado. En este caso, se recuperan proporcionalmente menos datos del disco. +3. Ejecutar una agregación para un número limitado de claves aleatorias, en lugar de para todas las claves. Bajo ciertas condiciones para la distribución de claves en los datos, esto proporciona un resultado razonablemente preciso mientras se utilizan menos recursos. + +## Replicación de datos y soporte de integridad de datos {#data-replication-and-data-integrity-support} + +Utiliza la replicación multimaster asincrónica. Después de escribir en cualquier réplica disponible, los datos se distribuyen a todas las réplicas restantes en segundo plano. El sistema mantiene datos idénticos en diferentes réplicas. La recuperación después de la mayoría de las fallas se realiza automáticamente, y en casos complejos, semiautomáticamente. + +Para obtener más información, consulte la sección [Replicación de datos](../operations/table_engines/replication.md). + +[Artículo Original](https://clickhouse.tech/docs/es/introduction/distinctive_features/) diff --git a/docs/es/introduction/features_considered_disadvantages.md b/docs/es/introduction/features_considered_disadvantages.md new file mode 100644 index 00000000000..0a9f56083c4 --- /dev/null +++ b/docs/es/introduction/features_considered_disadvantages.md @@ -0,0 +1,7 @@ +# Características de ClickHouse que pueden considerarse desventajas {#clickhouse-features-that-can-be-considered-disadvantages} + +1. No hay transacciones completas. +2. Falta de capacidad para modificar o eliminar datos ya insertados con alta tasa y baja latencia. Hay eliminaciones y actualizaciones por lotes disponibles para limpiar o modificar datos, por ejemplo, para cumplir con [GDPR](https://gdpr-info.eu). +3. El índice disperso hace que ClickHouse no sea tan adecuado para consultas de puntos que recuperan filas individuales por sus claves. + +[Artículo Original](https://clickhouse.tech/docs/es/introduction/features_considered_disadvantages/) diff --git a/docs/es/introduction/history.md b/docs/es/introduction/history.md new file mode 100644 index 00000000000..469b084454b --- /dev/null +++ b/docs/es/introduction/history.md @@ -0,0 +1,49 @@ +# Historial de ClickHouse {#clickhouse-history} + +ClickHouse fue desarrollado originalmente para alimentar [El Yandex.Métrica](https://metrica.yandex.com/), [la segunda plataforma de análisis web más grande del mundo](http://w3techs.com/technologies/overview/traffic_analysis/all), y sigue siendo el componente central de este sistema. Con más de 13 billones de registros en la base de datos y más de 20 mil millones de eventos diarios, ClickHouse permite generar informes personalizados sobre la marcha directamente a partir de datos no agregados. Este artículo cubre brevemente los objetivos de ClickHouse en las primeras etapas de su desarrollo. + +El Yandex.Metrica construye informes personalizados sobre la marcha basados en hits y sesiones, con segmentos arbitrarios definidos por el usuario. Esto a menudo requiere la creación de agregados complejos, como el número de usuarios únicos. Los nuevos datos para crear un informe se reciben en tiempo real. + +A partir de abril de 2014, Yandex.Metrica estaba rastreando alrededor de 12 mil millones de eventos (vistas de páginas y clics) diariamente. Todos estos eventos deben almacenarse para crear informes personalizados. Una sola consulta puede requerir escanear millones de filas en unos pocos cientos de milisegundos, o cientos de millones de filas en solo unos segundos. + +## Uso en el Yandex.Metrica y otros servicios de Yandex {#usage-in-yandex-metrica-and-other-yandex-services} + +ClickHouse se utiliza para múltiples propósitos en Yandex.Métrica. +Su tarea principal es crear informes en modo en línea utilizando datos no agregados. Utiliza un clúster de 374 servidores, que almacenan más de 20,3 billones de filas en la base de datos. El volumen de datos comprimidos, sin contar la duplicación y la replicación, es de aproximadamente 2 PB. El volumen de datos sin comprimir (en formato TSV) sería de aproximadamente 17 PB. + +ClickHouse también se utiliza para: + +- Almacenamiento de datos para Session Replay de Yandex.Métrica. +- Procesamiento de datos intermedios. +- Creación de informes globales con Analytics. +- Ejecutar consultas para depurar el Yandex.Motor Metrica. +- Análisis de registros desde la API y la interfaz de usuario. + +ClickHouse tiene al menos una docena de instalaciones en otros servicios de Yandex: en verticales de búsqueda, Market, Direct, análisis de negocios, desarrollo móvil, AdFox, servicios personales y otros. + +## Datos agregados y no agregados {#aggregated-and-non-aggregated-data} + +Existe una opinión popular de que para calcular efectivamente las estadísticas, debe agregar datos ya que esto reduce el volumen de datos. + +Pero la agregación de datos es una solución muy limitada, por las siguientes razones: + +- Debe tener una lista predefinida de informes que el usuario necesitará. +- El usuario no puede hacer informes personalizados. +- Al agregar una gran cantidad de claves, el volumen de datos no se reduce y la agregación es inútil. +- Para un gran número de informes, hay demasiadas variaciones de agregación (explosión combinatoria). +- Al agregar claves con alta cardinalidad (como las URL), el volumen de datos no se reduce en mucho (menos del doble). +- Por esta razón, el volumen de datos con agregación podría crecer en lugar de reducirse. +- Los usuarios no ven todos los informes que generamos para ellos. Una gran parte de esos cálculos es inútil. +- La integridad lógica de los datos puede ser violada para varias agregaciones. + +Si no agregamos nada y trabajamos con datos no agregados, esto podría reducir el volumen de cálculos. + +Sin embargo, con la agregación, una parte significativa del trabajo se desconecta y se completa con relativa calma. Por el contrario, los cálculos en línea requieren calcular lo más rápido posible, ya que el usuario está esperando el resultado. + +El Yandex.Metrica tiene un sistema especializado para agregar datos llamado Metrage, que se utiliza para la mayoría de los informes. +A partir de 2009, Yandex.Metrica también utilizó una base de datos OLAP especializada para datos no agregados llamada OLAPServer, que anteriormente se usaba para el generador de informes. +OLAPServer funcionó bien para datos no agregados, pero tenía muchas restricciones que no permitían que se utilizara para todos los informes según lo deseado. Estos incluyeron la falta de soporte para tipos de datos (solo números) y la incapacidad de actualizar datos de forma incremental en tiempo real (solo se podía hacer reescribiendo datos diariamente). OLAPServer no es un DBMS, sino una base de datos especializada. + +Para eliminar las limitaciones de OLAPServer y resolver el problema de trabajar con datos no agregados para todos los informes, desarrollamos el DBMS ClickHouse. + +[Artículo Original](https://clickhouse.tech/docs/es/introduction/history/) diff --git a/docs/es/introduction/performance.md b/docs/es/introduction/performance.md new file mode 100644 index 00000000000..a819e285369 --- /dev/null +++ b/docs/es/introduction/performance.md @@ -0,0 +1,25 @@ +# Rendimiento {#performance} + +De acuerdo con los resultados de las pruebas internas en Yandex, ClickHouse muestra el mejor rendimiento (tanto el mayor rendimiento para consultas largas como la menor latencia en consultas cortas) para escenarios operativos comparables entre los sistemas de su clase que estaban disponibles para pruebas. Puede ver los resultados de la prueba en un [página separada](https://clickhouse.tech/benchmark.html). + +Esto también ha sido confirmado por numerosos puntos de referencia independientes. No son difíciles de encontrar mediante una búsqueda en Internet, o se puede ver [nuestra pequeña colección de enlaces relacionados](https://clickhouse.tech/#independent-benchmarks). + +## Rendimiento para una única consulta grande {#throughput-for-a-single-large-query} + +El rendimiento se puede medir en filas por segundo o en megabytes por segundo. Si los datos se colocan en la caché de la página, una consulta que no es demasiado compleja se procesa en hardware moderno a una velocidad de aproximadamente 2-10 GB / s de datos sin comprimir en un solo servidor (para los casos más simples, la velocidad puede alcanzar los 30 GB / s). Si los datos no se colocan en la memoria caché de la página, la velocidad depende del subsistema de disco y la velocidad de compresión de datos. Por ejemplo, si el subsistema de disco permite leer datos a 400 MB/s y la tasa de compresión de datos es de 3, la velocidad será de aproximadamente 1,2 GB/s. Para obtener la velocidad en filas por segundo, divida la velocidad en bytes por segundo por el tamaño total de las columnas utilizadas en la consulta. Por ejemplo, si se extraen 10 bytes de columnas, la velocidad será de alrededor de 100-200 millones de filas por segundo. + +La velocidad de procesamiento aumenta casi linealmente para el procesamiento distribuido, pero solo si el número de filas resultantes de la agregación o la clasificación no es demasiado grande. + +## Latencia al procesar consultas cortas {#latency-when-processing-short-queries} + +Si una consulta usa una clave principal y no selecciona demasiadas filas para procesar (cientos de miles) y no usa demasiadas columnas, podemos esperar menos de 50 milisegundos de latencia (dígitos individuales de milisegundos en el mejor de los casos) si los datos se colocan en la caché de la página. De lo contrario, la latencia se calcula a partir del número de búsquedas. Si utiliza unidades giratorias, para un sistema que no está sobrecargado, la latencia se calcula mediante esta fórmula: tiempo de búsqueda (10 ms) \* número de columnas consultadas \* número de partes de datos. + +## Rendimiento al procesar una gran cantidad de consultas cortas {#throughput-when-processing-a-large-quantity-of-short-queries} + +En las mismas condiciones, ClickHouse puede manejar varios cientos de consultas por segundo en un solo servidor (hasta varios miles en el mejor de los casos). Dado que este escenario no es típico para DBMS analíticos, se recomienda esperar un máximo de 100 consultas por segundo. + +## Rendimiento al insertar datos {#performance-when-inserting-data} + +Recomendamos insertar datos en paquetes de al menos 1000 filas o no más de una sola solicitud por segundo. Al insertar en una tabla MergeTree desde un volcado separado por tabuladores, la velocidad de inserción será de 50 a 200 MB / s. Si las filas insertadas tienen un tamaño de aproximadamente 1 Kb, la velocidad será de 50,000 a 200,000 filas por segundo. Si las filas son pequeñas, el rendimiento será mayor en filas por segundo (en los datos del sistema Banner -`>` 500.000 filas por segundo; en datos de grafito -`>` 1.000.000 de filas por segundo). Para mejorar el rendimiento, puede realizar varias consultas INSERT en paralelo, y el rendimiento aumentará linealmente. + +[Artículo Original](https://clickhouse.tech/docs/es/introduction/performance/) diff --git a/docs/es/operations/access_rights.md b/docs/es/operations/access_rights.md new file mode 100644 index 00000000000..1e60285c259 --- /dev/null +++ b/docs/es/operations/access_rights.md @@ -0,0 +1,106 @@ +# Derechos de acceso {#access-rights} + +Los usuarios y los derechos de acceso se configuran en la configuración del usuario. Esto suele ser `users.xml`. + +Los usuarios se registran en el `users` apartado. Aquí hay un fragmento de la `users.xml` file: + +``` xml + + + + + + + + + + + + default + + + default + + + + + + + web + default + + test + + + test + + + +``` + +Puede ver una declaración de dos usuarios: `default`y`web`. Hemos añadido el `web` usuario por separado. + +El `default` usuario se elige en los casos en que no se pasa el nombre de usuario. El `default` usuario también se utiliza para el procesamiento de consultas distribuidas, si la configuración del servidor o clúster no `user` y `password` (véase la sección sobre el [Distribuido](../operations/table_engines/distributed.md) motor). + +El usuario que se utiliza para intercambiar información entre servidores combinados en un clúster no debe tener restricciones o cuotas sustanciales; de lo contrario, las consultas distribuidas fallarán. + +La contraseña se especifica en texto sin cifrar (no recomendado) o en SHA-256. El hash no es salado. En este sentido, no debe considerar estas contraseñas como proporcionar seguridad contra posibles ataques maliciosos. Más bien, son necesarios para la protección de los empleados. + +Se especifica una lista de redes desde las que se permite el acceso. En este ejemplo, la lista de redes para ambos usuarios se carga desde un archivo independiente (`/etc/metrika.xml`) que contiene el `networks` sustitución. Aquí hay un fragmento de eso: + +``` xml + + ... + + ::/64 + 203.0.113.0/24 + 2001:DB8::/32 + ... + + +``` + +Puede definir esta lista de redes directamente en `users.xml` o en un archivo en el `users.d` (para obtener más información, consulte la sección “[Archivos de configuración](configuration_files.md#configuration_files)”). + +La configuración incluye comentarios que explican cómo abrir el acceso desde todas partes. + +Para su uso en producción, sólo especifique `ip` elementos (direcciones IP y sus máscaras), ya que usan `host` y `hoost_regexp` podría causar latencia adicional. + +A continuación se especifica el perfil de configuración de usuario (consulte la sección “[Perfiles de configuración](settings/settings_profiles.md)”. Puede especificar el perfil predeterminado, `default'`. El perfil puede tener cualquier nombre. Puede especificar el mismo perfil para diferentes usuarios. Lo más importante que puede escribir en el perfil de configuración es `readonly=1`, que asegura el acceso de sólo lectura. A continuación, especifique la cuota que se utilizará (consulte la sección “[Cuota](quotas.md#quotas)”). Puede especificar la cuota predeterminada: `default`. Se establece en la configuración de forma predeterminada para contar solo el uso de recursos, sin restringirlo. La cuota puede tener cualquier nombre. Puede especificar la misma cuota para diferentes usuarios; en este caso, el uso de recursos se calcula para cada usuario individualmente. + +En el opcional `` sección, también puede especificar una lista de bases de datos a las que el usuario puede acceder. De forma predeterminada, todas las bases de datos están disponibles para el usuario. Puede especificar el `default` base. En este caso, el usuario recibirá acceso a la base de datos de forma predeterminada. + +En el opcional `` sección, también puede especificar una lista de diccionarios a los que el usuario puede acceder. De forma predeterminada, todos los diccionarios están disponibles para el usuario. + +Acceso a la `system` base de datos siempre está permitida (ya que esta base de datos se utiliza para procesar consultas). + +El usuario puede obtener una lista de todas las bases de datos y tablas en ellos mediante el uso de `SHOW` Consultas o tablas del sistema, incluso si no se permite el acceso a bases de datos individuales. + +El acceso a la base de datos no está [sólo lectura](settings/permissions_for_queries.md#settings_readonly) configuración. No puede conceder acceso completo a una base de datos y `readonly` acceso a otro. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/access_rights/) diff --git a/docs/es/operations/backup.md b/docs/es/operations/backup.md new file mode 100644 index 00000000000..94eaa606052 --- /dev/null +++ b/docs/es/operations/backup.md @@ -0,0 +1,34 @@ +# Copia de seguridad de datos {#data-backup} + +Mientras [replicación](table_engines/replication.md) proporciona protección contra fallas de hardware, no protege contra errores humanos: eliminación accidental de datos, eliminación de la tabla incorrecta o una tabla en el clúster incorrecto y errores de software que resultan en un procesamiento incorrecto de datos o daños en los datos. En muchos casos, errores como estos afectarán a todas las réplicas. ClickHouse tiene protecciones integradas para evitar algunos tipos de errores, por ejemplo, de forma predeterminada [no puede simplemente colocar tablas con un motor similar a MergeTree que contenga más de 50 Gb de datos](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/dbms/programs/server/config.xml#L322-L330). Sin embargo, estas garantías no cubren todos los casos posibles y pueden eludirse. + +Para mitigar eficazmente los posibles errores humanos, debe preparar cuidadosamente una estrategia para realizar copias de seguridad y restaurar sus datos **previamente**. + +Cada empresa tiene diferentes recursos disponibles y requisitos comerciales, por lo que no existe una solución universal para las copias de seguridad y restauraciones de ClickHouse que se adapten a cada situación. Lo que funciona para un gigabyte de datos probablemente no funcionará para decenas de petabytes. Hay una variedad de posibles enfoques con sus propios pros y contras, que se discutirán a continuación. Es una buena idea utilizar varios enfoques en lugar de solo uno para compensar sus diversas deficiencias. + +!!! note "Nota" + Tenga en cuenta que si realizó una copia de seguridad de algo y nunca intentó restaurarlo, es probable que la restauración no funcione correctamente cuando realmente la necesite (o al menos tomará más tiempo de lo que las empresas pueden tolerar). Por lo tanto, cualquiera que sea el enfoque de copia de seguridad que elija, asegúrese de automatizar el proceso de restauración también y practicarlo en un clúster de ClickHouse de repuesto regularmente. + +## Duplicar datos de origen en otro lugar {#duplicating-source-data-somewhere-else} + +A menudo, los datos que se ingieren en ClickHouse se entregan a través de algún tipo de cola persistente, como [Acerca de nosotros](https://kafka.apache.org). En este caso, es posible configurar un conjunto adicional de suscriptores que leerá el mismo flujo de datos mientras se escribe en ClickHouse y lo almacenará en almacenamiento en frío en algún lugar. La mayoría de las empresas ya tienen algún almacenamiento en frío recomendado por defecto, que podría ser un almacén de objetos o un sistema de archivos distribuido como [HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html). + +## Instantáneas del sistema de archivos {#filesystem-snapshots} + +Algunos sistemas de archivos locales proporcionan funcionalidad de instantánea (por ejemplo, [ZFS](https://en.wikipedia.org/wiki/ZFS)), pero podrían no ser la mejor opción para servir consultas en vivo. Una posible solución es crear réplicas adicionales con este tipo de sistema de archivos y excluirlas del [Distribuido](table_engines/distributed.md) tablas que se utilizan para `SELECT` consulta. Las instantáneas en tales réplicas estarán fuera del alcance de cualquier consulta que modifique los datos. Como beneficio adicional, estas réplicas podrían tener configuraciones de hardware especiales con más discos conectados por servidor, lo que sería rentable. + +## Método de codificación de datos: {#clickhouse-copier} + +[Método de codificación de datos:](utils/clickhouse-copier.md) es una herramienta versátil que se creó inicialmente para volver a dividir tablas de tamaño petabyte. También se puede usar con fines de copia de seguridad y restauración porque copia datos de forma fiable entre tablas y clústeres de ClickHouse. + +Para volúmenes de datos más pequeños, un simple `INSERT INTO ... SELECT ...` a tablas remotas podría funcionar también. + +## Manipulaciones con piezas {#manipulations-with-parts} + +Haz clickHouse permite usar el `ALTER TABLE ... FREEZE PARTITION ...` consulta para crear una copia local de particiones de tabla. Esto se implementa utilizando enlaces duros al `/var/lib/clickhouse/shadow/` carpeta, por lo que generalmente no consume espacio adicional en disco para datos antiguos. Las copias creadas de archivos no son manejadas por el servidor ClickHouse, por lo que puede dejarlos allí: tendrá una copia de seguridad simple que no requiere ningún sistema externo adicional, pero aún así será propenso a problemas de hardware. Por esta razón, es mejor copiarlos de forma remota en otra ubicación y luego eliminar las copias locales. Los sistemas de archivos distribuidos y los almacenes de objetos siguen siendo una buena opción para esto, pero los servidores de archivos conectados normales con una capacidad lo suficientemente grande podrían funcionar también (en este caso, la transferencia ocurrirá a través del sistema de archivos de red o tal vez [rsync](https://en.wikipedia.org/wiki/Rsync)). + +Para obtener más información sobre las consultas relacionadas con las manipulaciones de particiones, consulte [Documentación de ALTER](../query_language/alter.md#alter_manipulations-with-partitions). + +Una herramienta de terceros está disponible para automatizar este enfoque: [Haga clic en el botón de copia de seguridad](https://github.com/AlexAkulov/clickhouse-backup). + +[Artículo Original](https://clickhouse.tech/docs/es/operations/backup/) diff --git a/docs/es/operations/configuration_files.md b/docs/es/operations/configuration_files.md new file mode 100644 index 00000000000..57a6a122bd9 --- /dev/null +++ b/docs/es/operations/configuration_files.md @@ -0,0 +1,47 @@ +# Archivos de configuración {#configuration-files} + +ClickHouse admite la administración de configuración de varios archivos. El archivo de configuración del servidor principal es `/etc/clickhouse-server/config.xml`. Otros archivos deben estar en el `/etc/clickhouse-server/config.d` Directorio. + +!!! note "Nota" + Todos los archivos de configuración deben estar en formato XML. Además, deben tener el mismo elemento raíz, generalmente ``. + +Algunos valores especificados en el archivo de configuración principal se pueden anular en otros archivos de configuración. El `replace` o `remove` se pueden especificar atributos para los elementos de estos archivos de configuración. + +Si no se especifica ninguno, combina el contenido de los elementos de forma recursiva, reemplazando los valores de los elementos secundarios duplicados. + +Si `replace` se especifica, reemplaza todo el elemento por el especificado. + +Si `remove` se especifica, elimina el elemento. + +La configuración también puede definir “substitutions”. Si un elemento tiene el `incl` atributo, la sustitución correspondiente del archivo se utilizará como el valor. De forma predeterminada, la ruta al archivo con sustituciones es `/etc/metrika.xml`. Esto se puede cambiar en el [include\_from](server_settings/settings.md#server_settings-include_from) elemento en la configuración del servidor. Los valores de sustitución se especifican en `/yandex/substitution_name` elementos en este archivo. Si una sustitución especificada en `incl` No existe, se registra en el registro. Para evitar que ClickHouse registre las sustituciones que faltan, especifique `optional="true"` atributo (por ejemplo, ajustes para [macro](server_settings/settings.md)). + +Las sustituciones también se pueden realizar desde ZooKeeper. Para hacer esto, especifique el atributo `from_zk = "/path/to/node"`. El valor del elemento se sustituye por el contenido del nodo en `/path/to/node` en ZooKeeper. También puede colocar un subárbol XML completo en el nodo ZooKeeper y se insertará completamente en el elemento de origen. + +El `config.xml` file puede especificar una configuración separada con configuraciones de usuario, perfiles y cuotas. La ruta relativa a esta configuración se establece en el ‘users\_config’ elemento. Por defecto, es `users.xml`. Si `users_config` se omite, la configuración de usuario, los perfiles y las cuotas se especifican directamente en `config.xml`. + +Además, `users_config` puede tener anulaciones en los archivos `users_config.d` Directorio (por ejemplo, `users.d`) y sustituciones. Por ejemplo, puede tener un archivo de configuración separado para cada usuario como este: + +``` bash +$ cat /etc/clickhouse-server/users.d/alice.xml +``` + +``` xml + + + + analytics + + ::/0 + + ... + analytics + + + +``` + +Para cada archivo de configuración, el servidor también genera `file-preprocessed.xml` archivos al iniciar. Estos archivos contienen todas las sustituciones y anulaciones completadas, y están destinados para uso informativo. Si se utilizaron sustituciones de ZooKeeper en los archivos de configuración pero ZooKeeper no está disponible en el inicio del servidor, el servidor carga la configuración desde el archivo preprocesado. + +El servidor realiza un seguimiento de los cambios en los archivos de configuración, así como archivos y nodos ZooKeeper que se utilizaron al realizar sustituciones y anulaciones, y vuelve a cargar la configuración de los usuarios y clústeres sobre la marcha. Esto significa que puede modificar el clúster, los usuarios y su configuración sin reiniciar el servidor. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/configuration_files/) diff --git a/docs/es/operations/index.md b/docs/es/operations/index.md new file mode 100644 index 00000000000..aaa98a6bf05 --- /dev/null +++ b/docs/es/operations/index.md @@ -0,0 +1,20 @@ +# Operación {#operations} + +El manual de operaciones de ClickHouse consta de las siguientes secciones principales: + +- [Requisito](requirements.md) +- [Monitoreo](monitoring.md) +- [Solución de problemas](troubleshooting.md) +- [Recomendaciones de uso](tips.md) +- [Procedimiento de actualización](update.md) +- [Derechos de acceso](access_rights.md) +- [Copia de seguridad de datos](backup.md) +- [Archivos de configuración](configuration_files.md) +- [Cuota](quotas.md) +- [Tablas del sistema](system_tables.md) +- [Parámetros de configuración del servidor](server_settings/index.md) +- [Cómo probar su hardware con ClickHouse](performance_test.md) +- [Configuración](settings/index.md) +- [Utilidad](utils/index.md) + +[Artículo Original](https://clickhouse.tech/docs/es/operations/) diff --git a/docs/es/operations/monitoring.md b/docs/es/operations/monitoring.md new file mode 100644 index 00000000000..0584ed258fa --- /dev/null +++ b/docs/es/operations/monitoring.md @@ -0,0 +1,37 @@ +# Monitoreo {#monitoring} + +Usted puede monitorear: + +- Utilización de recursos de hardware. +- Métricas del servidor ClickHouse. + +## Utilización de recursos {#resource-utilization} + +ClickHouse no supervisa el estado de los recursos de hardware por sí mismo. + +Se recomienda encarecidamente configurar la supervisión para: + +- Carga y temperatura en los procesadores. + + Usted puede utilizar [dmesg](https://en.wikipedia.org/wiki/Dmesg), [Turbostat](https://www.linux.org/docs/man8/turbostat.html) u otros instrumentos. + +- Utilización del sistema de almacenamiento, RAM y red. + +## Métricas del servidor ClickHouse {#clickhouse-server-metrics} + +El servidor ClickHouse tiene instrumentos integrados para el monitoreo de estado propio. + +Para realizar un seguimiento de los eventos del servidor, use los registros del servidor. Ver el [registrador](server_settings/settings.md#server_settings-logger) sección del archivo de configuración. + +Recoge de ClickHouse: + +- Diferentes métricas de cómo el servidor utiliza recursos computacionales. +- Estadísticas comunes sobre el procesamiento de consultas. + +Puede encontrar métricas en el [sistema.métricas](system_tables.md#system_tables-metrics), [sistema.evento](system_tables.md#system_tables-events), y [sistema.asynchronous\_metrics](system_tables.md#system_tables-asynchronous_metrics) tabla. + +Puede configurar ClickHouse para exportar métricas a [Grafito](https://github.com/graphite-project). Ver el [Sección de grafito](server_settings/settings.md#server_settings-graphite) en el archivo de configuración del servidor ClickHouse. Antes de configurar la exportación de métricas, debe configurar Graphite siguiendo sus [guiar](https://graphite.readthedocs.io/en/latest/install.html). + +Además, puede supervisar la disponibilidad del servidor a través de la API HTTP. Enviar el `HTTP GET` Solicitud de `/ping`. Si el servidor está disponible, responde con `200 OK`. + +Para supervisar servidores en una configuración de clúster, debe establecer [max\_replica\_delay\_for\_distributed\_queries](settings/settings.md#settings-max_replica_delay_for_distributed_queries) parámetro y utilizar el recurso HTTP `/replicas_status`. Una solicitud para `/replicas_status` devoluciones `200 OK` si la réplica está disponible y no se retrasa detrás de las otras réplicas. Si una réplica se retrasa, devuelve `503 HTTP_SERVICE_UNAVAILABLE` con información sobre la brecha. diff --git a/docs/es/operations/performance/sampling_query_profiler.md b/docs/es/operations/performance/sampling_query_profiler.md new file mode 100644 index 00000000000..639dbfe972e --- /dev/null +++ b/docs/es/operations/performance/sampling_query_profiler.md @@ -0,0 +1,57 @@ +# Analizador de consultas de muestreo {#sampling-query-profiler} + +ClickHouse ejecuta el generador de perfiles de muestreo que permite analizar la ejecución de consultas. Utilizando el generador de perfiles puede encontrar rutinas de código fuente que se utilizan con más frecuencia durante la ejecución de la consulta. Puede rastrear el tiempo de CPU y el tiempo de reloj de pared invertido, incluido el tiempo de inactividad. + +Para usar el generador de perfiles: + +- Configurar el [trace\_log](../server_settings/settings.md#server_settings-trace_log) sección de la configuración del servidor. + + Esta sección configura la [trace\_log](../system_tables.md#system_tables-trace_log) tabla del sistema que contiene los resultados del funcionamiento del generador de perfiles. Está configurado de forma predeterminada. Recuerde que los datos de esta tabla solo son válidos para un servidor en ejecución. Después de reiniciar el servidor, ClickHouse no limpia la tabla y toda la dirección de memoria virtual almacenada puede dejar de ser válida. + +- Configurar el [Los resultados de la prueba](../settings/settings.md#query_profiler_cpu_time_period_ns) o [query\_profiler\_real\_time\_period\_ns](../settings/settings.md#query_profiler_real_time_period_ns) configuración. Ambos ajustes se pueden utilizar simultáneamente. + + Estas opciones le permiten configurar temporizadores del generador de perfiles. Como estos son los ajustes de sesión, puede obtener diferentes frecuencias de muestreo para todo el servidor, usuarios individuales o perfiles de usuario, para su sesión interactiva y para cada consulta individual. + +La frecuencia de muestreo predeterminada es una muestra por segundo y tanto la CPU como los temporizadores reales están habilitados. Esta frecuencia permite recopilar suficiente información sobre el clúster ClickHouse. Al mismo tiempo, al trabajar con esta frecuencia, el generador de perfiles no afecta el rendimiento del servidor ClickHouse. Si necesita perfilar cada consulta individual, intente usar una mayor frecuencia de muestreo. + +Para analizar el `trace_log` tabla del sistema: + +- Instale el `clickhouse-common-static-dbg` paquete. Ver [Instalar desde paquetes DEB](../../getting_started/install.md#install-from-deb-packages). + +- Permitir funciones de introspección [allow\_introspection\_functions](../settings/settings.md#settings-allow_introspection_functions) configuración. + + Por razones de seguridad, las funciones de introspección están deshabilitadas de forma predeterminada. + +- Descripción `addressToLine`, `addressToSymbol` y `demangle` [funciones de la introspección](../../query_language/functions/introspection.md) para obtener nombres de funciones y sus posiciones en el código ClickHouse. Para obtener un perfil para alguna consulta, debe agregar datos del `trace_log` tabla. Puede agregar datos por funciones individuales o por los seguimientos de pila completos. + +Si necesita visualizar `trace_log` información, intente [Gráfico de llamas](../../interfaces/third-party/gui/#clickhouse-flamegraph) y [Nivel de Cifrado WEP](https://github.com/laplab/clickhouse-speedscope). + +## Ejemplo {#example} + +En este ejemplo nos: + +- Filtrado `trace_log` datos por un identificador de consulta y la fecha actual. + +- Agregando por seguimiento de pila. + +- Usando funciones de introspección, obtendremos un informe de: + + - Nombres de símbolos y funciones de código fuente correspondientes. + - Ubicaciones del código fuente de estas funciones. + + + +``` sql +SELECT + count(), + arrayStringConcat(arrayMap(x -> concat(demangle(addressToSymbol(x)), '\n ', addressToLine(x)), trace), '\n') AS sym +FROM system.trace_log +WHERE (query_id = 'ebca3574-ad0a-400a-9cbc-dca382f5998c') AND (event_date = today()) +GROUP BY trace +ORDER BY count() DESC +LIMIT 10 +``` + +``` text +{% include "operations/performance/sampling_query_profiler_example_result.txt" %} +``` diff --git a/docs/es/operations/performance/sampling_query_profiler_example_result.txt b/docs/es/operations/performance/sampling_query_profiler_example_result.txt new file mode 100644 index 00000000000..8e4e0e0fd70 --- /dev/null +++ b/docs/es/operations/performance/sampling_query_profiler_example_result.txt @@ -0,0 +1,556 @@ +Row 1: +────── +count(): 6344 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +read + +DB::ReadBufferFromFileDescriptor::nextImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBufferFromFileDescriptor.cpp:56 +DB::CompressedReadBufferBase::readCompressedData(unsigned long&, unsigned long&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBuffer.h:54 +DB::CompressedReadBufferFromFile::nextImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Compression/CompressedReadBufferFromFile.cpp:22 +DB::CompressedReadBufferFromFile::seek(unsigned long, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Compression/CompressedReadBufferFromFile.cpp:63 +DB::MergeTreeReaderStream::seekToMark(unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp:200 +std::_Function_handler > const&), DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool)::{lambda(bool)#1}::operator()(bool) const::{lambda(std::vector > const&)#1}>::_M_invoke(std::_Any_data const&, std::vector > const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:212 +DB::IDataType::deserializeBinaryBulkWithMultipleStreams(DB::IColumn&, unsigned long, DB::IDataType::DeserializeBinaryBulkSettings&, std::shared_ptr&) const + /usr/local/include/c++/9.1.0/bits/std_function.h:690 +DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 +DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 +DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 +DB::MergeTreeRangeReader::continueReadingChain(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:487 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 2: +────── +count(): 3295 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +__pthread_cond_wait + +std::condition_variable::wait(std::unique_lock&) + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/src/c++11/../../../../../gcc-9.1.0/libstdc++-v3/src/c++11/condition_variable.cc:55 +Poco::Semaphore::wait() + /home/milovidov/ClickHouse/build_gcc9/../contrib/poco/Foundation/src/Semaphore.cpp:61 +DB::UnionBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/x86_64-pc-linux-gnu/bits/gthr-default.h:748 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Core/Block.h:90 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::LimitBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::AsynchronousBlockInputStream::calculate() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +std::_Function_handler::_M_invoke(std::_Any_data const&) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:551 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/x86_64-pc-linux-gnu/bits/gthr-default.h:748 +ThreadFromGlobalPool::ThreadFromGlobalPool::scheduleImpl(std::function, int, std::optional)::{lambda()#3}>(ThreadPoolImpl::scheduleImpl(std::function, int, std::optional)::{lambda()#3}&&)::{lambda()#1}::operator()() const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/ThreadPool.h:146 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 3: +────── +count(): 1978 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 +DB::ExpressionAction::execute(DB::Block&, bool) const + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::ExpressionActions::execute(DB::Block&, bool) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 +DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 4: +────── +count(): 1913 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 +DB::ExpressionAction::execute(DB::Block&, bool) const + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::ExpressionActions::execute(DB::Block&, bool) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 +DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 5: +────── +count(): 1672 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 +DB::ExpressionAction::execute(DB::Block&, bool) const + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::ExpressionActions::execute(DB::Block&, bool) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 +DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 6: +────── +count(): 1531 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +read + +DB::ReadBufferFromFileDescriptor::nextImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBufferFromFileDescriptor.cpp:56 +DB::CompressedReadBufferBase::readCompressedData(unsigned long&, unsigned long&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBuffer.h:54 +DB::CompressedReadBufferFromFile::nextImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Compression/CompressedReadBufferFromFile.cpp:22 +void DB::deserializeBinarySSE2<4>(DB::PODArray, 15ul, 16ul>&, DB::PODArray, 15ul, 16ul>&, DB::ReadBuffer&, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/ReadBuffer.h:53 +DB::DataTypeString::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataTypes/DataTypeString.cpp:202 +DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 +DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 +DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 +DB::MergeTreeRangeReader::startReadingChain(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 7: +────── +count(): 1034 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +DB::VolnitskyBase >::search(unsigned char const*, unsigned long) const + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::MatchImpl::vector_constant(DB::PODArray, 15ul, 16ul> const&, DB::PODArray, 15ul, 16ul> const&, std::__cxx11::basic_string, std::allocator > const&, DB::PODArray, 15ul, 16ul>&) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::FunctionsStringSearch, DB::NameLike>::executeImpl(DB::Block&, std::vector > const&, unsigned long, unsigned long) + /opt/milovidov/ClickHouse/build_gcc9/dbms/programs/clickhouse +DB::PreparedFunctionImpl::execute(DB::Block&, std::vector > const&, unsigned long, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Functions/IFunction.cpp:464 +DB::ExpressionAction::execute(DB::Block&, bool) const + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::ExpressionActions::execute(DB::Block&, bool) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Interpreters/ExpressionActions.cpp:739 +DB::MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(DB::MergeTreeRangeReader::ReadResult&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:660 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:546 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 8: +────── +count(): 989 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +__lll_lock_wait + +pthread_mutex_lock + +DB::MergeTreeReaderStream::loadMarks() + /usr/local/include/c++/9.1.0/bits/std_mutex.h:103 +DB::MergeTreeReaderStream::MergeTreeReaderStream(std::__cxx11::basic_string, std::allocator > const&, std::__cxx11::basic_string, std::allocator > const&, unsigned long, std::vector > const&, DB::MarkCache*, bool, DB::UncompressedCache*, unsigned long, unsigned long, unsigned long, DB::MergeTreeIndexGranularityInfo const*, std::function const&, int) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp:107 +std::_Function_handler > const&), DB::MergeTreeReader::addStreams(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, std::function const&, int)::{lambda(std::vector > const&)#1}>::_M_invoke(std::_Any_data const&, std::vector > const&) + /usr/local/include/c++/9.1.0/bits/unique_ptr.h:147 +DB::MergeTreeReader::addStreams(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, std::function const&, int) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:677 +DB::MergeTreeReader::MergeTreeReader(std::__cxx11::basic_string, std::allocator > const&, std::shared_ptr const&, DB::NamesAndTypesList const&, DB::UncompressedCache*, DB::MarkCache*, bool, DB::MergeTreeData const&, std::vector > const&, unsigned long, unsigned long, std::map, std::allocator >, double, std::less, std::allocator > >, std::allocator, std::allocator > const, double> > > const&, std::function const&, int) + /usr/local/include/c++/9.1.0/bits/stl_list.h:303 +DB::MergeTreeThreadSelectBlockInputStream::getNewTask() + /usr/local/include/c++/9.1.0/bits/std_function.h:259 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:54 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 9: +─────── +count(): 779 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +void DB::deserializeBinarySSE2<4>(DB::PODArray, 15ul, 16ul>&, DB::PODArray, 15ul, 16ul>&, DB::ReadBuffer&, unsigned long) + /usr/local/lib/gcc/x86_64-pc-linux-gnu/9.1.0/include/emmintrin.h:727 +DB::DataTypeString::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataTypes/DataTypeString.cpp:202 +DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 +DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 +DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 +DB::MergeTreeRangeReader::startReadingChain(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone + + +Row 10: +─────── +count(): 666 +sym: StackTrace::StackTrace(ucontext_t const&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Common/StackTrace.cpp:208 +DB::(anonymous namespace)::writeTraceInfo(DB::TimerType, int, siginfo_t*, void*) [clone .isra.0] + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/IO/BufferBase.h:99 + + +void DB::deserializeBinarySSE2<4>(DB::PODArray, 15ul, 16ul>&, DB::PODArray, 15ul, 16ul>&, DB::ReadBuffer&, unsigned long) + /usr/local/lib/gcc/x86_64-pc-linux-gnu/9.1.0/include/emmintrin.h:727 +DB::DataTypeString::deserializeBinaryBulk(DB::IColumn&, DB::ReadBuffer&, unsigned long, double) const + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataTypes/DataTypeString.cpp:202 +DB::MergeTreeReader::readData(std::__cxx11::basic_string, std::allocator > const&, DB::IDataType const&, DB::IColumn&, unsigned long, bool, unsigned long, bool) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:232 +DB::MergeTreeReader::readRows(unsigned long, bool, unsigned long, DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeReader.cpp:111 +DB::MergeTreeRangeReader::DelayedStream::finalize(DB::Block&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:35 +DB::MergeTreeRangeReader::startReadingChain(unsigned long, std::vector >&) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeRangeReader.cpp:219 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeRangeReader::read(unsigned long, std::vector >&) + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::MergeTreeBaseSelectBlockInputStream::readFromPartImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/Storages/MergeTree/MergeTreeBaseSelectBlockInputStream.cpp:158 +DB::MergeTreeBaseSelectBlockInputStream::readImpl() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ExpressionBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ExpressionBlockInputStream.cpp:34 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::PartialSortingBlockInputStream::readImpl() + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/PartialSortingBlockInputStream.cpp:13 +DB::IBlockInputStream::read() + /usr/local/include/c++/9.1.0/bits/stl_vector.h:108 +DB::ParallelInputsProcessor::loop(unsigned long) + /usr/local/include/c++/9.1.0/bits/atomic_base.h:419 +DB::ParallelInputsProcessor::thread(std::shared_ptr, unsigned long) + /home/milovidov/ClickHouse/build_gcc9/../dbms/src/DataStreams/ParallelInputsProcessor.h:215 +ThreadFromGlobalPool::ThreadFromGlobalPool::*)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*, std::shared_ptr, unsigned long&>(void (DB::ParallelInputsProcessor::*&&)(std::shared_ptr, unsigned long), DB::ParallelInputsProcessor*&&, std::shared_ptr&&, unsigned long&)::{lambda()#1}::operator()() const + /usr/local/include/c++/9.1.0/bits/shared_ptr_base.h:729 +ThreadPoolImpl::worker(std::_List_iterator) + /usr/local/include/c++/9.1.0/bits/unique_lock.h:69 +execute_native_thread_routine + /home/milovidov/ClickHouse/ci/workspace/gcc/gcc-build/x86_64-pc-linux-gnu/libstdc++-v3/include/bits/unique_ptr.h:81 +start_thread + +__clone diff --git a/docs/es/operations/performance_test.md b/docs/es/operations/performance_test.md new file mode 100644 index 00000000000..95bbef1be6c --- /dev/null +++ b/docs/es/operations/performance_test.md @@ -0,0 +1,75 @@ +# Cómo probar su hardware con ClickHouse {#how-to-test-your-hardware-with-clickhouse} + +Con esta instrucción, puede ejecutar una prueba de rendimiento básica de ClickHouse en cualquier servidor sin instalar paquetes de ClickHouse. + +1. Ir a un “commits” página: https://github.com/ClickHouse/ClickHouse/commits/master + +2. Haga clic en la primera marca de verificación verde o cruz roja con verde “ClickHouse Build Check” y haga clic en el “Details” enlace cerca “ClickHouse Build Check”. + +3. Copie el enlace a “clickhouse” binario para amd64 o aarch64. + +4. ssh al servidor y descargarlo con wget: + + + + # For amd64: + wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578163263_binary/clickhouse + # For aarch64: + wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578161264_binary/clickhouse + # Then do: + chmod a+x clickhouse + +1. Descargar configuraciones: + + + + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/config.xml + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/users.xml + mkdir config.d + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/config.d/path.xml -O config.d/path.xml + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/programs/server/config.d/log_to_console.xml -O config.d/log_to_console.xml + +1. Descargar archivos de referencia: + + + + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/benchmark/clickhouse/benchmark-new.sh + chmod a+x benchmark-new.sh + wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/dbms/benchmark/clickhouse/queries.sql + +1. Descargue los datos de prueba de acuerdo con el [El Yandex.Conjunto de datos de Metrica](../getting_started/example_datasets/metrica.md) instrucción (“hits” tabla que contiene 100 millones de filas). + + + + wget https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_100m_obfuscated_v1.tar.xz + tar xvf hits_100m_obfuscated_v1.tar.xz -C . + mv hits_100m_obfuscated_v1/* . + +1. Ejecute el servidor: + + + + ./clickhouse server + +1. Verifique los datos: ssh al servidor en otro terminal + + + + ./clickhouse client --query "SELECT count() FROM hits_100m_obfuscated" + 100000000 + +1. Edite el benchmark-new.sh, cambie “clickhouse-client” Naciones “./clickhouse client” y añadir “–max\_memory\_usage 100000000000” parámetro. + + + + mcedit benchmark-new.sh + +1. Ejecute el punto de referencia: + + + + ./benchmark-new.sh hits_100m_obfuscated + +1. Envíe los números y la información sobre la configuración de su hardware a clickhouse-feedback@yandex-team.com + +Todos los resultados se publican aquí: https://clickhouse.tecnología/benchmark\_hardware.HTML diff --git a/docs/es/operations/quotas.md b/docs/es/operations/quotas.md new file mode 100644 index 00000000000..89955f2b84a --- /dev/null +++ b/docs/es/operations/quotas.md @@ -0,0 +1,105 @@ +# Cuota {#quotas} + +Las cuotas le permiten limitar el uso de recursos durante un período de tiempo, o simplemente realizar un seguimiento del uso de recursos. +Las cuotas se configuran en la configuración del usuario. Esto suele ser ‘users.xml’. + +El sistema también tiene una característica para limitar la complejidad de una sola consulta. Vea la sección “Restrictions on query complexity”). + +A diferencia de las restricciones de complejidad de consultas, las cuotas: + +- Coloque restricciones en un conjunto de consultas que se pueden ejecutar durante un período de tiempo, en lugar de limitar una sola consulta. +- Tenga en cuenta los recursos gastados en todos los servidores remotos para el procesamiento de consultas distribuidas. + +Echemos un vistazo a la sección de la ‘users.xml’ fichero que define las cuotas. + +``` xml + + + + + + + + 3600 + + + 0 + 0 + 0 + 0 + 0 + + +``` + +De forma predeterminada, la cuota solo realiza un seguimiento del consumo de recursos para cada hora, sin limitar el uso. +El consumo de recursos calculado para cada intervalo se envía al registro del servidor después de cada solicitud. + +``` xml + + + + + 3600 + + 1000 + 100 + 1000000000 + 100000000000 + 900 + + + + 86400 + + 10000 + 1000 + 5000000000 + 500000000000 + 7200 + + +``` + +Para el ‘statbox’ Las restricciones se establecen por cada hora y por cada 24 horas (86.400 segundos). El intervalo de tiempo se cuenta a partir de un momento fijo definido por la implementación en el tiempo. En otras palabras, el intervalo de 24 horas no necesariamente comienza a medianoche. + +Cuando finaliza el intervalo, se borran todos los valores recopilados. Para la siguiente hora, el cálculo de la cuota comienza de nuevo. + +Estas son las cantidades que se pueden restringir: + +`queries` – El número total de solicitudes. + +`errors` – El número de consultas que lanzaron una excepción. + +`result_rows` – El número total de filas dadas como resultado. + +`read_rows` – El número total de filas de origen leídas de las tablas para ejecutar la consulta, en todos los servidores remotos. + +`execution_time` – El tiempo total de ejecución de la consulta, en segundos (tiempo de pared). + +Si se excede el límite durante al menos un intervalo de tiempo, se lanza una excepción con un texto sobre qué restricción se excedió, para qué intervalo y cuándo comienza el nuevo intervalo (cuando se pueden enviar consultas nuevamente). + +Las cuotas pueden usar el “quota key” característica con el fin de informar sobre los recursos para múltiples claves de forma independiente. Aquí hay un ejemplo de esto: + +``` xml + + + + +``` + +La cuota se asigna a los usuarios ‘users’ sección de la configuración. Vea la sección “Access rights”. + +Para el procesamiento de consultas distribuidas, los importes acumulados se almacenan en el servidor del solicitante. Entonces, si el usuario va a otro servidor, la cuota allí “start over”. + +Cuando se reinicia el servidor, las cuotas se restablecen. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/quotas/) diff --git a/docs/es/operations/requirements.md b/docs/es/operations/requirements.md new file mode 100644 index 00000000000..43c21fa14a7 --- /dev/null +++ b/docs/es/operations/requirements.md @@ -0,0 +1,54 @@ +# Requisito {#requirements} + +## CPU {#cpu} + +Para la instalación desde paquetes deb precompilados, utilice una CPU con arquitectura x86\_64 y soporte para las instrucciones de SSE 4.2. Para ejecutar ClickHouse con procesadores que no admiten SSE 4.2 o tienen arquitectura AArch64 o PowerPC64LE, debe compilar ClickHouse a partir de fuentes. + +ClickHouse implementa el procesamiento de datos paralelo y utiliza todos los recursos de hardware disponibles. Al elegir un procesador, tenga en cuenta que ClickHouse funciona de manera más eficiente en configuraciones con un gran número de núcleos pero con una velocidad de reloj más baja que en configuraciones con menos núcleos y una velocidad de reloj más alta. Por ejemplo, 16 núcleos con 2600 MHz es preferible a 8 núcleos con 3600 MHz. + +Uso de **Impulso de Turbo** y **Hiper-threading** tecnologías se recomienda. Mejora significativamente el rendimiento con una carga típica. + +## RAM {#ram} + +Recomendamos utilizar un mínimo de 4 GB de RAM para realizar consultas no triviales. El servidor ClickHouse puede ejecutarse con una cantidad mucho menor de RAM, pero requiere memoria para procesar consultas. + +El volumen requerido de RAM depende de: + +- La complejidad de las consultas. +- La cantidad de datos que se procesan en las consultas. + +Para calcular el volumen requerido de RAM, debe estimar el tamaño de los datos temporales para [GRUPO POR](../query_language/select.md#select-group-by-clause), [DISTINTO](../query_language/select.md#select-distinct), [UNIR](../query_language/select.md#select-join) y otras operaciones que utilice. + +ClickHouse puede usar memoria externa para datos temporales. Ver [GRUPO POR en memoria externa](../query_language/select.md#select-group-by-in-external-memory) para más detalles. + +## Archivo de intercambio {#swap-file} + +Deshabilite el archivo de intercambio para entornos de producción. + +## Subsistema de almacenamiento {#storage-subsystem} + +Necesita tener 2 GB de espacio libre en disco para instalar ClickHouse. + +El volumen de almacenamiento requerido para sus datos debe calcularse por separado. La evaluación debe incluir: + +- Estimación del volumen de datos. + + Puede tomar una muestra de los datos y obtener el tamaño promedio de una fila de ella. Luego multiplique el valor por el número de filas que planea almacenar. + +- El coeficiente de compresión de datos. + + Para estimar el coeficiente de compresión de datos, cargue una muestra de sus datos en ClickHouse y compare el tamaño real de los datos con el tamaño de la tabla almacenada. Por ejemplo, los datos de clickstream generalmente se comprimen de 6 a 10 veces. + +Para calcular el volumen final de datos que se almacenarán, aplique el coeficiente de compresión al volumen de datos estimado. Si planea almacenar datos en varias réplicas, multiplique el volumen estimado por el número de réplicas. + +## Rojo {#network} + +Si es posible, utilice redes de 10G o clase superior. + +El ancho de banda de la red es fundamental para procesar consultas distribuidas con una gran cantidad de datos intermedios. Además, la velocidad de la red afecta a los procesos de replicación. + +## Software {#software} + +ClickHouse está desarrollado para la familia de sistemas operativos Linux. La distribución de Linux recomendada es Ubuntu. El `tzdata` paquete debe ser instalado en el sistema. + +ClickHouse también puede funcionar en otras familias de sistemas operativos. Ver detalles en el [Primeros pasos](../getting_started/index.md) sección de la documentación. diff --git a/docs/es/operations/server_settings/index.md b/docs/es/operations/server_settings/index.md new file mode 100644 index 00000000000..6fe199aaf89 --- /dev/null +++ b/docs/es/operations/server_settings/index.md @@ -0,0 +1,11 @@ +# Parámetros de configuración del servidor {#server-settings} + +Esta sección contiene descripciones de la configuración del servidor que no se puede cambiar en el nivel de sesión o consulta. + +Estos ajustes se almacenan en el `config.xml` archivo en el servidor ClickHouse. + +Otros ajustes se describen en el “[Configuración](../settings/index.md#settings)” apartado. + +Antes de estudiar la configuración, lea el [Archivos de configuración](../configuration_files.md#configuration_files) sección y tomar nota del uso de sustituciones (el `incl` y `optional` atributo). + +[Artículo Original](https://clickhouse.tech/docs/es/operations/server_settings/) diff --git a/docs/es/operations/server_settings/settings.md b/docs/es/operations/server_settings/settings.md new file mode 100644 index 00000000000..a7e8c90b930 --- /dev/null +++ b/docs/es/operations/server_settings/settings.md @@ -0,0 +1,865 @@ +# Configuración del servidor {#server-settings} + +## builtin\_dictionaries\_reload\_interval {#builtin-dictionaries-reload-interval} + +El intervalo en segundos antes de volver a cargar los diccionarios integrados. + +ClickHouse recarga los diccionarios incorporados cada x segundos. Esto hace posible editar diccionarios “on the fly” sin reiniciar el servidor. + +Valor predeterminado: 3600. + +**Ejemplo** + +``` xml +3600 +``` + +## compresión {#server-settings-compression} + +Ajustes de compresión de datos para [Método de codificación de datos:](../table_engines/mergetree.md)-mesas de motor. + +!!! warning "Advertencia" + No lo use si acaba de comenzar a usar ClickHouse. + +Plantilla de configuración: + +``` xml + + + ... + ... + ... + + ... + +``` + +`` campo: + +- `min_part_size` – El tamaño mínimo de una parte de datos. +- `min_part_size_ratio` – La relación entre el tamaño de la parte de datos y el tamaño de la tabla. +- `method` – Método de compresión. Valores aceptables: `lz4` o `zstd`. + +Puede configurar múltiples `` apartado. + +Acciones cuando se cumplen las condiciones: + +- Si un elemento de datos coincide con un conjunto de condiciones, ClickHouse utiliza el método de compresión especificado. +- Si un elemento de datos coinciden con varios conjuntos de condiciones, ClickHouse utiliza el primer conjunto de condiciones coincidente. + +Si no se cumplen condiciones para un elemento de datos, ClickHouse utiliza el `lz4` compresión. + +**Ejemplo** + +``` xml + + + 10000000000 + 0.01 + zstd + + +``` + +## default\_database {#default-database} + +La base de datos predeterminada. + +Para obtener una lista de bases de datos, [MOSTRAR BASAS DE DATOS](../../query_language/show.md#show-databases) consulta. + +**Ejemplo** + +``` xml +default +``` + +## default\_profile {#default-profile} + +Perfil de configuración predeterminado. + +Los perfiles de configuración se encuentran en el archivo especificado en el parámetro `user_config`. + +**Ejemplo** + +``` xml +default +``` + +## Diccionarios\_config {#server-settings-dictionaries-config} + +La ruta de acceso al archivo de configuración para diccionarios externos. + +Camino: + +- Especifique la ruta absoluta o la ruta relativa al archivo de configuración del servidor. +- La ruta puede contener comodines \* y ?. + +Ver también “[Diccionarios externos](../../query_language/dicts/external_dicts.md)”. + +**Ejemplo** + +``` xml +*_dictionary.xml +``` + +## Diccionarios\_lazy\_load {#server-settings-dictionaries-lazy-load} + +La carga perezosa de los diccionarios. + +Si `true`, entonces cada diccionario es creado en el primer uso. Si se produce un error en la creación del diccionario, la función que estaba utilizando el diccionario produce una excepción. + +Si `false`, todos los diccionarios se crean cuando se inicia el servidor, y si hay un error, el servidor se apaga. + +El valor predeterminado es `true`. + +**Ejemplo** + +``` xml +true +``` + +## format\_schema\_path {#server-settings-format-schema-path} + +La ruta de acceso al directorio con los esquemas para los datos de entrada, como los esquemas [CapnProto](../../interfaces/formats.md#capnproto) formato. + +**Ejemplo** + +``` xml + + format_schemas/ +``` + +## Grafito {#server-settings-graphite} + +Envío de datos a [Grafito](https://github.com/graphite-project). + +Configuración: + +- host – El servidor de grafito. +- el puerto del servidor grafito. +- intervalo – El intervalo para el envío, en segundos. +- timeout – El tiempo de espera para el envío de datos, en segundos. +- root\_path – Prefijo para las claves. +- métricas – Envío de datos desde el [sistema.métricas](../system_tables.md#system_tables-metrics) tabla. +- eventos – Envío de datos deltas acumulados para el período de tiempo [sistema.evento](../system_tables.md#system_tables-events) tabla. +- events\_cumulative: envío de datos acumulativos desde el [sistema.evento](../system_tables.md#system_tables-events) tabla. +- asynchronous\_metrics – Envío de datos desde el [sistema.asynchronous\_metrics](../system_tables.md#system_tables-asynchronous_metrics) tabla. + +Puede configurar múltiples `` clausula. Por ejemplo, puede usar esto para enviar datos diferentes a intervalos diferentes. + +**Ejemplo** + +``` xml + + localhost + 42000 + 0.1 + 60 + one_min + true + true + false + true + +``` + +## graphite\_rollup {#server-settings-graphite-rollup} + +Ajustes para reducir los datos de grafito. + +Para obtener más información, consulte [GraphiteMergeTree](../table_engines/graphitemergetree.md). + +**Ejemplo** + +``` xml + + + max + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + +``` + +## http\_port/https\_port {#http-porthttps-port} + +El puerto para conectarse al servidor a través de HTTP(s). + +Si `https_port` se especifica, [openSSL](#server_settings-openssl) debe ser configurado. + +Si `http_port` se especifica, la configuración de OpenSSL se ignora incluso si está establecida. + +**Ejemplo** + +``` xml +0000 +``` + +## http\_server\_default\_response {#server-settings-http-server-default-response} + +La página que se muestra de forma predeterminada al acceder al servidor HTTP de ClickHouse. +El valor predeterminado es “Ok.” (con un avance de línea al final) + +**Ejemplo** + +Abrir `https://tabix.io/` al acceder `http://localhost: http_port`. + +``` xml + +
]]> +
+``` + +## include\_from {#server-settings-include-from} + +La ruta al archivo con sustituciones. + +Para obtener más información, consulte la sección “[Archivos de configuración](../configuration_files.md#configuration_files)”. + +**Ejemplo** + +``` xml +/etc/metrica.xml +``` + +## Interesante {#interserver-http-port} + +Puerto para el intercambio de datos entre servidores ClickHouse. + +**Ejemplo** + +``` xml +9009 +``` + +## Sistema abierto. {#interserver-http-host} + +El nombre de host que pueden utilizar otros servidores para acceder a este servidor. + +Si se omite, se define de la misma manera que el `hostname-f` comando. + +Útil para separarse de una interfaz de red específica. + +**Ejemplo** + +``` xml +example.yandex.ru +``` + +## interserver\_http\_credentials {#server-settings-interserver-http-credentials} + +El nombre de usuario y la contraseña utilizados para [replicación](../table_engines/replication.md) con los motores Replicated\*. Estas credenciales sólo se utilizan para la comunicación entre réplicas y no están relacionadas con las credenciales de los clientes de ClickHouse. El servidor está comprobando estas credenciales para conectar réplicas y utiliza las mismas credenciales cuando se conecta a otras réplicas. Por lo tanto, estas credenciales deben establecerse igual para todas las réplicas de un clúster. +De forma predeterminada, la autenticación no se utiliza. + +Esta sección contiene los siguientes parámetros: + +- `user` — nombre de usuario. +- `password` — contraseña. + +**Ejemplo** + +``` xml + + admin + 222 + +``` + +## keep\_alive\_timeout {#keep-alive-timeout} + +El número de segundos que ClickHouse espera las solicitudes entrantes antes de cerrar la conexión. El valor predeterminado es de 3 segundos. + +**Ejemplo** + +``` xml +3 +``` + +## listen\_host {#server-settings-listen-host} + +Restricción en hosts de los que pueden provenir las solicitudes. Si desea que el servidor responda a todos ellos, especifique `::`. + +Ejemplos: + +``` xml +::1 +127.0.0.1 +``` + +## registrador {#server-settings-logger} + +Configuración de registro. + +Claves: + +- level – Nivel de registro. Valores aceptables: `trace`, `debug`, `information`, `warning`, `error`. +- log – El archivo de registro. Contiene todas las entradas según `level`. +- errorlog – Archivo de registro de errores. +- size – Tamaño del archivo. Se aplica a `log`y`errorlog`. Una vez que el archivo alcanza `size`, ClickHouse archiva y cambia el nombre, y crea un nuevo archivo de registro en su lugar. +- count: el número de archivos de registro archivados que almacena ClickHouse. + +**Ejemplo** + +``` xml + + trace + /var/log/clickhouse-server/clickhouse-server.log + /var/log/clickhouse-server/clickhouse-server.err.log + 1000M + 10 + +``` + +También se admite la escritura en el syslog. Config ejemplo: + +``` xml + + 1 + +
syslog.remote:10514
+ myhost.local + LOG_LOCAL6 + syslog +
+
+``` + +Claves: + +- use\_syslog — Ajuste requerido si desea escribir en el syslog. +- address — El host\[:port\] de syslogd. Si se omite, se utiliza el daemon local. +- hostname — Opcional. El nombre del host desde el que se envían los registros. +- instalación — [La palabra clave syslog instalación](https://en.wikipedia.org/wiki/Syslog#Facility) en letras mayúsculas con el “LOG\_” prefijo: (`LOG_USER`, `LOG_DAEMON`, `LOG_LOCAL3` y así sucesivamente). + Valor predeterminado: `LOG_USER` si `address` se especifica, `LOG_DAEMON otherwise.` +- format – Formato de mensaje. Valores posibles: `bsd` y `syslog.` + +## macro {#macros} + +Sustituciones de parámetros para tablas replicadas. + +Se puede omitir si no se utilizan tablas replicadas. + +Para obtener más información, consulte la sección “[Creación de tablas replicadas](../../operations/table_engines/replication.md)”. + +**Ejemplo** + +``` xml + +``` + +## Método de codificación de datos: {#server-mark-cache-size} + +Tamaño aproximado (en bytes) de la memoria caché de marcas utilizadas por los motores de [Método de codificación de datos:](../table_engines/mergetree.md) Familia. + +La memoria caché se comparte para el servidor y la memoria se asigna según sea necesario. El tamaño de la memoria caché debe ser al menos 5368709120. + +**Ejemplo** + +``` xml +5368709120 +``` + +## max\_concurrent\_queries {#max-concurrent-queries} + +El número máximo de solicitudes procesadas simultáneamente. + +**Ejemplo** + +``` xml +100 +``` + +## max\_connections {#max-connections} + +El número máximo de conexiones entrantes. + +**Ejemplo** + +``` xml +4096 +``` + +## max\_open\_files {#max-open-files} + +El número máximo de archivos abiertos. + +Predeterminada: `maximum`. + +Recomendamos usar esta opción en Mac OS X desde el `getrlimit()` función devuelve un valor incorrecto. + +**Ejemplo** + +``` xml +262144 +``` + +## max\_table\_size\_to\_drop {#max-table-size-to-drop} + +Restricción en la eliminación de tablas. + +Si el tamaño de un [Método de codificación de datos:](../table_engines/mergetree.md) más caliente `max_table_size_to_drop` (en bytes), no puede eliminarlo mediante una consulta DROP. + +Si aún necesita eliminar la tabla sin reiniciar el servidor ClickHouse, cree el `/flags/force_drop_table` y ejecute la consulta DROP. + +Valor predeterminado: 50 GB. + +El valor 0 significa que puede eliminar todas las tablas sin restricciones. + +**Ejemplo** + +``` xml +0 +``` + +## merge\_tree {#server-settings-merge-tree} + +Ajuste fino para tablas en el [Método de codificación de datos:](../table_engines/mergetree.md). + +Para obtener más información, vea MergeTreeSettings.h archivo de encabezado. + +**Ejemplo** + +``` xml + + 5 + +``` + +## openSSL {#server-settings-openssl} + +Configuración cliente/servidor SSL. + +El soporte para SSL es proporcionado por el `libpoco` biblioteca. La interfaz se describe en el archivo [Nombre de la red inalámbrica (SSID):h](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/SSLManager.h) + +Claves para la configuración del servidor/cliente: + +- privateKeyFile: la ruta de acceso al archivo con la clave secreta del certificado PEM. El archivo puede contener una clave y un certificado al mismo tiempo. +- certificateFile: la ruta de acceso al archivo de certificado cliente/servidor en formato PEM. Puede omitirlo si `privateKeyFile` contiene el certificado. +- caConfig: la ruta de acceso al archivo o directorio que contiene certificados raíz de confianza. +- verificationMode: el método para verificar los certificados del nodo. Los detalles están en la descripción del [Contexto](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/Context.h) Clase. Valores posibles: `none`, `relaxed`, `strict`, `once`. +- Profundidad de verificación: la longitud máxima de la cadena de verificación. La verificación fallará si la longitud de la cadena del certificado supera el valor establecido. +- loadDefaultCAFile: indica que se usarán certificados de CA integrados para OpenSSL. Valores aceptables: `true`, `false`. \| +- cipherList: encriptaciones compatibles con OpenSSL. Por ejemplo: `ALL:!ADH:!LOW:!EXP:!MD5:@STRENGTH`. +- cacheSessions: habilita o deshabilita las sesiones de almacenamiento en caché. Debe usarse en combinación con `sessionIdContext`. Valores aceptables: `true`, `false`. +- sessionIdContext: un conjunto único de caracteres aleatorios que el servidor agrega a cada identificador generado. La longitud de la cuerda no debe exceder `SSL_MAX_SSL_SESSION_ID_LENGTH`. Este parámetro siempre se recomienda ya que ayuda a evitar problemas tanto si el servidor almacena en caché la sesión como si el cliente solicita el almacenamiento en caché. Valor predeterminado: `${application.name}`. +- sessionCacheSize: el número máximo de sesiones que el servidor almacena en caché. Valor predeterminado: 1024\*20. 0 – Sesiones ilimitadas. +- sessionTimeout - Tiempo para almacenar en caché la sesión en el servidor. +- extendedVerification : la verificación extendida automáticamente de los certificados después de que finalice la sesión. Valores aceptables: `true`, `false`. +- requireTLSv1: requiere una conexión TLSv1. Valores aceptables: `true`, `false`. +- requireTLSv1\_1: requiere una conexión TLSv1.1. Valores aceptables: `true`, `false`. +- requireTLSv1: requiere una conexión TLSv1.2. Valores aceptables: `true`, `false`. +- fips: activa el modo FIPS OpenSSL. Se admite si la versión OpenSSL de la biblioteca admite FIPS. +- privateKeyPassphraseHandler: clase (subclase PrivateKeyPassphraseHandler) que solicita la frase de contraseña para acceder a la clave privada. Por ejemplo: ``, `KeyFileHandler`, `test`, ``. +- invalidCertificateHandler: clase (una subclase de CertificateHandler) para verificar certificados no válidos. Por ejemplo: ` ConsoleCertificateHandler ` . +- disableProtocols: protocolos que no pueden usarse. +- preferServerCiphers: cifras de servidor preferidas en el cliente. + +**Ejemplo de configuración:** + +``` xml + + + + /etc/clickhouse-server/server.crt + /etc/clickhouse-server/server.key + + /etc/clickhouse-server/dhparam.pem + none + true + true + sslv2,sslv3 + true + + + true + true + sslv2,sslv3 + true + + + + RejectCertificateHandler + + + +``` + +## part\_log {#server-settings-part-log} + +Registro de eventos asociados con [Método de codificación de datos:](../table_engines/mergetree.md). Por ejemplo, agregar o fusionar datos. Puede utilizar el registro para simular algoritmos de combinación y comparar sus características. Puede visualizar el proceso de fusión. + +Las consultas se registran en el [sistema.part\_log](../system_tables.md#system_tables-part-log) tabla, no en un archivo separado. Puede configurar el nombre de esta tabla en el `table` parámetro (ver más abajo). + +Utilice los siguientes parámetros para configurar el registro: + +- `database` – Nombre de la base de datos. +- `table` – Nombre de la tabla del sistema. +- `partition_by` – Establece un [clave de partición personalizada](../../operations/table_engines/custom_partitioning_key.md). +- `flush_interval_milliseconds` – Intervalo para el vaciado de datos desde el búfer en la memoria a la tabla. + +**Ejemplo** + +``` xml + + system +
part_log
+ toMonday(event_date) + 7500 + +``` + +## camino {#server-settings-path} + +La ruta de acceso al directorio que contiene los datos. + +!!! note "Nota" + La barra diagonal es obligatoria. + +**Ejemplo** + +``` xml +/var/lib/clickhouse/ +``` + +## query\_log {#server-settings-query-log} + +Configuración de las consultas de registro recibidas con [log\_queries=1](../settings/settings.md) configuración. + +Las consultas se registran en el [sistema.query\_log](../system_tables.md#system_tables-query_log) tabla, no en un archivo separado. Puede cambiar el nombre de la tabla en el `table` parámetro (ver más abajo). + +Utilice los siguientes parámetros para configurar el registro: + +- `database` – Nombre de la base de datos. +- `table` – Nombre de la tabla del sistema en la que se registrarán las consultas. +- `partition_by` – Establece un [clave de partición personalizada](../../operations/table_engines/custom_partitioning_key.md) para una mesa. +- `flush_interval_milliseconds` – Intervalo para el vaciado de datos desde el búfer en la memoria a la tabla. + +Si la tabla no existe, ClickHouse la creará. Si la estructura del registro de consultas cambió cuando se actualizó el servidor ClickHouse, se cambia el nombre de la tabla con la estructura anterior y se crea una nueva tabla automáticamente. + +**Ejemplo** + +``` xml + + system + query_log
+ toMonday(event_date) + 7500 +
+``` + +## Sistema abierto. {#server-settings-query-thread-log} + +Configuración de subprocesos de registro de consultas recibidas con [Log\_query\_threads = 1](../settings/settings.md#settings-log-query-threads) configuración. + +Las consultas se registran en el [sistema.Sistema abierto.](../system_tables.md#system_tables-query-thread-log) tabla, no en un archivo separado. Puede cambiar el nombre de la tabla en el `table` parámetro (ver más abajo). + +Utilice los siguientes parámetros para configurar el registro: + +- `database` – Nombre de la base de datos. +- `table` – Nombre de la tabla del sistema en la que se registrarán las consultas. +- `partition_by` – Establece un [clave de partición personalizada](../../operations/table_engines/custom_partitioning_key.md) para una tabla del sistema. +- `flush_interval_milliseconds` – Intervalo para el vaciado de datos desde el búfer en la memoria a la tabla. + +Si la tabla no existe, ClickHouse la creará. Si la estructura del registro de subprocesos de consulta cambió cuando se actualizó el servidor ClickHouse, se cambia el nombre de la tabla con la estructura anterior y se crea una nueva tabla automáticamente. + +**Ejemplo** + +``` xml + + system + query_thread_log
+ toMonday(event_date) + 7500 +
+``` + +## trace\_log {#server-settings-trace-log} + +Ajustes para el [trace\_log](../system_tables.md#system_tables-trace_log) operación de la tabla del sistema. + +Parámetros: + +- `database` — Base de datos para almacenar una tabla. +- `table` — Nombre de la tabla. +- `partition_by` — [Clave de partición personalizada](../../operations/table_engines/custom_partitioning_key.md) para una tabla del sistema. +- `flush_interval_milliseconds` — Intervalo para el vaciado de datos del búfer en la memoria a la tabla. + +El archivo de configuración del servidor predeterminado `config.xml` contiene la siguiente sección de configuración: + +``` xml + + system + trace_log
+ toYYYYMM(event_date) + 7500 +
+``` + +## query\_masking\_rules {#query-masking-rules} + +Reglas basadas en Regexp, que se aplicarán a las consultas, así como a todos los mensajes de registro antes de almacenarlos en los registros del servidor, +`system.query_log`, `system.text_log`, `system.processes` tabla, y en los registros enviados al cliente. Eso permite prevenir +fuga de datos sensible de consultas SQL (como nombres, correos electrónicos, +identificadores o números de tarjetas de crédito) a los registros. + +**Ejemplo** + +``` xml + + + hide SSN + (^|\D)\d{3}-\d{2}-\d{4}($|\D) + 000-00-0000 + + +``` + +Campos de configuración: +- `name` - nombre de la regla (opcional) +- `regexp` - Expresión regular compatible con RE2 (obligatoria) +- `replace` - cadena de sustitución para datos confidenciales (opcional, por defecto - seis asteriscos) + +Las reglas de enmascaramiento se aplican a toda la consulta (para evitar fugas de datos confidenciales de consultas mal formadas / no analizables). + +`system.events` la tabla tiene contador `QueryMaskingRulesMatch` que tienen un número total de coincidencias de reglas de enmascaramiento de consultas. + +Para consultas distribuidas, cada servidor debe configurarse por separado; de lo contrario, las subconsultas pasan a otros +los nodos se almacenarán sin enmascarar. + +## remote\_servers {#server-settings-remote-servers} + +Configuración de los clústeres utilizados por [Distribuido](../../operations/table_engines/distributed.md) motor de mesa y por el `cluster` función de la tabla. + +**Ejemplo** + +``` xml + +``` + +Para el valor de la `incl` atributo, consulte la sección “[Archivos de configuración](../configuration_files.md#configuration_files)”. + +**Ver también** + +- [skip\_unavailable\_shards](../settings/settings.md#settings-skip_unavailable_shards) + +## Zona horaria {#server-settings-timezone} + +La zona horaria del servidor. + +Especificado como un identificador de la IANA para la zona horaria UTC o la ubicación geográfica (por ejemplo, África/Abidjan). + +La zona horaria es necesaria para las conversiones entre los formatos String y DateTime cuando los campos DateTime se envían al formato de texto (impreso en la pantalla o en un archivo) y cuando se obtiene DateTime de una cadena. Además, la zona horaria se usa en funciones que funcionan con la hora y la fecha si no recibieron la zona horaria en los parámetros de entrada. + +**Ejemplo** + +``` xml +Europe/Moscow +``` + +## Tcp\_port {#server-settings-tcp-port} + +Puerto para comunicarse con clientes a través del protocolo TCP. + +**Ejemplo** + +``` xml +9000 +``` + +## Tcp\_port\_secure {#server-settings-tcp-port-secure} + +Puerto TCP para una comunicación segura con los clientes. Úselo con [OpenSSL](#server_settings-openssl) configuración. + +**Valores posibles** + +Entero positivo. + +**Valor predeterminado** + +``` xml +9440 +``` + +## mysql\_port {#server-settings-mysql-port} + +Puerto para comunicarse con clientes a través del protocolo MySQL. + +**Valores posibles** + +Entero positivo. + +Ejemplo + +``` xml +9004 +``` + +## tmp\_path {#server-settings-tmp-path} + +Ruta de acceso a datos temporales para procesar consultas grandes. + +!!! note "Nota" + La barra diagonal es obligatoria. + +**Ejemplo** + +``` xml +/var/lib/clickhouse/tmp/ +``` + +## tmp\_policy {#server-settings-tmp-policy} + +Política de [`storage_configuration`](../table_engines/mergetree.md#table_engine-mergetree-multiple-volumes) para almacenar archivos temporales. +Si no se establece [`tmp_path`](#server-settings-tmp_path) se utiliza, de lo contrario se ignora. + +!!! note "Nota" + - `move_factor` se ignora +- `keep_free_space_bytes` se ignora +- `max_data_part_size_bytes` se ignora +- debe tener exactamente un volumen en esa política + +## Uncompressed\_cache\_size {#server-settings-uncompressed-cache-size} + +Tamaño de la memoria caché (en bytes) para los datos sin comprimir utilizados por los motores de [Método de codificación de datos:](../table_engines/mergetree.md). + +Hay una caché compartida para el servidor. La memoria se asigna a pedido. La caché se usa si la opción [Use\_uncompressed\_cache](../settings/settings.md#setting-use_uncompressed_cache) está habilitado. + +La caché sin comprimir es ventajosa para consultas muy cortas en casos individuales. + +**Ejemplo** + +``` xml +8589934592 +``` + +## user\_files\_path {#server-settings-user-files-path} + +El directorio con archivos de usuario. Utilizado en la función de tabla [file()](../../query_language/table_functions/file.md). + +**Ejemplo** + +``` xml +/var/lib/clickhouse/user_files/ +``` + +## users\_config {#users-config} + +Ruta de acceso al archivo que contiene: + +- Configuraciones de usuario. +- Derechos de acceso. +- Perfiles de configuración. +- Configuración de cuota. + +**Ejemplo** + +``` xml +users.xml +``` + +## Zookeeper {#server-settings-zookeeper} + +Contiene la configuración que permite a ClickHouse interactuar con [ZooKeeper](http://zookeeper.apache.org/) Cluster. + +ClickHouse utiliza ZooKeeper para almacenar metadatos de réplicas cuando se utilizan tablas replicadas. Si no se utilizan tablas replicadas, se puede omitir esta sección de parámetros. + +Esta sección contiene los siguientes parámetros: + +- `node` — Punto final ZooKeeper. Puede establecer varios puntos finales. + + Por ejemplo: + + + +``` xml + + example_host + 2181 + +``` + + The `index` attribute specifies the node order when trying to connect to the ZooKeeper cluster. + +- `session_timeout` — Tiempo de espera máximo para la sesión del cliente en milisegundos. +- `root` — El [Znode](http://zookeeper.apache.org/doc/r3.5.5/zookeeperOver.html#Nodes+and+ephemeral+nodes) que se utiliza como la raíz de los znodes utilizados por el servidor ClickHouse. Opcional. +- `identity` — Usuario y contraseña, que puede ser requerido por ZooKeeper para dar acceso a los znodes solicitados. Opcional. + +**Ejemplo de configuración** + +``` xml + + + example1 + 2181 + + + example2 + 2181 + + 30000 + 10000 + + /path/to/zookeeper/node + + user:password + +``` + +**Ver también** + +- [Replicación](../../operations/table_engines/replication.md) +- [Guía del programador ZooKeeper](http://zookeeper.apache.org/doc/current/zookeeperProgrammers.html) + +## use\_minimalistic\_part\_header\_in\_zookeeper {#server-settings-use-minimalistic-part-header-in-zookeeper} + +Método de almacenamiento para encabezados de parte de datos en ZooKeeper. + +Esta configuración sólo se aplica a `MergeTree` Familia. Se puede especificar: + +- A nivel mundial en el [merge\_tree](#server_settings-merge_tree) sección de la `config.xml` file. + + ClickHouse utiliza la configuración para todas las tablas del servidor. Puede cambiar la configuración en cualquier momento. Las tablas existentes cambian su comportamiento cuando cambia la configuración. + +- Para cada tabla. + + Al crear una tabla, especifique la correspondiente [ajuste del motor](../table_engines/mergetree.md#table_engine-mergetree-creating-a-table). El comportamiento de una tabla existente con esta configuración no cambia, incluso si la configuración global cambia. + +**Valores posibles** + +- 0 — Funcionalidad está desactivada. +- 1 — Funcionalidad está activada. + +Si `use_minimalistic_part_header_in_zookeeper = 1`, entonces [repetición](../table_engines/replication.md) las tablas almacenan los encabezados de las partes de datos de forma compacta `znode`. Si la tabla contiene muchas columnas, este método de almacenamiento reduce significativamente el volumen de los datos almacenados en Zookeeper. + +!!! attention "Atención" + Después de aplicar `use_minimalistic_part_header_in_zookeeper = 1`, no puede degradar el servidor ClickHouse a una versión que no admite esta configuración. Tenga cuidado al actualizar ClickHouse en servidores de un clúster. No actualice todos los servidores a la vez. Es más seguro probar nuevas versiones de ClickHouse en un entorno de prueba o solo en unos pocos servidores de un clúster. + + Data part headers already stored with this setting can't be restored to their previous (non-compact) representation. + +**Valor predeterminado:** 0. + +## disable\_internal\_dns\_cache {#server-settings-disable-internal-dns-cache} + +Deshabilita la memoria caché DNS interna. Recomendado para operar ClickHouse en sistemas +con infraestructura que cambia frecuentemente como Kubernetes. + +**Valor predeterminado:** 0. + +## dns\_cache\_update\_period {#server-settings-dns-cache-update-period} + +El período de actualización de las direcciones IP almacenadas en la caché DNS interna de ClickHouse (en segundos). +La actualización se realiza de forma asíncrona, en un subproceso del sistema separado. + +**Valor predeterminado**: 15. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/server_settings/settings/) diff --git a/docs/es/operations/settings/constraints_on_settings.md b/docs/es/operations/settings/constraints_on_settings.md new file mode 100644 index 00000000000..941fc8fc693 --- /dev/null +++ b/docs/es/operations/settings/constraints_on_settings.md @@ -0,0 +1,68 @@ +# Restricciones en la configuración {#constraints-on-settings} + +Las restricciones en los ajustes se pueden definir en el `profiles` sección de la `user.xml` el archivo de configuración y prohíba a los usuarios cambiar algunos de los ajustes `SET` consulta. +Las restricciones se definen como las siguientes: + +``` xml + + + + + lower_boundary + + + upper_boundary + + + lower_boundary + upper_boundary + + + + + + + +``` + +Si el usuario intenta violar las restricciones, se produce una excepción y no se cambia la configuración. +Se admiten tres tipos de restricciones: `min`, `max`, `readonly`. El `min` y `max` Las restricciones especifican los límites superior e inferior para una configuración numérica y se pueden usar en combinación. El `readonly` constraint especifica que el usuario no puede cambiar la configuración correspondiente en absoluto. + +**Ejemplo:** Dejar `users.xml` incluye líneas: + +``` xml + + + 10000000000 + 0 + ... + + + 5000000000 + 20000000000 + + + + + + + +``` + +Las siguientes consultas arrojan excepciones: + +``` sql +SET max_memory_usage=20000000001; +SET max_memory_usage=4999999999; +SET force_index_by_date=1; +``` + +``` text +Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be greater than 20000000000. +Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be less than 5000000000. +Code: 452, e.displayText() = DB::Exception: Setting force_index_by_date should not be changed. +``` + +**Nota:** el `default` perfil tiene un manejo especial: todas las restricciones definidas para el `default` perfil se convierten en las restricciones predeterminadas, por lo que restringen todos los usuarios hasta que se anulan explícitamente para estos usuarios. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/settings/constraints_on_settings/) diff --git a/docs/es/operations/settings/index.md b/docs/es/operations/settings/index.md new file mode 100644 index 00000000000..5fcfe99fd3d --- /dev/null +++ b/docs/es/operations/settings/index.md @@ -0,0 +1,24 @@ +# Configuración {#settings} + +Hay varias formas de realizar todos los ajustes que se describen a continuación. +Los ajustes se configuran en capas, por lo que cada capa subsiguiente redefine los ajustes anteriores. + +Formas de configurar los ajustes, por orden de prioridad: + +- Ajustes en el `users.xml` archivo de configuración del servidor. + + Establecer en el elemento ``. + +- Configuración de la sesión. + + Enviar `SET setting=value` desde el cliente de consola ClickHouse en modo interactivo. + Del mismo modo, puede utilizar sesiones ClickHouse en el protocolo HTTP. Para hacer esto, debe especificar el `session_id` Parámetro HTTP. + +- Configuración de consulta. + + - Al iniciar el cliente de consola de ClickHouse en modo no interactivo, establezca el parámetro de inicio `--setting=value`. + - Al usar la API HTTP, pase los parámetros CGI (`URL?setting_1=value&setting_2=value...`). + +Los ajustes que solo se pueden realizar en el archivo de configuración del servidor no se tratan en esta sección. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/settings/) diff --git a/docs/es/operations/settings/permissions_for_queries.md b/docs/es/operations/settings/permissions_for_queries.md new file mode 100644 index 00000000000..50c64abf6bf --- /dev/null +++ b/docs/es/operations/settings/permissions_for_queries.md @@ -0,0 +1,54 @@ +# Permisos para consultas {#permissions-for-queries} + +Las consultas en ClickHouse se pueden dividir en varios tipos: + +1. Leer consultas de datos: `SELECT`, `SHOW`, `DESCRIBE`, `EXISTS`. +2. Escribir consultas de datos: `INSERT`, `OPTIMIZE`. +3. Cambiar la consulta de configuración: `SET`, `USE`. +4. [DDL](https://en.wikipedia.org/wiki/Data_definition_language) Consulta: `CREATE`, `ALTER`, `RENAME`, `ATTACH`, `DETACH`, `DROP` `TRUNCATE`. +5. `KILL QUERY`. + +La siguiente configuración regula los permisos de usuario según el tipo de consulta: + +- [sólo lectura](#settings_readonly) — Restringe los permisos para todos los tipos de consultas excepto las consultas DDL. +- [Método de codificación de datos:](#settings_allow_ddl) — Restringe los permisos para consultas DDL. + +`KILL QUERY` se puede realizar con cualquier configuración. + +## sólo lectura {#settings-readonly} + +Restringe los permisos para leer datos, escribir datos y cambiar las consultas de configuración. + +Vea cómo las consultas se dividen en tipos [Arriba](#permissions_for_queries). + +Valores posibles: + +- 0 — Se permiten todas las consultas. +- 1 — Sólo se permiten consultas de datos de lectura. +- 2 — Leer datos y cambiar las consultas de configuración están permitidos. + +Después de configurar `readonly = 1` el usuario no puede cambiar `readonly` y `allow_ddl` configuración en la sesión actual. + +Cuando se utiliza el `GET` método en el [Interfaz HTTP](../../interfaces/http.md), `readonly = 1` se establece automáticamente. Para modificar los datos, `POST` método. + +Configuración `readonly = 1` prohibir al usuario cambiar todas las configuraciones. Hay una manera de prohibir al usuario +de cambiar sólo ajustes específicos, para más detalles ver [restricciones en la configuración](constraints_on_settings.md). + +Valor predeterminado: 0 + +## Método de codificación de datos: {#settings-allow-ddl} + +Permite o niega [DDL](https://en.wikipedia.org/wiki/Data_definition_language) consulta. + +Vea cómo las consultas se dividen en tipos [Arriba](#permissions_for_queries). + +Valores posibles: + +- 0 — No se permiten consultas DDL. +- 1 — Se permiten consultas DDL. + +No se puede ejecutar `SET allow_ddl = 1` si `allow_ddl = 0` para la sesión actual. + +Valor predeterminado: 1 + +[Artículo Original](https://clickhouse.tech/docs/es/operations/settings/permissions_for_queries/) diff --git a/docs/es/operations/settings/query_complexity.md b/docs/es/operations/settings/query_complexity.md new file mode 100644 index 00000000000..26579890ffa --- /dev/null +++ b/docs/es/operations/settings/query_complexity.md @@ -0,0 +1,294 @@ +# Restricciones en la complejidad de consultas {#restrictions-on-query-complexity} + +Las restricciones en la complejidad de la consulta forman parte de la configuración. +Se utilizan para proporcionar una ejecución más segura desde la interfaz de usuario. +Casi todas las restricciones solo se aplican a `SELECT`. Para el procesamiento de consultas distribuidas, las restricciones se aplican en cada servidor por separado. + +ClickHouse comprueba las restricciones para las partes de datos, no para cada fila. Significa que puede exceder el valor de restricción con el tamaño de la parte de datos. + +Restricciones en el “maximum amount of something” puede tomar el valor 0, lo que significa “unrestricted”. +La mayoría de las restricciones también tienen un ‘overflow\_mode’ establecer, lo que significa qué hacer cuando se excede el límite. +Puede tomar uno de dos valores: `throw` o `break`. Las restricciones en la agregación (group\_by\_overflow\_mode) también tienen el valor `any`. + +`throw` – Lanzar una excepción (por defecto). + +`break` – Detener la ejecución de la consulta y devolver el resultado parcial, como si los datos de origen se agotaron. + +`any (only for group_by_overflow_mode)` – Continuar la agregación de las claves que se metieron en el conjunto, pero no añadir nuevas claves al conjunto. + +## Método de codificación de datos: {#settings-max-memory-usage} + +La cantidad máxima de RAM que se utiliza para ejecutar una consulta en un único servidor. + +En el archivo de configuración predeterminado, el máximo es de 10 GB. + +El ajuste no tiene en cuenta el volumen de memoria disponible o el volumen total de memoria en la máquina. +La restricción se aplica a una sola consulta dentro de un único servidor. +Usted puede utilizar `SHOW PROCESSLIST` para ver el consumo de memoria actual para cada consulta. +Además, el consumo máximo de memoria se rastrea para cada consulta y se escribe en el registro. + +El uso de memoria no se supervisa para los estados de ciertas funciones agregadas. + +El uso de memoria no se realiza un seguimiento completo de los estados de las funciones agregadas `min`, `max`, `any`, `anyLast`, `argMin`, `argMax` de `String` y `Array` argumento. + +El consumo de memoria también está restringido por los parámetros `max_memory_usage_for_user` y `max_memory_usage_for_all_queries`. + +## Max\_memory\_usage\_for\_user {#max-memory-usage-for-user} + +La cantidad máxima de RAM que se utilizará para ejecutar las consultas de un usuario en un único servidor. + +Los valores predeterminados se definen en [Configuración.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L288). De forma predeterminada, el importe no está restringido (`max_memory_usage_for_user = 0`). + +Ver también la descripción de [Método de codificación de datos:](#settings_max_memory_usage). + +## Todos los derechos reservados. {#max-memory-usage-for-all-queries} + +La cantidad máxima de RAM que se utilizará para ejecutar todas las consultas en un único servidor. + +Los valores predeterminados se definen en [Configuración.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L289). De forma predeterminada, el importe no está restringido (`max_memory_usage_for_all_queries = 0`). + +Ver también la descripción de [Método de codificación de datos:](#settings_max_memory_usage). + +## ¿Qué puedes encontrar en Neodigit {#max-rows-to-read} + +Las siguientes restricciones se pueden verificar en cada bloque (en lugar de en cada fila). Es decir, las restricciones se pueden romper un poco. +Al ejecutar una consulta en varios subprocesos, las siguientes restricciones se aplican a cada subproceso por separado. + +Un número máximo de filas que se pueden leer de una tabla al ejecutar una consulta. + +## ¿Qué puedes encontrar en Neodigit {#max-bytes-to-read} + +Un número máximo de bytes (datos sin comprimir) que se pueden leer de una tabla al ejecutar una consulta. + +## Método de codificación de datos: {#read-overflow-mode} + +Qué hacer cuando el volumen de datos leídos excede uno de los límites: ‘throw’ o ‘break’. Por defecto, tirar. + +## Método de codificación de datos: {#settings-max-rows-to-group-by} + +Un número máximo de claves únicas recibidas de la agregación. Esta configuración le permite limitar el consumo de memoria al agregar. + +## Grupo\_by\_overflow\_mode {#group-by-overflow-mode} + +Qué hacer cuando el número de claves únicas para la agregación excede el límite: ‘throw’, ‘break’, o ‘any’. Por defecto, tirar. +Uso de la ‘any’ valor le permite ejecutar una aproximación de GROUP BY. La calidad de esta aproximación depende de la naturaleza estadística de los datos. + +## max\_bytes\_before\_external\_group\_by {#settings-max-bytes-before-external-group-by} + +Habilita o deshabilita la ejecución de `GROUP BY` en la memoria externa. Ver [GRUPO POR en memoria externa](../../query_language/select.md#select-group-by-in-external-memory). + +Valores posibles: + +- Volumen máximo de RAM (en bytes) que puede ser utilizado por el único [GRUPO POR](../../query_language/select.md#select-group-by-clause) operación. +- Cero — `GROUP BY` en la memoria externa deshabilitada. + +Valor predeterminado: 0. + +## Método de codificación de datos: {#max-rows-to-sort} + +Un número máximo de filas antes de ordenar. Esto le permite limitar el consumo de memoria al ordenar. + +## Método de codificación de datos: {#max-bytes-to-sort} + +Un número máximo de bytes antes de ordenar. + +## sort\_overflow\_mode {#sort-overflow-mode} + +Qué hacer si el número de filas recibidas antes de ordenar excede uno de los límites: ‘throw’ o ‘break’. Por defecto, tirar. + +## max\_result\_rows {#setting-max-result-rows} + +Límite en el número de filas en el resultado. También se comprueba si hay subconsultas y en servidores remotos cuando se ejecutan partes de una consulta distribuida. + +## max\_result\_bytes {#max-result-bytes} + +Límite en el número de bytes en el resultado. Lo mismo que el ajuste anterior. + +## result\_overflow\_mode {#result-overflow-mode} + +Qué hacer si el volumen del resultado excede uno de los límites: ‘throw’ o ‘break’. Por defecto, tirar. + +Utilizar ‘break’ es similar a un límite de usar. `Break` interrumpe la ejecución sólo en el nivel de bloque. Esto significa que la cantidad de filas devueltas es mayor que [max\_result\_rows](#setting-max_result_rows), múltiplo de [max\_block\_size](settings.md#setting-max_block_size) y depende de [max\_threads](settings.md#settings-max_threads). + +Ejemplo: + +``` sql +SET max_threads = 3, max_block_size = 3333; +SET max_result_rows = 3334, result_overflow_mode = 'break'; + +SELECT * +FROM numbers_mt(100000) +FORMAT Null; +``` + +Resultado: + +``` text +6666 rows in set. ... +``` + +## max\_execution\_time {#max-execution-time} + +Tiempo máximo de ejecución de la consulta en segundos. +En este momento, no se comprueba una de las etapas de clasificación, o al fusionar y finalizar funciones agregadas. + +## timeout\_overflow\_mode {#timeout-overflow-mode} + +Qué hacer si la consulta se ejecuta más de ‘max\_execution\_time’: ‘throw’ o ‘break’. Por defecto, tirar. + +## Método de codificación de datos: {#min-execution-speed} + +Velocidad de ejecución mínima en filas por segundo. Comprobado en cada bloque de datos cuando ‘timeout\_before\_checking\_execution\_speed’ expirar. Si la velocidad de ejecución es menor, se produce una excepción. + +## Todos los derechos reservados. {#min-execution-speed-bytes} + +Un número mínimo de bytes de ejecución por segundo. Comprobado en cada bloque de datos cuando ‘timeout\_before\_checking\_execution\_speed’ expirar. Si la velocidad de ejecución es menor, se produce una excepción. + +## Max\_execution\_speed {#max-execution-speed} + +Un número máximo de filas de ejecución por segundo. Comprobado en cada bloque de datos cuando ‘timeout\_before\_checking\_execution\_speed’ expirar. Si la velocidad de ejecución es alta, la velocidad de ejecución se reducirá. + +## Max\_execution\_speed\_bytes {#max-execution-speed-bytes} + +Un número máximo de bytes de ejecución por segundo. Comprobado en cada bloque de datos cuando ‘timeout\_before\_checking\_execution\_speed’ expirar. Si la velocidad de ejecución es alta, la velocidad de ejecución se reducirá. + +## Tiempo de espera antes de comprobar\_ejecución\_velocidad {#timeout-before-checking-execution-speed} + +Comprueba que la velocidad de ejecución no sea demasiado lenta (no menos de ‘min\_execution\_speed’), después de que el tiempo especificado en segundos haya expirado. + +## Max\_columns\_to\_read {#max-columns-to-read} + +Un número máximo de columnas que se pueden leer de una tabla en una sola consulta. Si una consulta requiere leer un mayor número de columnas, produce una excepción. + +## max\_temporary\_columns {#max-temporary-columns} + +Un número máximo de columnas temporales que se deben mantener en la memoria RAM al mismo tiempo cuando se ejecuta una consulta, incluidas las columnas constantes. Si hay más columnas temporales que esto, arroja una excepción. + +## max\_temporary\_non\_const\_columns {#max-temporary-non-const-columns} + +Lo mismo que ‘max\_temporary\_columns’, pero sin contar columnas constantes. +Tenga en cuenta que las columnas constantes se forman con bastante frecuencia cuando se ejecuta una consulta, pero requieren aproximadamente cero recursos informáticos. + +## max\_subquery\_depth {#max-subquery-depth} + +Profundidad máxima de anidamiento de subconsultas. Si las subconsultas son más profundas, se produce una excepción. De forma predeterminada, 100. + +## max\_pipeline\_depth {#max-pipeline-depth} + +Profundidad máxima de la tubería. Corresponde al número de transformaciones que realiza cada bloque de datos durante el procesamiento de consultas. Contado dentro de los límites de un único servidor. Si la profundidad de la canalización es mayor, se produce una excepción. Por defecto, 1000. + +## max\_ast\_depth {#max-ast-depth} + +Profundidad máxima de anidamiento de un árbol sintáctico de consulta. Si se supera, se produce una excepción. +En este momento, no se comprueba durante el análisis, sino solo después de analizar la consulta. Es decir, se puede crear un árbol sintáctico demasiado profundo durante el análisis, pero la consulta fallará. Por defecto, 1000. + +## max\_ast\_elements {#max-ast-elements} + +Un número máximo de elementos en un árbol sintáctico de consulta. Si se supera, se produce una excepción. +De la misma manera que la configuración anterior, se verifica solo después de analizar la consulta. De forma predeterminada, 50.000. + +## Método de codificación de datos: {#max-rows-in-set} + +Un número máximo de filas para un conjunto de datos en la cláusula IN creada a partir de una subconsulta. + +## Método de codificación de datos: {#max-bytes-in-set} + +Número máximo de bytes (datos sin comprimir) utilizados por un conjunto en la cláusula IN creada a partir de una subconsulta. + +## set\_overflow\_mode {#set-overflow-mode} + +Qué hacer cuando la cantidad de datos excede uno de los límites: ‘throw’ o ‘break’. Por defecto, tirar. + +## Método de codificación de datos: {#max-rows-in-distinct} + +Un número máximo de filas diferentes al usar DISTINCT. + +## Método de codificación de datos: {#max-bytes-in-distinct} + +Un número máximo de bytes utilizados por una tabla hash cuando se utiliza DISTINCT. + +## distinct\_overflow\_mode {#distinct-overflow-mode} + +Qué hacer cuando la cantidad de datos excede uno de los límites: ‘throw’ o ‘break’. Por defecto, tirar. + +## max\_rows\_to\_transfer {#max-rows-to-transfer} + +Un número máximo de filas que se pueden pasar a un servidor remoto o guardar en una tabla temporal cuando se utiliza GLOBAL IN. + +## max\_bytes\_to\_transfer {#max-bytes-to-transfer} + +Un número máximo de bytes (datos sin comprimir) que se pueden pasar a un servidor remoto o guardar en una tabla temporal cuando se utiliza GLOBAL IN. + +## transfer\_overflow\_mode {#transfer-overflow-mode} + +Qué hacer cuando la cantidad de datos excede uno de los límites: ‘throw’ o ‘break’. Por defecto, tirar. + +## Método de codificación de datos: {#settings-max-rows-in-join} + +Limita el número de filas de la tabla hash que se utiliza al unir tablas. + +Esta configuración se aplica a [SELECCIONAR … UNIRSE](../../query_language/select.md#select-join) operaciones y la [Unir](../table_engines/join.md) motor de mesa. + +Si una consulta contiene varias combinaciones, ClickHouse comprueba esta configuración para cada resultado intermedio. + +ClickHouse puede proceder con diferentes acciones cuando se alcanza el límite. Utilice el [join\_overflow\_mode](#settings-join_overflow_mode) configuración para elegir la acción. + +Valores posibles: + +- Entero positivo. +- 0 — Número ilimitado de filas. + +Valor predeterminado: 0. + +## Método de codificación de datos: {#settings-max-bytes-in-join} + +Limita el tamaño en bytes de la tabla hash utilizada al unir tablas. + +Esta configuración se aplica a [SELECCIONAR … UNIRSE](../../query_language/select.md#select-join) operaciones y [Unirse al motor de tabla](../table_engines/join.md). + +Si la consulta contiene combinaciones, ClickHouse comprueba esta configuración para cada resultado intermedio. + +ClickHouse puede proceder con diferentes acciones cuando se alcanza el límite. Utilizar [join\_overflow\_mode](#settings-join_overflow_mode) para elegir la acción. + +Valores posibles: + +- Entero positivo. +- 0 — El control de memoria está desactivado. + +Valor predeterminado: 0. + +## join\_overflow\_mode {#settings-join-overflow-mode} + +Define qué acción realiza ClickHouse cuando se alcanza cualquiera de los siguientes límites de combinación: + +- [Método de codificación de datos:](#settings-max_bytes_in_join) +- [Método de codificación de datos:](#settings-max_rows_in_join) + +Valores posibles: + +- `THROW` - ClickHouse lanza una excepción y rompe la operación. +- `BREAK` - ClickHouse interrumpe la operación y no lanza una excepción. + +Valor predeterminado: `THROW`. + +**Ver también** + +- [Cláusula JOIN](../../query_language/select.md#select-join) +- [Unirse al motor de tabla](../table_engines/join.md) + +## max\_partitions\_per\_insert\_block {#max-partitions-per-insert-block} + +Limita el número máximo de particiones en un único bloque insertado. + +- Entero positivo. +- 0 — número Ilimitado de particiones. + +Valor predeterminado: 100. + +**Detalles** + +Al insertar datos, ClickHouse calcula el número de particiones en el bloque insertado. Si el número de particiones es mayor que `max_partitions_per_insert_block`, ClickHouse lanza una excepción con el siguiente texto: + +> “Too many partitions for single INSERT block (more than” ¿Cómo puedo hacerlo? “). The limit is controlled by ‘max\_partitions\_per\_insert\_block’ setting. A large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).” + +[Artículo Original](https://clickhouse.tech/docs/es/operations/settings/query_complexity/) diff --git a/docs/es/operations/settings/settings.md b/docs/es/operations/settings/settings.md new file mode 100644 index 00000000000..ee43904d353 --- /dev/null +++ b/docs/es/operations/settings/settings.md @@ -0,0 +1,1184 @@ +# Configuración {#settings} + +## distributed\_product\_mode {#distributed-product-mode} + +Cambia el comportamiento de [subconsultas distribuidas](../../query_language/select.md). + +ClickHouse aplica esta configuración cuando la consulta contiene el producto de tablas distribuidas, es decir, cuando la consulta para una tabla distribuida contiene una subconsulta no GLOBAL para la tabla distribuida. + +Restricción: + +- Solo se aplica para las subconsultas IN y JOIN. +- Solo si la sección FROM utiliza una tabla distribuida que contiene más de un fragmento. +- Si la subconsulta se refiere a una tabla distribuida que contiene más de un fragmento. +- No se usa para un valor de tabla [remoto](../../query_language/table_functions/remote.md) función. + +Valores posibles: + +- `deny` — Valor predeterminado. Prohíbe el uso de estos tipos de subconsultas (devuelve el “Double-distributed in/JOIN subqueries is denied” Salvedad). +- `local` — Sustituye la base de datos y la tabla de la subconsulta por locales para el servidor de destino (fragmento), dejando `IN`/`JOIN.` +- `global` — Sustituye el `IN`/`JOIN` Consulta con `GLOBAL IN`/`GLOBAL JOIN.` +- `allow` — Permite la utilización de este tipo de subconsultas. + +## enable\_optimize\_predicate\_expression {#enable-optimize-predicate-expression} + +Activa el pushdown de predicado en `SELECT` consulta. + +La extracción de predicados puede reducir significativamente el tráfico de red para consultas distribuidas. + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 1. + +Uso + +Considere las siguientes consultas: + +1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` +2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` + +Si `enable_optimize_predicate_expression = 1`, entonces el tiempo de ejecución de estas consultas es igual porque se aplica ClickHouse `WHERE` a la subconsulta al procesarla. + +Si `enable_optimize_predicate_expression = 0`, entonces el tiempo de ejecución de la segunda consulta es mucho más largo, porque el `WHERE` cláusula se aplica a todos los datos después de que finalice la subconsulta. + +## fallback\_to\_stale\_replicas\_for\_distributed\_queries {#settings-fallback-to-stale-replicas-for-distributed-queries} + +Fuerza una consulta a una réplica obsoleta si los datos actualizados no están disponibles. Ver [Replicación](../table_engines/replication.md). + +ClickHouse selecciona la más relevante de las réplicas obsoletas de la tabla. + +Se utiliza al realizar `SELECT` desde una tabla distribuida que apunta a tablas replicadas. + +De forma predeterminada, 1 (habilitado). + +## Fecha de nacimiento {#settings-force-index-by-date} + +Deshabilita la ejecución de consultas si el índice no se puede usar por fecha. + +Funciona con tablas de la familia MergeTree. + +Si `force_index_by_date=1`, ClickHouse comprueba si la consulta tiene una condición de clave de fecha que se puede usar para restringir intervalos de datos. Si no hay una condición adecuada, arroja una excepción. Sin embargo, no comprueba si la condición reduce la cantidad de datos a leer. Por ejemplo, la condición `Date != ' 2000-01-01 '` es aceptable incluso cuando coincide con todos los datos de la tabla (es decir, ejecutar la consulta requiere un escaneo completo). Para obtener más información acerca de los intervalos de datos en las tablas MergeTree, vea [Método de codificación de datos:](../table_engines/mergetree.md). + +## force\_primary\_key {#force-primary-key} + +Deshabilita la ejecución de consultas si no es posible la indexación mediante la clave principal. + +Funciona con tablas de la familia MergeTree. + +Si `force_primary_key=1`, ClickHouse comprueba si la consulta tiene una condición de clave principal que se puede usar para restringir rangos de datos. Si no hay una condición adecuada, arroja una excepción. Sin embargo, no comprueba si la condición reduce la cantidad de datos a leer. Para obtener más información acerca de los intervalos de datos en las tablas MergeTree, consulte [Método de codificación de datos:](../table_engines/mergetree.md). + +## Formato\_esquema {#format-schema} + +Este parámetro es útil cuando se utilizan formatos que requieren una definición de esquema, como [Cap'n Proto](https://capnproto.org/) o [Protobuf](https://developers.google.com/protocol-buffers/). El valor depende del formato. + +## fsync\_metadata {#fsync-metadata} + +Habilita o deshabilita [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fsync.html) al escribir `.sql` file. Habilitado de forma predeterminada. + +Tiene sentido desactivarlo si el servidor tiene millones de pequeñas tablas que se crean y destruyen constantemente. + +## enable\_http\_compression {#settings-enable-http-compression} + +Habilita o deshabilita la compresión de datos en la respuesta a una solicitud HTTP. + +Para obtener más información, lea el [Descripción de la interfaz HTTP](../../interfaces/http.md). + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 0. + +## http\_zlib\_compression\_level {#settings-http-zlib-compression-level} + +Establece el nivel de compresión de datos en la respuesta a una solicitud HTTP si [enable\_http\_compression = 1](#settings-enable_http_compression). + +Valores posibles: Números del 1 al 9. + +Valor predeterminado: 3. + +## http\_native\_compression\_disable\_checksumming\_on\_decompress {#settings-http-native-compression-disable-checksumming-on-decompress} + +Habilita o deshabilita la verificación de suma de comprobación al descomprimir los datos HTTP POST del cliente. Se usa solo para el formato de compresión nativa ClickHouse (no se usa con `gzip` o `deflate`). + +Para obtener más información, lea el [Descripción de la interfaz HTTP](../../interfaces/http.md). + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 0. + +## send\_progress\_in\_http\_headers {#settings-send-progress-in-http-headers} + +Habilita o deshabilita `X-ClickHouse-Progress` Encabezados de respuesta HTTP en `clickhouse-server` respuesta. + +Para obtener más información, lea el [Descripción de la interfaz HTTP](../../interfaces/http.md). + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 0. + +## Nombre de la red inalámbrica (SSID): {#setting-max-http-get-redirects} + +Limita el número máximo de saltos de redirección HTTP GET para [URL](../table_engines/url.md)-mesas de motor. La configuración se aplica a ambos tipos de tablas: las creadas por [CREAR TABLA](../../query_language/create/#create-table-query) Consulta y por el [URL](../../query_language/table_functions/url.md) función de la tabla. + +Valores posibles: + +- Cualquier número entero positivo de saltos. +- 0 — No se permiten saltos. + +Valor predeterminado: 0. + +## Entrada\_format\_allow\_errors\_num {#settings-input-format-allow-errors-num} + +Establece el número máximo de errores aceptables al leer desde formatos de texto (CSV, TSV, etc.). + +El valor predeterminado es 0. + +Siempre emparejarlo con `input_format_allow_errors_ratio`. + +Si se produjo un error al leer filas, pero el contador de errores sigue siendo menor que `input_format_allow_errors_num`, ClickHouse ignora la fila y pasa a la siguiente. + +Si ambos `input_format_allow_errors_num` y `input_format_allow_errors_ratio` se exceden, ClickHouse lanza una excepción. + +## Entrada\_format\_allow\_errors\_ratio {#settings-input-format-allow-errors-ratio} + +Establece el porcentaje máximo de errores permitidos al leer desde formatos de texto (CSV, TSV, etc.). +El porcentaje de errores se establece como un número de punto flotante entre 0 y 1. + +El valor predeterminado es 0. + +Siempre emparejarlo con `input_format_allow_errors_num`. + +Si se produjo un error al leer filas, pero el contador de errores sigue siendo menor que `input_format_allow_errors_ratio`, ClickHouse ignora la fila y pasa a la siguiente. + +Si ambos `input_format_allow_errors_num` y `input_format_allow_errors_ratio` se exceden, ClickHouse lanza una excepción. + +## input\_format\_values\_interpret\_expressions {#settings-input-format-values-interpret-expressions} + +Habilita o deshabilita el analizador SQL completo si el analizador de flujo rápido no puede analizar los datos. Esta configuración sólo se utiliza para [Valor](../../interfaces/formats.md#data-format-values) formato en la inserción de datos. Para obtener más información sobre el análisis de sintaxis, consulte [Sintaxis](../../query_language/syntax.md) apartado. + +Valores posibles: + +- 0 — Desactivado. + + En este caso, debe proporcionar datos con formato. Ver el [Formato](../../interfaces/formats.md) apartado. + +- 1 — Habilitado. + + En este caso, puede usar una expresión SQL como valor, pero la inserción de datos es mucho más lenta de esta manera. Si inserta solo datos con formato, ClickHouse se comporta como si el valor de configuración fuera 0. + +Valor predeterminado: 1. + +Ejemplo de uso + +Inserte el [FechaHora](../../data_types/datetime.md) valor de tipo con los diferentes ajustes. + +``` sql +SET input_format_values_interpret_expressions = 0; +INSERT INTO datetime_t VALUES (now()) +``` + +``` text +Exception on client: +Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row 1) +``` + +``` sql +SET input_format_values_interpret_expressions = 1; +INSERT INTO datetime_t VALUES (now()) +``` + +``` text +Ok. +``` + +La última consulta es equivalente a la siguiente: + +``` sql +SET input_format_values_interpret_expressions = 0; +INSERT INTO datetime_t SELECT now() +``` + +``` text +Ok. +``` + +## input\_format\_values\_deduce\_templates\_of\_expressions {#settings-input-format-values-deduce-templates-of-expressions} + +Habilita o deshabilita la deducción de plantilla para expresiones SQL en [Valor](../../interfaces/formats.md#data-format-values) formato. Permite analizar e interpretar expresiones en `Values` mucho más rápido si las expresiones en filas consecutivas tienen la misma estructura. ClickHouse intentará deducir la plantilla de una expresión, analizar las siguientes filas utilizando esta plantilla y evaluar la expresión en un lote de filas analizadas correctamente. Para la siguiente consulta: + +``` sql +INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (upper('Values')), ... +``` + +- si `input_format_values_interpret_expressions=1` y `format_values_deduce_templates_of_expressions=0` Las expresiones se interpretarán por separado para cada fila (esto es muy lento para un gran número de filas) +- si `input_format_values_interpret_expressions=0` y `format_values_deduce_templates_of_expressions=1` Las expresiones en la primera, segunda y tercera filas se analizarán usando la plantilla `lower(String)` e interpretados juntos, la expresión es la cuarta fila se analizará con otra plantilla (`upper(String)`) +- si `input_format_values_interpret_expressions=1` y `format_values_deduce_templates_of_expressions=1` - igual que en el caso anterior, pero también permite la alternativa a la interpretación de expresiones por separado si no es posible deducir la plantilla. + +Habilitado de forma predeterminada. + +## Entrada\_format\_values\_accurate\_types\_of\_literals {#settings-input-format-values-accurate-types-of-literals} + +Esta configuración sólo se utiliza cuando `input_format_values_deduce_templates_of_expressions = 1`. Puede suceder que las expresiones para alguna columna tengan la misma estructura, pero contengan literales numéricos de diferentes tipos, por ejemplo + +``` sql +(..., abs(0), ...), -- UInt64 literal +(..., abs(3.141592654), ...), -- Float64 literal +(..., abs(-1), ...), -- Int64 literal +``` + +Cuando esta configuración está habilitada, ClickHouse comprobará el tipo real de literal y utilizará una plantilla de expresión del tipo correspondiente. En algunos casos, puede ralentizar significativamente la evaluación de expresiones en `Values`. +Cuando está deshabilitado, ClickHouse puede usar un tipo más general para algunos literales (por ejemplo, `Float64` o `Int64` es lugar de `UInt64` para `42`), pero puede causar problemas de desbordamiento y precisión. +Habilitado de forma predeterminada. + +## Entrada\_format\_defaults\_for\_omitted\_fields {#session-settings-input-format-defaults-for-omitted-fields} + +Al realizar `INSERT` consultas, reemplace los valores de columna de entrada omitidos con valores predeterminados de las columnas respectivas. Esta opción sólo se aplica a [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv) y [TabSeparated](../../interfaces/formats.md#tabseparated) formato. + +!!! note "Nota" + Cuando esta opción está habilitada, los metadatos de la tabla extendida se envían del servidor al cliente. Consume recursos informáticos adicionales en el servidor y puede reducir el rendimiento. + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 1. + +## input\_format\_tsv\_empty\_as\_default {#settings-input-format-tsv-empty-as-default} + +Cuando esté habilitado, reemplace los campos de entrada vacíos en TSV con valores predeterminados. Para expresiones predeterminadas complejas `input_format_defaults_for_omitted_fields` debe estar habilitado también. + +Deshabilitado de forma predeterminada. + +## input\_format\_null\_as\_default {#settings-input-format-null-as-default} + +Habilita o deshabilita el uso de valores predeterminados si los datos de entrada `NULL`, pero el tipo de datos de la columna correspondiente en no `Nullable(T)` (para formatos de entrada de texto). + +## input\_format\_skip\_unknown\_fields {#settings-input-format-skip-unknown-fields} + +Habilita o deshabilita omitir la inserción de datos adicionales. + +Al escribir datos, ClickHouse produce una excepción si los datos de entrada contienen columnas que no existen en la tabla de destino. Si la omisión está habilitada, ClickHouse no inserta datos adicionales y no produce una excepción. + +Formatos soportados: + +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [MOKOENA](../../interfaces/formats.md#tskv) + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 0. + +## Entrada\_format\_import\_nested\_json {#settings-input-format-import-nested-json} + +Habilita o deshabilita la inserción de datos JSON con objetos anidados. + +Formatos soportados: + +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 0. + +Ver también: + +- [Uso de estructuras anidadas](../../interfaces/formats.md#jsoneachrow-nested) con el `JSONEachRow` formato. + +## Entrada\_format\_with\_names\_use\_header {#settings-input-format-with-names-use-header} + +Habilita o deshabilita la comprobación del orden de las columnas al insertar datos. + +Para mejorar el rendimiento de la inserción, se recomienda deshabilitar esta comprobación si está seguro de que el orden de columna de los datos de entrada es el mismo que en la tabla de destino. + +Formatos soportados: + +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 1. + +## Date\_time\_input\_format {#settings-date-time-input-format} + +Permite elegir un analizador de la representación de texto de fecha y hora. + +La configuración no se aplica a [Funciones de fecha y hora](../../query_language/functions/date_time_functions.md). + +Valores posibles: + +- `'best_effort'` — Permite el análisis extendido. + + ClickHouse puede analizar el básico `YYYY-MM-DD HH:MM:SS` formato y todo [Descripción](https://en.wikipedia.org/wiki/ISO_8601) formatos de fecha y hora. Por ejemplo, `'2018-06-08T01:02:03.000Z'`. + +- `'basic'` — Utilice analizador básico. + + ClickHouse puede analizar solo lo básico `YYYY-MM-DD HH:MM:SS` formato. Por ejemplo, `'2019-08-20 10:18:56'`. + +Valor predeterminado: `'basic'`. + +Ver también: + +- [Tipo de datos DateTime.](../../data_types/datetime.md) +- [Funciones para trabajar con fechas y horas.](../../query_language/functions/date_time_functions.md) + +## Por favor, introduzca su dirección de correo electrónico {#settings-join-default-strictness} + +Establece el rigor predeterminado para [Cláusulas JOIN](../../query_language/select.md#select-join). + +Valores posibles: + +- `ALL` — Si la tabla correcta tiene varias filas coincidentes, ClickHouse crea un [Producto cartesiano](https://en.wikipedia.org/wiki/Cartesian_product) de filas coincidentes. Esta es la normal `JOIN` comportamiento de SQL estándar. +- `ANY` — Si la tabla correcta tiene varias filas coincidentes, solo se une la primera encontrada. Si la tabla correcta solo tiene una fila coincidente, los resultados de `ANY` y `ALL` hijo de los mismos. +- `ASOF` — Para unir secuencias con una coincidencia incierta. +- `Empty string` — Si `ALL` o `ANY` no se especifica en la consulta, ClickHouse produce una excepción. + +Valor predeterminado: `ALL`. + +## join\_any\_take\_last\_row {#settings-join-any-take-last-row} + +Cambia el comportamiento de las operaciones de unión con `ANY` rigor. + +!!! warning "Atención" + Esta configuración sólo se aplica a `JOIN` operaciones con [Unir](../table_engines/join.md) mesas de motores. + +Valores posibles: + +- 0 — Si la tabla correcta tiene más de una fila coincidente, solo se une la primera encontrada. +- 1 — Si la tabla correcta tiene más de una fila coincidente, solo se une la última encontrada. + +Valor predeterminado: 0. + +Ver también: + +- [Cláusula JOIN](../../query_language/select.md#select-join) +- [Unirse al motor de tabla](../table_engines/join.md) +- [Por favor, introduzca su dirección de correo electrónico](#settings-join_default_strictness) + +## Sistema abierto. {#join-use-nulls} + +Establece el tipo de [UNIR](../../query_language/select.md) comportamiento. Al fusionar tablas, pueden aparecer celdas vacías. ClickHouse los rellena de manera diferente según esta configuración. + +Valores posibles: + +- 0 — Las celdas vacías se rellenan con el valor predeterminado del tipo de campo correspondiente. +- Uno — `JOIN` se comporta de la misma manera que en SQL estándar. El tipo del campo correspondiente se convierte en [NULO](../../data_types/nullable.md#data_type-nullable), y las celdas vacías se llenan con [NULO](../../query_language/syntax.md). + +Valor predeterminado: 0. + +## max\_block\_size {#setting-max-block-size} + +En ClickHouse, los datos se procesan mediante bloques (conjuntos de partes de columna). Los ciclos de procesamiento interno para un solo bloque son lo suficientemente eficientes, pero hay gastos notables en cada bloque. El `max_block_size` set es una recomendación para el tamaño del bloque (en un recuento de filas) para cargar desde las tablas. El tamaño del bloque no debe ser demasiado pequeño, por lo que los gastos en cada bloque aún se notan, pero no demasiado grande para que la consulta con LIMIT que se complete después del primer bloque se procese rápidamente. El objetivo es evitar consumir demasiada memoria al extraer un gran número de columnas en múltiples subprocesos y preservar al menos alguna localidad de caché. + +Valor predeterminado: 65,536. + +Bloquea el tamaño de `max_block_size` no siempre se cargan desde la tabla. Si es obvio que se deben recuperar menos datos, se procesa un bloque más pequeño. + +## preferred\_block\_size\_bytes {#preferred-block-size-bytes} + +Utilizado para el mismo propósito que `max_block_size`, pero establece el tamaño de bloque recomendado en bytes adaptándolo al número de filas en el bloque. +Sin embargo, el tamaño del bloque no puede ser más que `max_block_size` películas. +Por defecto: 1.000.000. Solo funciona cuando se lee desde los motores MergeTree. + +## merge\_tree\_min\_rows\_for\_concurrent\_read {#setting-merge-tree-min-rows-for-concurrent-read} + +Si el número de filas que se leerán de un fichero [Método de codificación de datos:](../table_engines/mergetree.md) más caliente `merge_tree_min_rows_for_concurrent_read` luego ClickHouse intenta realizar una lectura simultánea de este archivo en varios hilos. + +Valores posibles: + +- Cualquier entero positivo. + +Valor predeterminado: 163840. + +## merge\_tree\_min\_bytes\_for\_concurrent\_read {#setting-merge-tree-min-bytes-for-concurrent-read} + +Si el número de bytes a leer de un archivo de un [Método de codificación de datos:](../table_engines/mergetree.md)-La tabla del motor excede `merge_tree_min_bytes_for_concurrent_read`, entonces ClickHouse intenta leer simultáneamente este archivo en varios subprocesos. + +Valor posible: + +- Cualquier entero positivo. + +Valor predeterminado: 251658240. + +## Método de codificación de datos: {#setting-merge-tree-min-rows-for-seek} + +Si la distancia entre dos bloques de datos que se leen en un archivo es menor que `merge_tree_min_rows_for_seek` filas, luego ClickHouse no busca a través del archivo, sino que lee los datos secuencialmente. + +Valores posibles: + +- Cualquier entero positivo. + +Valor predeterminado: 0. + +## merge\_tree\_min\_bytes\_for\_seek {#setting-merge-tree-min-bytes-for-seek} + +Si la distancia entre dos bloques de datos que se leen en un archivo es menor que `merge_tree_min_bytes_for_seek` bytes, luego ClickHouse lee secuencialmente un rango de archivos que contiene ambos bloques, evitando así la búsqueda adicional. + +Valores posibles: + +- Cualquier entero positivo. + +Valor predeterminado: 0. + +## merge\_tree\_coarse\_index\_granularity {#setting-merge-tree-coarse-index-granularity} + +Al buscar datos, ClickHouse comprueba las marcas de datos en el archivo de índice. Si ClickHouse encuentra que las claves requeridas están en algún rango, divide este rango en `merge_tree_coarse_index_granularity` subintervalos y busca las claves necesarias allí de forma recursiva. + +Valores posibles: + +- Cualquier entero incluso positivo. + +Valor predeterminado: 8. + +## merge\_tree\_max\_rows\_to\_use\_cache {#setting-merge-tree-max-rows-to-use-cache} + +Si ClickHouse debería leer más de `merge_tree_max_rows_to_use_cache` en una consulta, no utiliza la memoria caché de bloques sin comprimir. + +La memoria caché de bloques sin comprimir almacena datos extraídos para consultas. ClickHouse utiliza esta memoria caché para acelerar las respuestas a pequeñas consultas repetidas. Esta configuración protege la memoria caché del deterioro de las consultas que leen una gran cantidad de datos. El [Uncompressed\_cache\_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) configuración del servidor define el tamaño de la memoria caché de bloques sin comprimir. + +Valores posibles: + +- Cualquier entero positivo. + +Valor predeterminado: 128 ✕ 8192. + +## merge\_tree\_max\_bytes\_to\_use\_cache {#setting-merge-tree-max-bytes-to-use-cache} + +Si ClickHouse debería leer más de `merge_tree_max_bytes_to_use_cache` bytes en una consulta, no utiliza la memoria caché de bloques sin comprimir. + +La memoria caché de bloques sin comprimir almacena datos extraídos para consultas. ClickHouse utiliza esta memoria caché para acelerar las respuestas a pequeñas consultas repetidas. Esta configuración protege la memoria caché del deterioro de las consultas que leen una gran cantidad de datos. El [Uncompressed\_cache\_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) configuración del servidor define el tamaño de la memoria caché de bloques sin comprimir. + +Valor posible: + +- Cualquier entero positivo. + +Valor predeterminado: 2013265920. + +## Todos los derechos reservados. {#settings-min-bytes-to-use-direct-io} + +El volumen de datos mínimo necesario para utilizar el acceso directo de E/S al disco de almacenamiento. + +ClickHouse usa esta configuración al leer datos de tablas. Si el volumen total de almacenamiento de todos los datos a leer excede `min_bytes_to_use_direct_io` luego ClickHouse lee los datos del disco de almacenamiento con el `O_DIRECT` opción. + +Valores posibles: + +- 0 — E/S directa está deshabilitada. +- Entero positivo. + +Valor predeterminado: 0. + +## Log\_queries {#settings-log-queries} + +Configuración del registro de consultas. + +Las consultas enviadas a ClickHouse con esta configuración se registran de acuerdo con las reglas [query\_log](../server_settings/settings.md#server_settings-query-log) parámetro de configuración del servidor. + +Ejemplo: + +``` text +log_queries=1 +``` + +## Log\_query\_threads {#settings-log-query-threads} + +Configuración del registro de subprocesos de consulta. + +Los subprocesos de consultas ejecutados por ClickHouse con esta configuración se registran de acuerdo con las reglas en el [Sistema abierto.](../server_settings/settings.md#server_settings-query-thread-log) parámetro de configuración del servidor. + +Ejemplo: + +``` text +log_query_threads=1 +``` + +## Max\_insert\_block\_size {#settings-max-insert-block-size} + +El tamaño de los bloques a formar para su inserción en una tabla. +Esta configuración solo se aplica en los casos en que el servidor forma los bloques. +Por ejemplo, para un INSERT a través de la interfaz HTTP, el servidor analiza el formato de datos y forma bloques del tamaño especificado. +Pero al usar clickhouse-client, el cliente analiza los datos en sí, y el ‘max\_insert\_block\_size’ configuración en el servidor no afecta el tamaño de los bloques insertados. +La configuración tampoco tiene un propósito cuando se usa INSERT SELECT, ya que los datos se insertan usando los mismos bloques que se forman después de SELECT. + +Valor predeterminado: 1.048.576. + +El valor predeterminado es ligeramente más que `max_block_size`. La razón de esto se debe a que ciertos motores de mesa (`*MergeTree`) formar una parte de datos en el disco para cada bloque insertado, que es una entidad bastante grande. Similar, `*MergeTree` las tablas ordenan los datos durante la inserción y un tamaño de bloque lo suficientemente grande permiten clasificar más datos en la RAM. + +## max\_replica\_delay\_for\_distributed\_queries {#settings-max-replica-delay-for-distributed-queries} + +Deshabilita las réplicas rezagadas para consultas distribuidas. Ver [Replicación](../../operations/table_engines/replication.md). + +Establece el tiempo en segundos. Si una réplica tiene un retraso superior al valor establecido, no se utiliza esta réplica. + +Valor predeterminado: 300. + +Se utiliza al realizar `SELECT` desde una tabla distribuida que apunta a tablas replicadas. + +## max\_threads {#settings-max-threads} + +El número máximo de subprocesos de procesamiento de consultas, excluyendo subprocesos para recuperar datos de servidores ‘max\_distributed\_connections’ parámetro). + +Este parámetro se aplica a los subprocesos que realizan las mismas etapas de la canalización de procesamiento de consultas en paralelo. +Por ejemplo, al leer desde una tabla, si es posible evaluar expresiones con funciones, filtre con WHERE y preagregue para GROUP BY en paralelo usando al menos ‘max\_threads’ número de hilos, entonces ‘max\_threads’ se utilizan. + +Valor predeterminado: el número de núcleos de CPU físicos. + +Si normalmente se ejecuta menos de una consulta SELECT en un servidor a la vez, establezca este parámetro en un valor ligeramente inferior al número real de núcleos de procesador. + +Para las consultas que se completan rápidamente debido a un LIMIT, puede establecer un ‘max\_threads’. Por ejemplo, si el número necesario de entradas se encuentra en cada bloque y max\_threads = 8, entonces se recuperan 8 bloques, aunque hubiera sido suficiente leer solo uno. + +Cuanto menor sea el `max_threads` valor, menos memoria se consumen. + +## Método de codificación de datos: {#settings-max-insert-threads} + +El número máximo de subprocesos para ejecutar el `INSERT SELECT` consulta. + +Valores posibles: + +- 0 (o 1) — `INSERT SELECT` sin ejecución paralela. +- Entero positivo. Más grande que 1. + +Valor predeterminado: 0. + +Paralelo `INSERT SELECT` sólo tiene efecto si el `SELECT` parte se ejecuta en paralelo, ver [max\_threads](#settings-max_threads) configuración. +Los valores más altos conducirán a un mayor uso de memoria. + +## max\_compress\_block\_size {#max-compress-block-size} + +El tamaño máximo de bloques de datos sin comprimir antes de comprimir para escribir en una tabla. De forma predeterminada, 1.048.576 (1 MiB). Si se reduce el tamaño, la tasa de compresión se reduce significativamente, la velocidad de compresión y descompresión aumenta ligeramente debido a la localidad de la memoria caché, y se reduce el consumo de memoria. Por lo general, no hay ninguna razón para cambiar esta configuración. + +No confunda bloques para la compresión (un fragmento de memoria que consta de bytes) con bloques para el procesamiento de consultas (un conjunto de filas de una tabla). + +## Descripción del producto {#min-compress-block-size} + +Para [Método de codificación de datos:](../table_engines/mergetree.md)" tabla. Para reducir la latencia al procesar consultas, un bloque se comprime al escribir la siguiente marca si su tamaño es al menos ‘min\_compress\_block\_size’. De forma predeterminada, 65.536. + +El tamaño real del bloque, si los datos sin comprimir son menores que ‘max\_compress\_block\_size’, no es menor que este valor y no menor que el volumen de datos para una marca. + +Veamos un ejemplo. Supongamos que ‘index\_granularity’ se estableció en 8192 durante la creación de la tabla. + +Estamos escribiendo una columna de tipo UInt32 (4 bytes por valor). Al escribir 8192 filas, el total será de 32 KB de datos. Como min\_compress\_block\_size = 65,536, se formará un bloque comprimido por cada dos marcas. + +Estamos escribiendo una columna URL con el tipo String (tamaño promedio de 60 bytes por valor). Al escribir 8192 filas, el promedio será ligeramente inferior a 500 KB de datos. Como esto es más de 65,536, se formará un bloque comprimido para cada marca. En este caso, al leer datos del disco en el rango de una sola marca, los datos adicionales no se descomprimirán. + +Por lo general, no hay ninguna razón para cambiar esta configuración. + +## max\_query\_size {#settings-max-query-size} + +La parte máxima de una consulta que se puede llevar a la RAM para analizar con el analizador SQL. +La consulta INSERT también contiene datos para INSERT que es procesado por un analizador de secuencias independiente (que consume O(1) RAM), que no está incluido en esta restricción. + +Valor predeterminado: 256 KiB. + +## interactive\_delay {#interactive-delay} + +El intervalo en microsegundos para comprobar si la ejecución de la solicitud se ha cancelado y enviar el progreso. + +Valor predeterminado: 100.000 (comprueba la cancelación y envía el progreso diez veces por segundo). + +## ¿Cómo puedo hacerlo? {#connect-timeout-receive-timeout-send-timeout} + +Tiempos de espera en segundos en el socket utilizado para comunicarse con el cliente. + +Valor predeterminado: 10, 300, 300. + +## Cancel\_http\_readonly\_queries\_on\_client\_close {#cancel-http-readonly-queries-on-client-close} + +Cancela las consultas de solo lectura HTTP (por ejemplo, SELECT) cuando un cliente cierra la conexión sin esperar la respuesta. + +Valor predeterminado: 0 + +## poll\_interval {#poll-interval} + +Bloquear en un bucle de espera durante el número especificado de segundos. + +Valor predeterminado: 10. + +## max\_distributed\_connections {#max-distributed-connections} + +El número máximo de conexiones simultáneas con servidores remotos para el procesamiento distribuido de una única consulta a una única tabla distribuida. Se recomienda establecer un valor no menor que el número de servidores en el clúster. + +Valor predeterminado: 1024. + +Los siguientes parámetros solo se usan al crear tablas distribuidas (y al iniciar un servidor), por lo que no hay ninguna razón para cambiarlas en tiempo de ejecución. + +## Distributed\_connections\_pool\_size {#distributed-connections-pool-size} + +El número máximo de conexiones simultáneas con servidores remotos para el procesamiento distribuido de todas las consultas a una única tabla distribuida. Se recomienda establecer un valor no menor que el número de servidores en el clúster. + +Valor predeterminado: 1024. + +## Conecte\_timeout\_with\_failover\_ms {#connect-timeout-with-failover-ms} + +El tiempo de espera en milisegundos para conectarse a un servidor remoto para un motor de tablas distribuidas ‘shard’ y ‘replica’ secciones se utilizan en la definición de clúster. +Si no tiene éxito, se realizan varios intentos para conectarse a varias réplicas. + +Valor predeterminado: 50. + +## connections\_with\_failover\_max\_tries {#connections-with-failover-max-tries} + +El número máximo de intentos de conexión con cada réplica para el motor de tablas distribuidas. + +Valor predeterminado: 3. + +## extremo {#extremes} + +Ya sea para contar valores extremos (los mínimos y máximos en columnas de un resultado de consulta). Acepta 0 o 1. De forma predeterminada, 0 (deshabilitado). +Para obtener más información, consulte la sección “Extreme values”. + +## Use\_uncompressed\_cache {#setting-use-uncompressed-cache} + +Si se debe usar una memoria caché de bloques sin comprimir. Acepta 0 o 1. De forma predeterminada, 0 (deshabilitado). +El uso de la memoria caché sin comprimir (solo para tablas de la familia MergeTree) puede reducir significativamente la latencia y aumentar el rendimiento cuando se trabaja con un gran número de consultas cortas. Habilite esta configuración para los usuarios que envían solicitudes cortas frecuentes. También preste atención al [Uncompressed\_cache\_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) parámetro de configuración (solo establecido en el archivo de configuración): el tamaño de los bloques de caché sin comprimir. De forma predeterminada, es 8 GiB. La memoria caché sin comprimir se rellena según sea necesario y los datos menos utilizados se eliminan automáticamente. + +Para consultas que leen al menos un volumen algo grande de datos (un millón de filas o más), la memoria caché sin comprimir se desactiva automáticamente para ahorrar espacio para consultas realmente pequeñas. Esto significa que puede mantener el ‘use\_uncompressed\_cache’ ajuste siempre establecido en 1. + +## Reemplazar\_running\_query {#replace-running-query} + +Cuando se utiliza la interfaz HTTP, el ‘query\_id’ parámetro puede ser pasado. Se trata de cualquier cadena que sirva como identificador de consulta. +Si una consulta del mismo usuario ‘query\_id’ que ya existe en este momento, el comportamiento depende de la ‘replace\_running\_query’ parámetro. + +`0` (predeterminado) – Lanzar una excepción (no permita que la consulta se ejecute si una consulta ‘query\_id’ ya se está ejecutando). + +`1` – Cancelar la consulta anterior y empezar a ejecutar la nueva. + +El Yandex.Metrica utiliza este parámetro establecido en 1 para implementar sugerencias para las condiciones de segmentación. Después de ingresar el siguiente carácter, si la consulta anterior aún no ha terminado, debe cancelarse. + +## Nombre de la red inalámbrica (SSID): {#stream-flush-interval-ms} + +Funciona para tablas con streaming en el caso de un tiempo de espera, o cuando un subproceso genera [Max\_insert\_block\_size](#settings-max_insert_block_size) películas. + +El valor predeterminado es 7500. + +Cuanto menor sea el valor, más a menudo los datos se vacían en la tabla. Establecer el valor demasiado bajo conduce a un rendimiento deficiente. + +## load\_balancing {#settings-load-balancing} + +Especifica el algoritmo de selección de réplicas que se utiliza para el procesamiento de consultas distribuidas. + +ClickHouse admite los siguientes algoritmos para elegir réplicas: + +- [Aleatorio](#load_balancing-random) (predeterminada) +- [Nombre de host más cercano](#load_balancing-nearest_hostname) +- [En orden](#load_balancing-in_order) +- [Primero o aleatorio](#load_balancing-first_or_random) + +### Aleatorio (por defecto) {#load-balancing-random} + +``` sql +load_balancing = random +``` + +El número de errores se cuenta para cada réplica. La consulta se envía a la réplica con el menor número de errores, y si hay varios de estos, a cualquiera de ellos. +Desventajas: La proximidad del servidor no se tiene en cuenta; si las réplicas tienen datos diferentes, también obtendrá datos diferentes. + +### Nombre de host más cercano {#load-balancing-nearest-hostname} + +``` sql +load_balancing = nearest_hostname +``` + +El número de errores se cuenta para cada réplica. Cada 5 minutos, el número de errores se divide integralmente por 2. Por lo tanto, el número de errores se calcula para un tiempo reciente con suavizado exponencial. Si hay una réplica con un número mínimo de errores (es decir,errores ocurridos recientemente en las otras réplicas), la consulta se le envía. Si hay varias réplicas con el mismo número mínimo de errores, la consulta se envía a la réplica con un nombre de host que es más similar al nombre de host del servidor en el archivo de configuración (para el número de caracteres diferentes en posiciones idénticas, hasta la longitud mínima de ambos nombres de host). + +Por ejemplo, example01-01-1 y example01-01-2.yandex.ru son diferentes en una posición, mientras que example01-01-1 y example01-02-2 difieren en dos lugares. +Este método puede parecer primitivo, pero no requiere datos externos sobre la topología de red, y no compara las direcciones IP, lo que sería complicado para nuestras direcciones IPv6. + +Por lo tanto, si hay réplicas equivalentes, se prefiere la más cercana por nombre. +También podemos suponer que al enviar una consulta al mismo servidor, en ausencia de fallas, una consulta distribuida también irá a los mismos servidores. Por lo tanto, incluso si se colocan datos diferentes en las réplicas, la consulta devolverá principalmente los mismos resultados. + +### En orden {#load-balancing-in-order} + +``` sql +load_balancing = in_order +``` + +Se accede a las réplicas con el mismo número de errores en el mismo orden en que se especifican en la configuración. +Este método es apropiado cuando se sabe exactamente qué réplica es preferible. + +### Primero o aleatorio {#load-balancing-first-or-random} + +``` sql +load_balancing = first_or_random +``` + +Este algoritmo elige la primera réplica del conjunto o una réplica aleatoria si la primera no está disponible. Es eficaz en configuraciones de topología de replicación cruzada, pero inútil en otras configuraciones. + +El `first_or_random` Resuelve el problema del algoritmo `in_order` algoritmo. Desventaja `in_order`, si una réplica se cae, la siguiente obtiene una carga doble mientras que las réplicas restantes manejan la cantidad habitual de tráfico. Cuando se utiliza el `first_or_random` algoritmo, la carga se distribuye uniformemente entre las réplicas que todavía están disponibles. + +## prefer\_localhost\_replica {#settings-prefer-localhost-replica} + +Habilita/deshabilita el uso preferible de la réplica localhost al procesar consultas distribuidas. + +Valores posibles: + +- 1 - ClickHouse siempre envía una consulta a la réplica localhost si existe. +- 0 — ClickHouse utiliza la estrategia de equilibrio especificada [load\_balancing](#settings-load_balancing) configuración. + +Valor predeterminado: 1. + +!!! warning "Advertencia" + Deshabilite esta configuración si usa [max\_parallel\_replicas](#settings-max_parallel_replicas). + +## totals\_mode {#totals-mode} + +Cómo calcular TOTALS cuando HAVING está presente, así como cuando max\_rows\_to\_group\_by y group\_by\_overflow\_mode = ‘any’ están presentes. +Vea la sección “WITH TOTALS modifier”. + +## totals\_auto\_threshold {#totals-auto-threshold} + +El umbral para `totals_mode = 'auto'`. +Vea la sección “WITH TOTALS modifier”. + +## max\_parallel\_replicas {#settings-max-parallel-replicas} + +El número máximo de réplicas para cada fragmento al ejecutar una consulta. +Para obtener coherencia (para obtener diferentes partes de la misma división de datos), esta opción solo funciona cuando se establece la clave de muestreo. +El retraso de réplica no está controlado. + +## compilar {#compile} + +Habilitar la compilación de consultas. De forma predeterminada, 0 (deshabilitado). + +La compilación solo se usa para parte de la canalización de procesamiento de consultas: para la primera etapa de agregación (GROUP BY). +Si se compiló esta parte de la canalización, la consulta puede ejecutarse más rápido debido a la implementación de ciclos cortos y a las llamadas de función agregadas en línea. La mejora del rendimiento máximo (hasta cuatro veces más rápido en casos excepcionales) se ve para consultas con múltiples funciones agregadas simples. Por lo general, la ganancia de rendimiento es insignificante. En casos muy raros, puede ralentizar la ejecución de la consulta. + +## min\_count\_to\_compile {#min-count-to-compile} + +¿Cuántas veces usar potencialmente un fragmento de código compilado antes de ejecutar la compilación? Por defecto, 3. +Para las pruebas, el valor se puede establecer en 0: la compilación se ejecuta de forma sincrónica y la consulta espera al final del proceso de compilación antes de continuar con la ejecución. Para todos los demás casos, use valores comenzando con 1. La compilación normalmente toma alrededor de 5-10 segundos. +Si el valor es 1 o más, la compilación se produce de forma asíncrona en un subproceso independiente. El resultado se utilizará tan pronto como esté listo, incluidas las consultas que se están ejecutando actualmente. + +Se requiere código compilado para cada combinación diferente de funciones agregadas utilizadas en la consulta y el tipo de claves en la cláusula GROUP BY. +Los resultados de la compilación se guardan en el directorio de compilación en forma de archivos .so. No hay ninguna restricción en el número de resultados de compilación, ya que no utilizan mucho espacio. Los resultados anteriores se usarán después de reiniciar el servidor, excepto en el caso de una actualización del servidor; en este caso, se eliminan los resultados anteriores. + +## output\_format\_json\_quote\_64bit\_integers {#session-settings-output-format-json-quote-64bit-integers} + +Si el valor es true, los enteros aparecen entre comillas cuando se usan los formatos JSON\* Int64 y UInt64 (por compatibilidad con la mayoría de las implementaciones de JavaScript); de lo contrario, los enteros se generan sin las comillas. + +## Formato\_csv\_delimiter {#settings-format-csv-delimiter} + +El carácter interpretado como un delimitador en los datos CSV. De forma predeterminada, el delimitador es `,`. + +## input\_format\_csv\_unquoted\_null\_literal\_as\_null {#settings-input-format-csv-unquoted-null-literal-as-null} + +Para el formato de entrada CSV, habilita o deshabilita el análisis de `NULL` como literal (sinónimo de `\N`). + +## output\_format\_csv\_crlf\_end\_of\_line {#settings-output-format-csv-crlf-end-of-line} + +Utilice el separador de línea de estilo DOS / Windows (CRLF) en CSV en lugar de estilo Unix (LF). + +## output\_format\_tsv\_crlf\_end\_of\_line {#settings-output-format-tsv-crlf-end-of-line} + +Utilice el separador de línea de estilo DOC / Windows (CRLF) en TSV en lugar del estilo Unix (LF). + +## insert\_quorum {#settings-insert-quorum} + +Habilita las escrituras de quórum. + +- Si `insert_quorum < 2`, las escrituras de quórum están deshabilitadas. +- Si `insert_quorum >= 2`, las escrituras de quórum están habilitadas. + +Valor predeterminado: 0. + +Quórum escribe + +`INSERT` solo tiene éxito cuando ClickHouse logra escribir correctamente datos en el `insert_quorum` de réplicas durante el `insert_quorum_timeout`. Si por alguna razón el número de réplicas con escrituras exitosas no alcanza el `insert_quorum`, la escritura se considera fallida y ClickHouse eliminará el bloque insertado de todas las réplicas donde los datos ya se han escrito. + +Todas las réplicas del quórum son consistentes, es decir, contienen datos de todas las réplicas anteriores `INSERT` consulta. El `INSERT` la secuencia está linealizada. + +Al leer los datos escritos desde el `insert_quorum` Puede utilizar el [select\_sequential\_consistency](#settings-select_sequential_consistency) opción. + +ClickHouse genera una excepción + +- Si el número de réplicas disponibles en el momento de la consulta es `insert_quorum`. +- En un intento de escribir datos cuando el bloque anterior aún no se ha insertado en el `insert_quorum` de réplicas. Esta situación puede ocurrir si el usuario intenta realizar una `INSERT` antes de la anterior con el `insert_quorum` se ha completado. + +Ver también: + +- [insert\_quorum\_timeout](#settings-insert_quorum_timeout) +- [select\_sequential\_consistency](#settings-select_sequential_consistency) + +## insert\_quorum\_timeout {#settings-insert-quorum-timeout} + +Escribir en tiempo de espera de quórum en segundos. Si el tiempo de espera ha pasado y aún no se ha realizado ninguna escritura, ClickHouse generará una excepción y el cliente debe repetir la consulta para escribir el mismo bloque en la misma réplica o en cualquier otra réplica. + +Valor predeterminado: 60 segundos. + +Ver también: + +- [insert\_quorum](#settings-insert_quorum) +- [select\_sequential\_consistency](#settings-select_sequential_consistency) + +## select\_sequential\_consistency {#settings-select-sequential-consistency} + +Habilita o deshabilita la coherencia secuencial para `SELECT` Consulta: + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 0. + +Uso + +Cuando se habilita la coherencia secuencial, ClickHouse permite al cliente ejecutar el `SELECT` consulta sólo para aquellas réplicas que contienen datos de todas las `INSERT` Consultas ejecutadas con `insert_quorum`. Si el cliente hace referencia a una réplica parcial, ClickHouse generará una excepción. La consulta SELECT no incluirá datos que aún no se hayan escrito en el quórum de réplicas. + +Ver también: + +- [insert\_quorum](#settings-insert_quorum) +- [insert\_quorum\_timeout](#settings-insert_quorum_timeout) + +## insert\_deduplicate {#settings-insert-deduplicate} + +Habilita o deshabilita la desduplicación de bloques `INSERT` (para tablas replicadas\* + +Valores posibles: + +- 0 — Desactivado. +- 1 — Habilitado. + +Valor predeterminado: 1. + +De forma predeterminada, los bloques insertados en tablas replicadas `INSERT` (consulte \[Replicación de datos\] (../ table\_engines/replication.md). + +## deduplicate\_blocks\_in\_dependent\_materialized\_views {#settings-deduplicate-blocks-in-dependent-materialized-views} + +Habilita o deshabilita la comprobación de desduplicación para las vistas materializadas que reciben datos de tablas replicadas\*. + +Valores posibles: + + 0 — Disabled. + 1 — Enabled. + +Valor predeterminado: 0. + +Uso + +De forma predeterminada, la desduplicación no se realiza para las vistas materializadas, sino que se realiza en sentido ascendente, en la tabla de origen. +Si se omite un bloque INSERTed debido a la desduplicación en la tabla de origen, no habrá inserción en las vistas materializadas adjuntas. Este comportamiento existe para permitir la inserción de datos altamente agregados en vistas materializadas, para los casos en que los bloques insertados son los mismos después de la agregación de vistas materializadas pero derivados de diferentes INSERT en la tabla de origen. +Al mismo tiempo, este comportamiento “breaks” `INSERT` empotencia. Si una `INSERT` en la mesa principal fue exitoso y `INSERT` en una vista materializada falló (por ejemplo, debido a una falla de comunicación con Zookeeper), un cliente obtendrá un error y puede volver a intentar la operación. Sin embargo, la vista materializada no recibirá la segunda inserción porque se descartará mediante deduplicación en la tabla principal (fuente). Configuración `deduplicate_blocks_in_dependent_materialized_views` permite cambiar este comportamiento. Al reintentar, una vista materializada recibirá la inserción de repetición y realizará la comprobación de desduplicación por sí misma, +ignorando el resultado de la comprobación para la tabla de origen, e insertará filas perdidas debido a la primera falla. + +## Método de codificación de datos: {#settings-max-network-bytes} + +Limita el volumen de datos (en bytes) que se recibe o se transmite a través de la red al ejecutar una consulta. Esta configuración se aplica a cada consulta individual. + +Valores posibles: + +- Entero positivo. +- 0 — El control de volumen de datos está desactivado. + +Valor predeterminado: 0. + +## Método de codificación de datos: {#settings-max-network-bandwidth} + +Limita la velocidad del intercambio de datos a través de la red en bytes por segundo. Esta configuración se aplica a todas las consultas. + +Valores posibles: + +- Entero positivo. +- 0 — El control de ancho de banda está deshabilitado. + +Valor predeterminado: 0. + +## Todos los derechos reservados. {#settings-max-network-bandwidth-for-user} + +Limita la velocidad del intercambio de datos a través de la red en bytes por segundo. Esta configuración se aplica a todas las consultas que se ejecutan simultáneamente realizadas por un único usuario. + +Valores posibles: + +- Entero positivo. +- 0 — El control de la velocidad de los datos está desactivado. + +Valor predeterminado: 0. + +## Todos los derechos reservados. {#settings-max-network-bandwidth-for-all-users} + +Limita la velocidad a la que se intercambian datos a través de la red en bytes por segundo. Esta configuración se aplica a todas las consultas que se ejecutan simultáneamente en el servidor. + +Valores posibles: + +- Entero positivo. +- 0 — El control de la velocidad de los datos está desactivado. + +Valor predeterminado: 0. + +## count\_distinct\_implementation {#settings-count-distinct-implementation} + +Especifica cuál de las `uniq*` se deben utilizar para realizar el [COUNT(DISTINCT …)](../../query_language/agg_functions/reference.md#agg_function-count) construcción. + +Valores posibles: + +- [uniq](../../query_language/agg_functions/reference.md#agg_function-uniq) +- [uniqCombined](../../query_language/agg_functions/reference.md#agg_function-uniqcombined) +- [UniqCombined64](../../query_language/agg_functions/reference.md#agg_function-uniqcombined64) +- [uniqHLL12](../../query_language/agg_functions/reference.md#agg_function-uniqhll12) +- [uniqExact](../../query_language/agg_functions/reference.md#agg_function-uniqexact) + +Valor predeterminado: `uniqExact`. + +## skip\_unavailable\_shards {#settings-skip-unavailable-shards} + +Habilita o deshabilita la omisión silenciosa de fragmentos no disponibles. + +El fragmento se considera no disponible si todas sus réplicas no están disponibles. Una réplica no está disponible en los siguientes casos: + +- ClickHouse no se puede conectar a la réplica por ningún motivo. + + Al conectarse a una réplica, ClickHouse realiza varios intentos. Si todos estos intentos fallan, la réplica se considera que no está disponible. + +- Replica no se puede resolver a través de DNS. + + Si el nombre de host de la réplica no se puede resolver a través de DNS, puede indicar las siguientes situaciones: + + - El host de réplica no tiene registro DNS. Puede ocurrir en sistemas con DNS dinámico, por ejemplo, [Kubernetes](https://kubernetes.io), donde los nodos pueden ser irresolubles durante el tiempo de inactividad, y esto no es un error. + + - Error de configuración. El archivo de configuración de ClickHouse contiene un nombre de host incorrecto. + +Valores posibles: + +- 1 — omitir habilitado. + + Si un fragmento no está disponible, ClickHouse devuelve un resultado basado en datos parciales y no informa de problemas de disponibilidad de nodos. + +- 0 — omitiendo deshabilitado. + + Si un fragmento no está disponible, ClickHouse produce una excepción. + +Valor predeterminado: 0. + +## Optize\_skip\_unused\_shards {#settings-optimize-skip-unused-shards} + +Habilita o deshabilita la omisión de fragmentos no utilizados para las consultas SELECT que tienen la condición de clave de fragmentación en PREWHERE / WHERE (supone que los datos se distribuyen mediante clave de fragmentación, de lo contrario no hacer nada). + +Valor predeterminado: 0 + +## Fuerza\_optimize\_skip\_unused\_shards {#settings-force-optimize-skip-unused-shards} + +Habilita o deshabilita la ejecución de consultas si [`optimize_skip_unused_shards`](#settings-optimize_skip_unused_shards) no es posible omitir fragmentos no utilizados. Si la omisión no es posible y la configuración está habilitada, se lanzará una excepción. + +Valores posibles: + +- 0 - Discapacitados (no lanza) +- 1: deshabilite la ejecución de consultas solo si la tabla tiene una clave de fragmentación +- 2: deshabilita la ejecución de consultas independientemente de que se haya definido la clave de fragmentación para la tabla + +Valor predeterminado: 0 + +## Optize\_throw\_if\_noop {#setting-optimize-throw-if-noop} + +Habilita o deshabilita el lanzamiento de una excepción [OPTIMIZAR](../../query_language/misc.md#misc_operations-optimize) la consulta no realizó una fusión. + +Predeterminada, `OPTIMIZE` devuelve con éxito incluso si no hizo nada. Esta configuración le permite diferenciar estas situaciones y obtener el motivo en un mensaje de excepción. + +Valores posibles: + +- 1 — Lanzar una excepción está habilitada. +- 0 — Lanzar una excepción está deshabilitado. + +Valor predeterminado: 0. + +## distributed\_replica\_error\_half\_life {#settings-distributed-replica-error-half-life} + +- Tipo: segundos +- Valor predeterminado: 60 segundos + +Controla la rapidez con la que se ponen a cero los errores en las tablas distribuidas. Si una réplica no está disponible durante algún tiempo, acumula 5 errores y distribut\_replica\_error\_half\_life se establece en 1 segundo, la réplica se considera normal 3 segundos después del último error. + +Ver también: + +- [Motor de tabla distribuido](../../operations/table_engines/distributed.md) +- [distributed\_replica\_error\_cap](#settings-distributed_replica_error_cap) + +## distributed\_replica\_error\_cap {#settings-distributed-replica-error-cap} + +- Tipo: unsigned int +- Valor predeterminado: 1000 + +El recuento de errores de cada réplica está limitado a este valor, lo que impide que una sola réplica acumule demasiados errores. + +Ver también: + +- [Motor de tabla distribuido](../../operations/table_engines/distributed.md) +- [distributed\_replica\_error\_half\_life](#settings-distributed_replica_error_half_life) + +## Distributed\_directory\_monitor\_sleep\_time\_ms {#distributed-directory-monitor-sleep-time-ms} + +Intervalo base para el [Distribuido](../table_engines/distributed.md) motor de tabla para enviar datos. El intervalo real crece exponencialmente en caso de errores. + +Valores posibles: + +- Un número entero positivo de milisegundos. + +Valor predeterminado: 100 milisegundos. + +## Distributed\_directory\_monitor\_max\_sleep\_time\_ms {#distributed-directory-monitor-max-sleep-time-ms} + +Intervalo máximo para el [Distribuido](../table_engines/distributed.md) motor de tabla para enviar datos. Limita el crecimiento exponencial del intervalo establecido en el [Distributed\_directory\_monitor\_sleep\_time\_ms](#distributed_directory_monitor_sleep_time_ms) configuración. + +Valores posibles: + +- Un número entero positivo de milisegundos. + +Valor predeterminado: 30000 milisegundos (30 segundos). + +## distributed\_directory\_monitor\_batch\_inserts {#distributed-directory-monitor-batch-inserts} + +Habilita/deshabilita el envío de datos insertados en lotes. + +Cuando el envío por lotes está habilitado, el [Distribuido](../table_engines/distributed.md) El motor de tabla intenta enviar varios archivos de datos insertados en una operación en lugar de enviarlos por separado. El envío por lotes mejora el rendimiento del clúster al utilizar mejor los recursos del servidor y de la red. + +Valores posibles: + +- 1 — Habilitado. +- 0 — Desactivado. + +Valor predeterminado: 0. + +## os\_thread\_priority {#setting-os-thread-priority} + +Establece la prioridad ([agradable](https://en.wikipedia.org/wiki/Nice_(Unix))) para subprocesos que ejecutan consultas. El programador del sistema operativo considera esta prioridad al elegir el siguiente hilo para ejecutar en cada núcleo de CPU disponible. + +!!! warning "Advertencia" + Para utilizar esta configuración, debe establecer el `CAP_SYS_NICE` capacidad. El `clickhouse-server` paquete lo configura durante la instalación. Algunos entornos virtuales no le permiten establecer `CAP_SYS_NICE` capacidad. En este caso, `clickhouse-server` muestra un mensaje al respecto al principio. + +Valores posibles: + +- Puede establecer valores en el rango `[-20, 19]`. + +Los valores más bajos significan mayor prioridad. Hilos con bajo `nice` Los valores de prioridad se ejecutan con más frecuencia que los subprocesos con valores altos. Los valores altos son preferibles para consultas no interactivas de larga ejecución porque les permite renunciar rápidamente a recursos en favor de consultas interactivas cortas cuando llegan. + +Valor predeterminado: 0. + +## query\_profiler\_real\_time\_period\_ns {#query-profiler-real-time-period-ns} + +Establece el período para un temporizador de reloj real del [perfilador de consultas](../../operations/performance/sampling_query_profiler.md). El temporizador de reloj real cuenta el tiempo del reloj de pared. + +Valores posibles: + +- Número entero positivo, en nanosegundos. + + Valores recomendados: + + - 10000000 (100 times a second) nanoseconds and less for single queries. + - 1000000000 (once a second) for cluster-wide profiling. + +- 0 para apagar el temporizador. + +Tipo: [UInt64](../../data_types/int_uint.md). + +Valor predeterminado: 1000000000 nanosegundos (una vez por segundo). + +Ver también: + +- Tabla del sistema [trace\_log](../system_tables.md#system_tables-trace_log) + +## Los resultados de la prueba {#query-profiler-cpu-time-period-ns} + +Establece el período para un temporizador de reloj de CPU [perfilador de consultas](../../operations/performance/sampling_query_profiler.md). Este temporizador solo cuenta el tiempo de CPU. + +Valores posibles: + +- Un número entero positivo de nanosegundos. + + Valores recomendados: + + - 10000000 (100 times a second) nanoseconds and more for single queries. + - 1000000000 (once a second) for cluster-wide profiling. + +- 0 para apagar el temporizador. + +Tipo: [UInt64](../../data_types/int_uint.md). + +Valor predeterminado: 1000000000 nanosegundos. + +Ver también: + +- Tabla del sistema [trace\_log](../system_tables.md#system_tables-trace_log) + +## allow\_introspection\_functions {#settings-allow-introspection-functions} + +Habilita deshabilita [funciones de introspecciones](../../query_language/functions/introspection.md) para la creación de perfiles de consultas. + +Valores posibles: + +- 1 — Funciones de introspección habilitadas. +- 0 — Funciones de introspección deshabilitadas. + +Valor predeterminado: 0. + +**Ver también** + +- [Analizador de consultas de muestreo](../performance/sampling_query_profiler.md) +- Tabla del sistema [trace\_log](../system_tables.md#system_tables-trace_log) + +## input\_format\_parallel\_parsing {#input-format-parallel-parsing} + +- Tipo: bool +- Valor predeterminado: True + +Habilitar el análisis paralelo de los formatos de datos para preservar el orden. Solo se admite para los formatos TSV, TKSV, CSV y JSONEachRow. + +## También puede utilizar los siguientes métodos de envío: {#min-chunk-bytes-for-parallel-parsing} + +- Tipo: unsigned int +- Valor predeterminado: 1 MiB + +El tamaño mínimo de fragmento en bytes, que cada subproceso analizará en paralelo. + +## Sistema abierto. {#settings-output-format-avro-codec} + +Establece el códec de compresión utilizado para el archivo Avro de salida. + +Tipo: cadena + +Valores posibles: + +- `null` — Sin compresión +- `deflate` — Comprimir con Deflate (zlib) +- `snappy` — Comprimir con [Rápido](https://google.github.io/snappy/) + +Valor predeterminado: `snappy` (si está disponible) o `deflate`. + +## Sistema abierto. {#settings-output-format-avro-sync-interval} + +Establece el tamaño mínimo de datos (en bytes) entre los marcadores de sincronización para el archivo Avro de salida. + +Tipo: unsigned int + +Valores posibles: 32 (32 bytes) - 1073741824 (1 GiB) + +Valor predeterminado: 32768 (32 KiB) + +## Todos los derechos reservados. {#settings-format-avro-schema-registry-url} + +Establece la URL del Registro de esquemas confluentes para usar con [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent) Formato + +Tipo: URL + +Valor predeterminado: Vacío + +[Artículo Original](https://clickhouse.tech/docs/es/operations/settings/settings/) diff --git a/docs/es/operations/settings/settings_profiles.md b/docs/es/operations/settings/settings_profiles.md new file mode 100644 index 00000000000..056646844af --- /dev/null +++ b/docs/es/operations/settings/settings_profiles.md @@ -0,0 +1,64 @@ +# Perfiles de configuración {#settings-profiles} + +Un perfil de configuración es una colección de configuraciones agrupadas con el mismo nombre. Cada usuario de ClickHouse tiene un perfil. +Para aplicar todos los ajustes de un perfil, establezca el `profile` configuración. + +Ejemplo: + +Instale el `web` perfil. + +``` sql +SET profile = 'web' +``` + +Los perfiles de configuración se declaran en el archivo de configuración del usuario. Esto suele ser `users.xml`. + +Ejemplo: + +``` xml + + + + + + 8 + + + + + 1000000000 + 100000000000 + + 1000000 + any + + 1000000 + 1000000000 + + 100000 + 100000000 + break + + 600 + 1000000 + 15 + + 25 + 100 + 50 + + 2 + 25 + 50 + 100 + + 1 + + +``` + +El ejemplo especifica dos perfiles: `default` y `web`. El `default` tiene un propósito especial: siempre debe estar presente y se aplica al iniciar el servidor. En otras palabras, el `default` perfil contiene la configuración predeterminada. El `web` profile es un perfil regular que se puede establecer utilizando el `SET` consulta o utilizando un parámetro URL en una consulta HTTP. + +Los perfiles de configuración pueden heredar unos de otros. Para usar la herencia, indique una o varias `profile` configuraciones antes de las demás configuraciones que se enumeran en el perfil. En caso de que se defina una configuración en diferentes perfiles, se utiliza la última definida. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/settings/settings_profiles/) diff --git a/docs/es/operations/settings/settings_users.md b/docs/es/operations/settings/settings_users.md new file mode 100644 index 00000000000..21c389332a3 --- /dev/null +++ b/docs/es/operations/settings/settings_users.md @@ -0,0 +1,141 @@ +# Configuración del usuario {#user-settings} + +El `users` sección de la `user.xml` el archivo de configuración contiene la configuración del usuario. + +Estructura del `users` apartado: + +``` xml + + + + + + + + + + + profile_name + + default + + + + + expression + + + + + + +``` + +### user\_name/contraseña {#user-namepassword} + +La contraseña se puede especificar en texto sin formato o en SHA256 (formato hexagonal). + +- Para asignar una contraseña en texto sin formato (**no se recomienda**), colóquelo en un `password` elemento. + + Por ejemplo, `qwerty`. La contraseña se puede dejar en blanco. + + + +- Para asignar una contraseña utilizando su hash SHA256, colóquela en un `password_sha256_hex` elemento. + + Por ejemplo, `65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5`. + + Ejemplo de cómo generar una contraseña desde el shell: + + PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' + + La primera línea del resultado es la contraseña. La segunda línea es el hash SHA256 correspondiente. + + + +- Para la compatibilidad con los clientes MySQL, la contraseña se puede especificar en doble hash SHA1. Colóquelo en `password_double_sha1_hex` elemento. + + Por ejemplo, `08b4a0f1de6ad37da17359e592c8d74788a83eb0`. + + Ejemplo de cómo generar una contraseña desde el shell: + + PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-' + + La primera línea del resultado es la contraseña. La segunda línea es el hash SHA1 doble correspondiente. + +### Nombre\_usuario/redes {#user-namenetworks} + +Lista de redes desde las que el usuario puede conectarse al servidor ClickHouse. + +Cada elemento de la lista puede tener una de las siguientes formas: + +- `` — Dirección IP o máscara de red. + + Ejemplos: `213.180.204.3`, `10.0.0.1/8`, `10.0.0.1/255.255.255.0`, `2a02:6b8::3`, `2a02:6b8::3/64`, `2a02:6b8::3/ffff:ffff:ffff:ffff::`. + +- `` — Nombre de host. + + Ejemplo: `example01.host.ru`. + + Para comprobar el acceso, se realiza una consulta DNS y todas las direcciones IP devueltas se comparan con la dirección del mismo nivel. + +- `` — Expresión regular para nombres de host. + + Ejemplo, `^example\d\d-\d\d-\d\.host\.ru$` + + Para comprobar el acceso, un [Consulta de DNS PTR](https://en.wikipedia.org/wiki/Reverse_DNS_lookup) se realiza para la dirección del mismo nivel y luego se aplica la expresión regular especificada. A continuación, se realiza otra consulta DNS para los resultados de la consulta PTR y todas las direcciones recibidas se comparan con la dirección del mismo nivel. Recomendamos encarecidamente que regexp termine con $ . + +Todos los resultados de las solicitudes DNS se almacenan en caché hasta que el servidor se reinicia. + +**Ejemplos** + +Para abrir el acceso del usuario desde cualquier red, especifique: + +``` xml +::/0 +``` + +!!! warning "Advertencia" + Es inseguro abrir el acceso desde cualquier red a menos que tenga un firewall configurado correctamente o el servidor no esté conectado directamente a Internet. + +Para abrir el acceso solo desde localhost, especifique: + +``` xml +::1 +127.0.0.1 +``` + +### Nombre\_usuario/perfil {#user-nameprofile} + +Puede asignar un perfil de configuración para el usuario. Los perfiles de configuración se configuran en una sección separada del `users.xml` file. Para obtener más información, consulte [Perfiles de configuración](settings_profiles.md). + +### Nombre\_usuario/cuota {#user-namequota} + +Las cuotas le permiten realizar un seguimiento o limitar el uso de recursos durante un período de tiempo. Las cuotas se configuran en el `quotas` +sección de la `users.xml` archivo de configuración. + +Puede asignar un conjunto de cuotas para el usuario. Para obtener una descripción detallada de la configuración de las cuotas, consulte [Cuota](../quotas.md#quotas). + +### nombre\_usuario/bases de datos {#user-namedatabases} + +En esta sección, puede limitar las filas devueltas por ClickHouse para `SELECT` consultas realizadas por el usuario actual, implementando así la seguridad básica a nivel de fila. + +**Ejemplo** + +La siguiente configuración obliga a que el usuario `user1` sólo puede ver las filas de `table1` como resultado de `SELECT` Consultas, donde el valor de la `id` campo es 1000. + +``` xml + + + + + id = 1000 + + + + +``` + +El `filter` puede ser cualquier expresión que resulte en un [UInt8](../../data_types/int_uint.md)-tipo de valor. Por lo general, contiene comparaciones y operadores lógicos. Filas de `database_name.table1` donde los resultados del filtro a 0 no se devuelven para este usuario. El filtrado es incompatible con `PREWHERE` operaciones y desactiva `WHERE→PREWHERE` optimización. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/settings/settings_users/) diff --git a/docs/es/operations/system_tables.md b/docs/es/operations/system_tables.md new file mode 100644 index 00000000000..b77b0c21a4a --- /dev/null +++ b/docs/es/operations/system_tables.md @@ -0,0 +1,1047 @@ +# Tablas del sistema {#system-tables} + +Las tablas del sistema se utilizan para implementar parte de la funcionalidad del sistema y para proporcionar acceso a información sobre cómo funciona el sistema. +No puede eliminar una tabla del sistema (pero puede realizar DETACH). +Las tablas del sistema no tienen archivos con datos en el disco o archivos con metadatos. El servidor crea todas las tablas del sistema cuando se inicia. +Las tablas del sistema son de solo lectura. +Están ubicados en el ‘system’ basar. + +## sistema.asynchronous\_metrics {#system-tables-asynchronous-metrics} + +Contiene métricas que se calculan periódicamente en segundo plano. Por ejemplo, la cantidad de RAM en uso. + +Columna: + +- `metric` ([Cadena](../data_types/string.md)) — Nombre métrico. +- `value` ([Float64](../data_types/float.md)) — Valor métrico. + +**Ejemplo** + +``` sql +SELECT * FROM system.asynchronous_metrics LIMIT 10 +``` + +``` text +┌─metric──────────────────────────────────┬──────value─┐ +│ jemalloc.background_thread.run_interval │ 0 │ +│ jemalloc.background_thread.num_runs │ 0 │ +│ jemalloc.background_thread.num_threads │ 0 │ +│ jemalloc.retained │ 422551552 │ +│ jemalloc.mapped │ 1682989056 │ +│ jemalloc.resident │ 1656446976 │ +│ jemalloc.metadata_thp │ 0 │ +│ jemalloc.metadata │ 10226856 │ +│ UncompressedCacheCells │ 0 │ +│ MarkCacheFiles │ 0 │ +└─────────────────────────────────────────┴────────────┘ +``` + +**Ver también** + +- [Monitoreo](monitoring.md) — Conceptos básicos de monitoreo ClickHouse. +- [sistema.métricas](#system_tables-metrics) - Contiene métricas calculadas al instante. +- [sistema.evento](#system_tables-events) — Contiene una serie de eventos que han ocurrido. +- [sistema.metric\_log](#system_tables-metric_log) — Contiene un historial de valores de métricas de tablas `system.metrics` , . `system.events`. + +## sistema.Cluster {#system-clusters} + +Contiene información sobre los clústeres disponibles en el archivo de configuración y los servidores que contienen. + +Columna: + +- `cluster` (String) — El nombre del clúster. +- `shard_num` (UInt32) — El número de fragmento en el clúster, a partir de 1. +- `shard_weight` (UInt32) — El peso relativo del fragmento al escribir datos. +- `replica_num` (UInt32) — El número de réplica en el fragmento, a partir de 1. +- `host_name` (String) — El nombre de host, como se especifica en la configuración. +- `host_address` (String) — La dirección IP del host obtenida de DNS. +- `port` (UInt16): el puerto que se utiliza para conectarse al servidor. +- `user` (String) — El nombre del usuario para conectarse al servidor. +- `errors_count` (UInt32): número de veces que este host no pudo alcanzar la réplica. +- `estimated_recovery_time` (UInt32): quedan segundos hasta que el recuento de errores de réplica se ponga a cero y se considere que vuelve a la normalidad. + +Tenga en cuenta que `errors_count` se actualiza una vez por consulta al clúster, pero `estimated_recovery_time` se vuelve a calcular bajo demanda. Entonces podría haber un caso distinto de cero `errors_count` y cero `estimated_recovery_time`, esa próxima consulta será cero `errors_count` e intente usar la réplica como si no tuviera errores. + +**Ver también** + +- [Motor de tabla distribuido](table_engines/distributed.md) +- [distributed\_replica\_error\_cap configuración](settings/settings.md#settings-distributed_replica_error_cap) +- [distributed\_replica\_error\_half\_life configuración](settings/settings.md#settings-distributed_replica_error_half_life) + +## sistema.columna {#system-columns} + +Contiene información sobre las columnas de todas las tablas. + +Puede utilizar esta tabla para obtener información similar a la [TABLA DE DESCRIBE](../query_language/misc.md#misc-describe-table) Consulta, pero para varias tablas a la vez. + +El `system.columns` tabla contiene las siguientes columnas (el tipo de columna se muestra entre corchetes): + +- `database` (String) — Nombre de la base de datos. +- `table` (Cadena) — Nombre de tabla. +- `name` (Cadena) — Nombre de columna. +- `type` (Cadena) — Tipo de columna. +- `default_kind` (String) — Tipo de expresión (`DEFAULT`, `MATERIALIZED`, `ALIAS`) para el valor predeterminado, o una cadena vacía si no está definida. +- `default_expression` (String) — Expresión para el valor predeterminado, o una cadena vacía si no está definida. +- `data_compressed_bytes` (UInt64): el tamaño de los datos comprimidos, en bytes. +- `data_uncompressed_bytes` (UInt64): el tamaño de los datos descomprimidos, en bytes. +- `marks_bytes` (UInt64) — El tamaño de las marcas, en bytes. +- `comment` (Cadena): comenta la columna o una cadena vacía si no está definida. +- `is_in_partition_key` (UInt8): marca que indica si la columna está en la expresión de partición. +- `is_in_sorting_key` (UInt8): marca que indica si la columna está en la expresión de clave de ordenación. +- `is_in_primary_key` (UInt8): marca que indica si la columna está en la expresión de clave principal. +- `is_in_sampling_key` (UInt8): marca que indica si la columna está en la expresión de clave de muestreo. + +## sistema.colaborador {#system-contributors} + +Contiene información sobre los colaboradores. Todos los constributores en orden aleatorio. El orden es aleatorio en el momento de la ejecución de la consulta. + +Columna: + +- `name` (Cadena) - Nombre del colaborador (autor) del git log. + +**Ejemplo** + +``` sql +SELECT * FROM system.contributors LIMIT 10 +``` + +``` text +┌─name─────────────┐ +│ Olga Khvostikova │ +│ Max Vetrov │ +│ LiuYangkuan │ +│ svladykin │ +│ zamulla │ +│ Šimon Podlipský │ +│ BayoNet │ +│ Ilya Khomutov │ +│ Amy Krishnevsky │ +│ Loud_Scream │ +└──────────────────┘ +``` + +Para descubrirlo en la tabla, use una consulta: + +``` sql +SELECT * FROM system.contributors WHERE name='Olga Khvostikova' +``` + +``` text +┌─name─────────────┐ +│ Olga Khvostikova │ +└──────────────────┘ +``` + +## sistema.basar {#system-databases} + +Esta tabla contiene una sola columna String llamada ‘name’ – el nombre de una base de datos. +Cada base de datos que el servidor conoce tiene una entrada correspondiente en la tabla. +Esta tabla del sistema se utiliza para implementar el `SHOW DATABASES` consulta. + +## sistema.detached\_parts {#system-tables-detached-parts} + +Contiene información sobre piezas separadas de [Método de codificación de datos:](table_engines/mergetree.md) tabla. El `reason` columna especifica por qué se separó la pieza. Para las piezas separadas por el usuario, el motivo está vacío. Tales partes se pueden unir con [ALTER TABLE ATTACH PARTITION\|PARTE](../query_language/query_language/alter/#alter_attach-partition) comando. Para obtener la descripción de otras columnas, consulte [sistema.parte](#system_tables-parts). Si el nombre de la pieza no es válido, los valores de algunas columnas pueden ser `NULL`. Tales partes se pueden eliminar con [ALTER MESA GOTA PARTE DESMONTADA](../query_language/query_language/alter/#alter_drop-detached). + +## sistema.Diccionario {#system-dictionaries} + +Contiene información sobre diccionarios externos. + +Columna: + +- `name` (Cadena) — Nombre del diccionario. +- `type` (Cadena) - Tipo de diccionario: plano, hash, caché. +- `origin` (String) — Ruta de acceso al archivo de configuración que describe el diccionario. +- `attribute.names` (Array(String)) — Matriz de nombres de atributos proporcionados por el diccionario. +- `attribute.types` (Array(String)) — Matriz correspondiente de tipos de atributos que proporciona el diccionario. +- `has_hierarchy` (UInt8) - Si el diccionario es jerárquico. +- `bytes_allocated` (UInt64) - La cantidad de RAM que usa el diccionario. +- `hit_rate` (Float64): para los diccionarios de caché, el porcentaje de usos para los que el valor estaba en la caché. +- `element_count` (UInt64) — El número de elementos almacenados en el diccionario. +- `load_factor` (Float64): el porcentaje rellenado en el diccionario (para un diccionario hash, el porcentaje rellenado en la tabla hash). +- `creation_time` (DateTime): la hora en que se creó el diccionario o se recargó correctamente por última vez. +- `last_exception` (Cadena) — Texto del error que se produce al crear o volver a cargar el diccionario si no se pudo crear el diccionario. +- `source` (String) — Texto que describe el origen de datos para el diccionario. + +Tenga en cuenta que la cantidad de memoria utilizada por el diccionario no es proporcional a la cantidad de elementos almacenados en él. Por lo tanto, para los diccionarios planos y en caché, todas las celdas de memoria se asignan previamente, independientemente de qué tan lleno esté realmente el diccionario. + +## sistema.evento {#system-tables-events} + +Contiene información sobre el número de eventos que se han producido en el sistema. Por ejemplo, en la tabla, puede encontrar cuántos `SELECT` las consultas se procesaron desde que se inició el servidor ClickHouse. + +Columna: + +- `event` ([Cadena](../data_types/string.md)) — Nombre del evento. +- `value` ([UInt64](../data_types/int_uint.md)) — Número de eventos ocurridos. +- `description` ([Cadena](../data_types/string.md)) — Descripción del evento. + +**Ejemplo** + +``` sql +SELECT * FROM system.events LIMIT 5 +``` + +``` text +┌─event─────────────────────────────────┬─value─┬─description────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Query │ 12 │ Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries. │ +│ SelectQuery │ 8 │ Same as Query, but only for SELECT queries. │ +│ FileOpen │ 73 │ Number of files opened. │ +│ ReadBufferFromFileDescriptorRead │ 155 │ Number of reads (read/pread) from a file descriptor. Does not include sockets. │ +│ ReadBufferFromFileDescriptorReadBytes │ 9931 │ Number of bytes read from file descriptors. If the file is compressed, this will show the compressed data size. │ +└───────────────────────────────────────┴───────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**Ver también** + +- [sistema.asynchronous\_metrics](#system_tables-asynchronous_metrics) — Contiene métricas calculadas periódicamente. +- [sistema.métricas](#system_tables-metrics) - Contiene métricas calculadas al instante. +- [sistema.metric\_log](#system_tables-metric_log) — Contiene un historial de valores de métricas de tablas `system.metrics` , . `system.events`. +- [Monitoreo](monitoring.md) — Conceptos básicos de monitoreo ClickHouse. + +## sistema.función {#system-functions} + +Contiene información sobre funciones normales y agregadas. + +Columna: + +- `name`(`String`) – El nombre de la función. +- `is_aggregate`(`UInt8`) — Si la función es agregada. + +## sistema.graphite\_retentions {#system-graphite-retentions} + +Contiene información sobre los parámetros [graphite\_rollup](server_settings/settings.md#server_settings-graphite_rollup) que se utilizan en tablas con [\*GraphiteMergeTree](table_engines/graphitemergetree.md) motor. + +Columna: + +- `config_name` (Cadena) - `graphite_rollup` nombre del parámetro. +- `regexp` (Cadena) - Un patrón para el nombre de la métrica. +- `function` (String) - El nombre de la función de agregación. +- `age` (UInt64) - La edad mínima de los datos en segundos. +- `precision` (UInt64) - Cómo definir con precisión la edad de los datos en segundos. +- `priority` (UInt16) - Prioridad de patrón. +- `is_default` (UInt8) - Si el patrón es el predeterminado. +- `Tables.database` (Array(String)) - Matriz de nombres de tablas de base de datos que utilizan `config_name` parámetro. +- `Tables.table` (Array(String)) - Matriz de nombres de tablas que utilizan `config_name` parámetro. + +## sistema.Fusionar {#system-merges} + +Contiene información sobre fusiones y mutaciones de piezas actualmente en proceso para tablas de la familia MergeTree. + +Columna: + +- `database` (String) — El nombre de la base de datos en la que se encuentra la tabla. +- `table` (Cadena) — Nombre de tabla. +- `elapsed` (Float64) — El tiempo transcurrido (en segundos) desde que se inició la fusión. +- `progress` (Float64) — El porcentaje de trabajo completado de 0 a 1. +- `num_parts` (UInt64) — El número de piezas que se fusionarán. +- `result_part_name` (Cadena) — El nombre de la parte que se formará como resultado de la fusión. +- `is_mutation` (UInt8) - 1 si este proceso es una mutación parte. +- `total_size_bytes_compressed` (UInt64): el tamaño total de los datos comprimidos en los fragmentos combinados. +- `total_size_marks` (UInt64) — Número total de marcas en las partes fusionadas. +- `bytes_read_uncompressed` (UInt64) — Número de bytes leídos, sin comprimir. +- `rows_read` (UInt64) — Número de filas leídas. +- `bytes_written_uncompressed` (UInt64) — Número de bytes escritos, sin comprimir. +- `rows_written` (UInt64) — Número de filas escritas. + +## sistema.métricas {#system-tables-metrics} + +Contiene métricas que pueden calcularse instantáneamente o tener un valor actual. Por ejemplo, el número de consultas procesadas simultáneamente o el retraso de réplica actual. Esta tabla está siempre actualizada. + +Columna: + +- `metric` ([Cadena](../data_types/string.md)) — Nombre métrico. +- `value` ([Int64](../data_types/int_uint.md)) — Valor métrico. +- `description` ([Cadena](../data_types/string.md)) — Descripción métrica. + +La lista de métricas admitidas que puede encontrar en el [dbms/src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Common/CurrentMetrics.cpp) archivo fuente de ClickHouse. + +**Ejemplo** + +``` sql +SELECT * FROM system.metrics LIMIT 10 +``` + +``` text +┌─metric─────────────────────┬─value─┬─description──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Query │ 1 │ Number of executing queries │ +│ Merge │ 0 │ Number of executing background merges │ +│ PartMutation │ 0 │ Number of mutations (ALTER DELETE/UPDATE) │ +│ ReplicatedFetch │ 0 │ Number of data parts being fetched from replicas │ +│ ReplicatedSend │ 0 │ Number of data parts being sent to replicas │ +│ ReplicatedChecks │ 0 │ Number of data parts checking for consistency │ +│ BackgroundPoolTask │ 0 │ Number of active tasks in BackgroundProcessingPool (merges, mutations, fetches, or replication queue bookkeeping) │ +│ BackgroundSchedulePoolTask │ 0 │ Number of active tasks in BackgroundSchedulePool. This pool is used for periodic ReplicatedMergeTree tasks, like cleaning old data parts, altering data parts, replica re-initialization, etc. │ +│ DiskSpaceReservedForMerge │ 0 │ Disk space reserved for currently running background merges. It is slightly more than the total size of currently merging parts. │ +│ DistributedSend │ 0 │ Number of connections to remote servers sending data that was INSERTed into Distributed tables. Both synchronous and asynchronous mode. │ +└────────────────────────────┴───────┴──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**Ver también** + +- [sistema.asynchronous\_metrics](#system_tables-asynchronous_metrics) — Contiene métricas calculadas periódicamente. +- [sistema.evento](#system_tables-events) — Contiene una serie de eventos que ocurrieron. +- [sistema.metric\_log](#system_tables-metric_log) — Contiene un historial de valores de métricas de tablas `system.metrics` , . `system.events`. +- [Monitoreo](monitoring.md) — Conceptos básicos de monitoreo ClickHouse. + +## sistema.metric\_log {#system-tables-metric-log} + +Contiene el historial de valores de métricas de tablas `system.metrics` y `system.events`, periódicamente enjuagado al disco. +Para activar la recopilación de historial de métricas en `system.metric_log`, crear `/etc/clickhouse-server/config.d/metric_log.xml` con el siguiente contenido: + +``` xml + + + system + metric_log
+ 7500 + 1000 +
+
+``` + +**Ejemplo** + +``` sql +SELECT * FROM system.metric_log LIMIT 1 FORMAT Vertical; +``` + +``` text +Row 1: +────── +event_date: 2020-02-18 +event_time: 2020-02-18 07:15:33 +milliseconds: 554 +ProfileEvent_Query: 0 +ProfileEvent_SelectQuery: 0 +ProfileEvent_InsertQuery: 0 +ProfileEvent_FileOpen: 0 +ProfileEvent_Seek: 0 +ProfileEvent_ReadBufferFromFileDescriptorRead: 1 +ProfileEvent_ReadBufferFromFileDescriptorReadFailed: 0 +ProfileEvent_ReadBufferFromFileDescriptorReadBytes: 0 +ProfileEvent_WriteBufferFromFileDescriptorWrite: 1 +ProfileEvent_WriteBufferFromFileDescriptorWriteFailed: 0 +ProfileEvent_WriteBufferFromFileDescriptorWriteBytes: 56 +... +CurrentMetric_Query: 0 +CurrentMetric_Merge: 0 +CurrentMetric_PartMutation: 0 +CurrentMetric_ReplicatedFetch: 0 +CurrentMetric_ReplicatedSend: 0 +CurrentMetric_ReplicatedChecks: 0 +... +``` + +**Ver también** + +- [sistema.asynchronous\_metrics](#system_tables-asynchronous_metrics) — Contiene métricas calculadas periódicamente. +- [sistema.evento](#system_tables-events) — Contiene una serie de eventos que ocurrieron. +- [sistema.métricas](#system_tables-metrics) - Contiene métricas calculadas al instante. +- [Monitoreo](monitoring.md) — Conceptos básicos de monitoreo ClickHouse. + +## sistema.número {#system-numbers} + +Esta tabla contiene una única columna UInt64 llamada ‘number’ que contiene casi todos los números naturales a partir de cero. +Puede usar esta tabla para pruebas, o si necesita hacer una búsqueda de fuerza bruta. +Las lecturas de esta tabla no están paralelizadas. + +## sistema.Números\_mt {#system-numbers-mt} + +Lo mismo que ‘system.numbers’ pero las lecturas están paralelizadas. Los números se pueden devolver en cualquier orden. +Se utiliza para pruebas. + +## sistema.una {#system-one} + +Esta tabla contiene una sola fila con una ‘dummy’ Columna UInt8 que contiene el valor 0. +Esta tabla se utiliza si una consulta SELECT no especifica la cláusula FROM. +Esto es similar a la tabla DUAL que se encuentra en otros DBMS. + +## sistema.parte {#system-tables-parts} + +Contiene información sobre partes de [Método de codificación de datos:](table_engines/mergetree.md) tabla. + +Cada fila describe una parte de datos. + +Columna: + +- `partition` (Cadena) – el nombre de La partición. Para saber qué es una partición, consulte la descripción del [ALTERAR](../query_language/alter.md#query_language_queries_alter) consulta. + + Formato: + + - `YYYYMM` para la partición automática por mes. + - `any_string` al particionar manualmente. + +- `name` (`String`) – Nombre de la parte de datos. + +- `active` (`UInt8`) – Indicador que indica si la parte de datos está activa. Si un elemento de datos está activo, se utiliza en una tabla. De lo contrario, se elimina. Las partes de datos inactivas permanecen después de la fusión. + +- `marks` (`UInt64`) – El número de puntos. Para obtener el número aproximado de filas en una parte de datos, multiplique `marks` por la granularidad del índice (generalmente 8192) (esta sugerencia no funciona para la granularidad adaptativa). + +- `rows` (`UInt64`) – El número de filas. + +- `bytes_on_disk` (`UInt64`) – Tamaño total de todos los archivos de parte de datos en bytes. + +- `data_compressed_bytes` (`UInt64`) – Tamaño total de los datos comprimidos en la parte de datos. Todos los archivos auxiliares (por ejemplo, archivos con marcas) no están incluidos. + +- `data_uncompressed_bytes` (`UInt64`) – Tamaño total de los datos sin comprimir en la parte de datos. Todos los archivos auxiliares (por ejemplo, archivos con marcas) no están incluidos. + +- `marks_bytes` (`UInt64`) – El tamaño del archivo con marcas. + +- `modification_time` (`DateTime`) – La hora en que se modificó el directorio con la parte de datos. Esto normalmente corresponde a la hora de creación del elemento de datos.\| + +- `remove_time` (`DateTime`) – El momento en que la parte de datos quedó inactiva. + +- `refcount` (`UInt32`) – El número de lugares donde se utiliza la parte de datos. Un valor mayor que 2 indica que el elemento de datos se utiliza en consultas o fusiones. + +- `min_date` (`Date`) – El valor mínimo de la clave de fecha en la parte de datos. + +- `max_date` (`Date`) – El valor máximo de la clave de fecha en la parte de datos. + +- `min_time` (`DateTime`) – El valor mínimo de la clave de fecha y hora en la parte de datos. + +- `max_time`(`DateTime`) – El valor máximo de la clave de fecha y hora en la parte de datos. + +- `partition_id` (`String`) – ID de la partición. + +- `min_block_number` (`UInt64`) – El número mínimo de partes de datos que componen la parte actual después de la fusión. + +- `max_block_number` (`UInt64`) – El número máximo de partes de datos que componen la parte actual después de la fusión. + +- `level` (`UInt32`) – Profundidad del árbol de fusión. Cero significa que la parte actual se creó mediante inserción en lugar de fusionar otras partes. + +- `data_version` (`UInt64`) – Número que se utiliza para determinar qué mutaciones se deben aplicar a la parte de datos (mutaciones con una versión superior a `data_version`). + +- `primary_key_bytes_in_memory` (`UInt64`) – La cantidad de memoria (en bytes) utilizada por los valores de clave primaria. + +- `primary_key_bytes_in_memory_allocated` (`UInt64`) – La cantidad de memoria (en bytes) reservada para los valores de clave primaria. + +- `is_frozen` (`UInt8`) – Indicador que muestra que existe una copia de seguridad de datos de partición. 1, la copia de seguridad existe. 0, la copia de seguridad no existe. Para obtener más información, consulte [CONGELAR PARTICIÓN](../query_language/alter.md#alter_freeze-partition) + +- `database` (`String`) – Nombre de la base de datos. + +- `table` (`String`) – Nombre de la tabla. + +- `engine` (`String`) – Nombre del motor de tabla sin parámetros. + +- `path` (`String`) – Ruta absoluta a la carpeta con archivos de parte de datos. + +- `disk` (`String`) – Nombre de un disco que almacena la parte de datos. + +- `hash_of_all_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) de archivos comprimidos. + +- `hash_of_uncompressed_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) de archivos sin comprimir (archivos con marcas, archivo de índice, etc.). + +- `uncompressed_hash_of_compressed_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) de datos en los archivos comprimidos como si estuvieran descomprimidos. + +- `bytes` (`UInt64`) – Alias para `bytes_on_disk`. + +- `marks_size` (`UInt64`) – Alias para `marks_bytes`. + +## sistema.part\_log {#system-tables-part-log} + +El `system.part_log` se crea sólo si el [part\_log](server_settings/settings.md#server_settings-part-log) se especifica la configuración del servidor. + +Esta tabla contiene información sobre eventos que ocurrieron con [partes de datos](table_engines/custom_partitioning_key.md) es el [Método de codificación de datos:](table_engines/mergetree.md) tablas familiares, como agregar o fusionar datos. + +El `system.part_log` contiene las siguientes columnas: + +- `event_type` (Enum) — Tipo del evento que ocurrió con la parte de datos. Puede tener uno de los siguientes valores: + - `NEW_PART` — Inserción de una nueva parte de datos. + - `MERGE_PARTS` — Fusión de partes de datos. + - `DOWNLOAD_PART` — Descarga de una parte de datos. + - `REMOVE_PART` — Extracción o separación de una parte de datos mediante [DETACH PARTITION](../query_language/alter.md#alter_detach-partition). + - `MUTATE_PART` — Mutación de una parte de datos. + - `MOVE_PART` — Mover la parte de datos de un disco a otro. +- `event_date` (Fecha) — fecha del Evento. +- `event_time` (DateTime) — Hora del evento. +- `duration_ms` (UInt64) — Duración. +- `database` (String) — Nombre de la base de datos en la que se encuentra la parte de datos. +- `table` (String) — Nombre de la tabla en la que se encuentra la parte de datos. +- `part_name` (String) — Nombre de la parte de datos. +- `partition_id` (String) — ID de la partición en la que se insertó la parte de datos. La columna toma el ‘all’ valor si la partición es por `tuple()`. +- `rows` (UInt64): el número de filas en la parte de datos. +- `size_in_bytes` (UInt64) — Tamaño de la parte de datos en bytes. +- `merged_from` (Array(String)) - Una matriz de nombres de las partes de las que se componía la parte actual (después de la fusión). +- `bytes_uncompressed` (UInt64): tamaño de bytes sin comprimir. +- `read_rows` (UInt64): el número de filas que se leyó durante la fusión. +- `read_bytes` (UInt64): el número de bytes que se leyeron durante la fusión. +- `error` (UInt16) — El número de código del error ocurrido. +- `exception` (Cadena) — Mensaje de texto del error ocurrido. + +El `system.part_log` se crea después de la primera inserción de datos `MergeTree` tabla. + +## sistema.proceso {#system-tables-processes} + +Esta tabla del sistema se utiliza para implementar el `SHOW PROCESSLIST` consulta. + +Columna: + +- `user` (Cadena): el usuario que realizó la consulta. Tenga en cuenta que para el procesamiento distribuido, las consultas se envían a servidores remotos `default` usuario. El campo contiene el nombre de usuario para una consulta específica, no para una consulta que esta consulta inició. +- `address` (Cadena): la dirección IP desde la que se realizó la solicitud. Lo mismo para el procesamiento distribuido. Para realizar un seguimiento de dónde se hizo originalmente una consulta distribuida, mire `system.processes` en el servidor de solicitud de consulta. +- `elapsed` (Float64): el tiempo en segundos desde que se inició la ejecución de la solicitud. +- `rows_read` (UInt64): el número de filas leídas de la tabla. Para el procesamiento distribuido, en el solicitante servidor, este es el total para todos los servidores remotos. +- `bytes_read` (UInt64): el número de bytes sin comprimir leídos de la tabla. Para el procesamiento distribuido, en el solicitante servidor, este es el total para todos los servidores remotos. +- `total_rows_approx` (UInt64): la aproximación del número total de filas que se deben leer. Para el procesamiento distribuido, en el solicitante servidor, este es el total para todos los servidores remotos. Se puede actualizar durante el procesamiento de solicitudes, cuando se conozcan nuevas fuentes para procesar. +- `memory_usage` (UInt64): cantidad de RAM que usa la solicitud. Puede que no incluya algunos tipos de memoria dedicada. Ver el [Método de codificación de datos:](../operations/settings/query_complexity.md#settings_max_memory_usage) configuración. +- `query` (Cadena) – el texto de La consulta. Para `INSERT`, no incluye los datos para insertar. +- `query_id` (Cadena): ID de consulta, si se define. + +## sistema.text\_log {#system-tables-text-log} + +Contiene entradas de registro. El nivel de registro que va a esta tabla se puede limitar con `text_log.level` configuración del servidor. + +Columna: + +- `event_date` (`Date`) - Fecha de la entrada. +- `event_time` (`DateTime`) - Hora de la entrada. +- `microseconds` (`UInt32`) - Microsegundos de la entrada. +- `thread_name` (Cadena) — Nombre del subproceso desde el que se realizó el registro. +- `thread_id` (UInt64) - ID de subproceso del sistema operativo. +- `level` (`Enum8`) - Nivel de entrada. + - `'Fatal' = 1` + - `'Critical' = 2` + - `'Error' = 3` + - `'Warning' = 4` + - `'Notice' = 5` + - `'Information' = 6` + - `'Debug' = 7` + - `'Trace' = 8` +- `query_id` (`String`) - ID de la consulta. +- `logger_name` (`LowCardinality(String)`) - Nombre del registrador (es decir, `DDLWorker`) +- `message` (`String`) - El mensaje en sí. +- `revision` (`UInt32`) - Revisión de ClickHouse. +- `source_file` (`LowCardinality(String)`) - Archivo de origen desde el que se realizó el registro. +- `source_line` (`UInt64`) - Línea de origen desde la que se realizó el registro. + +## sistema.query\_log {#system-tables-query-log} + +Contiene información sobre la ejecución de consultas. Para cada consulta, puede ver la hora de inicio del procesamiento, la duración del procesamiento, los mensajes de error y otra información. + +!!! note "Nota" + La tabla no contiene datos de entrada para `INSERT` consulta. + +ClickHouse crea esta tabla sólo si el [query\_log](server_settings/settings.md#server_settings-query-log) se especifica el parámetro server. Este parámetro establece las reglas de registro, como el intervalo de registro o el nombre de la tabla en la que se registrarán las consultas. + +Para habilitar el registro de consultas, [Log\_queries](settings/settings.md#settings-log-queries) parámetro a 1. Para obtener más información, consulte el [Configuración](settings/settings.md) apartado. + +El `system.query_log` tabla registra dos tipos de consultas: + +1. Consultas iniciales ejecutadas directamente por el cliente. +2. Consultas secundarias iniciadas por otras consultas (para la ejecución de consultas distribuidas). Para estos tipos de consultas, la información sobre las consultas principales se muestra en el `initial_*` columna. + +Columna: + +- `type` (`Enum8`) — Tipo de evento que se produjo al ejecutar la consulta. Valor: + - `'QueryStart' = 1` — Inicio exitoso de la ejecución de la consulta. + - `'QueryFinish' = 2` — Final exitoso de la ejecución de la consulta. + - `'ExceptionBeforeStart' = 3` — Excepción antes del inicio de la ejecución de la consulta. + - `'ExceptionWhileProcessing' = 4` — Excepción durante la ejecución de la consulta. +- `event_date` (Fecha) — Fecha de inicio de la consulta. +- `event_time` (DateTime) — Hora de inicio de la consulta. +- `query_start_time` (DateTime) — Hora de inicio de la ejecución de la consulta. +- `query_duration_ms` (UInt64) — Duración de la ejecución de la consulta. +- `read_rows` (UInt64) — Número de filas leídas. +- `read_bytes` (UInt64) — Número de bytes leídos. +- `written_rows` (UInt64) — Para `INSERT` consultas, el número de filas escritas. Para otras consultas, el valor de la columna es 0. +- `written_bytes` (UInt64) — Para `INSERT` consultas, el número de bytes escritos. Para otras consultas, el valor de la columna es 0. +- `result_rows` (UInt64) — Número de filas en el resultado. +- `result_bytes` (UInt64) — Número de bytes en el resultado. +- `memory_usage` (UInt64) — Consumo de memoria por la consulta. +- `query` (Cadena) — Cadena de consulta. +- `exception` (String) — Mensaje de excepción. +- `stack_trace` (String) - Rastreo de pila (una lista de métodos llamados antes de que ocurriera el error). Una cadena vacía, si la consulta se completa correctamente. +- `is_initial_query` (UInt8) — Tipo de consulta. Valores posibles: + - 1 — La consulta fue iniciada por el cliente. + - 0 — La consulta fue iniciada por otra consulta para la ejecución de consultas distribuidas. +- `user` (String) — Nombre del usuario que inició la consulta actual. +- `query_id` (String) — ID de la consulta. +- `address` (IPv6): dirección IP que se utilizó para realizar la consulta. +- `port` (UInt16): el puerto de cliente que se utilizó para realizar la consulta. +- `initial_user` (String) — Nombre del usuario que ejecutó la consulta inicial (para la ejecución de consultas distribuidas). +- `initial_query_id` (String) — ID de la consulta inicial (para la ejecución de consultas distribuidas). +- `initial_address` (IPv6): dirección IP desde la que se inició la consulta principal. +- `initial_port` (UInt16): el puerto de cliente que se utilizó para realizar la consulta principal. +- `interface` (UInt8): interfaz desde la que se inició la consulta. Valores posibles: + - 1 — TCP. + - 2 — HTTP. +- `os_user` (Cadena) — Nombre de usuario del sistema operativo que ejecuta [Casa de clics-cliente](../interfaces/cli.md). +- `client_hostname` (String) — Nombre de host de la máquina cliente donde [Casa de clics-cliente](../interfaces/cli.md) o se ejecuta otro cliente TCP. +- `client_name` (Cadena) — El [Casa de clics-cliente](../interfaces/cli.md) o otro nombre de cliente TCP. +- `client_revision` (UInt32) — Revisión del [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. +- `client_version_major` (UInt32) — Versión principal del [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. +- `client_version_minor` (UInt32) — Versión menor de la [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. +- `client_version_patch` (UInt32) — Componente de parche del [Casa de clics-cliente](../interfaces/cli.md) o otra versión de cliente TCP. +- `http_method` (UInt8): método HTTP que inició la consulta. Valores posibles: + - 0 — La consulta se inició desde la interfaz TCP. + - Uno — `GET` se utilizó el método. + - Cómo hacer — `POST` se utilizó el método. +- `http_user_agent` (Cadena) — El `UserAgent` encabezado pasado en la solicitud HTTP. +- `quota_key` (Cadena) — El “quota key” especificado en el [cuota](quotas.md) ajuste (ver `keyed`). +- `revision` (UInt32) - Revisión de ClickHouse. +- `thread_numbers` (Array(UInt32)) — Número de subprocesos que participan en la ejecución de la consulta. +- `ProfileEvents.Names` (Array(String)) — Contadores que miden diferentes métricas. La descripción de ellos se puede encontrar en la tabla [sistema.evento](#system_tables-events) +- `ProfileEvents.Values` (Array(UInt64)) — Valores de las métricas que se enumeran en `ProfileEvents.Names` columna. +- `Settings.Names` (Array(String)) — Nombres de la configuración que se cambiaron cuando el cliente ejecutó la consulta. Para habilitar los cambios de registro en la configuración, `log_query_settings` parámetro a 1. +- `Settings.Values` (Array(String)) — Valores de configuración que se enumeran en el `Settings.Names` columna. + +Cada consulta crea una o dos filas en el `query_log` tabla, dependiendo del estado de la consulta: + +1. Si la ejecución de la consulta se realiza correctamente, se crean dos eventos con los tipos 1 y 2 (consulte `type` columna). +2. Si se produjo un error durante el procesamiento de la consulta, se crean dos eventos con los tipos 1 y 4. +3. Si se produjo un error antes de iniciar la consulta, se crea un solo evento con el tipo 3. + +De forma predeterminada, los registros se agregan a la tabla a intervalos de 7,5 segundos. Puede establecer este intervalo en el [query\_log](server_settings/settings.md#server_settings-query-log) configuración del servidor (consulte el `flush_interval_milliseconds` parámetro). Para vaciar los registros a la fuerza desde el búfer de memoria a la tabla, utilice `SYSTEM FLUSH LOGS` consulta. + +Cuando la tabla se elimina manualmente, se creará automáticamente sobre la marcha. Tenga en cuenta que se eliminarán todos los registros anteriores. + +!!! note "Nota" + El período de almacenamiento para los registros es ilimitado. Los registros no se eliminan automáticamente de la tabla. Debe organizar la eliminación de registros obsoletos usted mismo. + +Puede especificar una clave de partición arbitraria `system.query_log` mesa en el [query\_log](server_settings/settings.md#server_settings-query-log) configuración del servidor (consulte el `partition_by` parámetro). + +## sistema.Sistema abierto. {#system-tables-query-thread-log} + +La tabla contiene información sobre cada subproceso de ejecución de consultas. + +ClickHouse crea esta tabla sólo si el [Sistema abierto.](server_settings/settings.md#server_settings-query-thread-log) se especifica el parámetro server. Este parámetro establece las reglas de registro, como el intervalo de registro o el nombre de la tabla en la que se registrarán las consultas. + +Para habilitar el registro de consultas, [Log\_query\_threads](settings/settings.md#settings-log-query-threads) parámetro a 1. Para obtener más información, consulte el [Configuración](settings/settings.md) apartado. + +Columna: + +- `event_date` (Fecha) — la fecha en que el subproceso ha finalizado la ejecución de la consulta. +- `event_time` (DateTime) — la fecha y hora en que el subproceso ha finalizado la ejecución de la consulta. +- `query_start_time` (DateTime) — Hora de inicio de la ejecución de la consulta. +- `query_duration_ms` (UInt64) — Duración de la ejecución de la consulta. +- `read_rows` (UInt64) — Número de filas leídas. +- `read_bytes` (UInt64) — Número de bytes leídos. +- `written_rows` (UInt64) — Para `INSERT` consultas, el número de filas escritas. Para otras consultas, el valor de la columna es 0. +- `written_bytes` (UInt64) — Para `INSERT` consultas, el número de bytes escritos. Para otras consultas, el valor de la columna es 0. +- `memory_usage` (Int64) - La diferencia entre la cantidad de memoria asignada y liberada en el contexto de este hilo. +- `peak_memory_usage` (Int64) - La diferencia máxima entre la cantidad de memoria asignada y liberada en el contexto de este hilo. +- `thread_name` (String) — Nombre del hilo. +- `thread_number` (UInt32) - ID de rosca interna. +- `os_thread_id` (Int32) - ID de subproceso del sistema operativo. +- `master_thread_id` (UInt64) - ID inicial del sistema operativo del hilo inicial. +- `query` (Cadena) — Cadena de consulta. +- `is_initial_query` (UInt8) — Tipo de consulta. Valores posibles: + - 1 — La consulta fue iniciada por el cliente. + - 0 — La consulta fue iniciada por otra consulta para la ejecución de consultas distribuidas. +- `user` (String) — Nombre del usuario que inició la consulta actual. +- `query_id` (String) — ID de la consulta. +- `address` (IPv6): dirección IP que se utilizó para realizar la consulta. +- `port` (UInt16): el puerto de cliente que se utilizó para realizar la consulta. +- `initial_user` (String) — Nombre del usuario que ejecutó la consulta inicial (para la ejecución de consultas distribuidas). +- `initial_query_id` (String) — ID de la consulta inicial (para la ejecución de consultas distribuidas). +- `initial_address` (IPv6): dirección IP desde la que se inició la consulta principal. +- `initial_port` (UInt16): el puerto de cliente que se utilizó para realizar la consulta principal. +- `interface` (UInt8): interfaz desde la que se inició la consulta. Valores posibles: + - 1 — TCP. + - 2 — HTTP. +- `os_user` (Cadena) — Nombre de usuario del sistema operativo que ejecuta [Casa de clics-cliente](../interfaces/cli.md). +- `client_hostname` (String) — Nombre de host de la máquina cliente donde [Casa de clics-cliente](../interfaces/cli.md) o se ejecuta otro cliente TCP. +- `client_name` (Cadena) — El [Casa de clics-cliente](../interfaces/cli.md) o otro nombre de cliente TCP. +- `client_revision` (UInt32) — Revisión del [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. +- `client_version_major` (UInt32) — Versión principal del [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. +- `client_version_minor` (UInt32) — Versión menor de la [Casa de clics-cliente](../interfaces/cli.md) o otro cliente TCP. +- `client_version_patch` (UInt32) — Componente de parche del [Casa de clics-cliente](../interfaces/cli.md) o otra versión de cliente TCP. +- `http_method` (UInt8): método HTTP que inició la consulta. Valores posibles: + - 0 — La consulta se inició desde la interfaz TCP. + - Uno — `GET` se utilizó el método. + - Cómo hacer — `POST` se utilizó el método. +- `http_user_agent` (Cadena) — El `UserAgent` encabezado pasado en la solicitud HTTP. +- `quota_key` (Cadena) — El “quota key” especificado en el [cuota](quotas.md) ajuste (ver `keyed`). +- `revision` (UInt32) - Revisión de ClickHouse. +- `ProfileEvents.Names` (Array(String)) - Contadores que miden diferentes métricas para este hilo. La descripción de ellos se puede encontrar en la tabla [sistema.evento](#system_tables-events) +- `ProfileEvents.Values` (Array(UInt64)) — Valores de métricas para este subproceso que se enumeran en el `ProfileEvents.Names` columna. + +De forma predeterminada, los registros se agregan a la tabla a intervalos de 7,5 segundos. Puede establecer este intervalo en el [Sistema abierto.](server_settings/settings.md#server_settings-query-thread-log) configuración del servidor (consulte el `flush_interval_milliseconds` parámetro). Para vaciar los registros a la fuerza desde el búfer de memoria a la tabla, utilice `SYSTEM FLUSH LOGS` consulta. + +Cuando la tabla se elimina manualmente, se creará automáticamente sobre la marcha. Tenga en cuenta que se eliminarán todos los registros anteriores. + +!!! note "Nota" + El período de almacenamiento para los registros es ilimitado. Los registros no se eliminan automáticamente de la tabla. Debe organizar la eliminación de registros obsoletos usted mismo. + +Puede especificar una clave de partición arbitraria `system.query_thread_log` mesa en el [Sistema abierto.](server_settings/settings.md#server_settings-query-thread-log) configuración del servidor (consulte el `partition_by` parámetro). + +## sistema.trace\_log {#system-tables-trace-log} + +Contiene seguimientos de pila recopilados por el generador de perfiles de consultas de muestreo. + +ClickHouse crea esta tabla cuando el [trace\_log](server_settings/settings.md#server_settings-trace_log) se establece la sección de configuración del servidor. También el [query\_profiler\_real\_time\_period\_ns](settings/settings.md#query_profiler_real_time_period_ns) y [Los resultados de la prueba](settings/settings.md#query_profiler_cpu_time_period_ns) los ajustes deben establecerse. + +Para analizar los registros, utilice el `addressToLine`, `addressToSymbol` y `demangle` funciones de inspección. + +Columna: + +- `event_date`([Fecha](../data_types/date.md)) — Fecha del momento del muestreo. + +- `event_time`([FechaHora](../data_types/datetime.md)) — Marca de tiempo del momento de muestreo. + +- `revision`([UInt32](../data_types/int_uint.md)) — Revisión de compilación del servidor ClickHouse. + + Cuando se conecta al servidor por `clickhouse-client`, ves la cadena similar a `Connected to ClickHouse server version 19.18.1 revision 54429.`. Este campo contiene el `revision`, pero no el `version` de un servidor. + +- `timer_type`([Enum8](../data_types/enum.md)) — Tipo de temporizador: + + - `Real` representa el tiempo del reloj de pared. + - `CPU` representa el tiempo de CPU. + +- `thread_number`([UInt32](../data_types/int_uint.md)) — Identificador del subproceso. + +- `query_id`([Cadena](../data_types/string.md)) — Identificador de consulta que se puede utilizar para obtener detalles sobre una consulta que se estaba ejecutando desde el [query\_log](#system_tables-query_log) tabla del sistema. + +- `trace`([Matriz (UInt64)](../data_types/array.md)) — Rastro de apilamiento en el momento del muestreo. Cada elemento es una dirección de memoria virtual dentro del proceso del servidor ClickHouse. + +**Ejemplo** + +``` sql +SELECT * FROM system.trace_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2019-11-15 +event_time: 2019-11-15 15:09:38 +revision: 54428 +timer_type: Real +thread_number: 48 +query_id: acc4d61f-5bd1-4a3e-bc91-2180be37c915 +trace: [94222141367858,94222152240175,94222152325351,94222152329944,94222152330796,94222151449980,94222144088167,94222151682763,94222144088167,94222151682763,94222144088167,94222144058283,94222144059248,94222091840750,94222091842302,94222091831228,94222189631488,140509950166747,140509942945935] +``` + +## sistema.Replica {#system-tables-replicas} + +Contiene información y estado de las tablas replicadas que residen en el servidor local. +Esta tabla se puede utilizar para el monitoreo. La tabla contiene una fila para cada tabla Replicated\*. + +Ejemplo: + +``` sql +SELECT * +FROM system.replicas +WHERE table = 'visits' +FORMAT Vertical +``` + +``` text +Row 1: +────── +database: merge +table: visits +engine: ReplicatedCollapsingMergeTree +is_leader: 1 +can_become_leader: 1 +is_readonly: 0 +is_session_expired: 0 +future_parts: 1 +parts_to_check: 0 +zookeeper_path: /clickhouse/tables/01-06/visits +replica_name: example01-06-1.yandex.ru +replica_path: /clickhouse/tables/01-06/visits/replicas/example01-06-1.yandex.ru +columns_version: 9 +queue_size: 1 +inserts_in_queue: 0 +merges_in_queue: 1 +part_mutations_in_queue: 0 +queue_oldest_time: 2020-02-20 08:34:30 +inserts_oldest_time: 0000-00-00 00:00:00 +merges_oldest_time: 2020-02-20 08:34:30 +part_mutations_oldest_time: 0000-00-00 00:00:00 +oldest_part_to_get: +oldest_part_to_merge_to: 20200220_20284_20840_7 +oldest_part_to_mutate_to: +log_max_index: 596273 +log_pointer: 596274 +last_queue_update: 2020-02-20 08:34:32 +absolute_delay: 0 +total_replicas: 2 +active_replicas: 2 +``` + +Columna: + +- `database` (`String`) - Nombre de la base de datos +- `table` (`String`) - Nombre de la tabla +- `engine` (`String`) - Nombre del motor de tabla +- `is_leader` (`UInt8`) - Si la réplica es la líder. + Sólo una réplica a la vez puede ser el líder. El líder es responsable de seleccionar las fusiones de fondo para realizar. + Tenga en cuenta que las escrituras se pueden realizar en cualquier réplica que esté disponible y tenga una sesión en ZK, independientemente de si es un líder. +- `can_become_leader` (`UInt8`) - Si la réplica puede ser elegida como líder. +- `is_readonly` (`UInt8`) - Si la réplica está en modo de sólo lectura. + Este modo se activa si la configuración no tiene secciones con ZooKeeper, si se produce un error desconocido al reinicializar sesiones en ZooKeeper y durante la reinicialización de sesiones en ZooKeeper. +- `is_session_expired` (`UInt8`) - la sesión con ZooKeeper ha expirado. Básicamente lo mismo que `is_readonly`. +- `future_parts` (`UInt32`) - El número de partes de datos que aparecerán como resultado de INSERTs o fusiones que aún no se han realizado. +- `parts_to_check` (`UInt32`) - El número de partes de datos en la cola para la verificación. Una pieza se coloca en la cola de verificación si existe la sospecha de que podría estar dañada. +- `zookeeper_path` (`String`) - Ruta de acceso a los datos de la tabla en ZooKeeper. +- `replica_name` (`String`) - Nombre de réplica en ZooKeeper. Diferentes réplicas de la misma tabla tienen diferentes nombres. +- `replica_path` (`String`) - Ruta de acceso a los datos de réplica en ZooKeeper. Lo mismo que concatenar ‘zookeeper\_path/replicas/replica\_path’. +- `columns_version` (`Int32`) - Número de versión de la estructura de la tabla. Indica cuántas veces se realizó ALTER. Si las réplicas tienen versiones diferentes, significa que algunas réplicas aún no han realizado todas las ALTER. +- `queue_size` (`UInt32`) - Tamaño de la cola para las operaciones en espera de ser realizadas. Las operaciones incluyen insertar bloques de datos, fusiones y otras acciones. Por lo general, coincide con `future_parts`. +- `inserts_in_queue` (`UInt32`) - Número de inserciones de bloques de datos que deben realizarse. Las inserciones generalmente se replican con bastante rapidez. Si este número es grande, significa que algo anda mal. +- `merges_in_queue` (`UInt32`) - El número de fusiones en espera de hacerse. A veces las fusiones son largas, por lo que este valor puede ser mayor que cero durante mucho tiempo. +- `part_mutations_in_queue` (`UInt32`) - El número de mutaciones a la espera de hacerse. +- `queue_oldest_time` (`DateTime`) - Si `queue_size` mayor que 0, muestra cuándo se agregó la operación más antigua a la cola. +- `inserts_oldest_time` (`DateTime`) - Ver `queue_oldest_time` +- `merges_oldest_time` (`DateTime`) - Ver `queue_oldest_time` +- `part_mutations_oldest_time` (`DateTime`) - Ver `queue_oldest_time` + +Las siguientes 4 columnas tienen un valor distinto de cero solo cuando hay una sesión activa con ZK. + +- `log_max_index` (`UInt64`) - Número máximo de inscripción en el registro de actividad general. +- `log_pointer` (`UInt64`) - Número máximo de entrada en el registro de actividad general que la réplica copió en su cola de ejecución, más uno. Si `log_pointer` es mucho más pequeño que `log_max_index`, algo está mal. +- `last_queue_update` (`DateTime`) - Cuando la cola se actualizó la última vez. +- `absolute_delay` (`UInt64`) - ¿Qué tan grande retraso en segundos tiene la réplica actual. +- `total_replicas` (`UInt8`) - El número total de réplicas conocidas de esta tabla. +- `active_replicas` (`UInt8`) - El número de réplicas de esta tabla que tienen una sesión en ZooKeeper (es decir, el número de réplicas en funcionamiento). + +Si solicita todas las columnas, la tabla puede funcionar un poco lentamente, ya que se realizan varias lecturas de ZooKeeper para cada fila. +Si no solicita las últimas 4 columnas (log\_max\_index, log\_pointer, total\_replicas, active\_replicas), la tabla funciona rápidamente. + +Por ejemplo, puede verificar que todo funcione correctamente de esta manera: + +``` sql +SELECT + database, + table, + is_leader, + is_readonly, + is_session_expired, + future_parts, + parts_to_check, + columns_version, + queue_size, + inserts_in_queue, + merges_in_queue, + log_max_index, + log_pointer, + total_replicas, + active_replicas +FROM system.replicas +WHERE + is_readonly + OR is_session_expired + OR future_parts > 20 + OR parts_to_check > 10 + OR queue_size > 20 + OR inserts_in_queue > 10 + OR log_max_index - log_pointer > 10 + OR total_replicas < 2 + OR active_replicas < total_replicas +``` + +Si esta consulta no devuelve nada, significa que todo está bien. + +## sistema.configuración {#system-settings} + +Contiene información sobre la configuración actualmente en uso. +Es decir, se usa para ejecutar la consulta que está utilizando para leer del sistema.tabla de configuración. + +Columna: + +- `name` (Cadena) — Nombre de configuración. +- `value` (Cadena) — Valor de ajuste. +- `changed` (UInt8): si la configuración se definió explícitamente en la configuración o si se cambió explícitamente. + +Ejemplo: + +``` sql +SELECT * +FROM system.settings +WHERE changed +``` + +``` text +┌─name───────────────────┬─value───────┬─changed─┐ +│ max_threads │ 8 │ 1 │ +│ use_uncompressed_cache │ 0 │ 1 │ +│ load_balancing │ random │ 1 │ +│ max_memory_usage │ 10000000000 │ 1 │ +└────────────────────────┴─────────────┴─────────┘ +``` + +## sistema.table\_engines {#system-table-engines} + +Contiene la descripción de los motores de tablas admitidos por el servidor y su información de soporte de características. + +Esta tabla contiene las siguientes columnas (el tipo de columna se muestra entre corchetes): + +- `name` (Cadena) — El nombre del motor de tabla. +- `supports_settings` (UInt8): marca que indica si el motor de tabla admite `SETTINGS` clausula. +- `supports_skipping_indices` (UInt8): marca que indica si el motor de tabla admite [Índices de saltos](table_engines/mergetree/#table_engine-mergetree-data_skipping-indexes). +- `supports_ttl` (UInt8): marca que indica si el motor de tabla admite [TTL](table_engines/mergetree/#table_engine-mergetree-ttl). +- `supports_sort_order` (UInt8): marca que indica si el motor de tablas admite cláusulas `PARTITION_BY`, `PRIMARY_KEY`, `ORDER_BY` y `SAMPLE_BY`. +- `supports_replication` (UInt8): marca que indica si el motor de tabla admite [Replicación de datos](table_engines/replication/). +- `supports_duduplication` (UInt8): marca que indica si el motor de tablas admite la desduplicación de datos. + +Ejemplo: + +``` sql +SELECT * +FROM system.table_engines +WHERE name in ('Kafka', 'MergeTree', 'ReplicatedCollapsingMergeTree') +``` + +``` text +┌─name──────────────────────────┬─supports_settings─┬─supports_skipping_indices─┬─supports_sort_order─┬─supports_ttl─┬─supports_replication─┬─supports_deduplication─┐ +│ Kafka │ 1 │ 0 │ 0 │ 0 │ 0 │ 0 │ +│ MergeTree │ 1 │ 1 │ 1 │ 1 │ 0 │ 0 │ +│ ReplicatedCollapsingMergeTree │ 1 │ 1 │ 1 │ 1 │ 1 │ 1 │ +└───────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────────┴──────────────┴──────────────────────┴────────────────────────┘ +``` + +**Ver también** + +- Familia MergeTree [cláusulas de consulta](table_engines/mergetree.md#mergetree-query-clauses) +- Kafka [configuración](table_engines/kafka.md#table_engine-kafka-creating-a-table) +- Unir [configuración](table_engines/join.md#join-limitations-and-settings) + +## sistema.tabla {#system-tables} + +Contiene metadatos de cada tabla que el servidor conoce. Las tablas separadas no se muestran en `system.tables`. + +Esta tabla contiene las siguientes columnas (el tipo de columna se muestra entre corchetes): + +- `database` (String) — El nombre de la base de datos en la que se encuentra la tabla. +- `name` (Cadena) — Nombre de tabla. +- `engine` (Cadena) — Nombre del motor de tabla (sin parámetros). +- `is_temporary` (UInt8): marca que indica si la tabla es temporal. +- `data_path` (String) - Ruta de acceso a los datos de la tabla en el sistema de archivos. +- `metadata_path` (String) - Ruta de acceso a los metadatos de la tabla en el sistema de archivos. +- `metadata_modification_time` (DateTime) - Hora de la última modificación de los metadatos de la tabla. +- `dependencies_database` (Array(String)) - Dependencias de base de datos. +- `dependencies_table` (Array(String)) - Dependencias de tabla ([Método de codificación de datos:](table_engines/materializedview.md) tablas basadas en la tabla real). +- `create_table_query` (String) - La consulta que se utilizó para crear la tabla. +- `engine_full` (String) - Parámetros del motor de tabla. +- `partition_key` (String) - La expresión de clave de partición especificada en la tabla. +- `sorting_key` (String) - La expresión de clave de ordenación especificada en la tabla. +- `primary_key` (String) - La expresión de clave principal especificada en la tabla. +- `sampling_key` (String) - La expresión de clave de muestreo especificada en la tabla. + +El `system.tables` se utiliza en `SHOW TABLES` implementación de consultas. + +## sistema.Zookeeper {#system-zookeeper} + +La tabla no existe si ZooKeeper no está configurado. Permite leer datos del clúster ZooKeeper definido en la configuración. +La consulta debe tener un ‘path’ condición de igualdad en la cláusula WHERE. Este es el camino en ZooKeeper para los niños para los que desea obtener datos. + +Consulta `SELECT * FROM system.zookeeper WHERE path = '/clickhouse'` salidas de datos para todos los niños en el `/clickhouse` Nodo. +Para generar datos para todos los nodos raíz, escriba path = ‘/’. +Si la ruta especificada en ‘path’ no existe, se lanzará una excepción. + +Columna: + +- `name` (String) — El nombre del nodo. +- `path` (String) — La ruta al nodo. +- `value` (Cadena) - el Valor de nodo. +- `dataLength` (Int32) — Tamaño del valor. +- `numChildren` (Int32) — Número de descendientes. +- `czxid` (Int64) — ID de la transacción que creó el nodo. +- `mzxid` (Int64) — ID de la transacción que cambió el nodo por última vez. +- `pzxid` (Int64) — ID de la transacción que eliminó o agregó descendientes por última vez. +- `ctime` (DateTime) — Hora de creación del nodo. +- `mtime` (DateTime) — Hora de la última modificación del nodo. +- `version` (Int32) — Versión del nodo: el número de veces que se cambió el nodo. +- `cversion` (Int32) — Número de descendientes añadidos o eliminados. +- `aversion` (Int32) — Número de cambios en la ACL. +- `ephemeralOwner` (Int64): para nodos efímeros, el ID de la sesión que posee este nodo. + +Ejemplo: + +``` sql +SELECT * +FROM system.zookeeper +WHERE path = '/clickhouse/tables/01-08/visits/replicas' +FORMAT Vertical +``` + +``` text +Row 1: +────── +name: example01-08-1.yandex.ru +value: +czxid: 932998691229 +mzxid: 932998691229 +ctime: 2015-03-27 16:49:51 +mtime: 2015-03-27 16:49:51 +version: 0 +cversion: 47 +aversion: 0 +ephemeralOwner: 0 +dataLength: 0 +numChildren: 7 +pzxid: 987021031383 +path: /clickhouse/tables/01-08/visits/replicas + +Row 2: +────── +name: example01-08-2.yandex.ru +value: +czxid: 933002738135 +mzxid: 933002738135 +ctime: 2015-03-27 16:57:01 +mtime: 2015-03-27 16:57:01 +version: 0 +cversion: 37 +aversion: 0 +ephemeralOwner: 0 +dataLength: 0 +numChildren: 7 +pzxid: 987021252247 +path: /clickhouse/tables/01-08/visits/replicas +``` + +## sistema.mutación {#system-tables-mutations} + +La tabla contiene información sobre [mutación](../query_language/alter.md#alter-mutations) de las tablas MergeTree y su progreso. Cada comando de mutación está representado por una sola fila. La tabla tiene las siguientes columnas: + +**basar**, **tabla** - El nombre de la base de datos y la tabla a la que se aplicó la mutación. + +**mutation\_id** - La identificación de la mutación. Para las tablas replicadas, estos identificadores corresponden a los nombres de znode `/mutations/` Directorio en ZooKeeper. Para las tablas no duplicadas, los ID corresponden a los nombres de archivo en el directorio de datos de la tabla. + +**comando** - La cadena de comandos de mutación (la parte de la consulta después de `ALTER TABLE [db.]table`). + +**create\_time** - Cuando este comando de mutación fue enviado para su ejecución. + +**block\_numbers.partition\_id**, **block\_numbers.número** - Una columna anidada. Para las mutaciones de tablas replicadas, contiene un registro para cada partición: el ID de partición y el número de bloque que fue adquirido por la mutación (en cada partición, solo se mutarán las partes que contienen bloques con números menores que el número de bloque adquirido por la mutación en esa partición). En tablas no replicadas, los números de bloque en todas las particiones forman una sola secuencia. Esto significa que para las mutaciones de tablas no replicadas, la columna contendrá un registro con un solo número de bloque adquirido por la mutación. + +**partes\_a\_do** - El número de partes de datos que deben mutarse para que finalice la mutación. + +**is\_done** - Es la mutación hecho? Tenga en cuenta que incluso si `parts_to_do = 0` es posible que aún no se haya realizado una mutación de una tabla replicada debido a un INSERT de larga ejecución que creará una nueva parte de datos que deberá mutarse. + +Si hubo problemas con la mutación de algunas partes, las siguientes columnas contienen información adicional: + +**Método de codificación de datos:** - El nombre de la parte más reciente que no se pudo mutar. + +**Método de codificación de datos:** - El momento del fracaso de la mutación de la parte más reciente. + +**Método de codificación de datos:** - El mensaje de excepción que causó el error de mutación de parte más reciente. + +## sistema.Discoteca {#system-tables-disks} + +Contiene información sobre los discos definidos en el [configuración del servidor](table_engines/mergetree.md#table_engine-mergetree-multiple-volumes_configure). + +Columna: + +- `name` ([Cadena](../data_types/string.md)) — Nombre de un disco en la configuración del servidor. +- `path` ([Cadena](../data_types/string.md)) — Ruta de acceso al punto de montaje en el sistema de archivos. +- `free_space` ([UInt64](../data_types/int_uint.md)) — Espacio libre en el disco en bytes. +- `total_space` ([UInt64](../data_types/int_uint.md)) — Volumen del disco en bytes. +- `keep_free_space` ([UInt64](../data_types/int_uint.md)) — Cantidad de espacio en disco que debe permanecer libre en el disco en bytes. Definido en el `keep_free_space_bytes` parámetro de configuración del disco. + +## sistema.almacenamiento\_policies {#system-tables-storage-policies} + +Contiene información sobre las directivas de almacenamiento y los volúmenes [configuración del servidor](table_engines/mergetree.md#table_engine-mergetree-multiple-volumes_configure). + +Columna: + +- `policy_name` ([Cadena](../data_types/string.md)) — Nombre de la política de almacenamiento. +- `volume_name` ([Cadena](../data_types/string.md)) — Nombre de volumen definido en la política de almacenamiento. +- `volume_priority` ([UInt64](../data_types/int_uint.md)) — Número de orden de volumen en la configuración. +- `disks` ([Matriz (Cadena)](../data_types/array.md)) — Nombres de disco, definidos en la directiva de almacenamiento. +- `max_data_part_size` ([UInt64](../data_types/int_uint.md)) — Tamaño máximo de una parte de datos que se puede almacenar en discos de volumen (0 — sin límite). +- `move_factor` ([Float64](../data_types/float.md)) — Relación de espacio libre en disco. Cuando la relación excede el valor del parámetro de configuración, ClickHouse comienza a mover los datos al siguiente volumen en orden. + +Si la directiva de almacenamiento contiene más de un volumen, la información de cada volumen se almacena en la fila individual de la tabla. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/system_tables/) diff --git a/docs/es/operations/table_engines/aggregatingmergetree.md b/docs/es/operations/table_engines/aggregatingmergetree.md new file mode 100644 index 00000000000..8cd485831da --- /dev/null +++ b/docs/es/operations/table_engines/aggregatingmergetree.md @@ -0,0 +1,95 @@ +# AgregaciónMergeTree {#aggregatingmergetree} + +El motor hereda de [Método de codificación de datos:](mergetree.md#table_engines-mergetree), alterando la lógica para la fusión de partes de datos. ClickHouse reemplaza todas las filas con la misma clave principal (o más exactamente, con la misma [clave de clasificación](mergetree.md)) con una sola fila (dentro de una parte de datos) que almacena una combinación de estados de funciones agregadas. + +Usted puede utilizar `AggregatingMergeTree` tablas para la agregación de datos incrementales, incluidas las vistas materializadas agregadas. + +El motor procesa todas las columnas con [AggregateFunction](../../data_types/nested_data_structures/aggregatefunction.md) tipo. + +Es apropiado usar `AggregatingMergeTree` si reduce el número de filas por pedidos. + +## Creación de una tabla {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = AggregatingMergeTree() +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[TTL expr] +[SETTINGS name=value, ...] +``` + +Para obtener una descripción de los parámetros de solicitud, consulte [descripción de la solicitud](../../query_language/create.md). + +**Cláusulas de consulta** + +Al crear un `AggregatingMergeTree` mesa de la misma [clausula](mergetree.md) se requieren, como al crear un `MergeTree` tabla. + +
+ +Método obsoleto para crear una tabla + +!!! attention "Atención" + No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] AggregatingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity) +``` + +Todos los parámetros tienen el mismo significado que en `MergeTree`. +
+ +## SELECCIONAR e INSERTAR {#select-and-insert} + +Para insertar datos, utilice [INSERTAR SELECCIONAR](../../query_language/insert_into.md) Consulta con funciones agregadas -State-. +Al seleccionar datos de `AggregatingMergeTree` mesa, uso `GROUP BY` cláusula y las mismas funciones agregadas que al insertar datos, pero usando `-Merge` sufijo. + +En los resultados de `SELECT` consulta, los valores de `AggregateFunction` tipo tiene representación binaria específica de la implementación para todos los formatos de salida de ClickHouse. Si volcar datos en, por ejemplo, `TabSeparated` Formato con `SELECT` Consulta entonces este volcado se puede cargar de nuevo usando `INSERT` consulta. + +## Ejemplo de una vista materializada agregada {#example-of-an-aggregated-materialized-view} + +`AggregatingMergeTree` vista materializada que mira el `test.visits` tabla: + +``` sql +CREATE MATERIALIZED VIEW test.basic +ENGINE = AggregatingMergeTree() PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate) +AS SELECT + CounterID, + StartDate, + sumState(Sign) AS Visits, + uniqState(UserID) AS Users +FROM test.visits +GROUP BY CounterID, StartDate; +``` + +Insertar datos en el `test.visits` tabla. + +``` sql +INSERT INTO test.visits ... +``` + +Los datos se insertan tanto en la tabla como en la vista `test.basic` que realizará la agregación. + +Para obtener los datos agregados, necesitamos ejecutar una consulta como `SELECT ... GROUP BY ...` de la vista `test.basic`: + +``` sql +SELECT + StartDate, + sumMerge(Visits) AS Visits, + uniqMerge(Users) AS Users +FROM test.basic +GROUP BY StartDate +ORDER BY StartDate; +``` + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/aggregatingmergetree/) diff --git a/docs/es/operations/table_engines/buffer.md b/docs/es/operations/table_engines/buffer.md new file mode 100644 index 00000000000..a1545297ccf --- /dev/null +++ b/docs/es/operations/table_engines/buffer.md @@ -0,0 +1,64 @@ +# Búfer {#buffer} + +Almacena los datos para escribir en la memoria RAM, enjuagándolos periódicamente a otra tabla. Durante la operación de lectura, los datos se leen desde el búfer y la otra tabla simultáneamente. + +``` sql +Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes) +``` + +Parámetros del motor: + +- `database` – Nombre de la base de datos. En lugar del nombre de la base de datos, puede usar una expresión constante que devuelva una cadena. +- `table` – Tabla para eliminar los datos. +- `num_layers` – Capa de paralelismo. Físicamente, la tabla se representará como `num_layers` de búferes independientes. Valor recomendado: 16. +- `min_time`, `max_time`, `min_rows`, `max_rows`, `min_bytes`, y `max_bytes` – Condiciones para el lavado de datos del búfer. + +Los datos se vacían del búfer y se escriben en la tabla de destino si `min*` condiciones o al menos una `max*` condición se cumplen. + +- `min_time`, `max_time` – Condición para el tiempo en segundos desde el momento de la primera escritura en el búfer. +- `min_rows`, `max_rows` – Condición para el número de filas en el búfer. +- `min_bytes`, `max_bytes` – Condición para el número de bytes en el búfer. + +Durante la operación de escritura, los datos se insertan en un `num_layers` número de búferes aleatorios. O bien, si la parte de datos para insertar es lo suficientemente grande (mayor que `max_rows` o `max_bytes`), se escribe directamente en la tabla de destino, omitiendo el búfer. + +Las condiciones para el lavado de los datos se calculan por separado para cada uno de los `num_layers` búfer. Por ejemplo, si `num_layers = 16` y `max_bytes = 100000000`, el consumo máximo de RAM es de 1.6 GB. + +Ejemplo: + +``` sql +CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000) +``` + +Creación de un ‘merge.hits\_buffer’ mesa con la misma estructura que ‘merge.hits’ y usando el motor Buffer. Al escribir en esta tabla, los datos se almacenan en la memoria RAM y ‘merge.hits’ tabla. Se crean 16 búferes. Los datos de cada uno de ellos se vacían si han pasado 100 segundos o se han escrito un millón de filas o se han escrito 100 MB de datos; o si simultáneamente han pasado 10 segundos y se han escrito 10.000 filas y 10 MB de datos. Por ejemplo, si solo se ha escrito una fila, después de 100 segundos se vaciará, pase lo que pase. Pero si se han escrito muchas filas, los datos se vaciarán antes. + +Cuando se detiene el servidor, con DROP TABLE o DETACH TABLE, los datos del búfer también se vacían a la tabla de destino. + +Puede establecer cadenas vacías entre comillas simples para la base de datos y el nombre de la tabla. Esto indica la ausencia de una tabla de destino. En este caso, cuando se alcanzan las condiciones de descarga de datos, el búfer simplemente se borra. Esto puede ser útil para mantener una ventana de datos en la memoria. + +Al leer desde una tabla de búfer, los datos se procesan tanto desde el búfer como desde la tabla de destino (si hay uno). +Tenga en cuenta que las tablas Buffer no admiten un índice. En otras palabras, los datos del búfer se analizan por completo, lo que puede ser lento para los búferes grandes. (Para los datos de una tabla subordinada, se utilizará el índice que admite.) + +Si el conjunto de columnas de la tabla Buffer no coinciden con el conjunto de columnas de una tabla subordinada, se inserta un subconjunto de columnas que existen en ambas tablas. + +Si los tipos no coinciden con una de las columnas de la tabla Búfer y una tabla subordinada, se escribe un mensaje de error en el registro del servidor y se borra el búfer. +Lo mismo sucede si la tabla subordinada no existe cuando se vacía el búfer. + +Si necesita ejecutar ALTER para una tabla subordinada y la tabla de búfer, se recomienda eliminar primero la tabla de búfer, ejecutar ALTER para la tabla subordinada y, a continuación, crear la tabla de búfer de nuevo. + +Si el servidor se reinicia de forma anormal, se pierden los datos del búfer. + +FINAL y SAMPLE no funcionan correctamente para las tablas Buffer. Estas condiciones se pasan a la tabla de destino, pero no se utilizan para procesar datos en el búfer. Si se requieren estas características, recomendamos usar solo la tabla Buffer para escribir, mientras lee desde la tabla de destino. + +Al agregar datos a un búfer, uno de los búferes está bloqueado. Esto provoca retrasos si se realiza una operación de lectura simultáneamente desde la tabla. + +Los datos que se insertan en una tabla de búfer pueden terminar en la tabla subordinada en un orden diferente y en bloques diferentes. Debido a esto, una tabla Buffer es difícil de usar para escribir en un CollapsingMergeTree correctamente. Para evitar problemas, puede establecer ‘num\_layers’ a 1. + +Si se replica la tabla de destino, se pierden algunas características esperadas de las tablas replicadas al escribir en una tabla de búfer. Los cambios aleatorios en el orden de las filas y los tamaños de las partes de datos hacen que la desduplicación de datos deje de funcionar, lo que significa que no es posible tener un ‘exactly once’ escribir en tablas replicadas. + +Debido a estas desventajas, solo podemos recomendar el uso de una tabla Buffer en casos raros. + +Una tabla de búfer se utiliza cuando se reciben demasiados INSERT de un gran número de servidores durante una unidad de tiempo y los datos no se pueden almacenar en búfer antes de la inserción, lo que significa que los INSERT no pueden ejecutarse lo suficientemente rápido. + +Tenga en cuenta que no tiene sentido insertar datos una fila a la vez, incluso para tablas de búfer. Esto solo producirá una velocidad de unos pocos miles de filas por segundo, mientras que la inserción de bloques de datos más grandes puede producir más de un millón de filas por segundo (consulte la sección “Performance”). + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/buffer/) diff --git a/docs/es/operations/table_engines/collapsingmergetree.md b/docs/es/operations/table_engines/collapsingmergetree.md new file mode 100644 index 00000000000..bb19ade6e13 --- /dev/null +++ b/docs/es/operations/table_engines/collapsingmergetree.md @@ -0,0 +1,302 @@ +# ColapsarMergeTree {#table-engine-collapsingmergetree} + +El motor hereda de [Método de codificación de datos:](mergetree.md) y agrega la lógica de las filas que colapsan al algoritmo de fusión de partes de datos. + +`CollapsingMergeTree` elimina de forma asincrónica (colapsa) pares de filas si todos los campos de una clave de ordenación (`ORDER BY`) son equivalentes excepto el campo particular `Sign` que puede tener `1` y `-1` valor. Las filas sin un par se mantienen. Para más detalles, consulte el [Derrumbar](#table_engine-collapsingmergetree-collapsing) sección del documento. + +El motor puede reducir significativamente el volumen de almacenamiento y aumentar la eficiencia de `SELECT` consulta como consecuencia. + +## Creación de una tabla {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = CollapsingMergeTree(sign) +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +Para obtener una descripción de los parámetros de consulta, consulte [descripción de la consulta](../../query_language/create.md). + +**CollapsingMergeTree Parámetros** + +- `sign` — Nombre de la columna con el tipo de fila: `1` es una “state” fila, `-1` es una “cancel” Fila. + + Tipo de datos de columna — `Int8`. + +**Cláusulas de consulta** + +Al crear un `CollapsingMergeTree` mesa, la misma [cláusulas de consulta](mergetree.md#table_engine-mergetree-creating-a-table) se requieren, como al crear un `MergeTree` tabla. + +
+ +Método obsoleto para crear una tabla + +!!! attention "Atención" + No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] CollapsingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, sign) +``` + +Todos los parámetros excepto `sign` el mismo significado que en `MergeTree`. + +- `sign` — Nombre de la columna con el tipo de fila: `1` — “state” fila, `-1` — “cancel” Fila. + + Tipo de datos de columna — `Int8`. + +
+ +## Derrumbar {#table-engine-collapsingmergetree-collapsing} + +### Datos {#data} + +Considere la situación en la que necesita guardar datos que cambian continuamente para algún objeto. Parece lógico tener una fila para un objeto y actualizarla en cualquier cambio, pero la operación de actualización es costosa y lenta para DBMS porque requiere la reescritura de los datos en el almacenamiento. Si necesita escribir datos rápidamente, la actualización no es aceptable, pero puede escribir los cambios de un objeto secuencialmente de la siguiente manera. + +Utilice la columna en particular `Sign`. Si `Sign = 1` significa que la fila es un estado de un objeto, llamémoslo “state” Fila. Si `Sign = -1` significa la cancelación del estado de un objeto con los mismos atributos, llamémoslo “cancel” Fila. + +Por ejemplo, queremos calcular cuántas páginas revisaron los usuarios en algún sitio y cuánto tiempo estuvieron allí. En algún momento escribimos la siguiente fila con el estado de la actividad del usuario: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +En algún momento después registramos el cambio de actividad del usuario y lo escribimos con las siguientes dos filas. + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +La primera fila cancela el estado anterior del objeto (usuario). Debe copiar los campos clave de ordenación del estado cancelado exceptuando `Sign`. + +La segunda fila contiene el estado actual. + +Como solo necesitamos el último estado de actividad del usuario, las filas + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +se puede eliminar colapsando el estado no válido (antiguo) de un objeto. `CollapsingMergeTree` hace esto mientras se fusionan las partes de datos. + +Por qué necesitamos 2 filas para cada cambio leído en el [Algoritmo](#table_engine-collapsingmergetree-collapsing-algorithm) apartado. + +**Propiedades peculiares de tal enfoque** + +1. El programa que escribe los datos debe recordar el estado de un objeto para poder cancelarlo. “Cancel” debe contener copias de los campos de clave de ordenación “state” y lo opuesto `Sign`. Aumenta el tamaño inicial de almacenamiento, pero permite escribir los datos rápidamente. +2. Las matrices de largo crecimiento en columnas reducen la eficiencia del motor debido a la carga para escribir. Los datos más sencillos, mayor será la eficiencia. +3. El `SELECT` Los resultados dependen en gran medida de la consistencia del historial de cambios de objetos. Sea preciso al preparar los datos para insertarlos. Puede obtener resultados impredecibles en datos incoherentes, por ejemplo, valores negativos para métricas no negativas, como la profundidad de la sesión. + +### Algoritmo {#table-engine-collapsingmergetree-collapsing-algorithm} + +Cuando ClickHouse combina partes de datos, cada grupo de filas consecutivas tiene la misma clave de ordenación (`ORDER BY`) se reduce a no más de dos filas, una con `Sign = 1` (“state” fila) y otro con `Sign = -1` (“cancel” fila). En otras palabras, las entradas colapsan. + +Para cada parte de datos resultante, ClickHouse guarda: + +1. El primero “cancel” y el último “state” si el número de “state” y “cancel” y la última fila es una “state” Fila. + +2. El último “state” fila, si hay más “state” películas que “cancel” películas. + +3. El primero “cancel” fila, si hay más “cancel” películas que “state” películas. + +4. Ninguna de las filas, en todos los demás casos. + +También cuando hay al menos 2 más “state” películas que “cancel” filas, o al menos 2 más “cancel” películas entonces “state” fila, la fusión continúa, pero ClickHouse trata esta situación como un error lógico y la registra en el registro del servidor. Este error puede producirse si se insertan los mismos datos más de una vez. + +Por lo tanto, el colapso no debe cambiar los resultados del cálculo de las estadísticas. +Los cambios colapsaron gradualmente para que al final solo quedara el último estado de casi todos los objetos. + +El `Sign` se requiere porque el algoritmo de fusión no garantiza que todas las filas con la misma clave de ordenación estarán en la misma parte de datos resultante e incluso en el mismo servidor físico. Proceso de ClickHouse `SELECT` consultas con múltiples hilos, y no puede predecir el orden de las filas en el resultado. La agregación es necesaria si hay una necesidad de obtener completamente “collapsed” datos de `CollapsingMergeTree` tabla. + +Para finalizar el colapso, escriba una consulta con `GROUP BY` cláusula y funciones agregadas que representan el signo. Por ejemplo, para calcular la cantidad, use `sum(Sign)` es lugar de `count()`. Para calcular la suma de algo, use `sum(Sign * x)` es lugar de `sum(x)` y así sucesivamente, y también añadir `HAVING sum(Sign) > 0`. + +Los agregados `count`, `sum` y `avg` podría calcularse de esta manera. El agregado `uniq` podría calcularse si un objeto tiene al menos un estado no colapsado. Los agregados `min` y `max` no se pudo calcular porque `CollapsingMergeTree` no guarda el historial de valores de los estados colapsados. + +Si necesita extraer datos sin agregación (por ejemplo, para comprobar si hay filas presentes cuyos valores más recientes coinciden con ciertas condiciones), puede utilizar el `FINAL` modificador para el `FROM` clausula. Este enfoque es significativamente menos eficiente. + +## Ejemplo de uso {#example-of-use} + +Datos de ejemplo: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +Creación de la tabla: + +``` sql +CREATE TABLE UAct +( + UserID UInt64, + PageViews UInt8, + Duration UInt8, + Sign Int8 +) +ENGINE = CollapsingMergeTree(Sign) +ORDER BY UserID +``` + +Inserción de los datos: + +``` sql +INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1) +``` + +``` sql +INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 6, 185, 1) +``` + +Usamos dos `INSERT` consultas para crear dos partes de datos diferentes. Si insertamos los datos con una consulta, ClickHouse crea una parte de datos y nunca realizará ninguna fusión. + +Obtener los datos: + +``` sql +SELECT * FROM UAct +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +¿Qué vemos y dónde está colapsando? + +Con dos `INSERT` Consultas, hemos creado 2 partes de datos. El `SELECT` la consulta se realizó en 2 hilos, y obtuvimos un orden aleatorio de filas. No se ha producido un colapso porque todavía no se había fusionado las partes de datos. ClickHouse fusiona parte de datos en un momento desconocido que no podemos predecir. + +Por lo tanto, necesitamos agregación: + +``` sql +SELECT + UserID, + sum(PageViews * Sign) AS PageViews, + sum(Duration * Sign) AS Duration +FROM UAct +GROUP BY UserID +HAVING sum(Sign) > 0 +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┐ +│ 4324182021466249494 │ 6 │ 185 │ +└─────────────────────┴───────────┴──────────┘ +``` + +Si no necesitamos agregación y queremos forzar el colapso, podemos usar `FINAL` Modificador para `FROM` clausula. + +``` sql +SELECT * FROM UAct FINAL +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +Esta forma de seleccionar los datos es muy ineficiente. No lo use para mesas grandes. + +## Ejemplo de otro enfoque {#example-of-another-approach} + +Datos de ejemplo: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ +│ 4324182021466249494 │ -5 │ -146 │ -1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +La idea es que las fusiones tengan en cuenta solo los campos clave. Y en el “Cancel” línea podemos especificar valores negativos que igualan la versión anterior de la fila al sumar sin usar la columna Sign. Para este enfoque, es necesario cambiar el tipo de datos `PageViews`,`Duration` para almacenar valores negativos de UInt8 -\> Int16. + +``` sql +CREATE TABLE UAct +( + UserID UInt64, + PageViews Int16, + Duration Int16, + Sign Int8 +) +ENGINE = CollapsingMergeTree(Sign) +ORDER BY UserID +``` + +Vamos a probar el enfoque: + +``` sql +insert into UAct values(4324182021466249494, 5, 146, 1); +insert into UAct values(4324182021466249494, -5, -146, -1); +insert into UAct values(4324182021466249494, 6, 185, 1); + +select * from UAct final; // avoid using final in production (just for a test or small tables) +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +``` sql +SELECT + UserID, + sum(PageViews) AS PageViews, + sum(Duration) AS Duration +FROM UAct +GROUP BY UserID +```text +┌──────────────UserID─┬─PageViews─┬─Duration─┐ +│ 4324182021466249494 │ 6 │ 185 │ +└─────────────────────┴───────────┴──────────┘ +``` + +``` sqk +select count() FROM UAct +``` + +``` text +┌─count()─┐ +│ 3 │ +└─────────┘ +``` + +``` sql +optimize table UAct final; + +select * FROM UAct +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/collapsingmergetree/) diff --git a/docs/es/operations/table_engines/custom_partitioning_key.md b/docs/es/operations/table_engines/custom_partitioning_key.md new file mode 100644 index 00000000000..4d043716540 --- /dev/null +++ b/docs/es/operations/table_engines/custom_partitioning_key.md @@ -0,0 +1,120 @@ +# Clave de partición personalizada {#custom-partitioning-key} + +La partición está disponible para el [Método de codificación de datos:](mergetree.md) mesas familiares (incluyendo [repetición](replication.md) tabla). [Vistas materializadas](materializedview.md) basado en tablas MergeTree soporte de particionamiento, también. + +Una partición es una combinación lógica de registros en una tabla por un criterio especificado. Puede establecer una partición por un criterio arbitrario, como por mes, por día o por tipo de evento. Cada partición se almacena por separado para simplificar las manipulaciones de estos datos. Al acceder a los datos, ClickHouse utiliza el subconjunto más pequeño de particiones posible. + +La partición se especifica en el `PARTITION BY expr` cláusula cuando [Creando una tabla](mergetree.md#table_engine-mergetree-creating-a-table). La clave de partición puede ser cualquier expresión de las columnas de la tabla. Por ejemplo, para especificar la partición por mes, utilice la expresión `toYYYYMM(date_column)`: + +``` sql +CREATE TABLE visits +( + VisitDate Date, + Hour UInt8, + ClientID UUID +) +ENGINE = MergeTree() +PARTITION BY toYYYYMM(VisitDate) +ORDER BY Hour; +``` + +La clave de partición también puede ser una tupla de expresiones (similar a la [Clave primaria](mergetree.md#primary-keys-and-indexes-in-queries)). Por ejemplo: + +``` sql +ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/name', 'replica1', Sign) +PARTITION BY (toMonday(StartDate), EventType) +ORDER BY (CounterID, StartDate, intHash32(UserID)); +``` + +En este ejemplo, establecemos la partición por los tipos de eventos que se produjeron durante la semana actual. + +Al insertar datos nuevos en una tabla, estos datos se almacenan como una parte separada (porción) ordenada por la clave principal. En 10-15 minutos después de insertar, las partes de la misma partición se fusionan en toda la parte. + +!!! info "INFO" + Una combinación solo funciona para partes de datos que tienen el mismo valor para la expresión de partición. Esto significa **no debe hacer particiones demasiado granulares** (más de un millar de particiones). De lo contrario, el `SELECT` consulta funciona mal debido a un número excesivamente grande de archivos en el sistema de archivos y descriptores de archivos abiertos. + +Descripción [sistema.parte](../system_tables.md#system_tables-parts) tabla para ver las partes y particiones de la tabla. Por ejemplo, supongamos que tenemos un `visits` tabla con partición por mes. Vamos a realizar el `SELECT` Consulta para el `system.parts` tabla: + +``` sql +SELECT + partition, + name, + active +FROM system.parts +WHERE table = 'visits' +``` + +``` text +┌─partition─┬─name───────────┬─active─┐ +│ 201901 │ 201901_1_3_1 │ 0 │ +│ 201901 │ 201901_1_9_2 │ 1 │ +│ 201901 │ 201901_8_8_0 │ 0 │ +│ 201901 │ 201901_9_9_0 │ 0 │ +│ 201902 │ 201902_4_6_1 │ 1 │ +│ 201902 │ 201902_10_10_0 │ 1 │ +│ 201902 │ 201902_11_11_0 │ 1 │ +└───────────┴────────────────┴────────┘ +``` + +El `partition` columna contiene los nombres de las particiones. Hay dos particiones en este ejemplo: `201901` y `201902`. Puede utilizar este valor de columna para especificar el nombre de partición en [ALTER … PARTITION](#alter_manipulations-with-partitions) consulta. + +El `name` columna contiene los nombres de las partes de datos de partición. Puede utilizar esta columna para especificar el nombre de la pieza [ALTERAR PIEZA DE ADJUNTO](#alter_attach-partition) consulta. + +Vamos a desglosar el nombre de la primera parte: `201901_1_3_1`: + +- `201901` es el nombre de la partición. +- `1` es el número mínimo del bloque de datos. +- `3` es el número máximo del bloque de datos. +- `1` es el nivel de fragmento (la profundidad del árbol de fusión del que se forma). + +!!! info "INFO" + Las partes de las tablas de tipo antiguo tienen el nombre: `20190117_20190123_2_2_0` (fecha mínima - fecha máxima - número de bloque mínimo - número de bloque máximo - nivel). + +El `active` columna muestra el estado de la pieza. `1` está activo; `0` está inactivo. Las partes inactivas son, por ejemplo, las partes de origen que quedan después de fusionarse con una parte más grande. Las partes de datos dañadas también se indican como inactivas. + +Como puede ver en el ejemplo, hay varias partes separadas de la misma partición (por ejemplo, `201901_1_3_1` y `201901_1_9_2`). Esto significa que estas partes aún no están fusionadas. ClickHouse combina las partes insertadas de datos periódicamente, aproximadamente 15 minutos después de la inserción. Además, puede realizar una fusión no programada utilizando el [OPTIMIZAR](../../query_language/misc.md#misc_operations-optimize) consulta. Ejemplo: + +``` sql +OPTIMIZE TABLE visits PARTITION 201902; +``` + +``` text +┌─partition─┬─name───────────┬─active─┐ +│ 201901 │ 201901_1_3_1 │ 0 │ +│ 201901 │ 201901_1_9_2 │ 1 │ +│ 201901 │ 201901_8_8_0 │ 0 │ +│ 201901 │ 201901_9_9_0 │ 0 │ +│ 201902 │ 201902_4_6_1 │ 0 │ +│ 201902 │ 201902_4_11_2 │ 1 │ +│ 201902 │ 201902_10_10_0 │ 0 │ +│ 201902 │ 201902_11_11_0 │ 0 │ +└───────────┴────────────────┴────────┘ +``` + +Las partes inactivas se eliminarán aproximadamente 10 minutos después de la fusión. + +Otra forma de ver un conjunto de partes y particiones es ir al directorio de la tabla: `/var/lib/clickhouse/data///`. Por ejemplo: + +``` bash +/var/lib/clickhouse/data/default/visits$ ls -l +total 40 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201901_1_9_2 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_8_8_0 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 15:52 201901_9_9_0 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_10_10_0 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201902_11_11_0 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:19 201902_4_11_2 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 12:09 201902_4_6_1 +drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached +``` + +Carpeta ‘201901\_1\_1\_0’, ‘201901\_1\_7\_1’ y así sucesivamente son los directorios de las partes. Cada parte se relaciona con una partición correspondiente y contiene datos solo para un mes determinado (la tabla de este ejemplo tiene particiones por mes). + +El `detached` el directorio contiene partes que se separaron de la tabla utilizando el [SEPARAR](#alter_detach-partition) consulta. Las partes dañadas también se mueven a este directorio, en lugar de eliminarse. El servidor no utiliza las piezas del `detached` directorio. Puede agregar, eliminar o modificar los datos de este directorio en cualquier momento; el servidor no lo sabrá hasta que ejecute el [CONECTAR](../../query_language/alter.md#alter_attach-partition) consulta. + +Tenga en cuenta que en el servidor operativo, no puede cambiar manualmente el conjunto de piezas o sus datos en el sistema de archivos, ya que el servidor no lo sabrá. Para tablas no replicadas, puede hacerlo cuando el servidor está detenido, pero no se recomienda. Para tablas replicadas, el conjunto de piezas no se puede cambiar en ningún caso. + +ClickHouse le permite realizar operaciones con las particiones: eliminarlas, copiar de una tabla a otra o crear una copia de seguridad. Consulte la lista de todas las operaciones en la sección [Manipulaciones con particiones y piezas](../../query_language/alter.md#alter_manipulations-with-partitions). + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/custom_partitioning_key/) diff --git a/docs/es/operations/table_engines/dictionary.md b/docs/es/operations/table_engines/dictionary.md new file mode 100644 index 00000000000..5ef899d078e --- /dev/null +++ b/docs/es/operations/table_engines/dictionary.md @@ -0,0 +1,90 @@ +# Diccionario {#dictionary} + +El `Dictionary` el motor muestra el [Diccionario](../../query_language/dicts/external_dicts.md) datos como una tabla ClickHouse. + +Como ejemplo, considere un diccionario de `products` con la siguiente configuración: + +``` xml + + + products + + +
products
+ DSN=some-db-server + + + + 300 + 360 + + + + + + + product_id + + + title + String + + + + + +``` + +Consultar los datos del diccionario: + +``` sql +SELECT + name, + type, + key, + attribute.names, + attribute.types, + bytes_allocated, + element_count, + source +FROM system.dictionaries +WHERE name = 'products' +``` + +``` text +┌─name─────┬─type─┬─key────┬─attribute.names─┬─attribute.types─┬─bytes_allocated─┬─element_count─┬─source──────────┐ +│ products │ Flat │ UInt64 │ ['title'] │ ['String'] │ 23065376 │ 175032 │ ODBC: .products │ +└──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘ +``` + +Puede usar el [dictGet\*](../../query_language/functions/ext_dict_functions.md#ext_dict_functions) función para obtener los datos del diccionario en este formato. + +Esta vista no es útil cuando necesita obtener datos sin procesar o cuando `JOIN` operación. Para estos casos, puede usar el `Dictionary` motor, que muestra los datos del diccionario en una tabla. + +Sintaxis: + +``` sql +CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)` +``` + +Ejemplo de uso: + +``` sql +create table products (product_id UInt64, title String) Engine = Dictionary(products); +``` + + Ok + +Echa un vistazo a lo que hay en la mesa. + +``` sql +select * from products limit 1; +``` + +``` text +┌────product_id─┬─title───────────┐ +│ 152689 │ Some item │ +└───────────────┴─────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/dictionary/) diff --git a/docs/es/operations/table_engines/distributed.md b/docs/es/operations/table_engines/distributed.md new file mode 100644 index 00000000000..a4b20a4e74d --- /dev/null +++ b/docs/es/operations/table_engines/distributed.md @@ -0,0 +1,147 @@ +# Distribuido {#distributed} + +**Las tablas con motor distribuido no almacenan ningún dato por sí mismas**, pero permite el procesamiento de consultas distribuidas en varios servidores. +La lectura se paralela automáticamente. Durante una lectura, se utilizan los índices de tabla en servidores remotos, si los hay. + +El motor distribuido acepta parámetros: + +- el nombre del clúster en el archivo de configuración del servidor + +- el nombre de una base de datos remota + +- el nombre de una tabla remota + +- (opcionalmente) clave de fragmentación + +- nombre de política (opcionalmente), se usará para almacenar archivos temporales para el envío asíncrono + + Ver también: + + - `insert_distributed_sync` configuración + - [Método de codificación de datos:](mergetree.md#table_engine-mergetree-multiple-volumes) para los ejemplos + +Ejemplo: + +``` sql +Distributed(logs, default, hits[, sharding_key[, policy_name]]) +``` + +Los datos se leerán desde todos los servidores ‘logs’ clúster, desde el valor predeterminado.tabla de éxitos ubicada en cada servidor del clúster. +Los datos no solo se leen sino que se procesan parcialmente en los servidores remotos (en la medida en que esto sea posible). +Por ejemplo, para una consulta con GROUP BY, los datos se agregarán en servidores remotos y los estados intermedios de las funciones agregadas se enviarán al servidor solicitante. Luego, los datos se agregarán más. + +En lugar del nombre de la base de datos, puede usar una expresión constante que devuelva una cadena. Por ejemplo: currentDatabase(). + +logs: el nombre del clúster en el archivo de configuración del servidor. + +Los clústeres se establecen así: + +``` xml + + + + + 1 + + false + + example01-01-1 + 9000 + + + example01-01-2 + 9000 + + + + 2 + false + + example01-02-1 + 9000 + + + example01-02-2 + 1 + 9440 + + + + +``` + +Aquí se define un clúster con el nombre ‘logs’ que consta de dos fragmentos, cada uno de los cuales contiene dos réplicas. +Los fragmentos se refieren a los servidores que contienen diferentes partes de los datos (para leer todos los datos, debe acceder a todos los fragmentos). +Las réplicas están duplicando servidores (para leer todos los datos, puede acceder a los datos en cualquiera de las réplicas). + +Los nombres de clúster no deben contener puntos. + +Los parámetros `host`, `port`, y opcionalmente `user`, `password`, `secure`, `compression` Se especifican para cada servidor: +- `host` – La dirección del servidor remoto. Puede utilizar el dominio o la dirección IPv4 o IPv6. Si especifica el dominio, el servidor realiza una solicitud DNS cuando se inicia y el resultado se almacena mientras el servidor se esté ejecutando. Si la solicitud DNS falla, el servidor no se inicia. Si cambia el registro DNS, reinicie el servidor. +- `port` – El puerto TCP para la actividad de mensajería (‘tcp\_port’ en la configuración, generalmente establecido en 9000). No lo confundas con http\_port. +- `user` – Nombre del usuario para conectarse a un servidor remoto. Valor predeterminado: predeterminado. Este usuario debe tener acceso para conectarse al servidor especificado. El acceso se configura en los usuarios.archivo xml. Para obtener más información, consulte la sección [Derechos de acceso](../../operations/access_rights.md). +- `password` – La contraseña para conectarse a un servidor remoto (no enmascarado). Valor predeterminado: cadena vacía. +- `secure` - Use ssl para la conexión, por lo general también debe definir `port` ¿Por qué? El servidor debe escuchar en Método de codificación de datos: y tener certificados correctos. +- `compression` - Utilice la compresión de datos. Valor predeterminado: true. + +Al especificar réplicas, se seleccionará una de las réplicas disponibles para cada uno de los fragmentos al leer. Puede configurar el algoritmo para el equilibrio de carga (la preferencia para qué réplica acceder) [load\_balancing](../settings/settings.md#settings-load_balancing) configuración. +Si no se establece la conexión con el servidor, habrá un intento de conectarse con un breve tiempo de espera. Si la conexión falla, se seleccionará la siguiente réplica, y así sucesivamente para todas las réplicas. Si el intento de conexión falló para todas las réplicas, el intento se repetirá de la misma manera, varias veces. +Esto funciona a favor de la resiliencia, pero no proporciona una tolerancia completa a errores: un servidor remoto podría aceptar la conexión, pero podría no funcionar o funcionar mal. + +Puede especificar solo uno de los fragmentos (en este caso, el procesamiento de consultas debe denominarse remoto, en lugar de distribuido) o hasta cualquier número de fragmentos. En cada fragmento, puede especificar entre una y cualquier número de réplicas. Puede especificar un número diferente de réplicas para cada fragmento. + +Puede especificar tantos clústeres como desee en la configuración. + +Para ver los clústeres, utilice el ‘system.clusters’ tabla. + +El motor distribuido permite trabajar con un clúster como un servidor local. Sin embargo, el clúster es inextensible: debe escribir su configuración en el archivo de configuración del servidor (mejor aún, para todos los servidores del clúster). + +No hay compatibilidad con las tablas distribuidas que miran a otras tablas distribuidas (excepto en los casos en que una tabla distribuida solo tiene un fragmento). Como alternativa, haga que la tabla Distribuida mire el “final” tabla. + +El motor distribuido requiere escribir clústeres en el archivo de configuración. Los clústeres del archivo de configuración se actualizan sobre la marcha, sin reiniciar el servidor. Si necesita enviar una consulta a un conjunto desconocido de fragmentos y réplicas cada vez, no necesita crear una tabla ‘remote’ función de tabla en su lugar. Vea la sección [Funciones de tabla](../../query_language/table_functions/index.md). + +Hay dos métodos para escribir datos en un clúster: + +Primero, puede definir a qué servidores escribir en qué datos y realizar la escritura directamente en cada fragmento. En otras palabras, realice INSERT en las tablas que la tabla distribuida “looks at”. Esta es la solución más flexible, ya que puede usar cualquier esquema de fragmentación, que podría ser no trivial debido a los requisitos del área temática. Esta es también la solución más óptima ya que los datos se pueden escribir en diferentes fragmentos de forma completamente independiente. + +En segundo lugar, puede realizar INSERT en una tabla distribuida. En este caso, la tabla distribuirá los datos insertados a través de los propios servidores. Para escribir en una tabla distribuida, debe tener un conjunto de claves de fragmentación (el último parámetro). Además, si solo hay un fragmento, la operación de escritura funciona sin especificar la clave de fragmentación, ya que no significa nada en este caso. + +Cada fragmento puede tener un peso definido en el archivo de configuración. Por defecto, el peso es igual a uno. Los datos se distribuyen entre fragmentos en la cantidad proporcional al peso del fragmento. Por ejemplo, si hay dos fragmentos y el primero tiene un peso de 9 mientras que el segundo tiene un peso de 10, el primero se enviará 9 / 19 partes de las filas, y el segundo se enviará 10 / 19. + +Cada fragmento puede tener el ‘internal\_replication’ parámetro definido en el archivo de configuración. + +Si este parámetro se establece en ‘true’, la operación de escritura selecciona la primera réplica en buen estado y escribe datos en ella. Utilice esta alternativa si la tabla Distribuida “looks at” tablas replicadas. En otras palabras, si la tabla donde se escribirán los datos los replicará por sí misma. + +Si se establece en ‘false’ (el valor predeterminado), los datos se escriben en todas las réplicas. En esencia, esto significa que la tabla distribuida replica los datos en sí. Esto es peor que usar tablas replicadas, porque no se verifica la consistencia de las réplicas y, con el tiempo, contendrán datos ligeramente diferentes. + +Para seleccionar el fragmento al que se envía una fila de datos, se analiza la expresión de fragmentación y su resto se toma de dividirlo por el peso total de los fragmentos. La fila se envía al fragmento que corresponde al medio intervalo de los restos de ‘prev\_weight’ Naciones ‘prev\_weights + weight’, donde ‘prev\_weights’ es el peso total de los fragmentos con el número más pequeño, y ‘weight’ es el peso de este fragmento. Por ejemplo, si hay dos fragmentos, y el primero tiene un peso de 9 mientras que el segundo tiene un peso de 10, la fila se enviará al primer fragmento para los restos del rango \[0, 9), y al segundo para los restos del rango \[9, 19). + +La expresión de fragmentación puede ser cualquier expresión de constantes y columnas de tabla que devuelva un entero. Por ejemplo, puede usar la expresión ‘rand()’ para la distribución aleatoria de datos, o ‘UserID’ para la distribución por el resto de dividir el ID del usuario (entonces los datos de un solo usuario residirán en un solo fragmento, lo que simplifica la ejecución de IN y JOIN por los usuarios). Si una de las columnas no se distribuye lo suficientemente uniformemente, puede envolverla en una función hash: intHash64(UserID) . + +Un simple recordatorio de la división es una solución limitada para la fragmentación y no siempre es apropiado. Funciona para volúmenes medianos y grandes de datos (docenas de servidores), pero no para volúmenes muy grandes de datos (cientos de servidores o más). En este último caso, use el esquema de fragmentación requerido por el área asunto, en lugar de usar entradas en Tablas distribuidas. + +Las consultas SELECT se envían a todos los fragmentos y funcionan independientemente de cómo se distribuyen los datos entre los fragmentos (se pueden distribuir completamente aleatoriamente). Cuando agrega un nuevo fragmento, no tiene que transferirle los datos antiguos. Puede escribir nuevos datos con un peso más pesado: los datos se distribuirán de manera ligeramente desigual, pero las consultas funcionarán correcta y eficientemente. + +Debería preocuparse por el esquema de fragmentación en los siguientes casos: + +- Se utilizan consultas que requieren unir datos (IN o JOIN) mediante una clave específica. Si esta clave fragmenta datos, puede usar IN local o JOIN en lugar de GLOBAL IN o GLOBAL JOIN, que es mucho más eficiente. +- Se usa una gran cantidad de servidores (cientos o más) con una gran cantidad de consultas pequeñas (consultas de clientes individuales: sitios web, anunciantes o socios). Para que las pequeñas consultas no afecten a todo el clúster, tiene sentido ubicar datos para un solo cliente en un solo fragmento. Alternativamente, como lo hemos hecho en Yandex.Metrica, puede configurar sharding de dos niveles: divida todo el clúster en “layers”, donde una capa puede consistir en varios fragmentos. Los datos de un único cliente se encuentran en una sola capa, pero los fragmentos se pueden agregar a una capa según sea necesario y los datos se distribuyen aleatoriamente dentro de ellos. Las tablas distribuidas se crean para cada capa y se crea una única tabla distribuida compartida para consultas globales. + +Los datos se escriben de forma asíncrona. Cuando se inserta en la tabla, el bloque de datos se acaba de escribir en el sistema de archivos local. Los datos se envían a los servidores remotos en segundo plano tan pronto como sea posible. El período de envío de datos está gestionado por el [Distributed\_directory\_monitor\_sleep\_time\_ms](../settings/settings.md#distributed_directory_monitor_sleep_time_ms) y [Distributed\_directory\_monitor\_max\_sleep\_time\_ms](../settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) configuración. El `Distributed` el motor envía cada archivo con datos insertados por separado, pero puede habilitar el envío por lotes de archivos [distributed\_directory\_monitor\_batch\_inserts](../settings/settings.md#distributed_directory_monitor_batch_inserts) configuración. Esta configuración mejora el rendimiento del clúster al utilizar mejor los recursos de red y servidor local. Debe comprobar si los datos se envían correctamente comprobando la lista de archivos (datos en espera de ser enviados) en el directorio de la tabla: `/var/lib/clickhouse/data/database/table/`. + +Si el servidor dejó de existir o tuvo un reinicio aproximado (por ejemplo, después de un error de dispositivo) después de un INSERT en una tabla distribuida, es posible que se pierdan los datos insertados. Si se detecta un elemento de datos dañado en el directorio de la tabla, se transfiere al ‘broken’ subdirectorio y ya no se utiliza. + +Cuando la opción max\_parallel\_replicas está habilitada, el procesamiento de consultas se paralela en todas las réplicas dentro de un solo fragmento. Para obtener más información, consulte la sección [max\_parallel\_replicas](../settings/settings.md#settings-max_parallel_replicas). + +## Columnas virtuales {#virtual-columns} + +- `_shard_num` — Contiene el `shard_num` (de `system.clusters`). Tipo: [UInt32](../../data_types/int_uint.md). + +!!! note "Nota" + Ya [`remote`](../../query_language/table_functions/remote.md)/`cluster` funciones de tabla crean internamente instancia temporal del mismo motor distribuido, `_shard_num` está disponible allí también. + +**Ver también** + +- [Columnas virtuales](index.md#table_engines-virtual_columns) + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/distributed/) diff --git a/docs/es/operations/table_engines/external_data.md b/docs/es/operations/table_engines/external_data.md new file mode 100644 index 00000000000..a93c3bd3566 --- /dev/null +++ b/docs/es/operations/table_engines/external_data.md @@ -0,0 +1,61 @@ +# Datos externos para el procesamiento de consultas {#external-data-for-query-processing} + +ClickHouse permite enviar a un servidor los datos necesarios para procesar una consulta, junto con una consulta SELECT. Estos datos se colocan en una tabla temporal (consulte la sección “Temporary tables”) y se puede utilizar en la consulta (por ejemplo, en operadores IN). + +Por ejemplo, si tiene un archivo de texto con identificadores de usuario importantes, puede cargarlo en el servidor junto con una consulta que utilice la filtración de esta lista. + +Si necesita ejecutar más de una consulta con un gran volumen de datos externos, no utilice esta función. Es mejor cargar los datos a la base de datos con anticipación. + +Los datos externos se pueden cargar mediante el cliente de línea de comandos (en modo no interactivo) o mediante la interfaz HTTP. + +En el cliente de línea de comandos, puede especificar una sección de parámetros en el formato + +``` bash +--external --file=... [--name=...] [--format=...] [--types=...|--structure=...] +``` + +Puede tener varias secciones como esta, para el número de tablas que se transmiten. + +**–externo** – Marca el comienzo de una cláusula. +**–file** – Ruta al archivo con el volcado de tabla, o -, que hace referencia a stdin. +Solo se puede recuperar una sola tabla de stdin. + +Los siguientes parámetros son opcionales: **–nombre**– Nombre de la tabla. Si se omite, se utiliza \_data. +**–formato** – Formato de datos en el archivo. Si se omite, se utiliza TabSeparated. + +Se requiere uno de los siguientes parámetros:**–tipo** – Una lista de tipos de columnas separadas por comas. Por ejemplo: `UInt64,String`. Las columnas se llamarán \_1, \_2, … +**–estructura**– La estructura de la tabla en el formato`UserID UInt64`, `URL String`. Definir los nombres y tipos de columna. + +Los archivos especificados en ‘file’ se analizará mediante el formato especificado en ‘format’ utilizando los tipos de datos especificados en ‘types’ o ‘structure’. La mesa será cargado en el servidor y accesibles, como una tabla temporal con el nombre de ‘name’. + +Ejemplos: + +``` bash +$ echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 +849897 +$ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' +/bin/sh 20 +/bin/false 5 +/bin/bash 4 +/usr/sbin/nologin 1 +/bin/sync 1 +``` + +Cuando se utiliza la interfaz HTTP, los datos externos se pasan en el formato multipart/form-data. Cada tabla se transmite como un archivo separado. El nombre de la tabla se toma del nombre del archivo. El ‘query\_string’ se pasa los parámetros ‘name\_format’, ‘name\_types’, y ‘name\_structure’, donde ‘name’ es el nombre de la tabla a la que corresponden estos parámetros. El significado de los parámetros es el mismo que cuando se usa el cliente de línea de comandos. + +Ejemplo: + +``` bash +$ cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv + +$ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' +/bin/sh 20 +/bin/false 5 +/bin/bash 4 +/usr/sbin/nologin 1 +/bin/sync 1 +``` + +Para el procesamiento de consultas distribuidas, las tablas temporales se envían a todos los servidores remotos. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/external_data/) diff --git a/docs/es/operations/table_engines/file.md b/docs/es/operations/table_engines/file.md new file mode 100644 index 00000000000..c96709645d7 --- /dev/null +++ b/docs/es/operations/table_engines/file.md @@ -0,0 +1,83 @@ +# File {#table-engines-file} + +El motor de tabla de archivos mantiene los datos en un archivo en uno de los [file +Formato](../../interfaces/formats.md#formats) (TabSeparated, Native, etc.). + +Ejemplos de uso: + +- Exportación de datos de ClickHouse a archivo. +- Convertir datos de un formato a otro. +- Actualización de datos en ClickHouse mediante la edición de un archivo en un disco. + +## Uso en el servidor ClickHouse {#usage-in-clickhouse-server} + +``` sql +File(Format) +``` + +El `Format` parámetro especifica uno de los formatos de archivo disponibles. Realizar +`SELECT` consultas, el formato debe ser compatible para la entrada, y para realizar +`INSERT` consultas – para la salida. Los formatos disponibles se enumeran en el +[Formato](../../interfaces/formats.md#formats) apartado. + +ClickHouse no permite especificar la ruta del sistema de archivos para`File`. Utilizará la carpeta definida por [camino](../server_settings/settings.md) configuración en la configuración del servidor. + +Al crear una tabla usando `File(Format)` crea un subdirectorio vacío en esa carpeta. Cuando los datos se escriben en esa tabla, se colocan en `data.Format` en ese subdirectorio. + +Puede crear manualmente esta subcarpeta y archivo en el sistema de archivos del servidor y luego [CONECTAR](../../query_language/misc.md) para mostrar información con el nombre coincidente, para que pueda consultar datos desde ese archivo. + +!!! warning "Advertencia" + Tenga cuidado con esta funcionalidad, ya que ClickHouse no realiza un seguimiento de los cambios externos en dichos archivos. El resultado de las escrituras simultáneas a través de ClickHouse y fuera de ClickHouse no está definido. + +**Ejemplo:** + +**1.** Configurar el `file_engine_table` tabla: + +``` sql +CREATE TABLE file_engine_table (name String, value UInt32) ENGINE=File(TabSeparated) +``` + +Por defecto, ClickHouse creará una carpeta `/var/lib/clickhouse/data/default/file_engine_table`. + +**2.** Crear manualmente `/var/lib/clickhouse/data/default/file_engine_table/data.TabSeparated` contener: + +``` bash +$ cat data.TabSeparated +one 1 +two 2 +``` + +**3.** Consultar los datos: + +``` sql +SELECT * FROM file_engine_table +``` + +``` text +┌─name─┬─value─┐ +│ one │ 1 │ +│ two │ 2 │ +└──────┴───────┘ +``` + +## Uso es Clickhouse-local {#usage-in-clickhouse-local} + +En [Sistema abierto.](../utils/clickhouse-local.md) El motor de archivos acepta la ruta del archivo además de `Format`. Los flujos de entrada / salida predeterminados se pueden especificar utilizando nombres numéricos o legibles por humanos como `0` o `stdin`, `1` o `stdout`. +**Ejemplo:** + +``` bash +$ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" +``` + +## Detalles de la implementación {#details-of-implementation} + +- Multiple `SELECT` las consultas se pueden realizar simultáneamente, pero `INSERT` las consultas se esperarán entre sí. +- Apoyado la creación de nuevos archivos por `INSERT` consulta. +- Si el archivo existe, `INSERT` añadiría nuevos valores en él. +- No soportado: + - `ALTER` + - `SELECT ... SAMPLE` + - Indice + - Replicación + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/file/) diff --git a/docs/es/operations/table_engines/generate.md b/docs/es/operations/table_engines/generate.md new file mode 100644 index 00000000000..1e48d563af5 --- /dev/null +++ b/docs/es/operations/table_engines/generate.md @@ -0,0 +1,54 @@ +# GenerateRandom {#table-engines-generate} + +El motor de tabla GenerateRandom produce datos aleatorios para el esquema de tabla determinado. + +Ejemplos de uso: + +- Se usa en la prueba para poblar una tabla grande reproducible. +- Generar entrada aleatoria para pruebas de fuzzing. + +## Uso en el servidor ClickHouse {#usage-in-clickhouse-server} + +``` sql +ENGINE = GenerateRandom(random_seed, max_string_length, max_array_length) +``` + +El `max_array_length` y `max_string_length` parámetros especifican la longitud máxima de todos +columnas y cadenas de matriz correspondientemente en los datos generados. + +Generar motor de tabla sólo admite `SELECT` consulta. + +Es compatible con todos [Tipos de datos](../../data_types/index.md) que se pueden almacenar en una tabla excepto `LowCardinality` y `AggregateFunction`. + +**Ejemplo:** + +**1.** Configurar el `generate_engine_table` tabla: + +``` sql +CREATE TABLE generate_engine_table (name String, value UInt32) ENGINE = GenerateRandom(1, 5, 3) +``` + +**2.** Consultar los datos: + +``` sql +SELECT * FROM generate_engine_table LIMIT 3 +``` + +``` text +┌─name─┬──────value─┐ +│ c4xJ │ 1412771199 │ +│ r │ 1791099446 │ +│ 7#$ │ 124312908 │ +└──────┴────────────┘ +``` + +## Detalles de la implementación {#details-of-implementation} + +- No soportado: + - `ALTER` + - `SELECT ... SAMPLE` + - `INSERT` + - Indice + - Replicación + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/generate/) diff --git a/docs/es/operations/table_engines/graphitemergetree.md b/docs/es/operations/table_engines/graphitemergetree.md new file mode 100644 index 00000000000..75c27347328 --- /dev/null +++ b/docs/es/operations/table_engines/graphitemergetree.md @@ -0,0 +1,167 @@ +# GraphiteMergeTree {#graphitemergetree} + +Este motor está diseñado para el adelgazamiento y la agregación / promedio (rollup) [Grafito](http://graphite.readthedocs.io/en/latest/index.html) datos. Puede ser útil para los desarrolladores que desean usar ClickHouse como almacén de datos para Graphite. + +Puede utilizar cualquier motor de tabla ClickHouse para almacenar los datos de grafito si no necesita un paquete acumulativo, pero si necesita un paquete acumulativo, use `GraphiteMergeTree`. El motor reduce el volumen de almacenamiento y aumenta la eficiencia de las consultas de Grafito. + +El motor hereda propiedades de [Método de codificación de datos:](mergetree.md). + +## Creación de una tabla {#creating-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + Path String, + Time DateTime, + Value , + Version + ... +) ENGINE = GraphiteMergeTree(config_section) +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +Vea una descripción detallada del [CREAR TABLA](../../query_language/create.md#create-table-query) consulta. + +Una tabla para los datos de grafito debe tener las siguientes columnas para los siguientes datos: + +- Nombre métrico (sensor de grafito). Tipo de datos: `String`. + +- Tiempo de medición de la métrica. Tipo de datos: `DateTime`. + +- Valor de la métrica. Tipo de datos: cualquier numérico. + +- Versión de la métrica. Tipo de datos: cualquier numérico. + + ClickHouse guarda las filas con la versión más alta o la última escrita si las versiones son las mismas. Otras filas se eliminan durante la fusión de partes de datos. + +Los nombres de estas columnas deben establecerse en la configuración acumulativa. + +**GraphiteMergeTree parámetros** + +- `config_section` — Nombre de la sección en el archivo de configuración, donde se establecen las reglas de acumulación. + +**Cláusulas de consulta** + +Al crear un `GraphiteMergeTree` mesa, la misma [clausula](mergetree.md#table_engine-mergetree-creating-a-table) se requieren, como al crear un `MergeTree` tabla. + +
+ +Método obsoleto para crear una tabla + +!!! attention "Atención" + No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + EventDate Date, + Path String, + Time DateTime, + Value , + Version + ... +) ENGINE [=] GraphiteMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, config_section) +``` + +Todos los parámetros excepto `config_section` el mismo significado que en `MergeTree`. + +- `config_section` — Nombre de la sección en el archivo de configuración, donde se establecen las reglas de acumulación. + +
+ +## Configuración acumulativa {#rollup-configuration} + +La configuración del paquete acumulativo está definida por [graphite\_rollup](../server_settings/settings.md#server_settings-graphite_rollup) parámetro en la configuración del servidor. El nombre del parámetro podría ser cualquiera. Puede crear varias configuraciones y usarlas para diferentes tablas. + +Estructura de configuración Rollup: + + required-columns + patterns + +### Columnas requeridas {#required-columns} + +- `path_column_name` — El nombre de la columna que almacena el nombre de la métrica (sensor de grafito). Valor predeterminado: `Path`. +- `time_column_name` — El nombre de la columna que almacena el tiempo de medición de la métrica. Valor predeterminado: `Time`. +- `value_column_name` — El nombre de la columna que almacena el valor de la métrica a la hora establecida en `time_column_name`. Valor predeterminado: `Value`. +- `version_column_name` — El nombre de la columna que almacena la versión de la métrica. Valor predeterminado: `Timestamp`. + +### Patrón {#patterns} + +Estructura del `patterns` apartado: + +``` text +pattern + regexp + function +pattern + regexp + age + precision + ... +pattern + regexp + function + age + precision + ... +pattern + ... +default + function + age + precision + ... +``` + +!!! warning "Atención" + Los patrones deben ser estrictamente ordenados: + + 1. Patterns without `function` or `retention`. + 1. Patterns with both `function` and `retention`. + 1. Pattern `default`. + +Al procesar una fila, ClickHouse comprueba las reglas en el `pattern` apartado. Cada uno de `pattern` (incluir `default` secciones pueden contener `function` parámetro para la agregación, `retention` parámetros o ambos. Si el nombre de la métrica coincide con `regexp`, las reglas de la `pattern` sección (o secciones); de lo contrario, las reglas de la `default` sección se utilizan. + +Campos para `pattern` y `default` apartado: + +- `regexp`– Un patrón para el nombre de la métrica. +- `age` – La edad mínima de los datos en segundos. +- `precision`– Cómo definir con precisión la edad de los datos en segundos. Debe ser un divisor para 86400 (segundos en un día). +- `function` – El nombre de la función de agregación que se aplicará a los datos cuya antigüedad se encuentra dentro del intervalo `[age, age + precision]`. + +### Ejemplo de configuración {#configuration-example} + +``` xml + + Version + + click_cost + any + + 0 + 5 + + + 86400 + 60 + + + + max + + 0 + 60 + + + 3600 + 300 + + + 86400 + 3600 + + + +``` + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/graphitemergetree/) diff --git a/docs/es/operations/table_engines/hdfs.md b/docs/es/operations/table_engines/hdfs.md new file mode 100644 index 00000000000..705764e2e5c --- /dev/null +++ b/docs/es/operations/table_engines/hdfs.md @@ -0,0 +1,116 @@ +# HDFS {#table-engines-hdfs} + +Este motor proporciona integración con [Acerca de nosotros](https://en.wikipedia.org/wiki/Apache_Hadoop) permitiendo gestionar datos sobre [HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)a través de ClickHouse. Este motor es similar +Angeles [File](file.md) y [URL](url.md) motores, pero proporciona características específicas de Hadoop. + +## Uso {#usage} + +``` sql +ENGINE = HDFS(URI, format) +``` + +El `URI` El parámetro es el URI del archivo completo en HDFS. +El `format` parámetro especifica uno de los formatos de archivo disponibles. Realizar +`SELECT` consultas, el formato debe ser compatible para la entrada, y para realizar +`INSERT` consultas – para la salida. Los formatos disponibles se enumeran en el +[Formato](../../interfaces/formats.md#formats) apartado. +La parte de la ruta de `URI` puede contener globs. En este caso, la tabla sería de solo lectura. + +**Ejemplo:** + +**1.** Configurar el `hdfs_engine_table` tabla: + +``` sql +CREATE TABLE hdfs_engine_table (name String, value UInt32) ENGINE=HDFS('hdfs://hdfs1:9000/other_storage', 'TSV') +``` + +**2.** Llenar archivo: + +``` sql +INSERT INTO hdfs_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) +``` + +**3.** Consultar los datos: + +``` sql +SELECT * FROM hdfs_engine_table LIMIT 2 +``` + +``` text +┌─name─┬─value─┐ +│ one │ 1 │ +│ two │ 2 │ +└──────┴───────┘ +``` + +## Detalles de implementación {#implementation-details} + +- Las lecturas y escrituras pueden ser paralelas +- No soportado: + - `ALTER` y `SELECT...SAMPLE` operación. + - Índices. + - Replicación. + +**Globs en el camino** + +Múltiples componentes de ruta de acceso pueden tener globs. Para ser procesado, el archivo debe existir y coincidir con todo el patrón de ruta. Listado de archivos determina durante `SELECT` (no en `CREATE` momento). + +- `*` — Sustituye cualquier número de caracteres excepto `/` incluyendo cadena vacía. +- `?` — Sustituye a cualquier carácter individual. +- `{some_string,another_string,yet_another_one}` — Sustituye cualquiera de las cadenas `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Sustituye cualquier número en el intervalo de N a M, incluidas ambas fronteras. + +Construcciones con `{}` hijo similares a la [remoto](../../query_language/table_functions/remote.md) función de la tabla. + +**Ejemplo** + +1. Supongamos que tenemos varios archivos en formato TSV con los siguientes URI en HDFS: + +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_3’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_3’ + +1. Hay varias maneras de hacer una tabla que consta de los seis archivos: + + + +``` sql +CREATE TABLE table_with_range (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV') +``` + +Otra forma: + +``` sql +CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_?', 'TSV') +``` + +La tabla consta de todos los archivos en ambos directorios (todos los archivos deben satisfacer el formato y el esquema descritos en la consulta): + +``` sql +CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV') +``` + +!!! warning "Advertencia" + Si la lista de archivos contiene rangos de números con ceros a la izquierda, use la construcción con llaves para cada dígito por separado o use `?`. + +**Ejemplo** + +Crear tabla con archivos llamados `file000`, `file001`, … , `file999`: + +``` sql +CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') +``` + +## Columnas virtuales {#virtual-columns} + +- `_path` — Ruta de acceso al archivo. +- `_file` — Nombre del expediente. + +**Ver también** + +- [Columnas virtuales](https://clickhouse.tech/docs/es/operations/table_engines/#table_engines-virtual_columns) + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/hdfs/) diff --git a/docs/es/operations/table_engines/index.md b/docs/es/operations/table_engines/index.md new file mode 100644 index 00000000000..823a7f97373 --- /dev/null +++ b/docs/es/operations/table_engines/index.md @@ -0,0 +1,77 @@ +# Motores de mesa {#table-engines} + +El motor de tabla (tipo de tabla) determina: + +- Cómo y dónde se almacenan los datos, dónde escribirlos y dónde leerlos. +- Qué consultas son compatibles y cómo. +- Acceso a datos simultáneos. +- Uso de índices, si está presente. +- Si es posible la ejecución de solicitudes multiproceso. +- Parámetros de replicación de datos. + +## Familias de motores {#engine-families} + +### Método de codificación de datos: {#mergetree} + +Los motores de mesa más universales y funcionales para tareas de alta carga. La propiedad compartida por estos motores es la inserción rápida de datos con el posterior procesamiento de datos en segundo plano. `MergeTree` Los motores familiares admiten la replicación de datos (con [Replicado\*](replication.md) versiones de motores), particionamiento y otras características no admitidas en otros motores. + +Motores en la familia: + +- [Método de codificación de datos:](mergetree.md) +- [ReplacingMergeTree](replacingmergetree.md) +- [SummingMergeTree](summingmergetree.md) +- [AgregaciónMergeTree](aggregatingmergetree.md) +- [ColapsarMergeTree](collapsingmergetree.md) +- [VersionedCollapsingMergeTree](versionedcollapsingmergetree.md) +- [GraphiteMergeTree](graphitemergetree.md) + +### Registro {#log} + +Ligero [motor](log_family.md) con funcionalidad mínima. Son los más efectivos cuando necesita escribir rápidamente muchas tablas pequeñas (hasta aproximadamente 1 millón de filas) y leerlas más tarde en su conjunto. + +Motores en la familia: + +- [TinyLog](tinylog.md) +- [StripeLog](stripelog.md) +- [Registro](log.md) + +### Motores de intergación {#intergation-engines} + +Motores para comunicarse con otros sistemas de almacenamiento y procesamiento de datos. + +Motores en la familia: + +- [Kafka](kafka.md) +- [MySQL](mysql.md) +- [ODBC](odbc.md) +- [JDBC](jdbc.md) +- [HDFS](hdfs.md) + +### Motores especiales {#special-engines} + +Motores en la familia: + +- [Distribuido](distributed.md) +- [Método de codificación de datos:](materializedview.md) +- [Diccionario](dictionary.md) +- [Fusionar](merge.md) +- [File](file.md) +- [Nulo](null.md) +- [Establecer](set.md) +- [Unir](join.md) +- [URL](url.md) +- [Vista](view.md) +- [Memoria](memory.md) +- [Búfer](buffer.md) + +## Columnas virtuales {#table-engines-virtual-columns} + +La columna virtual es un atributo de motor de tabla integral que se define en el código fuente del motor. + +No debe especificar columnas virtuales en el `CREATE TABLE` consulta y no se puede ver en `SHOW CREATE TABLE` y `DESCRIBE TABLE` resultados de la consulta. Las columnas virtuales también son de solo lectura, por lo que no puede insertar datos en columnas virtuales. + +Para seleccionar datos de una columna virtual, debe especificar su nombre en el `SELECT` consulta. `SELECT *` no devuelve valores de columnas virtuales. + +Si crea una tabla con una columna que tiene el mismo nombre que una de las columnas virtuales de la tabla, la columna virtual se vuelve inaccesible. No recomendamos hacer esto. Para ayudar a evitar conflictos, los nombres de columna virtual suelen tener el prefijo de un guión bajo. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/) diff --git a/docs/es/operations/table_engines/jdbc.md b/docs/es/operations/table_engines/jdbc.md new file mode 100644 index 00000000000..eacefe25701 --- /dev/null +++ b/docs/es/operations/table_engines/jdbc.md @@ -0,0 +1,83 @@ +# JDBC {#table-engine-jdbc} + +Permite que ClickHouse se conecte a bases de datos externas a través de [JDBC](https://en.wikipedia.org/wiki/Java_Database_Connectivity). + +Para implementar la conexión JDBC, ClickHouse utiliza el programa independiente [Sistema abierto.](https://github.com/alex-krash/clickhouse-jdbc-bridge) que debería ejecutarse como un demonio. + +Este motor soporta el [NULO](../../data_types/nullable.md) tipo de datos. + +## Creación de una tabla {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name +( + columns list... +) +ENGINE = JDBC(dbms_uri, external_database, external_table) +``` + +**Parámetros del motor** + +- `dbms_uri` — URI de un DBMS externo. + + Formato: `jdbc:://:/?user=&password=`. + Ejemplo para MySQL: `jdbc:mysql://localhost:3306/?user=root&password=root`. + +- `external_database` — Base de datos en un DBMS externo. + +- `external_table` — Nombre de la tabla en `external_database`. + +## Ejemplo de uso {#usage-example} + +Creación de una tabla en el servidor MySQL mediante la conexión directa con su cliente de consola: + +``` text +mysql> CREATE TABLE `test`.`test` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `int_nullable` INT NULL DEFAULT NULL, + -> `float` FLOAT NOT NULL, + -> `float_nullable` FLOAT NULL DEFAULT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into test (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from test; ++--------+--------------+-------+----------------+ +| int_id | int_nullable | float | float_nullable | ++--------+--------------+-------+----------------+ +| 1 | NULL | 2 | NULL | ++--------+--------------+-------+----------------+ +1 row in set (0,00 sec) +``` + +Creación de una tabla en el servidor ClickHouse y selección de datos de ella: + +``` sql +CREATE TABLE jdbc_table +( + `int_id` Int32, + `int_nullable` Nullable(Int32), + `float` Float32, + `float_nullable` Nullable(Float32) +) +ENGINE JDBC('jdbc:mysql://localhost:3306/?user=root&password=root', 'test', 'test') +``` + +``` sql +SELECT * +FROM jdbc_table +``` + +``` text +┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ +│ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ +└────────┴──────────────┴───────┴────────────────┘ +``` + +## Ver también {#see-also} + +- [Función de la tabla de JDBC](../../query_language/table_functions/jdbc.md). + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/jdbc/) diff --git a/docs/es/operations/table_engines/join.md b/docs/es/operations/table_engines/join.md new file mode 100644 index 00000000000..4a21e7f8d6c --- /dev/null +++ b/docs/es/operations/table_engines/join.md @@ -0,0 +1,104 @@ +# Unir {#join} + +Estructura de datos preparada para usar en [UNIR](../../query_language/select.md#select-join) operación. + +## Creación de una tabla {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], +) ENGINE = Join(join_strictness, join_type, k1[, k2, ...]) +``` + +Vea la descripción detallada del [CREAR TABLA](../../query_language/create.md#create-table-query) consulta. + +**Parámetros del motor** + +- `join_strictness` – [ÚNETE a la rigurosidad](../../query_language/select.md#select-join-strictness). +- `join_type` – [Tipo de unión](../../query_language/select.md#select-join-types). +- `k1[, k2, ...]` – Columnas clave de la `USING` cláusula que el `JOIN` operación se hace con. + +Entrar `join_strictness` y `join_type` parámetros sin comillas, por ejemplo, `Join(ANY, LEFT, col1)`. Deben coincidir con el `JOIN` operación para la que se utilizará la tabla. Si los parámetros no coinciden, ClickHouse no produce una excepción y puede devolver datos incorrectos. + +## Uso de la tabla {#table-usage} + +### Ejemplo {#example} + +Creación de la tabla del lado izquierdo: + +``` sql +CREATE TABLE id_val(`id` UInt32, `val` UInt32) ENGINE = TinyLog +``` + +``` sql +INSERT INTO id_val VALUES (1,11)(2,12)(3,13) +``` + +Creando el lado derecho `Join` tabla: + +``` sql +CREATE TABLE id_val_join(`id` UInt32, `val` UInt8) ENGINE = Join(ANY, LEFT, id) +``` + +``` sql +INSERT INTO id_val_join VALUES (1,21)(1,22)(3,23) +``` + +Unirse a las tablas: + +``` sql +SELECT * FROM id_val ANY LEFT JOIN id_val_join USING (id) SETTINGS join_use_nulls = 1 +``` + +``` text +┌─id─┬─val─┬─id_val_join.val─┐ +│ 1 │ 11 │ 21 │ +│ 2 │ 12 │ ᴺᵁᴸᴸ │ +│ 3 │ 13 │ 23 │ +└────┴─────┴─────────────────┘ +``` + +Como alternativa, puede recuperar datos del `Join` tabla, especificando el valor de la clave de unión: + +``` sql +SELECT joinGet('id_val_join', 'val', toUInt32(1)) +``` + +``` text +┌─joinGet('id_val_join', 'val', toUInt32(1))─┐ +│ 21 │ +└────────────────────────────────────────────┘ +``` + +### Selección e inserción de datos {#selecting-and-inserting-data} + +Usted puede utilizar `INSERT` Consultas para agregar datos al `Join`-mesas de motor. Si la tabla se creó con el `ANY` estricta, se ignoran los datos de las claves duplicadas. Con el `ALL` estricta, se agregan todas las filas. + +No se puede realizar una `SELECT` consulta directamente desde la tabla. En su lugar, use uno de los siguientes métodos: + +- Coloque la mesa hacia el lado derecho en un `JOIN` clausula. +- Llame al [joinGet](../../query_language/functions/other_functions.md#joinget) función, que le permite extraer datos de la tabla de la misma manera que de un diccionario. + +### Limitaciones y ajustes {#join-limitations-and-settings} + +Al crear una tabla, se aplican los siguientes valores: + +- [Sistema abierto.](../settings/settings.md#join_use_nulls) +- [Método de codificación de datos:](../settings/query_complexity.md#settings-max_rows_in_join) +- [Método de codificación de datos:](../settings/query_complexity.md#settings-max_bytes_in_join) +- [join\_overflow\_mode](../settings/query_complexity.md#settings-join_overflow_mode) +- [join\_any\_take\_last\_row](../settings/settings.md#settings-join_any_take_last_row) + +El `Join`-las tablas del motor no se pueden utilizar adentro `GLOBAL JOIN` operación. + +El `Join`-motor permite el uso [Sistema abierto.](../settings/settings.md#join_use_nulls) ajuste en el `CREATE TABLE` instrucción. Y [SELECCIONAR](../../query_language/select.md) Consulta permite el uso `join_use_nulls` demasiado. Si tienes diferentes `join_use_nulls` configuración, puede obtener un error al unirse a la tabla. Depende del tipo de JOIN. Cuando se utiliza [joinGet](../../query_language/functions/other_functions.md#joinget) función, usted tiene que utilizar el mismo `join_use_nulls` ajuste en `CRATE TABLE` y `SELECT` instrucción. + +## Almacenamiento de datos {#data-storage} + +`Join` datos de la tabla siempre se encuentra en la memoria RAM. Al insertar filas en una tabla, ClickHouse escribe bloques de datos en el directorio del disco para que puedan restaurarse cuando se reinicie el servidor. + +Si el servidor se reinicia incorrectamente, el bloque de datos en el disco puede perderse o dañarse. En este caso, es posible que deba eliminar manualmente el archivo con datos dañados. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/join/) diff --git a/docs/es/operations/table_engines/kafka.md b/docs/es/operations/table_engines/kafka.md new file mode 100644 index 00000000000..68db3d1961e --- /dev/null +++ b/docs/es/operations/table_engines/kafka.md @@ -0,0 +1,169 @@ +# Kafka {#kafka} + +Este motor funciona con [Acerca de nosotros](http://kafka.apache.org/). + +Kafka te permite: + +- Publicar o suscribirse a flujos de datos. +- Organice el almacenamiento tolerante a fallos. +- Secuencias de proceso a medida que estén disponibles. + +## Creación de una tabla {#table-engine-kafka-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = Kafka() +SETTINGS + kafka_broker_list = 'host:port', + kafka_topic_list = 'topic1,topic2,...', + kafka_group_name = 'group_name', + kafka_format = 'data_format'[,] + [kafka_row_delimiter = 'delimiter_symbol',] + [kafka_schema = '',] + [kafka_num_consumers = N,] + [kafka_skip_broken_messages = N] +``` + +Parámetros requeridos: + +- `kafka_broker_list` – Una lista separada por comas de corredores (por ejemplo, `localhost:9092`). +- `kafka_topic_list` – Una lista de temas Kafka. +- `kafka_group_name` – Un grupo de consumidores Kafka. Los márgenes de lectura se rastrean para cada grupo por separado. Si no desea que los mensajes se dupliquen en el clúster, utilice el mismo nombre de grupo en todas partes. +- `kafka_format` – Formato de mensaje. Utiliza la misma notación que el SQL `FORMAT` función, tal como `JSONEachRow`. Para obtener más información, consulte [Formato](../../interfaces/formats.md) apartado. + +Parámetros opcionales: + +- `kafka_row_delimiter` – Carácter delimitador, que termina el mensaje. +- `kafka_schema` – Parámetro que debe utilizarse si el formato requiere una definición de esquema. Por ejemplo, [Cap'n Proto](https://capnproto.org/) requiere la ruta de acceso al archivo de esquema y el nombre de la raíz `schema.capnp:Message` objeto. +- `kafka_num_consumers` – El número de consumidores por tabla. Predeterminado: `1`. Especifique más consumidores si el rendimiento de un consumidor es insuficiente. El número total de consumidores no debe exceder el número de particiones en el tema, ya que solo se puede asignar un consumidor por partición. +- `kafka_skip_broken_messages` – Tolerancia analizador de mensajes Kafka a los mensajes incompatibles con el esquema por bloque. Predeterminado: `0`. Si `kafka_skip_broken_messages = N` Entonces el motor salta *Y* Mensajes de Kafka que no se pueden analizar (un mensaje es igual a una fila de datos). + +Ejemplos: + +``` sql + CREATE TABLE queue ( + timestamp UInt64, + level String, + message String + ) ENGINE = Kafka('localhost:9092', 'topic', 'group1', 'JSONEachRow'); + + SELECT * FROM queue LIMIT 5; + + CREATE TABLE queue2 ( + timestamp UInt64, + level String, + message String + ) ENGINE = Kafka SETTINGS kafka_broker_list = 'localhost:9092', + kafka_topic_list = 'topic', + kafka_group_name = 'group1', + kafka_format = 'JSONEachRow', + kafka_num_consumers = 4; + + CREATE TABLE queue2 ( + timestamp UInt64, + level String, + message String + ) ENGINE = Kafka('localhost:9092', 'topic', 'group1') + SETTINGS kafka_format = 'JSONEachRow', + kafka_num_consumers = 4; +``` + +
+ +Método obsoleto para crear una tabla + +!!! attention "Atención" + No utilice este método en nuevos proyectos. Si es posible, cambie los proyectos antiguos al método descrito anteriormente. + +``` sql +Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format + [, kafka_row_delimiter, kafka_schema, kafka_num_consumers, kafka_skip_broken_messages]) +``` + +
+ +## Descripción {#description} + +Los mensajes entregados se realizan un seguimiento automático, por lo que cada mensaje de un grupo solo se cuenta una vez. Si desea obtener los datos dos veces, cree una copia de la tabla con otro nombre de grupo. + +Los grupos son flexibles y se sincronizan en el clúster. Por ejemplo, si tiene 10 temas y 5 copias de una tabla en un clúster, cada copia obtiene 2 temas. Si el número de copias cambia, los temas se redistribuyen automáticamente entre las copias. Lea más sobre esto en http://kafka.apache.org/intro . + +`SELECT` no es particularmente útil para leer mensajes (excepto para la depuración), ya que cada mensaje se puede leer solo una vez. Es más práctico crear subprocesos en tiempo real utilizando vistas materializadas. Para hacer esto: + +1. Use el motor para crear un consumidor de Kafka y considérelo como un flujo de datos. +2. Crea una tabla con la estructura deseada. +3. Cree una vista materializada que convierta los datos del motor y los coloque en una tabla creada previamente. + +Cuando el `MATERIALIZED VIEW` se une al motor, comienza a recopilar datos en segundo plano. Esto le permite recibir continuamente mensajes de Kafka y convertirlos al formato requerido usando `SELECT`. +Una tabla kafka puede tener tantas vistas materializadas como desee, no leen datos de la tabla kafka directamente, sino que reciben nuevos registros (en bloques), de esta manera puede escribir en varias tablas con diferentes niveles de detalle (con agrupación - agregación y sin). + +Ejemplo: + +``` sql + CREATE TABLE queue ( + timestamp UInt64, + level String, + message String + ) ENGINE = Kafka('localhost:9092', 'topic', 'group1', 'JSONEachRow'); + + CREATE TABLE daily ( + day Date, + level String, + total UInt64 + ) ENGINE = SummingMergeTree(day, (day, level), 8192); + + CREATE MATERIALIZED VIEW consumer TO daily + AS SELECT toDate(toDateTime(timestamp)) AS day, level, count() as total + FROM queue GROUP BY day, level; + + SELECT level, sum(total) FROM daily GROUP BY level; +``` + +Para mejorar el rendimiento, los mensajes recibidos se agrupan en bloques del tamaño de [Max\_insert\_block\_size](../settings/settings.md#settings-max_insert_block_size). Si el bloque no se formó dentro [Nombre de la red inalámbrica (SSID):](../settings/settings.md) milisegundos, los datos se vaciarán a la tabla independientemente de la integridad del bloque. + +Para detener la recepción de datos de tema o cambiar la lógica de conversión, desconecte la vista materializada: + +``` sql + DETACH TABLE consumer; + ATTACH MATERIALIZED VIEW consumer; +``` + +Si desea cambiar la tabla de destino utilizando `ALTER`, recomendamos deshabilitar la vista de material para evitar discrepancias entre la tabla de destino y los datos de la vista. + +## Configuración {#configuration} + +Similar a GraphiteMergeTree, el motor Kafka admite una configuración extendida utilizando el archivo de configuración ClickHouse. Hay dos claves de configuración que puede usar: global (`kafka`) y a nivel de tema (`kafka_*`). La configuración global se aplica primero y, a continuación, se aplica la configuración de nivel de tema (si existe). + +``` xml + + + cgrp + smallest + + + + + 250 + 100000 + +``` + +Para obtener una lista de posibles opciones de configuración, consulte [referencia de configuración librdkafka](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md). Usa el guión bajo (`_`) en lugar de un punto en la configuración de ClickHouse. Por ejemplo, `check.crcs=true` será `true`. + +## Columnas virtuales {#virtual-columns} + +- `_topic` — El tema de Kafka. +- `_key` — Clave del mensaje. +- `_offset` — Desplazamiento del mensaje. +- `_timestamp` — Marca de tiempo del mensaje. +- `_partition` — Partición de Kafka tema. + +**Ver también** + +- [Columnas virtuales](index.md#table_engines-virtual_columns) + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/kafka/) diff --git a/docs/es/operations/table_engines/log.md b/docs/es/operations/table_engines/log.md new file mode 100644 index 00000000000..8e029f45408 --- /dev/null +++ b/docs/es/operations/table_engines/log.md @@ -0,0 +1,9 @@ +# Registro {#log} + +El motor pertenece a la familia de motores de registro. Consulte las propiedades comunes de los motores de registro y sus diferencias en [Familia del motor de registro](log_family.md) artículo. + +El registro difiere de [TinyLog](tinylog.md) en que un pequeño archivo de “marks” reside con los archivos de columna. Estas marcas se escriben en cada bloque de datos y contienen compensaciones que indican dónde comenzar a leer el archivo para omitir el número especificado de filas. Esto hace posible leer datos de tabla en múltiples hilos. +Para el acceso a datos simultáneos, las operaciones de lectura se pueden realizar simultáneamente, mientras que las operaciones de escritura bloquean las lecturas entre sí. +El motor de registro no admite índices. Del mismo modo, si la escritura en una tabla falla, la tabla se rompe y la lectura de ella devuelve un error. El motor de registro es adecuado para datos temporales, tablas de escritura única y para fines de prueba o demostración. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/log/) diff --git a/docs/es/operations/table_engines/log_family.md b/docs/es/operations/table_engines/log_family.md new file mode 100644 index 00000000000..0f9d77ee062 --- /dev/null +++ b/docs/es/operations/table_engines/log_family.md @@ -0,0 +1,39 @@ +# Familia del motor de registro {#log-engine-family} + +Estos motores fueron desarrollados para escenarios en los que necesita escribir rápidamente muchas tablas pequeñas (hasta aproximadamente 1 millón de filas) y leerlas más tarde en su conjunto. + +Motores de la familia: + +- [StripeLog](stripelog.md) +- [Registro](log.md) +- [TinyLog](tinylog.md) + +## Propiedades comunes {#common-properties} + +Motor: + +- Almacenar datos en un disco. + +- Agregue datos al final del archivo al escribir. + +- Bloqueos de soporte para el acceso a datos simultáneos. + + Durante `INSERT` consultas, la tabla está bloqueada y otras consultas para leer y escribir datos esperan a que la tabla se desbloquee. Si no hay consultas de escritura de datos, se puede realizar cualquier número de consultas de lectura de datos simultáneamente. + +- No apoyo [mutación](../../query_language/alter.md#alter-mutations) operación. + +- No admite índices. + + Esto significa que `SELECT` las consultas para rangos de datos no son eficientes. + +- No escriba datos atómicamente. + + Puede obtener una tabla con datos dañados si algo rompe la operación de escritura, por ejemplo, un cierre anormal del servidor. + +## Diferencia {#differences} + +El `TinyLog` es el más simple de la familia y proporciona la funcionalidad más pobre y la eficiencia más baja. El `TinyLog` el motor no admite la lectura de datos paralelos por varios subprocesos. Lee datos más lentamente que otros motores de la familia que admiten lectura paralela y utiliza casi tantos descriptores como los `Log` motor porque almacena cada columna en un archivo separado. Úselo en escenarios simples de baja carga. + +El `Log` y `StripeLog` Los motores admiten lectura de datos paralela. Al leer datos, ClickHouse usa múltiples hilos. Cada subproceso procesa un bloque de datos separado. El `Log` utiliza un archivo separado para cada columna de la tabla. `StripeLog` almacena todos los datos en un archivo. Como resultado, el `StripeLog` el motor utiliza menos descriptores en el sistema operativo, pero el `Log` proporciona una mayor eficiencia al leer datos. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/log_family/) diff --git a/docs/es/operations/table_engines/materializedview.md b/docs/es/operations/table_engines/materializedview.md new file mode 100644 index 00000000000..4e12075c542 --- /dev/null +++ b/docs/es/operations/table_engines/materializedview.md @@ -0,0 +1,5 @@ +# Método de codificación de datos: {#materializedview} + +Se utiliza para implementar vistas materializadas (para obtener más información, consulte [CREAR TABLA](../../query_language/create.md)). Para almacenar datos, utiliza un motor diferente que se especificó al crear la vista. Al leer desde una tabla, solo usa este motor. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/materializedview/) diff --git a/docs/es/operations/table_engines/memory.md b/docs/es/operations/table_engines/memory.md new file mode 100644 index 00000000000..333d74ccd9b --- /dev/null +++ b/docs/es/operations/table_engines/memory.md @@ -0,0 +1,12 @@ +# Memoria {#memory} + +El motor de memoria almacena datos en RAM, en forma sin comprimir. Los datos se almacenan exactamente en la misma forma en que se reciben cuando se leen. En otras palabras, la lectura de esta tabla es completamente gratuita. +El acceso a los datos simultáneos está sincronizado. Los bloqueos son cortos: las operaciones de lectura y escritura no se bloquean entre sí. +Los índices no son compatibles. La lectura está paralelizada. +La productividad máxima (más de 10 GB/s) se alcanza en consultas simples, porque no hay lectura del disco, descomprimir o deserializar datos. (Cabe señalar que, en muchos casos, la productividad del motor MergeTree es casi tan alta.) +Al reiniciar un servidor, los datos desaparecen de la tabla y la tabla queda vacía. +Normalmente, el uso de este motor de tabla no está justificado. Sin embargo, se puede usar para pruebas y para tareas donde se requiere la velocidad máxima en un número relativamente pequeño de filas (hasta aproximadamente 100,000,000). + +El sistema utiliza el motor de memoria para tablas temporales con datos de consulta externos (consulte la sección “External data for processing a query”), y para la implementación de GLOBAL IN (véase la sección “IN operators”). + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/memory/) diff --git a/docs/es/operations/table_engines/merge.md b/docs/es/operations/table_engines/merge.md new file mode 100644 index 00000000000..33789b56570 --- /dev/null +++ b/docs/es/operations/table_engines/merge.md @@ -0,0 +1,63 @@ +# Fusionar {#merge} + +El `Merge` motor (no debe confundirse con `MergeTree`) no almacena datos en sí, pero permite leer de cualquier número de otras tablas simultáneamente. +La lectura se paralela automáticamente. No se admite la escritura en una tabla. Al leer, se usan los índices de las tablas que realmente se están leyendo, si existen. +El `Merge` engine acepta parámetros: el nombre de la base de datos y una expresión regular para las tablas. + +Ejemplo: + +``` sql +Merge(hits, '^WatchLog') +``` + +Los datos se leerán de las tablas en el `hits` base de datos que tienen nombres que coinciden con la expresión regular ‘`^WatchLog`’. + +En lugar del nombre de la base de datos, puede usar una expresión constante que devuelva una cadena. Por ejemplo, `currentDatabase()`. + +Expresiones regulares — [Re2](https://github.com/google/re2) (soporta un subconjunto de PCRE), sensible a mayúsculas y minúsculas. +Vea las notas sobre los símbolos de escape en expresiones regulares en el “match” apartado. + +Al seleccionar tablas para leer, el `Merge` no se seleccionará la tabla en sí, incluso si coincide con la expresión regular. Esto es para evitar bucles. +Es posible crear dos `Merge` tablas que intentarán sin cesar leer los datos de los demás, pero esta no es una buena idea. + +La forma típica de usar el `Merge` para trabajar con un gran número de `TinyLog` tablas como si con una sola tabla. + +Ejemplo 2: + +Supongamos que tiene una tabla antigua (WatchLog\_old) y decidió cambiar la partición sin mover los datos a una nueva tabla (WatchLog\_new) y necesita ver los datos de ambas tablas. + +``` sql +CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64) +ENGINE=MergeTree(date, (UserId, EventType), 8192); +INSERT INTO WatchLog_old VALUES ('2018-01-01', 1, 'hit', 3); + +CREATE TABLE WatchLog_new(date Date, UserId Int64, EventType String, Cnt UInt64) +ENGINE=MergeTree PARTITION BY date ORDER BY (UserId, EventType) SETTINGS index_granularity=8192; +INSERT INTO WatchLog_new VALUES ('2018-01-02', 2, 'hit', 3); + +CREATE TABLE WatchLog as WatchLog_old ENGINE=Merge(currentDatabase(), '^WatchLog'); + +SELECT * +FROM WatchLog +``` + +``` text +┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ +│ 2018-01-01 │ 1 │ hit │ 3 │ +└────────────┴────────┴───────────┴─────┘ +┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ +│ 2018-01-02 │ 2 │ hit │ 3 │ +└────────────┴────────┴───────────┴─────┘ +``` + +## Columnas virtuales {#virtual-columns} + +- `_table` — Contiene el nombre de la tabla de la que se leyeron los datos. Tipo: [Cadena](../../data_types/string.md). + + Puede establecer las condiciones constantes en `_table` es el `WHERE/PREWHERE` cláusula (por ejemplo, `WHERE _table='xyz'`). En este caso, la operación de lectura se realiza sólo para las tablas donde la condición en `_table` está satisfecho, por lo que el `_table` columna actúa como un índice. + +**Ver también** + +- [Columnas virtuales](index.md#table_engines-virtual_columns) + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/merge/) diff --git a/docs/es/operations/table_engines/mergetree.md b/docs/es/operations/table_engines/mergetree.md new file mode 100644 index 00000000000..7ed2d62459d --- /dev/null +++ b/docs/es/operations/table_engines/mergetree.md @@ -0,0 +1,649 @@ +# Método de codificación de datos: {#table-engines-mergetree} + +El `MergeTree` motor y otros motores de esta familia (`*MergeTree`) son los motores de mesa ClickHouse más robustos. + +Motores en el `MergeTree` familia están diseñados para insertar una gran cantidad de datos en una tabla. Los datos se escriben rápidamente en la tabla parte por parte, luego se aplican reglas para fusionar las partes en segundo plano. Este método es mucho más eficiente que reescribir continuamente los datos en almacenamiento durante la inserción. + +Principales características: + +- Almacena datos ordenados por clave principal. + + Esto le permite crear un pequeño índice disperso que ayuda a encontrar datos más rápido. + +- Las particiones se pueden utilizar si [clave de partición](custom_partitioning_key.md) se especifica. + + ClickHouse admite ciertas operaciones con particiones que son más efectivas que las operaciones generales en los mismos datos con el mismo resultado. ClickHouse también corta automáticamente los datos de partición donde se especifica la clave de partición en la consulta. Esto también mejora el rendimiento de las consultas. + +- Soporte de replicación de datos. + + La familia de `ReplicatedMergeTree` proporciona la replicación de datos. Para obtener más información, consulte [Replicación de datos](replication.md). + +- Soporte de muestreo de datos. + + Si es necesario, puede establecer el método de muestreo de datos en la tabla. + +!!! info "INFO" + El [Fusionar](merge.md) el motor no pertenece al `*MergeTree` Familia. + +## Creación de una tabla {#table-engine-mergetree-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], + ... + INDEX index_name1 expr1 TYPE type1(...) GRANULARITY value1, + INDEX index_name2 expr2 TYPE type2(...) GRANULARITY value2 +) ENGINE = MergeTree() +[PARTITION BY expr] +[ORDER BY expr] +[PRIMARY KEY expr] +[SAMPLE BY expr] +[TTL expr [DELETE|TO DISK 'xxx'|TO VOLUME 'xxx'], ...] +[SETTINGS name=value, ...] +``` + +Para obtener una descripción de los parámetros, consulte [Descripción de la consulta CREATE](../../query_language/create.md). + +!!! note "Nota" + `INDEX` es una característica experimental, ver [Índices de saltos de datos](#table_engine-mergetree-data_skipping-indexes). + +### Cláusulas de consulta {#mergetree-query-clauses} + +- `ENGINE` — Nombre y parámetros del motor. `ENGINE = MergeTree()`. El `MergeTree` el motor no tiene parámetros. + +- `PARTITION BY` — El [clave de partición](custom_partitioning_key.md). + + Para particionar por mes, utilice el `toYYYYMM(date_column)` expresión, donde `date_column` es una columna con una fecha del tipo [Fecha](../../data_types/date.md). Los nombres de partición aquí tienen el `"YYYYMM"` formato. + +- `ORDER BY` — La clave de clasificación. + + Una tupla de columnas o expresiones arbitrarias. Ejemplo: `ORDER BY (CounterID, EventDate)`. + +- `PRIMARY KEY` — La clave principal si [difiere de la clave de clasificación](mergetree.md). + + De forma predeterminada, la clave principal es la misma que la clave de ordenación (que se especifica `ORDER BY` clausula). Por lo tanto, en la mayoría de los casos no es necesario especificar un `PRIMARY KEY` clausula. + +- `SAMPLE BY` — Una expresión para el muestreo. + + Si se utiliza una expresión de muestreo, la clave principal debe contenerla. Ejemplo: `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))`. + +- `TTL` — Una lista de reglas que especifican la duración de almacenamiento de las filas y definen la lógica del movimiento automático de piezas [entre discos y volúmenes](#table_engine-mergetree-multiple-volumes). + + La expresión debe tener una `Date` o `DateTime` columna como resultado. Ejemplo: + `TTL date + INTERVAL 1 DAY` + + Tipo de regla `DELETE|TO DISK 'xxx'|TO VOLUME 'xxx'` especifica una acción que debe realizarse con la pieza si la expresión está satisfecha (alcanza la hora actual): eliminación de filas caducadas, mover una pieza (si la expresión está satisfecha para todas las filas de una pieza) al disco especificado (`TO DISK 'xxx'`) o al volumen (`TO VOLUME 'xxx'`). El tipo predeterminado de la regla es la eliminación (`DELETE`). Se puede especificar una lista de varias reglas, pero no debe haber más de una `DELETE` regla. + + Para obtener más información, consulte [TTL para columnas y tablas](#table_engine-mergetree-ttl) + +- `SETTINGS` — Parámetros adicionales que controlan el comportamiento del `MergeTree`: + + - `index_granularity` — Número máximo de filas de datos entre las marcas de un índice. Valor predeterminado: 8192. Ver [Almacenamiento de datos](#mergetree-data-storage). + - `index_granularity_bytes` — Tamaño máximo de los gránulos de datos en bytes. Valor predeterminado: 10 MB. Para restringir el tamaño del gránulo solo por el número de filas, establezca en 0 (no recomendado). Ver [Almacenamiento de datos](#mergetree-data-storage). + - `enable_mixed_granularity_parts` — Habilita o deshabilita la transición para controlar el tamaño del gránulo `index_granularity_bytes` configuración. Antes de la versión 19.11, sólo existía el `index_granularity` ajuste para restringir el tamaño del gránulo. El `index_granularity_bytes` mejora el rendimiento de ClickHouse al seleccionar datos de tablas con filas grandes (decenas y cientos de megabytes). Si tiene tablas con filas grandes, puede habilitar esta configuración para que las tablas mejoren la eficiencia de `SELECT` consulta. + - `use_minimalistic_part_header_in_zookeeper` — Método de almacenamiento de los encabezados de partes de datos en ZooKeeper. Si `use_minimalistic_part_header_in_zookeeper=1`, entonces ZooKeeper almacena menos datos. Para obtener más información, consulte [descripción del ajuste](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) es “Server configuration parameters”. + - `min_merge_bytes_to_use_direct_io` — El volumen mínimo de datos para la operación de fusión que se necesita para utilizar el acceso directo de E/S al disco de almacenamiento. Al fusionar partes de datos, ClickHouse calcula el volumen total de almacenamiento de todos los datos que se van a fusionar. Si el volumen excede `min_merge_bytes_to_use_direct_io` bytes, ClickHouse lee y escribe los datos en el disco de almacenamiento utilizando la interfaz de E / S directa (`O_DIRECT` opción). Si `min_merge_bytes_to_use_direct_io = 0`, entonces la E/S directa está deshabilitada. Valor predeterminado: `10 * 1024 * 1024 * 1024` byte. + + - `merge_with_ttl_timeout` — Retraso mínimo en segundos antes de repetir una fusión con TTL. Valor predeterminado: 86400 (1 día). + - `write_final_mark` — Habilita o deshabilita la escritura de la marca de índice final al final de la parte de datos (después del último byte). Valor predeterminado: 1. No lo apague. + - `merge_max_block_size` — Número máximo de filas en el bloque para operaciones de fusión. Valor predeterminado: 8192. + - `storage_policy` — Política de almacenamiento. Ver [Uso de varios dispositivos de bloque para el almacenamiento de datos](#table_engine-mergetree-multiple-volumes). + +**Ejemplo de configuración de secciones** + +``` sql +ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity=8192 +``` + +En el ejemplo, configuramos la partición por mes. + +También establecemos una expresión para el muestreo como un hash por el ID de usuario. Esto le permite pseudoaleatorizar los datos en la tabla para cada `CounterID` y `EventDate`. Si definir un [MUESTRA](../../query_language/select.md#select-sample-clause) cláusula al seleccionar los datos, ClickHouse devolverá una muestra de datos pseudoaleatoria uniforme para un subconjunto de usuarios. + +El `index_granularity` se puede omitir porque 8192 es el valor predeterminado. + +
+ +Método obsoleto para crear una tabla + +!!! attention "Atención" + No utilice este método en nuevos proyectos. Si es posible, cambie los proyectos antiguos al método descrito anteriormente. + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] MergeTree(date-column [, sampling_expression], (primary, key), index_granularity) +``` + +**Parámetros MergeTree()** + +- `date-column` — El nombre de una columna del [Fecha](../../data_types/date.md) tipo. ClickHouse crea automáticamente particiones por mes en función de esta columna. Los nombres de partición están en el `"YYYYMM"` formato. +- `sampling_expression` — Una expresión para el muestreo. +- `(primary, key)` — Clave principal. Tipo: [Tupla()](../../data_types/tuple.md) +- `index_granularity` — La granularidad de un índice. El número de filas de datos entre “marks” de un índice. El valor 8192 es apropiado para la mayoría de las tareas. + +**Ejemplo** + +``` sql +MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) +``` + +El `MergeTree` engine se configura de la misma manera que en el ejemplo anterior para el método de configuración del motor principal. +
+ +## Almacenamiento de datos {#mergetree-data-storage} + +Una tabla consta de partes de datos ordenadas por clave principal. + +Cuando se insertan datos en una tabla, se crean partes de datos separadas y cada una de ellas se ordena lexicográficamente por clave principal. Por ejemplo, si la clave principal es `(CounterID, Date)`, los datos en la parte se ordenan por `CounterID`, y dentro de cada `CounterID` es ordenado por `Date`. + +Los datos que pertenecen a diferentes particiones se separan en diferentes partes. En el fondo, ClickHouse combina partes de datos para un almacenamiento más eficiente. Las piezas que pertenecen a particiones diferentes no se fusionan. El mecanismo de combinación no garantiza que todas las filas con la misma clave principal estén en la misma parte de datos. + +Cada parte de datos se divide lógicamente en gránulos. Un gránulo es el conjunto de datos indivisibles más pequeño que ClickHouse lee al seleccionar datos. ClickHouse no divide filas o valores, por lo que cada gránulo siempre contiene un número entero de filas. La primera fila de un gránulo está marcada con el valor de la clave principal de la fila. Para cada parte de datos, ClickHouse crea un archivo de índice que almacena las marcas. Para cada columna, ya sea en la clave principal o no, ClickHouse también almacena las mismas marcas. Estas marcas le permiten encontrar datos directamente en archivos de columnas. + +El tamaño del gránulo es restringido por `index_granularity` y `index_granularity_bytes` configuración del motor de tabla. El número de filas en un gránulo se encuentra en el `[1, index_granularity]` rango, dependiendo del tamaño de las filas. El tamaño de un gránulo puede exceder `index_granularity_bytes` si el tamaño de una sola fila es mayor que el valor de la configuración. En este caso, el tamaño del gránulo es igual al tamaño de la fila. + +## Claves e índices principales en consultas {#primary-keys-and-indexes-in-queries} + +Tome el `(CounterID, Date)` clave primaria como ejemplo. En este caso, la clasificación y el índice se pueden ilustrar de la siguiente manera: + + Whole data: [-------------------------------------------------------------------------] + CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll] + Date: [1111111222222233331233211111222222333211111112122222223111112223311122333] + Marks: | | | | | | | | | | | + a,1 a,2 a,3 b,3 e,2 e,3 g,1 h,2 i,1 i,3 l,3 + Marks numbers: 0 1 2 3 4 5 6 7 8 9 10 + +Si la consulta de datos especifica: + +- `CounterID in ('a', 'h')`, el servidor lee los datos en los rangos de marcas `[0, 3)` y `[6, 8)`. +- `CounterID IN ('a', 'h') AND Date = 3`, el servidor lee los datos en los rangos de marcas `[1, 3)` y `[7, 8)`. +- `Date = 3`, el servidor lee los datos en el rango de marcas `[1, 10]`. + +Los ejemplos anteriores muestran que siempre es más efectivo usar un índice que un análisis completo. + +Un índice disperso permite leer datos adicionales. Al leer un único rango de la clave primaria, hasta `index_granularity * 2` se pueden leer filas adicionales en cada bloque de datos. + +Los índices dispersos le permiten trabajar con una gran cantidad de filas de tabla, porque en la mayoría de los casos, dichos índices caben en la RAM de la computadora. + +ClickHouse no requiere una clave principal única. Puede insertar varias filas con la misma clave principal. + +### Selección de la clave principal {#selecting-the-primary-key} + +El número de columnas en la clave principal no está explícitamente limitado. Dependiendo de la estructura de datos, puede incluir más o menos columnas en la clave principal. Esto puede: + +- Mejorar el rendimiento de un índice. + + Si la clave principal es `(a, b)`, a continuación, añadir otra columna `c` mejorará el rendimiento si se cumplen las siguientes condiciones: + + - Hay consultas con una condición en la columna `c`. + - Rangos de datos largos (varias veces más `index_granularity`) con valores idénticos para `(a, b)` hijo comunes. En otras palabras, al agregar otra columna le permite omitir rangos de datos bastante largos. + +- Mejorar la compresión de datos. + + ClickHouse ordena los datos por clave principal, por lo que cuanto mayor sea la consistencia, mejor será la compresión. + +- Proporcione una lógica adicional al fusionar partes de datos en el [ColapsarMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) y [SummingMergeTree](summingmergetree.md) motor. + + En este caso tiene sentido especificar el *clave de clasificación* que es diferente de la clave principal. + +Una clave principal larga afectará negativamente al rendimiento de la inserción y al consumo de memoria, pero las columnas adicionales de la clave principal no afectarán al rendimiento de ClickHouse durante `SELECT` consulta. + +### Elegir una clave principal que difiere de la clave de ordenación {#choosing-a-primary-key-that-differs-from-the-sorting-key} + +Es posible especificar una clave principal (una expresión con valores que se escriben en el archivo de índice para cada marca) que es diferente de la clave de ordenación (una expresión para ordenar las filas en partes de datos). En este caso, la tupla de expresión de clave primaria debe ser un prefijo de la tupla de expresión de clave de ordenación. + +Esta característica es útil cuando se [SummingMergeTree](summingmergetree.md) y +[AgregaciónMergeTree](aggregatingmergetree.md) motores de mesa. En un caso común cuando se utilizan estos motores, la tabla tiene dos tipos de columnas: *cota* y *medida*. Las consultas típicas agregan valores de columnas de medida con `GROUP BY` y filtrado por dimensiones. Debido a que SummingMergeTree y AggregatingMergeTree agregan filas con el mismo valor de la clave de ordenación, es natural agregarle todas las dimensiones. Como resultado, la expresión de clave consta de una larga lista de columnas y esta lista debe actualizarse con frecuencia con las dimensiones recién agregadas. + +En este caso, tiene sentido dejar solo unas pocas columnas en la clave principal que proporcionarán análisis de rango eficientes y agregarán las columnas de dimensión restantes a la tupla de clave de clasificación. + +[ALTERAR](../../query_language/alter.md) de la clave de ordenación es una operación ligera porque cuando se agrega una nueva columna simultáneamente a la tabla y a la clave de ordenación, no es necesario cambiar las partes de datos existentes. Dado que la clave de ordenación anterior es un prefijo de la nueva clave de ordenación y no hay datos en la columna recién agregada, los datos se ordenan tanto por las claves de ordenación antiguas como por las nuevas en el momento de la modificación de la tabla. + +### Uso de índices y particiones en consultas {#use-of-indexes-and-partitions-in-queries} + +Para `SELECT` consultas, ClickHouse analiza si se puede usar un índice. Se puede usar un índice si el `WHERE/PREWHERE` clause tiene una expresión (como uno de los elementos de conjunción, o enteramente) que representa una operación de comparación de igualdad o desigualdad, o si tiene `IN` o `LIKE` con un prefijo fijo en columnas o expresiones que están en la clave principal o clave de partición, o en ciertas funciones parcialmente repetitivas de estas columnas, o relaciones lógicas de estas expresiones. + +Por lo tanto, es posible ejecutar rápidamente consultas en uno o varios rangos de la clave principal. En este ejemplo, las consultas serán rápidas cuando se ejecuten para una etiqueta de seguimiento específica, para una etiqueta y un intervalo de fechas específicos, para una etiqueta y una fecha específicas, para varias etiquetas con un intervalo de fechas, etc. + +Veamos el motor configurado de la siguiente manera: + + ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192 + +En este caso, en consultas: + +``` sql +SELECT count() FROM table WHERE EventDate = toDate(now()) AND CounterID = 34 +SELECT count() FROM table WHERE EventDate = toDate(now()) AND (CounterID = 34 OR CounterID = 42) +SELECT count() FROM table WHERE ((EventDate >= toDate('2014-01-01') AND EventDate <= toDate('2014-01-31')) OR EventDate = toDate('2014-05-01')) AND CounterID IN (101500, 731962, 160656) AND (CounterID = 101500 OR EventDate != toDate('2014-05-01')) +``` + +ClickHouse utilizará el índice de clave principal para recortar datos incorrectos y la clave de partición mensual para recortar particiones que están en intervalos de fechas incorrectos. + +Las consultas anteriores muestran que el índice se usa incluso para expresiones complejas. La lectura de la tabla está organizada de modo que el uso del índice no puede ser más lento que un análisis completo. + +En el siguiente ejemplo, el índice no se puede usar. + +``` sql +SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' +``` + +Para comprobar si ClickHouse puede usar el índice al ejecutar una consulta, use la configuración [Fecha de nacimiento](../settings/settings.md#settings-force_index_by_date) y [force\_primary\_key](../settings/settings.md). + +La clave para particionar por mes permite leer solo aquellos bloques de datos que contienen fechas del rango adecuado. En este caso, el bloque de datos puede contener datos para muchas fechas (hasta un mes). Dentro de un bloque, los datos se ordenan por clave principal, que puede no contener la fecha como la primera columna. Debido a esto, el uso de una consulta con solo una condición de fecha que no especifica el prefijo de clave principal hará que se lean más datos que para una sola fecha. + +### Uso del índice para claves primarias parcialmente monotónicas {#use-of-index-for-partially-monotonic-primary-keys} + +Considere, por ejemplo, los días del mes. Ellos forman un [monótona secuencia](https://en.wikipedia.org/wiki/Monotonic_function) durante un mes, pero no monótono durante períodos más prolongados. Esta es una secuencia parcialmente monotónica. Si un usuario crea la tabla con clave primaria parcialmente monótona, ClickHouse crea un índice disperso como de costumbre. Cuando un usuario selecciona datos de este tipo de tabla, ClickHouse analiza las condiciones de consulta. Si el usuario desea obtener datos entre dos marcas del índice y ambas marcas caen dentro de un mes, ClickHouse puede usar el índice en este caso particular porque puede calcular la distancia entre los parámetros de una consulta y las marcas de índice. + +ClickHouse no puede usar un índice si los valores de la clave principal en el rango de parámetros de consulta no representan una secuencia monotónica. En este caso, ClickHouse utiliza el método de análisis completo. + +ClickHouse usa esta lógica no solo para secuencias de días del mes, sino para cualquier clave principal que represente una secuencia parcialmente monotónica. + +### Índices de saltos de datos (experimental) {#table-engine-mergetree-data-skipping-indexes} + +La declaración de índice se encuentra en la sección de columnas del `CREATE` consulta. + +``` sql +INDEX index_name expr TYPE type(...) GRANULARITY granularity_value +``` + +Para tablas de la `*MergeTree` familia, se pueden especificar índices de omisión de datos. + +Estos índices agregan cierta información sobre la expresión especificada en bloques, que consisten en `granularity_value` gránulos (el tamaño del gránulo se especifica utilizando el `index_granularity` ajuste en el motor de la tabla). Entonces estos agregados se usan en `SELECT` consultas para reducir la cantidad de datos a leer desde el disco omitiendo grandes bloques de datos donde el `where` consulta no puede ser satisfecha. + +**Ejemplo** + +``` sql +CREATE TABLE table_name +( + u64 UInt64, + i32 Int32, + s String, + ... + INDEX a (u64 * i32, s) TYPE minmax GRANULARITY 3, + INDEX b (u64 * length(s)) TYPE set(1000) GRANULARITY 4 +) ENGINE = MergeTree() +... +``` + +ClickHouse puede utilizar los índices del ejemplo para reducir la cantidad de datos que se leen desde el disco en las siguientes consultas: + +``` sql +SELECT count() FROM table WHERE s < 'z' +SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 +``` + +#### Tipos de índices disponibles {#available-types-of-indices} + +- `minmax` + + Almacena los extremos de la expresión especificada (si la expresión `tuple`, entonces almacena extremos para cada elemento de `tuple`), utiliza información almacenada para omitir bloques de datos como la clave principal. + +- `set(max_rows)` + + Almacena valores únicos de la expresión especificada (no más de `max_rows` películas, `max_rows=0` medio “no limits”). Utiliza los valores para comprobar si `WHERE` expresión no es satisfactorio en un bloque de datos. + +- `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` + + Tiendas a [Filtro de floración](https://en.wikipedia.org/wiki/Bloom_filter) que contiene todos los ngrams de un bloque de datos. Funciona solo con cadenas. Puede ser utilizado para la optimización de `equals`, `like` y `in` expresiones. + + - `n` — tamaño del ngram, + - `size_of_bloom_filter_in_bytes` — Tamaño del filtro Bloom en bytes (puede usar valores grandes aquí, por ejemplo, 256 o 512, porque se puede comprimir bien). + - `number_of_hash_functions` — El número de funciones hash utilizadas en el filtro Bloom. + - `random_seed` — La semilla para las funciones hash de filtro Bloom. + +- `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` + + Lo mismo que `ngrambf_v1`, pero almacena tokens en lugar de ngrams. Los tokens son secuencias separadas por caracteres no alfanuméricos. + +- `bloom_filter([false_positive])` — Almacena un [Filtro de floración](https://en.wikipedia.org/wiki/Bloom_filter) para las columnas especificadas. + + Opcional `false_positive` parámetro es la probabilidad de recibir una respuesta falsa positiva del filtro. Valores posibles: (0, 1). Valor predeterminado: 0.025. + + Tipos de datos admitidos: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`, `Array`, `LowCardinality`, `Nullable`. + + Las siguientes funciones pueden usarlo: [igual](../../query_language/functions/comparison_functions.md), [notEquals](../../query_language/functions/comparison_functions.md), [es](../../query_language/functions/in_functions.md), [noEn](../../query_language/functions/in_functions.md), [Tener](../../query_language/functions/array_functions.md). + + + +``` sql +INDEX sample_index (u64 * length(s)) TYPE minmax GRANULARITY 4 +INDEX sample_index2 (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4 +INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 4 +``` + +#### Funciones de apoyo {#functions-support} + +Condiciones en el `WHERE` cláusula contiene llamadas de las funciones que operan con columnas. Si la columna forma parte de un índice, ClickHouse intenta usar este índice al realizar las funciones. ClickHouse admite diferentes subconjuntos de funciones para usar índices. + +El `set` index se puede utilizar con todas las funciones. Subconjuntos de funciones para otros índices se muestran en la siguiente tabla. + +| Función (operador) / Índice | Clave primaria | minmax | Descripción | Sistema abierto. | bloom\_filter | +|--------------------------------------------------------------------------------------------------------|----------------|--------|-------------|------------------|---------------| +| [¿Por qué?)](../../query_language/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notEquals(!Número)](../../query_language/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [como](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ | +| [No como](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✗ | ✗ | +| [Comienza con](../../query_language/functions/string_functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [Finaliza con](../../query_language/functions/string_functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ | +| [multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✗ | ✗ | +| [es](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [noEn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [menos (\<)](../../query_language/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [Alcalde (\>)](../../query_language/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [menosOrEquals (\<=)](../../query_language/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [mayorOrEquals (\>=)](../../query_language/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [Vaciar](../../query_language/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| hasToken | ✗ | ✗ | ✗ | ✔ | ✗ | + +Las funciones con un argumento constante que es menor que el tamaño de ngram no pueden ser utilizadas por `ngrambf_v1` para la optimización de consultas. + +Los filtros Bloom pueden tener coincidencias falsas positivas, por lo que `ngrambf_v1`, `tokenbf_v1`, y `bloom_filter` los índices no se pueden usar para optimizar consultas donde se espera que el resultado de una función sea falso, por ejemplo: + +- Puede ser optimizado: + - `s LIKE '%test%'` + - `NOT s NOT LIKE '%test%'` + - `s = 1` + - `NOT s != 1` + - `startsWith(s, 'test')` +- No se puede optimizar: + - `NOT s LIKE '%test%'` + - `s NOT LIKE '%test%'` + - `NOT s = 1` + - `s != 1` + - `NOT startsWith(s, 'test')` + +## Acceso a datos simultáneos {#concurrent-data-access} + +Para el acceso simultáneo a tablas, usamos versiones múltiples. En otras palabras, cuando una tabla se lee y actualiza simultáneamente, los datos se leen de un conjunto de partes que está actualizado en el momento de la consulta. No hay cerraduras largas. Las inserciones no se interponen en el camino de las operaciones de lectura. + +La lectura de una tabla se paralela automáticamente. + +## TTL para columnas y tablas {#table-engine-mergetree-ttl} + +Determina la duración de los valores. + +El `TTL` se puede establecer para toda la tabla y para cada columna individual. TTL de nivel de tabla también puede especificar la lógica de movimiento automático de datos entre discos y volúmenes. + +Las expresiones deben evaluar [Fecha](../../data_types/date.md) o [FechaHora](../../data_types/datetime.md) tipo de datos. + +Ejemplo: + +``` sql +TTL time_column +TTL time_column + interval +``` + +Definir `interval`, utilizar [intervalo de tiempo](../../query_language/operators.md#operators-datetime) operador. + +``` sql +TTL date_time + INTERVAL 1 MONTH +TTL date_time + INTERVAL 15 HOUR +``` + +### Columna TTL {#mergetree-column-ttl} + +Cuando los valores de la columna caducan, ClickHouse los reemplaza con los valores predeterminados para el tipo de datos de columna. Si todos los valores de columna en la parte de datos caducan, ClickHouse elimina esta columna de la parte de datos en un sistema de archivos. + +El `TTL` cláusula no se puede utilizar para columnas clave. + +Ejemplos: + +Creación de una tabla con TTL + +``` sql +CREATE TABLE example_table +( + d DateTime, + a Int TTL d + INTERVAL 1 MONTH, + b Int TTL d + INTERVAL 1 MONTH, + c String +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(d) +ORDER BY d; +``` + +Adición de TTL a una columna de una tabla existente + +``` sql +ALTER TABLE example_table + MODIFY COLUMN + c String TTL d + INTERVAL 1 DAY; +``` + +Modificación de TTL de la columna + +``` sql +ALTER TABLE example_table + MODIFY COLUMN + c String TTL d + INTERVAL 1 MONTH; +``` + +### Tabla TTL {#mergetree-table-ttl} + +La tabla puede tener una expresión para la eliminación de filas caducadas y varias expresiones para el movimiento automático de partes entre [discos o volúmenes](#table_engine-mergetree-multiple-volumes). Cuando las filas de la tabla caducan, ClickHouse elimina todas las filas correspondientes. Para la entidad de movimiento de piezas, todas las filas de una pieza deben cumplir los criterios de expresión de movimiento. + +``` sql +TTL expr [DELETE|TO DISK 'aaa'|TO VOLUME 'bbb'], ... +``` + +El tipo de regla TTL puede seguir cada expresión TTL. Afecta a una acción que debe realizarse una vez que se satisface la expresión (alcanza la hora actual): + +- `DELETE` - Eliminar filas caducadas (acción predeterminada); +- `TO DISK 'aaa'` - mover parte al disco `aaa`; +- `TO VOLUME 'bbb'` - mover parte al disco `bbb`. + +Ejemplos: + +Creación de una tabla con TTL + +``` sql +CREATE TABLE example_table +( + d DateTime, + a Int +) +ENGINE = MergeTree +PARTITION BY toYYYYMM(d) +ORDER BY d +TTL d + INTERVAL 1 MONTH [DELETE], + d + INTERVAL 1 WEEK TO VOLUME 'aaa', + d + INTERVAL 2 WEEK TO DISK 'bbb'; +``` + +Modificación de TTL de la tabla + +``` sql +ALTER TABLE example_table + MODIFY TTL d + INTERVAL 1 DAY; +``` + +**Eliminación de datos** + +Los datos con un TTL caducado se eliminan cuando ClickHouse fusiona partes de datos. + +Cuando ClickHouse ve que los datos han caducado, realiza una combinación fuera de programación. Para controlar la frecuencia de tales fusiones, puede establecer [Método de codificación de datos:](#mergetree_setting-merge_with_ttl_timeout). Si el valor es demasiado bajo, realizará muchas fusiones fuera de horario que pueden consumir muchos recursos. + +Si realiza el `SELECT` consulta entre fusiones, puede obtener datos caducados. Para evitarlo, use el [OPTIMIZAR](../../query_language/misc.md#misc_operations-optimize) Consulta antes `SELECT`. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/mergetree/) + +## Uso de varios dispositivos de bloque para el almacenamiento de datos {#table-engine-mergetree-multiple-volumes} + +### Implantación {#introduction} + +`MergeTree` Los motores de tablas familiares pueden almacenar datos en múltiples dispositivos de bloque. Por ejemplo, puede ser útil cuando los datos de una determinada tabla se dividen implícitamente en “hot” y “cold”. Los datos más recientes se solicitan regularmente, pero solo requieren una pequeña cantidad de espacio. Por el contrario, los datos históricos de cola gorda se solicitan raramente. Si hay varios discos disponibles, el “hot” los datos pueden estar ubicados en discos rápidos (por ejemplo, SSD NVMe o en memoria), mientras que “cold” datos - en los relativamente lentos (por ejemplo, HDD). + +La parte de datos es la unidad móvil mínima para `MergeTree`-mesas de motor. Los datos que pertenecen a una parte se almacenan en un disco. Las partes de datos se pueden mover entre discos en segundo plano (según la configuración del usuario) así como por medio de la [ALTERAR](../../query_language/alter.md#alter_move-partition) consulta. + +### Plazo {#terms} + +- Disco: bloquea el dispositivo montado en el sistema de archivos. +- Disco predeterminado: disco que almacena la ruta especificada en el [camino](../server_settings/settings.md#server_settings-path) configuración del servidor. +- Volumen — Conjunto ordenado de discos iguales (similar a [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures)). +- Política de almacenamiento: conjunto de volúmenes y reglas para mover datos entre ellos. + +Los nombres dados a las entidades descritas se pueden encontrar en las tablas del sistema, [sistema.almacenamiento\_policies](../system_tables.md#system_tables-storage_policies) y [sistema.Discoteca](../system_tables.md#system_tables-disks). Para aplicar una de las directivas de almacenamiento configuradas para una tabla, `storage_policy` establecimiento de `MergeTree`-mesas de la familia del motor. + +### Configuración {#table-engine-mergetree-multiple-volumes-configure} + +Los discos, los volúmenes y las políticas de almacenamiento deben declararse `` etiqueta ya sea en el archivo principal `config.xml` o en un archivo distinto en el `config.d` Directorio. + +Estructura de configuración: + +``` xml + + + + /mnt/fast_ssd/clickhouse + + + /mnt/hdd1/clickhouse + 10485760 + + + /mnt/hdd2/clickhouse + 10485760 + + + ... + + + ... + +``` + +Tags: + +- `` — Nombre del disco. Los nombres deben ser diferentes para todos los discos. +- `path` — ruta bajo la cual un servidor almacenará datos (`data` y `shadow` carpetas), debe terminarse con ‘/’. +- `keep_free_space_bytes` — la cantidad de espacio libre en disco que debe reservarse. + +El orden de la definición del disco no es importante. + +Marcado de configuración de directivas de almacenamiento: + +``` xml + + ... + + + + + disk_name_from_disks_configuration + 1073741824 + + + + + + + 0.2 + + + + + + + + ... + +``` + +Tags: + +- `policy_name_N` — Nombre de la póliza. Los nombres de directiva deben ser únicos. +- `volume_name_N` — Nombre del volumen. Los nombres de volumen deben ser únicos. +- `disk` — un disco dentro de un volumen. +- `max_data_part_size_bytes` — el tamaño máximo de una pieza que puede almacenarse en cualquiera de los discos del volumen. +- `move_factor` — cuando la cantidad de espacio disponible es inferior a este factor, los datos comienzan a moverse automáticamente en el siguiente volumen si los hay (por defecto, 0.1). + +Cofiguration ejemplos: + +``` xml + + ... + + + + + disk1 + disk2 + + + + + + + + fast_ssd + 1073741824 + + + disk1 + + + 0.2 + + + ... + +``` + +En un ejemplo dado, el `hdd_in_order` la política implementa el [Ronda-robin](https://en.wikipedia.org/wiki/Round-robin_scheduling) enfoque. Por lo tanto, esta política define solo un volumen (`single`), las partes de datos se almacenan en todos sus discos en orden circular. Dicha política puede ser bastante útil si hay varios discos similares montados en el sistema, pero RAID no está configurado. Tenga en cuenta que cada unidad de disco individual no es confiable y es posible que desee compensarlo con un factor de replicación de 3 o más. + +Si hay diferentes tipos de discos disponibles en el sistema, `moving_from_ssd_to_hdd` política se puede utilizar en su lugar. Volumen `hot` consta de un disco SSD (`fast_ssd`), y el tamaño máximo de una pieza que se puede almacenar en este volumen es de 1 GB. Todas las piezas con el tamaño más grande que 1GB serán almacenadas directamente en `cold` volumen, que contiene un disco duro `disk1`. +Además, una vez que el disco `fast_ssd` se llena en más del 80%, los datos se transferirán al `disk1` por un proceso en segundo plano. + +El orden de enumeración de volúmenes dentro de una directiva de almacenamiento es importante. Una vez que un volumen está sobrellenado, los datos se mueven al siguiente. El orden de la enumeración del disco también es importante porque los datos se almacenan en ellos por turnos. + +Al crear una tabla, se puede aplicarle una de las directivas de almacenamiento configuradas: + +``` sql +CREATE TABLE table_with_non_default_policy ( + EventDate Date, + OrderID UInt64, + BannerID UInt64, + SearchPhrase String +) ENGINE = MergeTree +ORDER BY (OrderID, BannerID) +PARTITION BY toYYYYMM(EventDate) +SETTINGS storage_policy = 'moving_from_ssd_to_hdd' +``` + +El `default` política de almacenamiento implica el uso de un solo volumen, que consiste en un solo disco dado en ``. Una vez que se crea una tabla, no se puede cambiar su política de almacenamiento. + +### Detalles {#details} + +En el caso de `MergeTree` tablas, los datos están llegando al disco de diferentes maneras: + +- Como resultado de un inserto (`INSERT` consulta). +- Durante las fusiones de fondo y [mutación](../../query_language/alter.md#alter-mutations). +- Al descargar desde otra réplica. +- Como resultado de la congelación de particiones [ALTER TABLE … CONGELAR LA PARTICIÓN](../../query_language/alter.md#alter_freeze-partition). + +En todos estos casos, excepto las mutaciones y la congelación de particiones, una pieza se almacena en un volumen y un disco de acuerdo con la política de almacenamiento dada: + +1. El primer volumen (en el orden de definición) que tiene suficiente espacio en disco para almacenar una pieza (`unreserved_space > current_part_size`) y permite almacenar partes de un tamaño determinado (`max_data_part_size_bytes > current_part_size`) se elige. +2. Dentro de este volumen, se elige ese disco que sigue al que se utilizó para almacenar el fragmento de datos anterior y que tiene espacio libre más que el tamaño de la pieza (`unreserved_space - keep_free_space_bytes > current_part_size`). + +Bajo el capó, las mutaciones y la congelación de particiones hacen uso de [enlaces duros](https://en.wikipedia.org/wiki/Hard_link). Los enlaces duros entre diferentes discos no son compatibles, por lo tanto, en tales casos las partes resultantes se almacenan en los mismos discos que los iniciales. + +En el fondo, las partes se mueven entre volúmenes en función de la cantidad de espacio libre (`move_factor` parámetro) según el orden en que se declaran los volúmenes en el archivo de configuración. +Los datos nunca se transfieren desde el último y al primero. Uno puede usar tablas del sistema [sistema.part\_log](../system_tables.md#system_tables-part-log) (campo `type = MOVE_PART`) y [sistema.parte](../system_tables.md#system_tables-parts) (campo `path` y `disk`) para monitorear movimientos de fondo. Además, la información detallada se puede encontrar en los registros del servidor. + +El usuario puede forzar el movimiento de una pieza o una partición de un volumen a otro mediante la consulta [ALTER TABLE … MUEVA PARTE\|PARTICIÓN … AL VOLUMEN\|DISCO …](../../query_language/alter.md#alter_move-partition), todas las restricciones para las operaciones en segundo plano se tienen en cuenta. La consulta inicia un movimiento por sí misma y no espera a que se completen las operaciones en segundo plano. El usuario recibirá un mensaje de error si no hay suficiente espacio libre disponible o si no se cumple alguna de las condiciones requeridas. + +Mover datos no interfiere con la replicación de datos. Por lo tanto, se pueden especificar diferentes directivas de almacenamiento para la misma tabla en diferentes réplicas. + +Después de la finalización de las fusiones y mutaciones de fondo, las partes viejas se eliminan solo después de un cierto período de tiempo (`old_parts_lifetime`). +Durante este tiempo, no se mueven a otros volúmenes o discos. Por lo tanto, hasta que las partes finalmente se eliminen, aún se tienen en cuenta para la evaluación del espacio en disco ocupado. + +[Artículo Original](https://clickhouse.tech/docs/ru/operations/table_engines/mergetree/) diff --git a/docs/es/operations/table_engines/mysql.md b/docs/es/operations/table_engines/mysql.md new file mode 100644 index 00000000000..3ce17a02837 --- /dev/null +++ b/docs/es/operations/table_engines/mysql.md @@ -0,0 +1,98 @@ +# MySQL {#mysql} + +El motor MySQL le permite realizar `SELECT` Consultas sobre datos almacenados en un servidor MySQL remoto. + +## Creación de una tabla {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], + ... +) ENGINE = MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); +``` + +Vea una descripción detallada del [CREAR TABLA](../../query_language/create.md#create-table-query) consulta. + +La estructura de la tabla puede diferir de la estructura de la tabla MySQL original: + +- Los nombres de columna deben ser los mismos que en la tabla MySQL original, pero puede usar solo algunas de estas columnas y en cualquier orden. +- Los tipos de columna pueden diferir de los de la tabla MySQL original. Haga clic en Casa intenta [elenco](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) valores a los tipos de datos ClickHouse. + +**Parámetros del motor** + +- `host:port` — Dirección del servidor MySQL. + +- `database` — Nombre de base de datos remota. + +- `table` — Nombre de la tabla remota. + +- `user` — Usuario de MySQL. + +- `password` — Contraseña de usuario. + +- `replace_query` — Bandera que convierte `INSERT INTO` Consultas a `REPLACE INTO`. Si `replace_query=1`, la consulta se sustituye. + +- `on_duplicate_clause` — El `ON DUPLICATE KEY on_duplicate_clause` expresión que se añade a la `INSERT` consulta. + + Ejemplo: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, donde `on_duplicate_clause` ser `UPDATE c2 = c2 + 1`. Ver el [Documentación de MySQL](https://dev.mysql.com/doc/refman/8.0/en/insert-on-duplicate.html) para encontrar qué `on_duplicate_clause` se puede utilizar con el `ON DUPLICATE KEY` clausula. + + Especificar `on_duplicate_clause` tienes que pasar `0` Angeles `replace_query` parámetro. Si pasa simultáneamente `replace_query = 1` y `on_duplicate_clause`, ClickHouse genera una excepción. + +Simple `WHERE` cláusulas tales como `=, !=, >, >=, <, <=` se ejecutan en el servidor MySQL. + +El resto de las condiciones y el `LIMIT` La restricción de muestreo se ejecuta en ClickHouse solo después de que finalice la consulta a MySQL. + +## Ejemplo de uso {#usage-example} + +Tabla en MySQL: + +``` text +mysql> CREATE TABLE `test`.`test` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `int_nullable` INT NULL DEFAULT NULL, + -> `float` FLOAT NOT NULL, + -> `float_nullable` FLOAT NULL DEFAULT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into test (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from test; ++--------+--------------+-------+----------------+ +| int_id | int_nullable | float | float_nullable | ++--------+--------------+-------+----------------+ +| 1 | NULL | 2 | NULL | ++--------+--------------+-------+----------------+ +1 row in set (0,00 sec) +``` + +Tabla en ClickHouse, recuperando datos de la tabla MySQL creada anteriormente: + +``` sql +CREATE TABLE mysql_table +( + `float_nullable` Nullable(Float32), + `int_id` Int32 +) +ENGINE = MySQL('localhost:3306', 'test', 'test', 'bayonet', '123') +``` + +``` sql +SELECT * FROM mysql_table +``` + +``` text +┌─float_nullable─┬─int_id─┐ +│ ᴺᵁᴸᴸ │ 1 │ +└────────────────┴────────┘ +``` + +## Ver también {#see-also} + +- [El ‘mysql’ función de la tabla](../../query_language/table_functions/mysql.md) +- [Uso de MySQL como fuente de diccionario externo](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/mysql/) diff --git a/docs/es/operations/table_engines/null.md b/docs/es/operations/table_engines/null.md new file mode 100644 index 00000000000..39bec0533f5 --- /dev/null +++ b/docs/es/operations/table_engines/null.md @@ -0,0 +1,7 @@ +# Nulo {#null} + +Al escribir en una tabla Null, los datos se ignoran. Al leer desde una tabla Null, la respuesta está vacía. + +Sin embargo, puede crear una vista materializada en una tabla Null. Entonces los datos escritos en la tabla terminarán en la vista. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/null/) diff --git a/docs/es/operations/table_engines/odbc.md b/docs/es/operations/table_engines/odbc.md new file mode 100644 index 00000000000..d5bf8fc8492 --- /dev/null +++ b/docs/es/operations/table_engines/odbc.md @@ -0,0 +1,125 @@ +# ODBC {#table-engine-odbc} + +Permite que ClickHouse se conecte a bases de datos externas a través de [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). + +Para implementar con seguridad conexiones ODBC, ClickHouse usa un programa separado `clickhouse-odbc-bridge`. Si el controlador ODBC se carga directamente desde `clickhouse-server`, problemas de controlador pueden bloquear el servidor ClickHouse. ClickHouse se inicia automáticamente `clickhouse-odbc-bridge` cuando se requiere. El programa de puente ODBC se instala desde el mismo paquete que el `clickhouse-server`. + +Este motor soporta el [NULO](../../data_types/nullable.md) tipo de datos. + +## Creación de una tabla {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1], + name2 [type2], + ... +) +ENGINE = ODBC(connection_settings, external_database, external_table) +``` + +Vea una descripción detallada del [CREAR TABLA](../../query_language/create.md#create-table-query) consulta. + +La estructura de la tabla puede diferir de la estructura de la tabla de origen: + +- Los nombres de columna deben ser los mismos que en la tabla de origen, pero puede usar solo algunas de estas columnas y en cualquier orden. +- Los tipos de columna pueden diferir de los de la tabla de origen. Haga clic en Casa intenta [elenco](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) valores a los tipos de datos ClickHouse. + +**Parámetros del motor** + +- `connection_settings` — Nombre de la sección con ajustes de conexión en el `odbc.ini` file. +- `external_database` — Nombre de una base de datos en un DBMS externo. +- `external_table` — Nombre de una tabla en el `external_database`. + +## Ejemplo de uso {#usage-example} + +**Recuperación de datos de la instalación local de MySQL a través de ODBC** + +Este ejemplo se comprueba para Ubuntu Linux 18.04 y el servidor MySQL 5.7. + +Asegúrese de que unixODBC y MySQL Connector están instalados. + +De forma predeterminada (si se instala desde paquetes), ClickHouse comienza como usuario `clickhouse`. Por lo tanto, debe crear y configurar este usuario en el servidor MySQL. + +``` bash +$ sudo mysql +``` + +``` sql +mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; +mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; +``` + +A continuación, configure la conexión en `/etc/odbc.ini`. + +``` bash +$ cat /etc/odbc.ini +[mysqlconn] +DRIVER = /usr/local/lib/libmyodbc5w.so +SERVER = 127.0.0.1 +PORT = 3306 +DATABASE = test +USERNAME = clickhouse +PASSWORD = clickhouse +``` + +Puede verificar la conexión usando el `isql` utilidad desde la instalación de unixODBC. + +``` bash +$ isql -v mysqlconn ++---------------------------------------+ +| Connected! | +| | +... +``` + +Tabla en MySQL: + +``` text +mysql> CREATE TABLE `test`.`test` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `int_nullable` INT NULL DEFAULT NULL, + -> `float` FLOAT NOT NULL, + -> `float_nullable` FLOAT NULL DEFAULT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into test (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from test; ++--------+--------------+-------+----------------+ +| int_id | int_nullable | float | float_nullable | ++--------+--------------+-------+----------------+ +| 1 | NULL | 2 | NULL | ++--------+--------------+-------+----------------+ +1 row in set (0,00 sec) +``` + +Tabla en ClickHouse, recuperando datos de la tabla MySQL: + +``` sql +CREATE TABLE odbc_t +( + `int_id` Int32, + `float_nullable` Nullable(Float32) +) +ENGINE = ODBC('DSN=mysqlconn', 'test', 'test') +``` + +``` sql +SELECT * FROM odbc_t +``` + +``` text +┌─int_id─┬─float_nullable─┐ +│ 1 │ ᴺᵁᴸᴸ │ +└────────┴────────────────┘ +``` + +## Ver también {#see-also} + +- [Diccionarios externos ODBC](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) +- [Tabla ODBC función](../../query_language/table_functions/odbc.md) + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/odbc/) diff --git a/docs/es/operations/table_engines/replacingmergetree.md b/docs/es/operations/table_engines/replacingmergetree.md new file mode 100644 index 00000000000..7b59f08223f --- /dev/null +++ b/docs/es/operations/table_engines/replacingmergetree.md @@ -0,0 +1,62 @@ +# ReplacingMergeTree {#replacingmergetree} + +El motor difiere de [Método de codificación de datos:](mergetree.md#table_engines-mergetree) en que elimina las entradas duplicadas con el mismo valor de clave principal (o más exactamente, con el mismo [clave de clasificación](mergetree.md) de valor). + +La desduplicación de datos solo se produce durante una fusión. La fusión se produce en segundo plano en un momento desconocido, por lo que no se puede planificar para ello. Algunos de los datos pueden permanecer sin procesar. Aunque puede ejecutar una fusión no programada utilizando el `OPTIMIZE` Consulta, no cuente con su uso, porque el `OPTIMIZE` consulta leerá y escribirá una gran cantidad de datos. + +Así, `ReplacingMergeTree` es adecuado para borrar datos duplicados en segundo plano para ahorrar espacio, pero no garantiza la ausencia de duplicados. + +## Creación de una tabla {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = ReplacingMergeTree([ver]) +[PARTITION BY expr] +[ORDER BY expr] +[PRIMARY KEY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +Para obtener una descripción de los parámetros de solicitud, consulte [descripción de la solicitud](../../query_language/create.md). + +**ReplacingMergeTree Parámetros** + +- `ver` — columna con versión. Tipo `UInt*`, `Date` o `DateTime`. Parámetro opcional. + + Al fusionar, `ReplacingMergeTree` de todas las filas con la misma clave primaria deja solo una: + + - Último en la selección, si `ver` no establecido. + - Con la versión máxima, si `ver` indicado. + +**Cláusulas de consulta** + +Al crear un `ReplacingMergeTree` mesa de la misma [clausula](mergetree.md) se requieren, como al crear un `MergeTree` tabla. + +
+ +Método obsoleto para crear una tabla + +!!! attention "Atención" + No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] ReplacingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [ver]) +``` + +Todos los parámetros excepto `ver` el mismo significado que en `MergeTree`. + +- `ver` - columna con la versión. Parámetro opcional. Para una descripción, vea el texto anterior. + +
+ +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/replacingmergetree/) diff --git a/docs/es/operations/table_engines/replication.md b/docs/es/operations/table_engines/replication.md new file mode 100644 index 00000000000..116c5b0745b --- /dev/null +++ b/docs/es/operations/table_engines/replication.md @@ -0,0 +1,211 @@ +# Replicación de datos {#table-engines-replication} + +La replicación solo se admite para tablas de la familia MergeTree: + +- ReplicatedMergeTree +- ReplicatedSummingMergeTree +- ReplicatedReplacingMergeTree +- ReplicatedAggregatingMergeTree +- ReplicatedCollapsingMergeTree +- ReplicatedVersionedCollapsingMergetree +- ReplicatedGraphiteMergeTree + +La replicación funciona a nivel de una tabla individual, no de todo el servidor. Un servidor puede almacenar tablas replicadas y no replicadas al mismo tiempo. + +La replicación no depende de la fragmentación. Cada fragmento tiene su propia replicación independiente. + +Datos comprimidos para `INSERT` y `ALTER` se replica (para obtener más información, consulte la documentación para [ALTERAR](../../query_language/alter.md#query_language_queries_alter)). + +`CREATE`, `DROP`, `ATTACH`, `DETACH` y `RENAME` las consultas se ejecutan en un único servidor y no se replican: + +- El `CREATE TABLE` query crea una nueva tabla replicable en el servidor donde se ejecuta la consulta. Si esta tabla ya existe en otros servidores, agrega una nueva réplica. +- El `DROP TABLE` query elimina la réplica ubicada en el servidor donde se ejecuta la consulta. +- El `RENAME` query cambia el nombre de la tabla en una de las réplicas. En otras palabras, las tablas replicadas pueden tener diferentes nombres en diferentes réplicas. + +Uso de ClickHouse [Apache ZooKeeper](https://zookeeper.apache.org) para almacenar metainformación de réplicas. Utilice ZooKeeper versión 3.4.5 o posterior. + +Para utilizar la replicación, establezca los parámetros [Zookeeper](../server_settings/settings.md#server-settings_zookeeper) sección de configuración del servidor. + +!!! attention "Atención" + No descuides la configuración de seguridad. ClickHouse soporta el `digest` [Esquema de ACL](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) del subsistema de seguridad ZooKeeper. + +Ejemplo de configuración de las direcciones del clúster ZooKeeper: + +``` xml + + + example1 + 2181 + + + example2 + 2181 + + + example3 + 2181 + + +``` + +Puede especificar cualquier clúster ZooKeeper existente y el sistema utilizará un directorio en él para sus propios datos (el directorio se especifica al crear una tabla replicable). + +Si ZooKeeper no está establecido en el archivo de configuración, no puede crear tablas replicadas y las tablas replicadas existentes serán de solo lectura. + +ZooKeeper no se utiliza en `SELECT` consultas porque la replicación no afecta al rendimiento de `SELECT` y las consultas se ejecutan tan rápido como lo hacen para las tablas no replicadas. Al consultar tablas replicadas distribuidas, el comportamiento de ClickHouse se controla mediante la configuración [max\_replica\_delay\_for\_distributed\_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) y [fallback\_to\_stale\_replicas\_for\_distributed\_queries](../settings/settings.md#settings-fallback_to_stale_replicas_for_distributed_queries). + +Para cada `INSERT` consulta, aproximadamente diez entradas se agregan a ZooKeeper a través de varias transacciones. (Para ser más precisos, esto es para cada bloque de datos insertado; una consulta INSERT contiene un bloque o un bloque por `max_insert_block_size = 1048576` filas.) Esto conduce a latencias ligeramente más largas para `INSERT` en comparación con las tablas no replicadas. Pero si sigue las recomendaciones para insertar datos en lotes de no más de uno `INSERT` por segundo, no crea ningún problema. Todo el clúster ClickHouse utilizado para coordinar un clúster ZooKeeper tiene un total de varios cientos `INSERTs` por segundo. El rendimiento en las inserciones de datos (el número de filas por segundo) es tan alto como para los datos no replicados. + +Para clústeres muy grandes, puede usar diferentes clústeres de ZooKeeper para diferentes fragmentos. Sin embargo, esto no ha demostrado ser necesario en el Yandex.Clúster Metrica (aproximadamente 300 servidores). + +La replicación es asíncrona y multi-master. `INSERT` consultas (así como `ALTER`) se puede enviar a cualquier servidor disponible. Los datos se insertan en el servidor donde se ejecuta la consulta y, a continuación, se copian a los demás servidores. Debido a que es asincrónico, los datos insertados recientemente aparecen en las otras réplicas con cierta latencia. Si parte de las réplicas no está disponible, los datos se escriben cuando estén disponibles. Si hay una réplica disponible, la latencia es la cantidad de tiempo que tarda en transferir el bloque de datos comprimidos a través de la red. + +De forma predeterminada, una consulta INSERT espera la confirmación de la escritura de los datos de una sola réplica. Si los datos fue correctamente escrito a sólo una réplica y el servidor con esta réplica deja de existir, los datos almacenados se perderán. Para habilitar la confirmación de las escrituras de datos de varias réplicas, utilice `insert_quorum` opción. + +Cada bloque de datos se escribe atómicamente. La consulta INSERT se divide en bloques hasta `max_insert_block_size = 1048576` películas. En otras palabras, si el `INSERT` consulta tiene menos de 1048576 filas, se hace atómicamente. + +Los bloques de datos se deduplican. Para varias escrituras del mismo bloque de datos (bloques de datos del mismo tamaño que contienen las mismas filas en el mismo orden), el bloque solo se escribe una vez. La razón de esto es en caso de errores de red cuando la aplicación cliente no sabe si los datos se escribieron en la base de datos, por lo que `INSERT` consulta simplemente se puede repetir. No importa a qué réplica se enviaron los INSERT con datos idénticos. `INSERTs` son idempotentes. Los parámetros de desduplicación son controlados por [merge\_tree](../server_settings/settings.md#server_settings-merge_tree) configuración del servidor. + +Durante la replicación, sólo los datos de origen que se van a insertar se transfieren a través de la red. La transformación de datos adicional (fusión) se coordina y se realiza en todas las réplicas de la misma manera. Esto minimiza el uso de la red, lo que significa que la replicación funciona bien cuando las réplicas residen en centros de datos diferentes. (Tenga en cuenta que la duplicación de datos en diferentes centros de datos es el objetivo principal de la replicación.) + +Puede tener cualquier número de réplicas de los mismos datos. El Yandex.Metrica utiliza doble replicación en producción. Cada servidor utiliza RAID-5 o RAID-6, y RAID-10 en algunos casos. Esta es una solución relativamente confiable y conveniente. + +El sistema supervisa la sincronicidad de los datos en las réplicas y puede recuperarse después de un fallo. La conmutación por error es automática (para pequeñas diferencias en los datos) o semiautomática (cuando los datos difieren demasiado, lo que puede indicar un error de configuración). + +## Creación de tablas replicadas {#creating-replicated-tables} + +El `Replicated` prefijo se agrega al nombre del motor de tabla. Por ejemplo:`ReplicatedMergeTree`. + +**Replicated\*MergeTree parámetros** + +- `zoo_path` — El camino a la mesa en ZooKeeper. +- `replica_name` — El nombre de la réplica en ZooKeeper. + +Ejemplo: + +``` sql +CREATE TABLE table_name +( + EventDate DateTime, + CounterID UInt32, + UserID UInt32 +) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}') +PARTITION BY toYYYYMM(EventDate) +ORDER BY (CounterID, EventDate, intHash32(UserID)) +SAMPLE BY intHash32(UserID) +``` + +
+ +Ejemplo en sintaxis obsoleta + +``` sql +CREATE TABLE table_name +( + EventDate DateTime, + CounterID UInt32, + UserID UInt32 +) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192) +``` + +
+ +Como muestra el ejemplo, estos parámetros pueden contener sustituciones entre llaves. Los valores sustituidos se toman de la ‘macros’ sección del archivo de configuración. Ejemplo: + +``` xml + + 05 + 02 + example05-02-1.yandex.ru + +``` + +La ruta de acceso a la tabla en ZooKeeper debe ser única para cada tabla replicada. Las tablas en diferentes fragmentos deben tener rutas diferentes. +En este caso, la ruta consta de las siguientes partes: + +`/clickhouse/tables/` es el prefijo común. Recomendamos usar exactamente este. + +`{layer}-{shard}` es el identificador de fragmento. En este ejemplo consta de dos partes, ya que el Yandex.Metrica clúster utiliza sharding de dos niveles. Para la mayoría de las tareas, puede dejar solo la sustitución {shard}, que se expandirá al identificador de fragmento. + +`table_name` es el nombre del nodo de la tabla en ZooKeeper. Es una buena idea hacerlo igual que el nombre de la tabla. Se define explícitamente, porque a diferencia del nombre de la tabla, no cambia después de una consulta RENAME. +*SUGERENCIA*: podría agregar un nombre de base de datos delante de `table_name` También. Nivel de Cifrado WEP `db_name.table_name` + +El nombre de réplica identifica diferentes réplicas de la misma tabla. Puede usar el nombre del servidor para esto, como en el ejemplo. El nombre solo tiene que ser único dentro de cada fragmento. + +Puede definir los parámetros explícitamente en lugar de utilizar sustituciones. Esto podría ser conveniente para probar y para configurar clústeres pequeños. Sin embargo, no puede usar consultas DDL distribuidas (`ON CLUSTER` en este caso. + +Cuando se trabaja con clústeres grandes, se recomienda utilizar sustituciones porque reducen la probabilidad de error. + +Ejecute el `CREATE TABLE` consulta en cada réplica. Esta consulta crea una nueva tabla replicada o agrega una nueva réplica a una existente. + +Si agrega una nueva réplica después de que la tabla ya contenga algunos datos en otras réplicas, los datos se copiarán de las otras réplicas a la nueva después de ejecutar la consulta. En otras palabras, la nueva réplica se sincroniza con las demás. + +Para eliminar una réplica, ejecute `DROP TABLE`. Sin embargo, solo se elimina una réplica, la que reside en el servidor donde se ejecuta la consulta. + +## Recuperación después de fallos {#recovery-after-failures} + +Si ZooKeeper no está disponible cuando se inicia un servidor, las tablas replicadas cambian al modo de solo lectura. El sistema intenta conectarse periódicamente a ZooKeeper. + +Si ZooKeeper no está disponible durante un `INSERT`, o se produce un error al interactuar con ZooKeeper, se produce una excepción. + +Después de conectarse a ZooKeeper, el sistema comprueba si el conjunto de datos en el sistema de archivos local coincide con el conjunto de datos esperado (ZooKeeper almacena esta información). Si hay incoherencias menores, el sistema las resuelve sincronizando datos con las réplicas. + +Si el sistema detecta partes de datos rotas (con un tamaño incorrecto de archivos) o partes no reconocidas (partes escritas en el sistema de archivos pero no grabadas en ZooKeeper), las mueve al `detached` subdirectorio (no se eliminan). Las piezas que faltan se copian de las réplicas. + +Tenga en cuenta que ClickHouse no realiza ninguna acción destructiva, como eliminar automáticamente una gran cantidad de datos. + +Cuando el servidor se inicia (o establece una nueva sesión con ZooKeeper), solo verifica la cantidad y el tamaño de todos los archivos. Si los tamaños de los archivos coinciden pero los bytes se han cambiado en algún punto intermedio, esto no se detecta inmediatamente, sino solo cuando se intenta leer los datos `SELECT` consulta. La consulta produce una excepción sobre una suma de comprobación no coincidente o el tamaño de un bloque comprimido. En este caso, las partes de datos se agregan a la cola de verificación y se copian de las réplicas si es necesario. + +Si el conjunto local de datos difiere demasiado del esperado, se activa un mecanismo de seguridad. El servidor ingresa esto en el registro y se niega a iniciarse. La razón de esto es que este caso puede indicar un error de configuración, como si una réplica en un fragmento se configurara accidentalmente como una réplica en un fragmento diferente. Sin embargo, los umbrales para este mecanismo se establecen bastante bajos, y esta situación puede ocurrir durante la recuperación de falla normal. En este caso, los datos se restauran semiautomáticamente, mediante “pushing a button”. + +Para iniciar la recuperación, cree el nodo `/path_to_table/replica_name/flags/force_restore_data` es ZooKeeper con cualquier contenido, o ejecute el comando para restaurar todas las tablas replicadas: + +``` bash +sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data +``` + +A continuación, reinicie el servidor. Al iniciar, el servidor elimina estos indicadores e inicia la recuperación. + +## Recuperación después de la pérdida completa de datos {#recovery-after-complete-data-loss} + +Si todos los datos y metadatos desaparecieron de uno de los servidores, siga estos pasos para la recuperación: + +1. Instale ClickHouse en el servidor. Defina correctamente las sustituciones en el archivo de configuración que contiene el identificador de fragmento y las réplicas, si las usa. +2. Si tenía tablas no duplicadas que deben duplicarse manualmente en los servidores, copie sus datos desde una réplica (en el directorio `/var/lib/clickhouse/data/db_name/table_name/`). +3. Copiar definiciones de tablas ubicadas en `/var/lib/clickhouse/metadata/` de una réplica. Si un identificador de fragmento o réplica se define explícitamente en las definiciones de tabla, corríjalo para que corresponda a esta réplica. (Como alternativa, inicie el servidor y `ATTACH TABLE` consultas que deberían haber estado en el .sql archivos en `/var/lib/clickhouse/metadata/`.) +4. Para iniciar la recuperación, cree el nodo ZooKeeper `/path_to_table/replica_name/flags/force_restore_data` con cualquier contenido o ejecute el comando para restaurar todas las tablas replicadas: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` + +Luego inicie el servidor (reinicie, si ya se está ejecutando). Los datos se descargarán de las réplicas. + +Una opción de recuperación alternativa es eliminar información sobre la réplica perdida de ZooKeeper (`/path_to_table/replica_name`), luego vuelva a crear la réplica como se describe en “[Creación de tablas replicadas](#creating-replicated-tables)”. + +No hay restricción en el ancho de banda de la red durante la recuperación. Tenga esto en cuenta si está restaurando muchas réplicas a la vez. + +## La conversión de MergeTree a ReplicatedMergeTree {#converting-from-mergetree-to-replicatedmergetree} + +Usamos el término `MergeTree` para referirse a todos los motores de mesa en el `MergeTree family`, lo mismo que para `ReplicatedMergeTree`. + +Si usted tenía un `MergeTree` tabla replicada manualmente, puede convertirla en una tabla replicada. Es posible que tenga que hacer esto si ya ha recopilado una gran cantidad de datos `MergeTree` y ahora desea habilitar la replicación. + +Si los datos difieren en varias réplicas, primero sincronícelos o elimínelos en todas las réplicas, excepto en una. + +Cambie el nombre de la tabla MergeTree existente y, a continuación, cree un `ReplicatedMergeTree` mesa con el antiguo nombre. +Mueva los datos de la tabla antigua a la `detached` subdirectorio dentro del directorio con los nuevos datos de la tabla (`/var/lib/clickhouse/data/db_name/table_name/`). +Luego ejecuta `ALTER TABLE ATTACH PARTITION` en una de las réplicas para agregar estas partes de datos al conjunto de trabajo. + +## La conversión de ReplicatedMergeTree a MergeTree {#converting-from-replicatedmergetree-to-mergetree} + +Cree una tabla MergeTree con un nombre diferente. Mueva todos los datos del directorio con el `ReplicatedMergeTree` datos de la tabla al directorio de datos de la nueva tabla. A continuación, elimine el `ReplicatedMergeTree` y reinicie el servidor. + +Si desea deshacerse de un `ReplicatedMergeTree` sin iniciar el servidor: + +- Eliminar el correspondiente `.sql` archivo en el directorio de metadatos (`/var/lib/clickhouse/metadata/`). +- Eliminar la ruta correspondiente en ZooKeeper (`/path_to_table/replica_name`). + +Después de esto, puede iniciar el servidor, crear un `MergeTree` tabla, mueva los datos a su directorio y, a continuación, reinicie el servidor. + +## Recuperación cuando los metadatos del clúster ZooKeeper se pierden o se dañan {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged} + +Si los datos de ZooKeeper se perdieron o se dañaron, puede guardar los datos moviéndolos a una tabla no duplicada como se describió anteriormente. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/replication/) diff --git a/docs/es/operations/table_engines/set.md b/docs/es/operations/table_engines/set.md new file mode 100644 index 00000000000..c56ab0ab96a --- /dev/null +++ b/docs/es/operations/table_engines/set.md @@ -0,0 +1,12 @@ +# Establecer {#set} + +Un conjunto de datos que siempre está en la memoria RAM. Está diseñado para su uso en el lado derecho del operador IN (consulte la sección “IN operators”). + +Puede usar INSERT para insertar datos en la tabla. Se agregarán nuevos elementos al conjunto de datos, mientras que los duplicados se ignorarán. +Pero no puede realizar SELECT desde la tabla. La única forma de recuperar datos es usándolos en la mitad derecha del operador IN. + +Los datos siempre se encuentran en la memoria RAM. Para INSERT, los bloques de datos insertados también se escriben en el directorio de tablas en el disco. Al iniciar el servidor, estos datos se cargan en la RAM. En otras palabras, después de reiniciar, los datos permanecen en su lugar. + +Para un reinicio aproximado del servidor, el bloque de datos en el disco puede perderse o dañarse. En este último caso, es posible que deba eliminar manualmente el archivo con datos dañados. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/set/) diff --git a/docs/es/operations/table_engines/stripelog.md b/docs/es/operations/table_engines/stripelog.md new file mode 100644 index 00000000000..77625db3aca --- /dev/null +++ b/docs/es/operations/table_engines/stripelog.md @@ -0,0 +1,88 @@ +# StripeLog {#stripelog} + +Este motor pertenece a la familia de motores de registro. Consulte las propiedades comunes de los motores de registro y sus diferencias en [Familia del motor de registro](log_family.md) artículo. + +Utilice este motor en escenarios en los que necesite escribir muchas tablas con una pequeña cantidad de datos (menos de 1 millón de filas). + +## Creación de una tabla {#table-engines-stripelog-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + column1_name [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + column2_name [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = StripeLog +``` + +Vea la descripción detallada del [CREAR TABLA](../../query_language/create.md#create-table-query) consulta. + +## Escribir los datos {#table-engines-stripelog-writing-the-data} + +El `StripeLog` el motor almacena todas las columnas en un archivo. Para cada `INSERT` ClickHouse agrega el bloque de datos al final de un archivo de tabla, escribiendo columnas una por una. + +Para cada tabla, ClickHouse escribe los archivos: + +- `data.bin` — Archivo de datos. +- `index.mrk` — Archivo con marcas. Las marcas contienen compensaciones para cada columna de cada bloque de datos insertado. + +El `StripeLog` el motor no soporta el `ALTER UPDATE` y `ALTER DELETE` operación. + +## Lectura de los datos {#table-engines-stripelog-reading-the-data} + +El archivo con marcas permite ClickHouse paralelizar la lectura de datos. Esto significa que un `SELECT` consulta devuelve filas en un orden impredecible. Descripción `ORDER BY` cláusula para ordenar filas. + +## Ejemplo de uso {#table-engines-stripelog-example-of-use} + +Creación de una tabla: + +``` sql +CREATE TABLE stripe_log_table +( + timestamp DateTime, + message_type String, + message String +) +ENGINE = StripeLog +``` + +Inserte datos: + +``` sql +INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The first regular message') +INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The second regular message'),(now(),'WARNING','The first warning message') +``` + +Se utilizaron dos `INSERT` consultas para crear dos bloques de datos dentro del `data.bin` file. + +ClickHouse usa múltiples subprocesos al seleccionar datos. Cada subproceso lee un bloque de datos separado y devuelve las filas resultantes de forma independiente a medida que termina. Como resultado, el orden de los bloques de filas en la salida no coincide con el orden de los mismos bloques en la entrada en la mayoría de los casos. Por ejemplo: + +``` sql +SELECT * FROM stripe_log_table +``` + +``` text +┌───────────timestamp─┬─message_type─┬─message────────────────────┐ +│ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ +│ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ +└─────────────────────┴──────────────┴────────────────────────────┘ +┌───────────timestamp─┬─message_type─┬─message───────────────────┐ +│ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ +└─────────────────────┴──────────────┴───────────────────────────┘ +``` + +Ordenación de los resultados (orden ascendente por defecto): + +``` sql +SELECT * FROM stripe_log_table ORDER BY timestamp +``` + +``` text +┌───────────timestamp─┬─message_type─┬─message────────────────────┐ +│ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ +│ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ +│ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ +└─────────────────────┴──────────────┴────────────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/stripelog/) diff --git a/docs/es/operations/table_engines/summingmergetree.md b/docs/es/operations/table_engines/summingmergetree.md new file mode 100644 index 00000000000..2b2fedfe602 --- /dev/null +++ b/docs/es/operations/table_engines/summingmergetree.md @@ -0,0 +1,134 @@ +# SummingMergeTree {#summingmergetree} + +El motor hereda de [Método de codificación de datos:](mergetree.md#table_engines-mergetree). La diferencia es que al fusionar partes de datos para `SummingMergeTree` ClickHouse reemplaza todas las filas con la misma clave primaria (o más exactamente, con la misma [clave de clasificación](mergetree.md)) con una fila que contiene valores resumidos para las columnas con el tipo de datos numérico. Si la clave de ordenación está compuesta de manera que un solo valor de clave corresponde a un gran número de filas, esto reduce significativamente el volumen de almacenamiento y acelera la selección de datos. + +Recomendamos usar el motor junto con `MergeTree`. Almacenar datos completos en `MergeTree` mesa, y el uso `SummingMergeTree` para el almacenamiento de datos agregados, por ejemplo, al preparar informes. Tal enfoque evitará que pierda datos valiosos debido a una clave primaria compuesta incorrectamente. + +## Creación de una tabla {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = SummingMergeTree([columns]) +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +Para obtener una descripción de los parámetros de solicitud, consulte [descripción de la solicitud](../../query_language/create.md). + +**Parámetros de SummingMergeTree** + +- `columns` - una tupla con los nombres de las columnas donde se resumirán los valores. Parámetro opcional. + Las columnas deben ser de tipo numérico y no deben estar en la clave principal. + + Si `columns` no especificado, ClickHouse resume los valores de todas las columnas con un tipo de datos numérico que no están en la clave principal. + +**Cláusulas de consulta** + +Al crear un `SummingMergeTree` mesa de la misma [clausula](mergetree.md) se requieren, como al crear un `MergeTree` tabla. + +
+ +Método obsoleto para crear una tabla + +!!! attention "Atención" + No use este método en proyectos nuevos y, si es posible, cambie los proyectos antiguos al método descrito anteriormente. + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] SummingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [columns]) +``` + +Todos los parámetros excepto `columns` el mismo significado que en `MergeTree`. + +- `columns` — tupla con nombres de valores de columnas de los cuales se resumirán. Parámetro opcional. Para una descripción, vea el texto anterior. + +
+ +## Ejemplo de uso {#usage-example} + +Considere la siguiente tabla: + +``` sql +CREATE TABLE summtt +( + key UInt32, + value UInt32 +) +ENGINE = SummingMergeTree() +ORDER BY key +``` + +Inserte datos: + +``` sql +INSERT INTO summtt Values(1,1),(1,2),(2,1) +``` + +ClickHouse puede sumar todas las filas no completamente ([ver abajo](#data-processing)), entonces usamos una función agregada `sum` y `GROUP BY` cláusula en la consulta. + +``` sql +SELECT key, sum(value) FROM summtt GROUP BY key +``` + +``` text +┌─key─┬─sum(value)─┐ +│ 2 │ 1 │ +│ 1 │ 3 │ +└─────┴────────────┘ +``` + +## Procesamiento de datos {#data-processing} + +Cuando los datos se insertan en una tabla, se guardan tal cual. Clickhouse fusiona las partes insertadas de datos periódicamente y esto es cuando las filas con la misma clave principal se suman y se reemplazan con una para cada parte resultante de los datos. + +ClickHouse puede fusionar las partes de datos para que las diferentes partes resultantes del cat de datos consistan en filas con la misma clave principal, es decir, la suma estará incompleta. Pues (`SELECT`) una función agregada [resumir()](../../query_language/agg_functions/reference.md#agg_function-sum) y `GROUP BY` cláusula se debe utilizar en una consulta como se describe en el ejemplo anterior. + +### Reglas comunes para la suma {#common-rules-for-summation} + +Se resumen los valores de las columnas con el tipo de datos numérico. El conjunto de columnas está definido por el parámetro `columns`. + +Si los valores eran 0 en todas las columnas para la suma, se elimina la fila. + +Si la columna no está en la clave principal y no se resume, se selecciona un valor arbitrario entre los existentes. + +Los valores no se resumen para las columnas de la clave principal. + +### La suma en las columnas AggregateFunction {#the-summation-in-the-aggregatefunction-columns} + +Para columnas de [Tipo AggregateFunction](../../data_types/nested_data_structures/aggregatefunction.md) ClickHouse se comporta como [AgregaciónMergeTree](aggregatingmergetree.md) agregación del motor según la función. + +### Estructuras anidadas {#nested-structures} + +La tabla puede tener estructuras de datos anidadas que se procesan de una manera especial. + +Si el nombre de una tabla anidada termina con `Map` y contiene al menos dos columnas que cumplen los siguientes criterios: + +- la primera columna es numérica `(*Int*, Date, DateTime)` O una cadena `(String, FixedString)`, vamos a llamarlo `key`, +- las otras columnas son aritméticas `(*Int*, Float32/64)`, vamos a llamarlo `(values...)`, + +entonces esta tabla anidada se interpreta como una asignación de `key => (values...)`, y al fusionar sus filas, los elementos de dos conjuntos de datos se fusionan por `key` con una suma de los correspondientes `(values...)`. + +Ejemplos: + +``` text +[(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] +[(1, 100)] + [(1, 150)] -> [(1, 250)] +[(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] +[(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)] +``` + +Al solicitar datos, utilice el [SumMap (clave, valor)](../../query_language/agg_functions/reference.md) función para la agregación de `Map`. + +Para la estructura de datos anidados, no necesita especificar sus columnas en la tupla de columnas para la suma. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/summingmergetree/) diff --git a/docs/es/operations/table_engines/tinylog.md b/docs/es/operations/table_engines/tinylog.md new file mode 100644 index 00000000000..a563d7a523e --- /dev/null +++ b/docs/es/operations/table_engines/tinylog.md @@ -0,0 +1,9 @@ +# TinyLog {#tinylog} + +El motor pertenece a la familia de motores de registro. Ver [Familia del motor de registro](log_family.md) para las propiedades comunes de los motores de registro y sus diferencias. + +Este motor de tablas se usa normalmente con el método write-once: escribir datos una vez, luego leerlos tantas veces como sea necesario. Por ejemplo, puede usar `TinyLog`-type tablas para datos intermedios que se procesan en pequeños lotes. Tenga en cuenta que el almacenamiento de datos en un gran número de tablas pequeñas es ineficiente. + +Las consultas se ejecutan en una sola secuencia. En otras palabras, este motor está diseñado para tablas relativamente pequeñas (hasta aproximadamente 1,000,000 filas). Tiene sentido utilizar este motor de tablas si tiene muchas tablas pequeñas, ya que es más simple que el [Registro](log.md) motor (menos archivos necesitan ser abiertos. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/tinylog/) diff --git a/docs/es/operations/table_engines/url.md b/docs/es/operations/table_engines/url.md new file mode 100644 index 00000000000..6202bb869df --- /dev/null +++ b/docs/es/operations/table_engines/url.md @@ -0,0 +1,75 @@ +# Nombre de la red inalámbrica (SSID):) {#table-engines-url} + +Administra datos en un servidor HTTP/HTTPS remoto. Este motor es similar +Angeles [File](file.md) motor. + +## Uso del motor en el servidor ClickHouse {#using-the-engine-in-the-clickhouse-server} + +El `format` debe ser uno que ClickHouse pueda usar en +`SELECT` consultas y, si es necesario, en `INSERTs`. Para obtener la lista completa de formatos admitidos, consulte +[Formato](../../interfaces/formats.md#formats). + +El `URL` debe ajustarse a la estructura de un localizador uniforme de recursos. La dirección URL especificada debe apuntar a un servidor +que utiliza HTTP o HTTPS. Esto no requiere ningún +encabezados adicionales para obtener una respuesta del servidor. + +`INSERT` y `SELECT` las consultas se transforman en `POST` y `GET` Peticiones, +Respectivamente. Para el procesamiento `POST` solicitudes, el servidor remoto debe admitir +[Codificación de transferencia fragmentada](https://en.wikipedia.org/wiki/Chunked_transfer_encoding). + +Puede limitar el número máximo de saltos de redirección HTTP GET utilizando el [Nombre de la red inalámbrica (SSID):](../settings/settings.md#setting-max_http_get_redirects) configuración. + +**Ejemplo:** + +**1.** Crear un `url_engine_table` tabla en el servidor : + +``` sql +CREATE TABLE url_engine_table (word String, value UInt64) +ENGINE=URL('http://127.0.0.1:12345/', CSV) +``` + +**2.** Cree un servidor HTTP básico utilizando las herramientas estándar de Python 3 y +comenzarlo: + +``` python3 +from http.server import BaseHTTPRequestHandler, HTTPServer + +class CSVHTTPServer(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + self.send_header('Content-type', 'text/csv') + self.end_headers() + + self.wfile.write(bytes('Hello,1\nWorld,2\n', "utf-8")) + +if __name__ == "__main__": + server_address = ('127.0.0.1', 12345) + HTTPServer(server_address, CSVHTTPServer).serve_forever() +``` + +``` bash +$ python3 server.py +``` + +**3.** Solicitar datos: + +``` sql +SELECT * FROM url_engine_table +``` + +``` text +┌─word──┬─value─┐ +│ Hello │ 1 │ +│ World │ 2 │ +└───────┴───────┘ +``` + +## Detalles de la implementación {#details-of-implementation} + +- Las lecturas y escrituras pueden ser paralelas +- No soportado: + - `ALTER` y `SELECT...SAMPLE` operación. + - Índices. + - Replicación. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/url/) diff --git a/docs/es/operations/table_engines/versionedcollapsingmergetree.md b/docs/es/operations/table_engines/versionedcollapsingmergetree.md new file mode 100644 index 00000000000..76451b1db47 --- /dev/null +++ b/docs/es/operations/table_engines/versionedcollapsingmergetree.md @@ -0,0 +1,231 @@ +# VersionedCollapsingMergeTree {#versionedcollapsingmergetree} + +Este motor: + +- Permite la escritura rápida de estados de objetos que cambian continuamente. +- Elimina los estados de objetos antiguos en segundo plano. Esto reduce significativamente el volumen de almacenamiento. + +Vea la sección [Derrumbar](#table_engines_versionedcollapsingmergetree) para más detalles. + +El motor hereda de [Método de codificación de datos:](mergetree.md#table_engines-mergetree) y agrega la lógica para colapsar filas al algoritmo para fusionar partes de datos. `VersionedCollapsingMergeTree` tiene el mismo propósito que [ColapsarMergeTree](collapsingmergetree.md) pero usa un algoritmo de colapso diferente que permite insertar los datos en cualquier orden con múltiples hilos. En particular, el `Version` columna ayuda a contraer las filas correctamente, incluso si se insertan en el orden incorrecto. En contraste, `CollapsingMergeTree` sólo permite la inserción estrictamente consecutiva. + +## Creación de una tabla {#creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = VersionedCollapsingMergeTree(sign, version) +[PARTITION BY expr] +[ORDER BY expr] +[SAMPLE BY expr] +[SETTINGS name=value, ...] +``` + +Para obtener una descripción de los parámetros de consulta, consulte [descripción de la consulta](../../query_language/create.md). + +**Parámetros del motor** + +``` sql +VersionedCollapsingMergeTree(sign, version) +``` + +- `sign` — Nombre de la columna con el tipo de fila: `1` es una “state” fila, `-1` es una “cancel” Fila. + + El tipo de datos de columna debe ser `Int8`. + +- `version` — Nombre de la columna con la versión del estado del objeto. + + El tipo de datos de columna debe ser `UInt*`. + +**Cláusulas de consulta** + +Al crear un `VersionedCollapsingMergeTree` mesa, la misma [clausula](mergetree.md) se requieren como al crear un `MergeTree` tabla. + +
+ +Método obsoleto para crear una tabla + +!!! attention "Atención" + No utilice este método en nuevos proyectos. Si es posible, cambie los proyectos antiguos al método descrito anteriormente. + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE [=] VersionedCollapsingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, sign, version) +``` + +Todos los parámetros excepto `sign` y `version` el mismo significado que en `MergeTree`. + +- `sign` — Nombre de la columna con el tipo de fila: `1` es una “state” fila, `-1` es una “cancel” Fila. + + Tipo de datos de columna — `Int8`. + +- `version` — Nombre de la columna con la versión del estado del objeto. + + El tipo de datos de columna debe ser `UInt*`. + +
+ +## Derrumbar {#table-engines-versionedcollapsingmergetree} + +### Datos {#data} + +Considere una situación en la que necesite guardar datos que cambien continuamente para algún objeto. Es razonable tener una fila para un objeto y actualizar la fila siempre que haya cambios. Sin embargo, la operación de actualización es costosa y lenta para un DBMS porque requiere volver a escribir los datos en el almacenamiento. La actualización no es aceptable si necesita escribir datos rápidamente, pero puede escribir los cambios en un objeto secuencialmente de la siguiente manera. + +Descripción `Sign` columna al escribir la fila. Si `Sign = 1` significa que la fila es un estado de un objeto (llamémoslo el “state” fila). Si `Sign = -1` indica la cancelación del estado de un objeto con los mismos atributos (llamémoslo el “cancel” fila). También use el `Version` columna, que debe identificar cada estado de un objeto con un número separado. + +Por ejemplo, queremos calcular cuántas páginas visitaron los usuarios en algún sitio y cuánto tiempo estuvieron allí. En algún momento escribimos la siguiente fila con el estado de la actividad del usuario: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +En algún momento después registramos el cambio de actividad del usuario y lo escribimos con las siguientes dos filas. + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | +│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +La primera fila cancela el estado anterior del objeto (usuario). Debe copiar todos los campos del estado cancelado excepto `Sign`. + +La segunda fila contiene el estado actual. + +Debido a que solo necesitamos el último estado de actividad del usuario, las filas + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | +│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +se puede eliminar, colapsando el estado no válido (antiguo) del objeto. `VersionedCollapsingMergeTree` hace esto mientras fusiona las partes de datos. + +Para averiguar por qué necesitamos dos filas para cada cambio, vea [Algoritmo](#table_engines-versionedcollapsingmergetree-algorithm). + +**Notas sobre el uso** + +1. El programa que escribe los datos debe recordar el estado de un objeto para cancelarlo. El “cancel” Cadena debe ser una copia de la “state” con lo opuesto `Sign`. Esto aumenta el tamaño inicial de almacenamiento, pero permite escribir los datos rápidamente. +2. Las matrices de largo crecimiento en columnas reducen la eficiencia del motor debido a la carga para escribir. Cuanto más sencillos sean los datos, mejor será la eficiencia. +3. `SELECT` Los resultados dependen en gran medida de la coherencia del historial de cambios de objetos. Sea preciso al preparar los datos para insertarlos. Puede obtener resultados impredecibles con datos incoherentes, como valores negativos para métricas no negativas, como la profundidad de la sesión. + +### Algoritmo {#table-engines-versionedcollapsingmergetree-algorithm} + +Cuando ClickHouse combina partes de datos, elimina cada par de filas que tienen la misma clave principal y versión y diferentes `Sign`. El orden de las filas no importa. + +Cuando ClickHouse inserta datos, ordena filas por la clave principal. Si el `Version` la columna no está en la clave principal, ClickHouse la agrega a la clave principal implícitamente como el último campo y la usa para ordenar. + +## Selección de datos {#selecting-data} + +ClickHouse no garantiza que todas las filas con la misma clave principal estarán en la misma parte de datos resultante o incluso en el mismo servidor físico. Esto es cierto tanto para escribir los datos como para la posterior fusión de las partes de datos. Además, ClickHouse procesa `SELECT` consultas con múltiples subprocesos, y no puede predecir el orden de las filas en el resultado. Esto significa que la agregación es necesaria si hay una necesidad de obtener completamente “collapsed” datos de un `VersionedCollapsingMergeTree` tabla. + +Para finalizar el colapso, escriba una consulta con un `GROUP BY` cláusula y funciones agregadas que representan el signo. Por ejemplo, para calcular la cantidad, use `sum(Sign)` es lugar de `count()`. Para calcular la suma de algo, use `sum(Sign * x)` es lugar de `sum(x)` y agregar `HAVING sum(Sign) > 0`. + +Los agregados `count`, `sum` y `avg` se puede calcular de esta manera. El agregado `uniq` se puede calcular si un objeto tiene al menos un estado no colapsado. Los agregados `min` y `max` no se puede calcular porque `VersionedCollapsingMergeTree` no guarda el historial de valores de estados colapsados. + +Si necesita extraer los datos con “collapsing” pero sin agregación (por ejemplo, para verificar si hay filas presentes cuyos valores más nuevos coinciden con ciertas condiciones), puede usar el `FINAL` modificador para el `FROM` clausula. Este enfoque es ineficiente y no debe usarse con tablas grandes. + +## Ejemplo de uso {#example-of-use} + +Datos de ejemplo: + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | +│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | +│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +Creación de la tabla: + +``` sql +CREATE TABLE UAct +( + UserID UInt64, + PageViews UInt8, + Duration UInt8, + Sign Int8, + Version UInt8 +) +ENGINE = VersionedCollapsingMergeTree(Sign, Version) +ORDER BY UserID +``` + +Insertar los datos: + +``` sql +INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1, 1) +``` + +``` sql +INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1, 1),(4324182021466249494, 6, 185, 1, 2) +``` + +Usamos dos `INSERT` consultas para crear dos partes de datos diferentes. Si insertamos los datos con una sola consulta, ClickHouse crea una parte de datos y nunca realizará ninguna fusión. + +Obtener los datos: + +``` sql +SELECT * FROM UAct +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 │ +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 │ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +¿Qué vemos aquí y dónde están las partes colapsadas? +Creamos dos partes de datos usando dos `INSERT` consulta. El `SELECT` la consulta se realizó en dos subprocesos, y el resultado es un orden aleatorio de filas. +No se produjo el colapso porque las partes de datos aún no se han fusionado. ClickHouse fusiona partes de datos en un punto desconocido en el tiempo que no podemos predecir. + +Es por eso que necesitamos agregación: + +``` sql +SELECT + UserID, + sum(PageViews * Sign) AS PageViews, + sum(Duration * Sign) AS Duration, + Version +FROM UAct +GROUP BY UserID, Version +HAVING sum(Sign) > 0 +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Version─┐ +│ 4324182021466249494 │ 6 │ 185 │ 2 │ +└─────────────────────┴───────────┴──────────┴─────────┘ +``` + +Si no necesitamos agregación y queremos forzar el colapso, podemos usar el `FINAL` modificador para el `FROM` clausula. + +``` sql +SELECT * FROM UAct FINAL +``` + +``` text +┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ +│ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ +└─────────────────────┴───────────┴──────────┴──────┴─────────┘ +``` + +Esta es una forma muy ineficiente de seleccionar datos. No lo use para mesas grandes. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/versionedcollapsingmergetree/) diff --git a/docs/es/operations/table_engines/view.md b/docs/es/operations/table_engines/view.md new file mode 100644 index 00000000000..63ddfa50f45 --- /dev/null +++ b/docs/es/operations/table_engines/view.md @@ -0,0 +1,5 @@ +# Vista {#table-engines-view} + +Se utiliza para implementar vistas (para obtener más información, consulte `CREATE VIEW query`). No almacena datos, pero solo almacena los datos especificados `SELECT` consulta. Al leer desde una tabla, ejecuta esta consulta (y elimina todas las columnas innecesarias de la consulta). + +[Artículo Original](https://clickhouse.tech/docs/es/operations/table_engines/view/) diff --git a/docs/es/operations/tips.md b/docs/es/operations/tips.md new file mode 100644 index 00000000000..3913a6b1909 --- /dev/null +++ b/docs/es/operations/tips.md @@ -0,0 +1,244 @@ +# Recomendaciones de uso {#usage-recommendations} + +## CPU Scaling Governor {#cpu-scaling-governor} + +Utilice siempre el `performance` gobernador de escala. El `on-demand` regulador de escala funciona mucho peor con una demanda constante. + +``` bash +$ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor +``` + +## Limitaciones de la CPU {#cpu-limitations} + +Los procesadores pueden sobrecalentarse. Utilizar `dmesg` para ver si la velocidad de reloj de la CPU era limitada debido al sobrecalentamiento. +La restricción también se puede establecer externamente en el nivel del centro de datos. Usted puede utilizar `turbostat` para controlarlo bajo una carga. + +## RAM {#ram} + +Para pequeñas cantidades de datos (hasta ~200 GB comprimidos), es mejor usar tanta memoria como el volumen de datos. +Para grandes cantidades de datos y al procesar consultas interactivas (en línea), debe usar una cantidad razonable de RAM (128 GB o más) para que el subconjunto de datos en caliente quepa en la memoria caché de páginas. +Incluso para volúmenes de datos de ~ 50 TB por servidor, el uso de 128 GB de RAM mejora significativamente el rendimiento de las consultas en comparación con 64 GB. + +No deshabilite el sobrecompromiso. Valor `cat /proc/sys/vm/overcommit_memory` Ser debe 0 la 1. Ejecutar + +``` bash +$ echo 0 | sudo tee /proc/sys/vm/overcommit_memory +``` + +## Páginas enormes {#huge-pages} + +Siempre deshabilite las páginas enormes transparentes. Interfiere con los asignadores de memoria, lo que conduce a una degradación significativa del rendimiento. + +``` bash +$ echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled +``` + +Utilizar `perf top` para ver el tiempo pasado en el kernel para la administración de memoria. +Las páginas enormes permanentes tampoco necesitan ser asignadas. + +## Subsistema de almacenamiento {#storage-subsystem} + +Si su presupuesto le permite usar SSD, utilizar SSD. +Si no, use HDD. Los discos duros SATA 7200 RPM servirán. + +Dar preferencia a una gran cantidad de servidores con discos duros locales sobre un número menor de servidores con estantes de discos conectados. +Pero para almacenar archivos con consultas raras, los estantes funcionarán. + +## RAID {#raid} + +Al usar HDD, puede combinar su RAID-10, RAID-5, RAID-6 o RAID-50. +Para Linux, el software RAID es mejor (con `mdadm`). No recomendamos usar LVM. +Al crear RAID-10, seleccione el `far` diseño. +Si su presupuesto lo permite, elija RAID-10. + +Si tiene más de 4 discos, utilice RAID-6 (preferido) o RAID-50, en lugar de RAID-5. +Cuando use RAID-5, RAID-6 o RAID-50, siempre aumente stripe\_cache\_size, ya que el valor predeterminado generalmente no es la mejor opción. + +``` bash +$ echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size +``` + +Calcule el número exacto a partir del número de dispositivos y el tamaño del bloque, utilizando la fórmula: `2 * num_devices * chunk_size_in_bytes / 4096`. + +Un tamaño de bloque de 1024 KB es suficiente para todas las configuraciones RAID. +Nunca ajuste el tamaño del bloque demasiado pequeño o demasiado grande. + +Puede usar RAID-0 en SSD. +Independientemente del uso de RAID, utilice siempre la replicación para la seguridad de los datos. + +Habilite NCQ con una cola larga. Para HDD, elija el programador CFQ, y para SSD, elija noop. No reduzca el ‘readahead’ configuración. +Para HDD, habilite la memoria caché de escritura. + +## Sistema de archivos {#file-system} + +Ext4 es la opción más confiable. Establecer las opciones de montaje `noatime, nobarrier`. +XFS también es adecuado, pero no ha sido probado tan a fondo con ClickHouse. +La mayoría de los otros sistemas de archivos también deberían funcionar bien. Los sistemas de archivos con asignación retrasada funcionan mejor. + +## Núcleo de Linux {#linux-kernel} + +No utilice un kernel de Linux obsoleto. + +## Rojo {#network} + +Si está utilizando IPv6, aumente el tamaño de la caché de ruta. +El kernel de Linux anterior a 3.2 tenía una multitud de problemas con la implementación de IPv6. + +Utilice al menos una red de 10 GB, si es posible. 1 Gb también funcionará, pero será mucho peor para parchear réplicas con decenas de terabytes de datos, o para procesar consultas distribuidas con una gran cantidad de datos intermedios. + +## ZooKeeper {#zookeeper} + +Probablemente ya esté utilizando ZooKeeper para otros fines. Puede utilizar la misma instalación de ZooKeeper, si aún no está sobrecargado. + +Lo mejor es utilizar una nueva versión de ZooKeeper – 3.4.9 o posterior. La versión en distribuciones estables de Linux puede estar desactualizada. + +Nunca debe usar scripts escritos manualmente para transferir datos entre diferentes clústeres de ZooKeeper, ya que el resultado será incorrecto para los nodos secuenciales. Nunca utilice el “zkcopy” utilidad por la misma razón: https://github.com/ksprojects/zkcopy/issues/15 + +Si desea dividir un clúster ZooKeeper existente en dos, la forma correcta es aumentar el número de sus réplicas y, a continuación, volver a configurarlo como dos clústeres independientes. + +No ejecute ZooKeeper en los mismos servidores que ClickHouse. Porque ZooKeeper es muy sensible a la latencia y ClickHouse puede utilizar todos los recursos del sistema disponibles. + +Con la configuración predeterminada, ZooKeeper es una bomba de tiempo: + +> El servidor ZooKeeper no eliminará archivos de instantáneas y registros antiguos cuando utilice la configuración predeterminada (consulte autopurge), y esto es responsabilidad del operador. + +Esta bomba debe ser desactivada. + +La configuración ZooKeeper (3.5.1) a continuación se usa en Yandex.Entorno de producción de Métrica al 20 de mayo de 2017: + +zoológico.Cómo: + +``` bash +# http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html + +# The number of milliseconds of each tick +tickTime=2000 +# The number of ticks that the initial +# synchronization phase can take +initLimit=30000 +# The number of ticks that can pass between +# sending a request and getting an acknowledgement +syncLimit=10 + +maxClientCnxns=2000 + +maxSessionTimeout=60000000 +# the directory where the snapshot is stored. +dataDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/data +# Place the dataLogDir to a separate physical disc for better performance +dataLogDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/logs + +autopurge.snapRetainCount=10 +autopurge.purgeInterval=1 + + +# To avoid seeks ZooKeeper allocates space in the transaction log file in +# blocks of preAllocSize kilobytes. The default block size is 64M. One reason +# for changing the size of the blocks is to reduce the block size if snapshots +# are taken more often. (Also, see snapCount). +preAllocSize=131072 + +# Clients can submit requests faster than ZooKeeper can process them, +# especially if there are a lot of clients. To prevent ZooKeeper from running +# out of memory due to queued requests, ZooKeeper will throttle clients so that +# there is no more than globalOutstandingLimit outstanding requests in the +# system. The default limit is 1,000.ZooKeeper logs transactions to a +# transaction log. After snapCount transactions are written to a log file a +# snapshot is started and a new transaction log file is started. The default +# snapCount is 10,000. +snapCount=3000000 + +# If this option is defined, requests will be will logged to a trace file named +# traceFile.year.month.day. +#traceFile= + +# Leader accepts client connections. Default value is "yes". The leader machine +# coordinates updates. For higher update throughput at thes slight expense of +# read throughput the leader can be configured to not accept clients and focus +# on coordination. +leaderServes=yes + +standaloneEnabled=false +dynamicConfigFile=/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/zoo.cfg.dynamic +``` + +Versión Java: + +``` text +Java(TM) SE Runtime Environment (build 1.8.0_25-b17) +Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) +``` + +Parámetros de JVM: + +``` bash +NAME=zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} +ZOOCFGDIR=/etc/$NAME/conf + +# TODO this is really ugly +# How to find out, which jars are needed? +# seems, that log4j requires the log4j.properties file to be in the classpath +CLASSPATH="$ZOOCFGDIR:/usr/build/classes:/usr/build/lib/*.jar:/usr/share/zookeeper/zookeeper-3.5.1-metrika.jar:/usr/share/zookeeper/slf4j-log4j12-1.7.5.jar:/usr/share/zookeeper/slf4j-api-1.7.5.jar:/usr/share/zookeeper/servlet-api-2.5-20081211.jar:/usr/share/zookeeper/netty-3.7.0.Final.jar:/usr/share/zookeeper/log4j-1.2.16.jar:/usr/share/zookeeper/jline-2.11.jar:/usr/share/zookeeper/jetty-util-6.1.26.jar:/usr/share/zookeeper/jetty-6.1.26.jar:/usr/share/zookeeper/javacc.jar:/usr/share/zookeeper/jackson-mapper-asl-1.9.11.jar:/usr/share/zookeeper/jackson-core-asl-1.9.11.jar:/usr/share/zookeeper/commons-cli-1.2.jar:/usr/src/java/lib/*.jar:/usr/etc/zookeeper" + +ZOOCFG="$ZOOCFGDIR/zoo.cfg" +ZOO_LOG_DIR=/var/log/$NAME +USER=zookeeper +GROUP=zookeeper +PIDDIR=/var/run/$NAME +PIDFILE=$PIDDIR/$NAME.pid +SCRIPTNAME=/etc/init.d/$NAME +JAVA=/usr/bin/java +ZOOMAIN="org.apache.zookeeper.server.quorum.QuorumPeerMain" +ZOO_LOG4J_PROP="INFO,ROLLINGFILE" +JMXLOCALONLY=false +JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '}}' }} \ + -Xmx{{ '{{' }} cluster.get('xmx','1G') {{ '}}' }} \ + -Xloggc:/var/log/$NAME/zookeeper-gc.log \ + -XX:+UseGCLogFileRotation \ + -XX:NumberOfGCLogFiles=16 \ + -XX:GCLogFileSize=16M \ + -verbose:gc \ + -XX:+PrintGCTimeStamps \ + -XX:+PrintGCDateStamps \ + -XX:+PrintGCDetails + -XX:+PrintTenuringDistribution \ + -XX:+PrintGCApplicationStoppedTime \ + -XX:+PrintGCApplicationConcurrentTime \ + -XX:+PrintSafepointStatistics \ + -XX:+UseParNewGC \ + -XX:+UseConcMarkSweepGC \ +-XX:+CMSParallelRemarkEnabled" +``` + +Sal init: + +``` text +description "zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} centralized coordination service" + +start on runlevel [2345] +stop on runlevel [!2345] + +respawn + +limit nofile 8192 8192 + +pre-start script + [ -r "/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment" ] || exit 0 + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment + [ -d $ZOO_LOG_DIR ] || mkdir -p $ZOO_LOG_DIR + chown $USER:$GROUP $ZOO_LOG_DIR +end script + +script + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment + [ -r /etc/default/zookeeper ] && . /etc/default/zookeeper + if [ -z "$JMXDISABLE" ]; then + JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.local.only=$JMXLOCALONLY" + fi + exec start-stop-daemon --start -c $USER --exec $JAVA --name zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} \ + -- -cp $CLASSPATH $JAVA_OPTS -Dzookeeper.log.dir=${ZOO_LOG_DIR} \ + -Dzookeeper.root.logger=${ZOO_LOG4J_PROP} $ZOOMAIN $ZOOCFG +end script +``` + +[Artículo Original](https://clickhouse.tech/docs/es/operations/tips/) diff --git a/docs/es/operations/troubleshooting.md b/docs/es/operations/troubleshooting.md new file mode 100644 index 00000000000..4996ee2bebc --- /dev/null +++ b/docs/es/operations/troubleshooting.md @@ -0,0 +1,139 @@ +# Solución de problemas {#troubleshooting} + +- [Instalación](#troubleshooting-installation-errors) +- [Conexión al servidor](#troubleshooting-accepts-no-connections) +- [Procesamiento de consultas](#troubleshooting-does-not-process-queries) +- [Eficiencia del procesamiento de consultas](#troubleshooting-too-slow) + +## Instalación {#troubleshooting-installation-errors} + +### No puede obtener paquetes Deb del repositorio ClickHouse con apt-get {#you-cannot-get-deb-packages-from-clickhouse-repository-with-apt-get} + +- Compruebe la configuración del firewall. +- Si no puede acceder al repositorio por cualquier motivo, descargue los paquetes como se describe en el [Primeros pasos](../getting_started/index.md) artículo e instálelos manualmente usando el `sudo dpkg -i ` comando. También necesitará el `tzdata` paquete. + +## Conexión al servidor {#troubleshooting-accepts-no-connections} + +Posibles problemas: + +- El servidor no se está ejecutando. +- Parámetros de configuración inesperados o incorrectos. + +### El servidor no se está ejecutando {#server-is-not-running} + +**Compruebe si el servidor está ejecutado** + +Comando: + +``` bash +$ sudo service clickhouse-server status +``` + +Si el servidor no se está ejecutando, inícielo con el comando: + +``` bash +$ sudo service clickhouse-server start +``` + +**Comprobar registros** + +El registro principal de `clickhouse-server` está en `/var/log/clickhouse-server/clickhouse-server.log` predeterminada. + +Si el servidor se inició correctamente, debería ver las cadenas: + +- ` Application: starting up.` — Servidor iniciado. +- ` Application: Ready for connections.` — El servidor se está ejecutando y listo para las conexiones. + +Si `clickhouse-server` error de inicio con un error de configuración, debería ver el `` cadena con una descripción de error. Por ejemplo: + +``` text +2019.01.11 15:23:25.549505 [ 45 ] {} ExternalDictionaries: Failed reloading 'event2id' external dictionary: Poco::Exception. Code: 1000, e.code() = 111, e.displayText() = Connection refused, e.what() = Connection refused +``` + +Si no ve un error al final del archivo, revise todo el archivo a partir de la cadena: + +``` text + Application: starting up. +``` + +Si intenta iniciar una segunda instancia de `clickhouse-server` en el servidor, verá el siguiente registro: + +``` text +2019.01.11 15:25:11.151730 [ 1 ] {} : Starting ClickHouse 19.1.0 with revision 54413 +2019.01.11 15:25:11.154578 [ 1 ] {} Application: starting up +2019.01.11 15:25:11.156361 [ 1 ] {} StatusFile: Status file ./status already exists - unclean restart. Contents: +PID: 8510 +Started at: 2019-01-11 15:24:23 +Revision: 54413 + +2019.01.11 15:25:11.156673 [ 1 ] {} Application: DB::Exception: Cannot lock file ./status. Another server instance in same directory is already running. +2019.01.11 15:25:11.156682 [ 1 ] {} Application: shutting down +2019.01.11 15:25:11.156686 [ 1 ] {} Application: Uninitializing subsystem: Logging Subsystem +2019.01.11 15:25:11.156716 [ 2 ] {} BaseDaemon: Stop SignalListener thread +``` + +**Ver sistema.d registros** + +Si no encuentra ninguna información útil en `clickhouse-server` registros o no hay ningún registro, puede ver `system.d` comando el comando: + +``` bash +$ sudo journalctl -u clickhouse-server +``` + +**Iniciar clickhouse-server en modo interactivo** + +``` bash +$ sudo -u clickhouse /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml +``` + +Este comando inicia el servidor como una aplicación interactiva con parámetros estándar del script de inicio automático. En este modo `clickhouse-server` imprime todos los mensajes de eventos en la consola. + +### Parámetros de configuración {#configuration-parameters} + +Comprobar: + +- Configuración de Docker. + + Si ejecuta ClickHouse en Docker en una red IPv6, asegúrese de que `network=host` se establece. + +- Configuración del punto final. + + Comprobar [listen\_host](server_settings/settings.md#server_settings-listen_host) y [Tcp\_port](server_settings/settings.md#server_settings-tcp_port) configuración. + + El servidor ClickHouse acepta conexiones localhost solo de forma predeterminada. + +- Configuración del protocolo HTTP. + + Compruebe la configuración del protocolo para la API HTTP. + +- Configuración de conexión segura. + + Comprobar: + + - El [Tcp\_port\_secure](server_settings/settings.md#server_settings-tcp_port_secure) configuración. + - Ajustes para [Sertificados SSL](server_settings/settings.md#server_settings-openssl). + + Utilice los parámetros adecuados mientras se conecta. Por ejemplo, utilice el `port_secure` parámetro con `clickhouse_client`. + +- Configuración del usuario. + + Es posible que esté utilizando el nombre de usuario o la contraseña incorrectos. + +## Procesamiento de consultas {#troubleshooting-does-not-process-queries} + +Si ClickHouse no puede procesar la consulta, envía una descripción de error al cliente. En el `clickhouse-client` obtienes una descripción del error en la consola. Si está utilizando la interfaz HTTP, ClickHouse envía la descripción del error en el cuerpo de la respuesta. Por ejemplo: + +``` bash +$ curl 'http://localhost:8123/' --data-binary "SELECT a" +Code: 47, e.displayText() = DB::Exception: Unknown identifier: a. Note that there are no tables (FROM clause) in your query, context: required_names: 'a' source_tables: table_aliases: private_aliases: column_aliases: public_columns: 'a' masked_columns: array_join_columns: source_columns: , e.what() = DB::Exception +``` + +Si empiezas `clickhouse-client` con el `stack-trace` parámetro, ClickHouse devuelve el seguimiento de la pila del servidor con la descripción de un error. + +Es posible que vea un mensaje sobre una conexión rota. En este caso, puede repetir la consulta. Si la conexión se rompe cada vez que realiza la consulta, compruebe si hay errores en los registros del servidor. + +## Eficiencia del procesamiento de consultas {#troubleshooting-too-slow} + +Si ve que ClickHouse funciona demasiado lentamente, debe perfilar la carga en los recursos del servidor y la red para sus consultas. + +Puede utilizar la utilidad clickhouse-benchmark para crear perfiles de consultas. Muestra el número de consultas procesadas por segundo, el número de filas procesadas por segundo y percentiles de tiempos de procesamiento de consultas. diff --git a/docs/es/operations/update.md b/docs/es/operations/update.md new file mode 100644 index 00000000000..d03f6069426 --- /dev/null +++ b/docs/es/operations/update.md @@ -0,0 +1,13 @@ +# Actualización de ClickHouse {#clickhouse-update} + +Si se instaló ClickHouse desde paquetes deb, ejecute los siguientes comandos en el servidor: + +``` bash +$ sudo apt-get update +$ sudo apt-get install clickhouse-client clickhouse-server +$ sudo service clickhouse-server restart +``` + +Si ha instalado ClickHouse utilizando algo distinto de los paquetes deb recomendados, utilice el método de actualización adecuado. + +ClickHouse no admite una actualización distribuida. La operación debe realizarse consecutivamente en cada servidor separado. No actualice todos los servidores de un clúster simultáneamente, o el clúster no estará disponible durante algún tiempo. diff --git a/docs/es/operations/utils/clickhouse-benchmark.md b/docs/es/operations/utils/clickhouse-benchmark.md new file mode 100644 index 00000000000..f63dc41c67e --- /dev/null +++ b/docs/es/operations/utils/clickhouse-benchmark.md @@ -0,0 +1,149 @@ +# Sistema abierto. {#clickhouse-benchmark} + +Se conecta a un servidor ClickHouse y envía repetidamente las consultas especificadas. + +Sintaxis: + +``` bash +$ echo "single query" | clickhouse-benchmark [keys] +``` + +o + +``` bash +$ clickhouse-benchmark [keys] <<< "single query" +``` + +Si desea enviar un conjunto de consultas, cree un archivo de texto y coloque cada consulta en la cadena individual de este archivo. Por ejemplo: + +``` sql +SELECT * FROM system.numbers LIMIT 10000000 +SELECT 1 +``` + +Luego pase este archivo a una entrada estándar de `clickhouse-benchmark`. + +``` bash +clickhouse-benchmark [keys] < queries_file +``` + +## Claves {#clickhouse-benchmark-keys} + +- `-c N`, `--concurrency=N` — Número de consultas que `clickhouse-benchmark` se envía simultáneamente. Valor predeterminado: 1. +- `-d N`, `--delay=N` — Intervalo en segundos entre informes intermedios (ajuste 0 para deshabilitar informes). Valor predeterminado: 1. +- `-h WORD`, `--host=WORD` — Host del servidor. Valor predeterminado: `localhost`. Para el [modo de comparación](#clickhouse-benchmark-comparison-mode) puedes usar múltiples `-h` claves. +- `-p N`, `--port=N` — Puerto del servidor. Valor predeterminado: 9000. Para el [modo de comparación](#clickhouse-benchmark-comparison-mode) puedes usar múltiples `-p` claves. +- `-i N`, `--iterations=N` — Número Total de consultas. Valor predeterminado: 0. +- `-r`, `--randomize` - Orden aleatorio de ejecución de consultas si hay más de una consulta de entrada. +- `-s`, `--secure` — Usando la conexión TLS. +- `-t N`, `--timelimit=N` — Límite de tiempo en segundos. `clickhouse-benchmark` detiene el envío de consultas cuando se alcanza el límite de tiempo especificado. Valor predeterminado: 0 (límite de tiempo desactivado). +- `--confidence=N` — Nivel de confianza para la prueba T. Valores posibles: 0 (80%), 1 (90%), 2 (95%), 3 (98%), 4 (99%), 5 (99.5%). Valor predeterminado: 5. En el [modo de comparación](#clickhouse-benchmark-comparison-mode) `clickhouse-benchmark` realiza el [Examen t independiente de dos muestras para estudiantes](https://en.wikipedia.org/wiki/Student%27s_t-test#Independent_two-sample_t-test) para determinar si las dos distribuciones no son diferentes con el nivel de confianza seleccionado. +- `--cumulative` — Impresión de datos acumulativos en lugar de datos por intervalo. +- `--database=DATABASE_NAME` — Nombre de base de datos ClickHouse. Valor predeterminado: `default`. +- `--json=FILEPATH` — Salida JSON. Cuando se establece la clave, `clickhouse-benchmark` " resultados de la búsqueda relacionados +- `--user=USERNAME` — Nombre de usuario de ClickHouse. Valor predeterminado: `default`. +- `--password=PSWD` — Contraseña de usuario de ClickHouse. Valor predeterminado: cadena vacía. +- `--stacktrace` — Pila trazas de salida. Cuando se establece la clave, `clickhouse-bencmark` las salidas acumulan rastros de excepciones. +- `--stage=WORD` - Etapa de procesamiento de consultas en el servidor. ClickHouse detiene el procesamiento de consultas y devuelve la respuesta a `clickhouse-benchmark` en la etapa especificada. Valores posibles: `complete`, `fetch_columns`, `with_mergeable_state`. Valor predeterminado: `complete`. +- `--help` — Muestra el mensaje de ayuda. + +Si desea aplicar alguna [configuración](../../operations/settings/index.md) para consultas, páselas como una clave `--= SETTING_VALUE`. Por ejemplo, `--max_memory_usage=1048576`. + +## Salida {#clickhouse-benchmark-output} + +Predeterminada, `clickhouse-benchmark` Informes para cada `--delay` intervalo. + +Ejemplo del informe: + +``` text +Queries executed: 10. + +localhost:9000, queries 10, QPS: 6.772, RPS: 67904487.440, MiB/s: 518.070, result RPS: 67721584.984, result MiB/s: 516.675. + +0.000% 0.145 sec. +10.000% 0.146 sec. +20.000% 0.146 sec. +30.000% 0.146 sec. +40.000% 0.147 sec. +50.000% 0.148 sec. +60.000% 0.148 sec. +70.000% 0.148 sec. +80.000% 0.149 sec. +90.000% 0.150 sec. +95.000% 0.150 sec. +99.000% 0.150 sec. +99.900% 0.150 sec. +99.990% 0.150 sec. +``` + +En el informe puedes encontrar: + +- Número de consultas en el `Queries executed:` campo. + +- Cadena de estado que contiene (en orden): + + - Punto final del servidor ClickHouse. + - Número de consultas procesadas. + - QPS: QPS: ¿Cuántas consultas realizó el servidor por segundo durante un período `--delay` argumento. + - RPS: ¿Cuántas filas lee el servidor por segundo durante un período `--delay` argumento. + - MiB/s: ¿Cuántos mebibytes servidor leído por segundo durante un período especificado en el `--delay` argumento. + - resultado RPS: ¿Cuántas filas colocadas por el servidor al resultado de una consulta por segundo durante un período `--delay` argumento. + - resultado MiB/s. ¿Cuántos mebibytes colocados por el servidor al resultado de una consulta por segundo durante un período especificado en el `--delay` argumento. + +- Percentiles de tiempo de ejecución de consultas. + +## Modo de comparación {#clickhouse-benchmark-comparison-mode} + +`clickhouse-benchmark` puede comparar el rendimiento de dos servidores ClickHouse en ejecución. + +Para utilizar el modo de comparación, especifique los puntos finales de ambos servidores `--host`, `--port` claves. Las claves coinciden entre sí por posición en la lista de argumentos, la primera `--host` se empareja con la primera `--port` y así sucesivamente. `clickhouse-benchmark` establece conexiones a ambos servidores, luego envía consultas. Cada consulta dirigida a un servidor seleccionado al azar. Los resultados se muestran para cada servidor por separado. + +## Ejemplo {#clickhouse-benchmark-example} + +``` bash +$ echo "SELECT * FROM system.numbers LIMIT 10000000 OFFSET 10000000" | clickhouse-benchmark -i 10 +``` + +``` text +Loaded 1 queries. + +Queries executed: 6. + +localhost:9000, queries 6, QPS: 6.153, RPS: 123398340.957, MiB/s: 941.455, result RPS: 61532982.200, result MiB/s: 469.459. + +0.000% 0.159 sec. +10.000% 0.159 sec. +20.000% 0.159 sec. +30.000% 0.160 sec. +40.000% 0.160 sec. +50.000% 0.162 sec. +60.000% 0.164 sec. +70.000% 0.165 sec. +80.000% 0.166 sec. +90.000% 0.166 sec. +95.000% 0.167 sec. +99.000% 0.167 sec. +99.900% 0.167 sec. +99.990% 0.167 sec. + + + +Queries executed: 10. + +localhost:9000, queries 10, QPS: 6.082, RPS: 121959604.568, MiB/s: 930.478, result RPS: 60815551.642, result MiB/s: 463.986. + +0.000% 0.159 sec. +10.000% 0.159 sec. +20.000% 0.160 sec. +30.000% 0.163 sec. +40.000% 0.164 sec. +50.000% 0.165 sec. +60.000% 0.166 sec. +70.000% 0.166 sec. +80.000% 0.167 sec. +90.000% 0.167 sec. +95.000% 0.170 sec. +99.000% 0.172 sec. +99.900% 0.172 sec. +99.990% 0.172 sec. +``` diff --git a/docs/es/operations/utils/clickhouse-copier.md b/docs/es/operations/utils/clickhouse-copier.md new file mode 100644 index 00000000000..c398301998a --- /dev/null +++ b/docs/es/operations/utils/clickhouse-copier.md @@ -0,0 +1,169 @@ +# Método de codificación de datos: {#clickhouse-copier} + +Copia datos de las tablas de un clúster en tablas de otro (o del mismo) clúster. + +Puede ejecutar varios `clickhouse-copier` instancias en diferentes servidores para realizar el mismo trabajo. ZooKeeper se utiliza para sincronizar los procesos. + +Después de comenzar, `clickhouse-copier`: + +- Se conecta a ZooKeeper y recibe: + + - Copia de trabajos. + - El estado de los trabajos de copia. + +- Realiza los trabajos. + + Cada proceso en ejecución elige el “closest” el fragmento del clúster de origen y copia los datos en el clúster de destino, reafirmando los datos si es necesario. + +`clickhouse-copier` realiza un seguimiento de los cambios en ZooKeeper y los aplica sobre la marcha. + +Para reducir el tráfico de red, recomendamos ejecutar `clickhouse-copier` en el mismo servidor donde se encuentran los datos de origen. + +## Ejecución de clickhouse-copiadora {#running-clickhouse-copier} + +La utilidad debe ejecutarse manualmente: + +``` bash +$ clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir +``` + +Parámetros: + +- `daemon` — Empezar `clickhouse-copier` en modo demonio. +- `config` — El camino hacia el `zookeeper.xml` con los parámetros para la conexión a ZooKeeper. +- `task-path` — La ruta al nodo ZooKeeper. Este nodo se utiliza para la sincronización `clickhouse-copier` procesos y tareas de almacenamiento. Las tareas se almacenan en `$task-path/description`. +- `task-file` - Ruta opcional al archivo con la configuración de la tarea para la carga inicial a ZooKeeper. +- `task-upload-force` — Fuerza de carga `task-file` incluso si el nodo ya existe. +- `base-dir` — La ruta a los registros y archivos auxiliares. Cuando comienza, `clickhouse-copier` crear `clickhouse-copier_YYYYMMHHSS_` subdirectorios en `$base-dir`. Si se omite este parámetro, los directorios se crean en el directorio donde `clickhouse-copier` se puso en marcha. + +## Formato de zookeeper.XML {#format-of-zookeeper-xml} + +``` xml + + + trace + 100M + 3 + + + + + 127.0.0.1 + 2181 + + + +``` + +## Configuración de tareas de copia {#configuration-of-copying-tasks} + +``` xml + + + + + + false + + 127.0.0.1 + 9000 + + + ... + + + + ... + + + + + 2 + + + + 1 + + + + + 0 + + + + + 3 + + 1 + + + + + + + + source_cluster + test + hits + + + destination_cluster + test + hits2 + + + + ENGINE=ReplicatedMergeTree('/clickhouse/tables/{cluster}/{shard}/hits2', '{replica}') + PARTITION BY toMonday(date) + ORDER BY (CounterID, EventDate) + + + + jumpConsistentHash(intHash64(UserID), 2) + + + CounterID != 0 + + + + '2018-02-26' + '2018-03-05' + ... + + + + + + ... + + ... + + +``` + +`clickhouse-copier` seguimiento de los cambios en `/task/path/description` y los aplica sobre la marcha. Por ejemplo, si cambia el valor de `max_workers`, el número de procesos que ejecutan tareas también cambiará. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/utils/clickhouse-copier/) diff --git a/docs/es/operations/utils/clickhouse-local.md b/docs/es/operations/utils/clickhouse-local.md new file mode 100644 index 00000000000..0a4cc067170 --- /dev/null +++ b/docs/es/operations/utils/clickhouse-local.md @@ -0,0 +1,74 @@ +# Sistema abierto. {#clickhouse-local} + +El `clickhouse-local` El programa le permite realizar un procesamiento rápido en archivos locales, sin tener que implementar y configurar el servidor ClickHouse. + +Acepta datos que representan tablas y las consulta usando [Nombre de la red inalámbrica (SSID):](../../query_language/index.md). + +`clickhouse-local` utiliza el mismo núcleo que el servidor ClickHouse, por lo que es compatible con la mayoría de las características y el mismo conjunto de formatos y motores de tabla. + +Predeterminada `clickhouse-local` no tiene acceso a los datos en el mismo host, pero admite la carga de la configuración del servidor `--config-file` argumento. + +!!! warning "Advertencia" + No se recomienda cargar la configuración del servidor de producción en `clickhouse-local` Porque los datos pueden dañarse en caso de error humano. + +## Uso {#usage} + +Uso básico: + +``` bash +$ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" +``` + +Argumento: + +- `-S`, `--structure` — estructura de tabla para los datos de entrada. +- `-if`, `--input-format` — formato de entrada, `TSV` predeterminada. +- `-f`, `--file` — ruta de acceso a los datos, `stdin` predeterminada. +- `-q` `--query` — consultas para ejecutar con `;` como delimitador. +- `-N`, `--table` — nombre de la tabla dónde colocar los datos de salida, `table` predeterminada. +- `-of`, `--format`, `--output-format` — formato de salida, `TSV` predeterminada. +- `--stacktrace` — si volcar la salida de depuración en caso de excepción. +- `--verbose` — más detalles sobre la ejecución de la consulta. +- `-s` — desactivar `stderr` Tala. +- `--config-file` — ruta al archivo de configuración en el mismo formato que para el servidor ClickHouse, por defecto la configuración vacía. +- `--help` — argumentos referencias para `clickhouse-local`. + +También hay argumentos para cada variable de configuración de ClickHouse que se usan más comúnmente en lugar de `--config-file`. + +## Ejemplos {#examples} + +``` bash +$ echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" +Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. +1 2 +3 4 +``` + +El ejemplo anterior es el mismo que: + +``` bash +$ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" +Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. +1 2 +3 4 +``` + +Ahora vamos a usuario de memoria de salida para cada usuario de Unix: + +``` bash +$ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" +``` + +``` text +Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. +┏━━━━━━━━━━┳━━━━━━━━━━┓ +┃ user ┃ memTotal ┃ +┡━━━━━━━━━━╇━━━━━━━━━━┩ +│ bayonet │ 113.5 │ +├──────────┼──────────┤ +│ root │ 8.8 │ +├──────────┼──────────┤ +... +``` + +[Artículo Original](https://clickhouse.tech/docs/es/operations/utils/clickhouse-local/) diff --git a/docs/es/operations/utils/index.md b/docs/es/operations/utils/index.md new file mode 100644 index 00000000000..b7bee1def34 --- /dev/null +++ b/docs/es/operations/utils/index.md @@ -0,0 +1,7 @@ +# Utilidad ClickHouse {#clickhouse-utility} + +- [Sistema abierto.](clickhouse-local.md) — Permite ejecutar consultas SQL en datos sin detener el servidor ClickHouse, similar a cómo `awk` hace esto. +- [Método de codificación de datos:](clickhouse-copier.md) — Copia (y vuelve a dividir) los datos de un clúster a otro. +- [Sistema abierto.](clickhouse-benchmark.md) — Carga el servidor con las consultas y configuraciones personalizadas. + +[Artículo Original](https://clickhouse.tech/docs/es/operations/utils/) diff --git a/docs/es/query_language/agg_functions/combinators.md b/docs/es/query_language/agg_functions/combinators.md new file mode 100644 index 00000000000..766195b6c4e --- /dev/null +++ b/docs/es/query_language/agg_functions/combinators.md @@ -0,0 +1,159 @@ +# Combinadores de funciones agregadas {#aggregate-functions-combinators} + +El nombre de una función agregada puede tener un sufijo anexado. Esto cambia la forma en que funciona la función de agregado. + +## Nuestros servicios {#agg-functions-combinator-if} + +El sufijo -If se puede anexar al nombre de cualquier función agregada. En este caso, la función de agregado acepta un argumento adicional: una condición (tipo Uint8). La función de agregado procesa solo las filas que desencadenan la condición. Si la condición no se desencadenó ni una sola vez, devuelve un valor predeterminado (normalmente ceros o cadenas vacías). + +Ejemplos: `sumIf(column, cond)`, `countIf(cond)`, `avgIf(x, cond)`, `quantilesTimingIf(level1, level2)(x, cond)`, `argMinIf(arg, val, cond)` y así sucesivamente. + +Con las funciones de agregado condicional, puede calcular agregados para varias condiciones a la vez, sin utilizar subconsultas y `JOIN`Por ejemplo, en Yandex.Metrica, las funciones de agregado condicional se utilizan para implementar la funcionalidad de comparación de segmentos. + +## -Matriz {#agg-functions-combinator-array} + +El sufijo -Array se puede agregar a cualquier función agregada. En este caso, la función de agregado toma argumentos del ‘Array(T)’ tipo (arrays) en lugar de ‘T’ argumentos de tipo. Si la función de agregado acepta varios argumentos, deben ser matrices de igual longitud. Al procesar matrices, la función de agregado funciona como la función de agregado original en todos los elementos de la matriz. + +Ejemplo 1: `sumArray(arr)` - Totales de todos los elementos de todos ‘arr’ matriz. En este ejemplo, podría haber sido escrito más simplemente: `sum(arraySum(arr))`. + +Ejemplo 2: `uniqArray(arr)` – Cuenta el número de elementos únicos ‘arr’ matriz. Esto podría hacerse de una manera más fácil: `uniq(arrayJoin(arr))` pero no siempre es posible añadir ‘arrayJoin’ a una consulta. + +-Si y -Array se pueden combinar. Obstante, ‘Array’ debe venir primero, entonces ‘If’. Ejemplos: `uniqArrayIf(arr, cond)`, `quantilesTimingArrayIf(level1, level2)(arr, cond)`. Debido a este pedido, el ‘cond’ el argumento no será una matriz. + +## -Estado {#agg-functions-combinator-state} + +Si aplica este combinador, la función de agregado no devuelve el valor resultante (como el número de valores únicos [uniq](reference.md#agg_function-uniq) función), pero un estado intermedio de la agregación (para `uniq`, esta es la tabla hash para calcular el número de valores únicos). Este es un `AggregateFunction(...)` que puede ser utilizado para su posterior procesamiento o almacenado en una tabla para terminar de agregar más tarde. + +Para trabajar con estos estados, use: + +- [AgregaciónMergeTree](../../operations/table_engines/aggregatingmergetree.md) motor de mesa. +- [finalizeAggregation](../functions/other_functions.md#function-finalizeaggregation) función. +- [runningAccumulate](../functions/other_functions.md#function-runningaccumulate) función. +- [-Fusionar](#aggregate_functions_combinators_merge) combinador. +- [-MergeState](#aggregate_functions_combinators_mergestate) combinador. + +## -Fusionar {#aggregate-functions-combinators-merge} + +Si aplica este combinador, la función de agregado toma el estado de agregación intermedio como argumento, combina los estados para finalizar la agregación y devuelve el valor resultante. + +## -MergeState {#aggregate-functions-combinators-mergestate} + +Combina los estados de agregación intermedios de la misma manera que el combinador -Merge. Sin embargo, no devuelve el valor resultante, sino un estado de agregación intermedio, similar al combinador -State. + +## -ForEach {#agg-functions-combinator-foreach} + +Convierte una función de agregado para tablas en una función de agregado para matrices que agrega los elementos de matriz correspondientes y devuelve una matriz de resultados. Por ejemplo, `sumForEach` para las matrices `[1, 2]`, `[3, 4, 5]`y`[6, 7]`devuelve el resultado `[10, 13, 5]` después de agregar los elementos de la matriz correspondientes. + +## -OPor defecto {#agg-functions-combinator-ordefault} + +Rellena el valor predeterminado del tipo devuelto de la función de agregado si no hay nada que agregar. + +``` sql +SELECT avg(number), avgOrDefault(number) FROM numbers(0) +``` + +``` text +┌─avg(number)─┬─avgOrDefault(number)─┐ +│ nan │ 0 │ +└─────────────┴──────────────────────┘ +``` + +## -OrNull {#agg-functions-combinator-ornull} + +Llenar `null` si no hay nada que agregar. La columna de retorno será anulable. + +``` sql +SELECT avg(number), avgOrNull(number) FROM numbers(0) +``` + +``` text +┌─avg(number)─┬─avgOrNull(number)─┐ +│ nan │ ᴺᵁᴸᴸ │ +└─────────────┴───────────────────┘ +``` + +-OrDefault y -OrNull se pueden combinar con otros combinadores. Es útil cuando la función de agregado no acepta la entrada vacía. + +``` sql +SELECT avgOrNullIf(x, x > 10) +FROM +( + SELECT toDecimal32(1.23, 2) AS x +) +``` + +``` text +┌─avgOrNullIf(x, greater(x, 10))─┐ +│ ᴺᵁᴸᴸ │ +└────────────────────────────────┘ +``` + +## -Remuestrear {#agg-functions-combinator-resample} + +Permite dividir los datos en grupos y, a continuación, agregar por separado los datos de esos grupos. Los grupos se crean dividiendo los valores de una columna en intervalos. + +``` sql +Resample(start, end, step)(, resampling_key) +``` + +**Parámetros** + +- `start` — Valor inicial de todo el intervalo requerido para `resampling_key` valor. +- `stop` — valor final de todo el intervalo requerido para `resampling_key` valor. Todo el intervalo no incluye el `stop` valor `[start, stop)`. +- `step` — Paso para separar todo el intervalo en subintervalos. El `aggFunction` se ejecuta sobre cada uno de esos subintervalos de forma independiente. +- `resampling_key` — Columna cuyos valores se utilizan para separar los datos en intervalos. +- `aggFunction_params` — `aggFunction` parámetros. + +**Valores devueltos** + +- Matriz de `aggFunction` resultados para cada subintervalo. + +**Ejemplo** + +Considere el `people` con los siguientes datos: + +``` text +┌─name───┬─age─┬─wage─┐ +│ John │ 16 │ 10 │ +│ Alice │ 30 │ 15 │ +│ Mary │ 35 │ 8 │ +│ Evelyn │ 48 │ 11.5 │ +│ David │ 62 │ 9.9 │ +│ Brian │ 60 │ 16 │ +└────────┴─────┴──────┘ +``` + +Vamos a obtener los nombres de las personas cuya edad se encuentra en los intervalos de `[30,60)` y `[60,75)`. Como usamos la representación entera para la edad, obtenemos edades en el `[30, 59]` y `[60,74]` intervalo. + +Para agregar nombres en una matriz, usamos el [Método de codificación de datos:](reference.md#agg_function-grouparray) función de agregado. Se necesita un argumento. En nuestro caso, es el `name` columna. El `groupArrayResample` función debe utilizar el `age` columna para agregar nombres por edad. Para definir los intervalos requeridos, pasamos el `30, 75, 30` discusiones sobre el `groupArrayResample` función. + +``` sql +SELECT groupArrayResample(30, 75, 30)(name, age) FROM people +``` + +``` text +┌─groupArrayResample(30, 75, 30)(name, age)─────┐ +│ [['Alice','Mary','Evelyn'],['David','Brian']] │ +└───────────────────────────────────────────────┘ +``` + +Considera los resultados. + +`Jonh` está fuera de la muestra porque es demasiado joven. Otras personas se distribuyen de acuerdo con los intervalos de edad especificados. + +Ahora vamos a contar el número total de personas y su salario promedio en los intervalos de edad especificados. + +``` sql +SELECT + countResample(30, 75, 30)(name, age) AS amount, + avgResample(30, 75, 30)(wage, age) AS avg_wage +FROM people +``` + +``` text +┌─amount─┬─avg_wage──────────────────┐ +│ [3,2] │ [11.5,12.949999809265137] │ +└────────┴───────────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/agg_functions/combinators/) diff --git a/docs/es/query_language/agg_functions/index.md b/docs/es/query_language/agg_functions/index.md new file mode 100644 index 00000000000..f230aa00439 --- /dev/null +++ b/docs/es/query_language/agg_functions/index.md @@ -0,0 +1,54 @@ +# Funciones agregadas {#aggregate-functions} + +Las funciones agregadas funcionan en el [normal](http://www.sql-tutorial.com/sql-aggregate-functions-sql-tutorial) forma esperada por los expertos en bases de datos. + +ClickHouse también es compatible: + +- [Funciones agregadas paramétricas](parametric_functions.md#aggregate_functions_parametric) que aceptan otros parámetros además de las columnas. +- [Combinadores](combinators.md#aggregate_functions_combinators), que cambian el comportamiento de las funciones agregadas. + +## Procesamiento NULL {#null-processing} + +Durante la agregación, todos `NULL`s se omiten. + +**Ejemplos:** + +Considere esta tabla: + +``` text +┌─x─┬────y─┐ +│ 1 │ 2 │ +│ 2 │ ᴺᵁᴸᴸ │ +│ 3 │ 2 │ +│ 3 │ 3 │ +│ 3 │ ᴺᵁᴸᴸ │ +└───┴──────┘ +``` + +Supongamos que necesita sumar los valores en el `y` columna: + +``` sql +SELECT sum(y) FROM t_null_big +``` + +Método de codificación de datos: +│ 7 │ +¿Qué puedes encontrar en Neodigit + +El `sum` función interpreta `NULL` como `0`. En particular, esto significa que si la función recibe la entrada de una selección donde todos los valores son `NULL`, entonces el resultado será `0`Nuestra `NULL`. + +Ahora puedes usar el `groupArray` función para crear una matriz a partir de la `y` columna: + +``` sql +SELECT groupArray(y) FROM t_null_big +``` + +``` text +┌─groupArray(y)─┐ +│ [2,2,3] │ +└───────────────┘ +``` + +`groupArray` no incluye `NULL` en la matriz resultante. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/agg_functions/) diff --git a/docs/es/query_language/agg_functions/parametric_functions.md b/docs/es/query_language/agg_functions/parametric_functions.md new file mode 100644 index 00000000000..fc661ff0293 --- /dev/null +++ b/docs/es/query_language/agg_functions/parametric_functions.md @@ -0,0 +1,490 @@ +# Funciones agregadas paramétricas {#aggregate-functions-parametric} + +Algunas funciones agregadas pueden aceptar no solo columnas de argumentos (utilizadas para la compresión), sino un conjunto de parámetros: constantes para la inicialización. La sintaxis es de dos pares de corchetes en lugar de uno. El primero es para parámetros, y el segundo es para argumentos. + +## histograma {#histogram} + +Calcula un histograma adaptativo. No garantiza resultados precisos. + +``` sql +histogram(number_of_bins)(values) +``` + +Las funciones utiliza [Un algoritmo de árbol de decisión paralelo de transmisión](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). Los bordes de los contenedores de histograma se ajustan a medida que los nuevos datos entran en una función. En caso común, los anchos de los contenedores no son iguales. + +**Parámetros** + +`number_of_bins` — Límite superior para el número de ubicaciones en el histograma. La función calcula automáticamente el número de contenedores. Intenta alcanzar el número especificado de ubicaciones, pero si falla, utiliza menos ubicaciones. +`values` — [Expresion](../syntax.md#syntax-expressions) resultando en valores de entrada. + +**Valores devueltos** + +- [Matriz](../../data_types/array.md) de [Tuples](../../data_types/tuple.md) del siguiente formato: + + [(lower_1, upper_1, height_1), ... (lower_N, upper_N, height_N)] + + - `lower` — Límite inferior del contenedor. + - `upper` — Límite superior del contenedor. + - `height` — Altura calculada del contenedor. + +**Ejemplo** + +``` sql +SELECT histogram(5)(number + 1) +FROM ( + SELECT * + FROM system.numbers + LIMIT 20 +) +``` + +``` text +┌─histogram(5)(plus(number, 1))───────────────────────────────────────────┐ +│ [(1,4.5,4),(4.5,8.5,4),(8.5,12.75,4.125),(12.75,17,4.625),(17,20,3.25)] │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +Puede visualizar un histograma con el [Bar](../functions/other_functions.md#function-bar) función, por ejemplo: + +``` sql +WITH histogram(5)(rand() % 100) AS hist +SELECT + arrayJoin(hist).3 AS height, + bar(height, 0, 6, 5) AS bar +FROM +( + SELECT * + FROM system.numbers + LIMIT 20 +) +``` + +``` text +┌─height─┬─bar───┐ +│ 2.125 │ █▋ │ +│ 3.25 │ ██▌ │ +│ 5.625 │ ████▏ │ +│ 5.625 │ ████▏ │ +│ 3.375 │ ██▌ │ +└────────┴───────┘ +``` + +En este caso, debe recordar que no conoce los bordes del contenedor del histograma. + +## Por ejemplo, esta función es la siguiente:, …) {#function-sequencematch} + +Comprueba si la secuencia contiene una cadena de eventos que coincida con el patrón. + +``` sql +sequenceMatch(pattern)(timestamp, cond1, cond2, ...) +``` + +!!! warning "Advertencia" + Los eventos que ocurren en el mismo segundo pueden estar en la secuencia en un orden indefinido que afecta el resultado. + +**Parámetros** + +- `pattern` — Patrón de cadena. Ver [Sintaxis de patrón](#sequence-function-pattern-syntax). + +- `timestamp` — Columna que se considera que contiene datos de tiempo. Los tipos de datos típicos son `Date` y `DateTime`. También puede utilizar cualquiera de los [UInt](../../data_types/int_uint.md) tipos de datos. + +- `cond1`, `cond2` — Condiciones que describen la cadena de eventos. Tipo de datos: `UInt8`. Puede pasar hasta 32 argumentos de condición. La función sólo tiene en cuenta los eventos descritos en estas condiciones. Si la secuencia contiene datos que no se describen en una condición, la función los salta. + +**Valores devueltos** + +- 1, si el patrón coincide. +- 0, si el patrón no coincide. + +Tipo: `UInt8`. + + +**Sintaxis de patrón** + +- `(?N)` — Hace coincidir el argumento de condición en la posición `N`. Las condiciones están numeradas en el `[1, 32]` Gama. Por ejemplo, `(?1)` coincide con el argumento pasado al `cond1` parámetro. + +- `.*` — Coincide con cualquier número de eventos. No necesita argumentos condicionales para hacer coincidir este elemento del patrón. + +- `(?t operator value)` — Establece el tiempo en segundos que debe separar dos eventos. Por ejemplo, patrón `(?1)(?t>1800)(?2)` coincide con los eventos que ocurren a más de 1800 segundos el uno del otro. Un número arbitrario de cualquier evento puede estar entre estos eventos. Puede usar el `>=`, `>`, `<`, `<=` operador. + +**Ejemplos** + +Considere los datos en el `t` tabla: + +``` text +┌─time─┬─number─┐ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 3 │ 2 │ +└──────┴────────┘ +``` + +Realizar la consulta: + +``` sql +SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2) FROM t +``` + +``` text +┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2))─┐ +│ 1 │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +La función encontró la cadena de eventos donde el número 2 sigue al número 1. Se saltó el número 3 entre ellos, porque el número no se describe como un evento. Si queremos tener en cuenta este número al buscar la cadena de eventos dada en el ejemplo, debemos establecer una condición para ello. + +``` sql +SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 3) FROM t +``` + +``` text +┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 3))─┐ +│ 0 │ +└──────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +En este caso, la función no pudo encontrar la cadena de eventos que coincidiera con el patrón, porque el evento para el número 3 ocurrió entre 1 y 2. Si en el mismo caso comprobamos la condición para el número 4, la secuencia coincidiría con el patrón. + +``` sql +SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM t +``` + +``` text +┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 4))─┐ +│ 1 │ +└──────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +**Ver también** + +- [sequenceCount](#function-sequencecount) + +## Por ejemplo, una secuencia de tiempo, …) {#function-sequencecount} + +Cuenta el número de cadenas de eventos que coinciden con el patrón. La función busca cadenas de eventos que no se superponen. Comienza a buscar la siguiente cadena después de que se haga coincidir la cadena actual. + +!!! warning "Advertencia" + Los eventos que ocurren en el mismo segundo pueden estar en la secuencia en un orden indefinido que afecta el resultado. + +``` sql +sequenceCount(pattern)(timestamp, cond1, cond2, ...) +``` + +**Parámetros** + +- `pattern` — Patrón de cadena. Ver [Sintaxis de patrón](#sequence-function-pattern-syntax). + +- `timestamp` — Columna que se considera que contiene datos de tiempo. Los tipos de datos típicos son `Date` y `DateTime`. También puede utilizar cualquiera de los [UInt](../../data_types/int_uint.md) tipos de datos. + +- `cond1`, `cond2` — Condiciones que describen la cadena de eventos. Tipo de datos: `UInt8`. Puede pasar hasta 32 argumentos de condición. La función sólo tiene en cuenta los eventos descritos en estas condiciones. Si la secuencia contiene datos que no se describen en una condición, la función los salta. + +**Valores devueltos** + +- Número de cadenas de eventos no superpuestas que coinciden. + +Tipo: `UInt64`. + +**Ejemplo** + +Considere los datos en el `t` tabla: + +``` text +┌─time─┬─number─┐ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 3 │ 2 │ +│ 4 │ 1 │ +│ 5 │ 3 │ +│ 6 │ 2 │ +└──────┴────────┘ +``` + +Cuente cuántas veces ocurre el número 2 después del número 1 con cualquier cantidad de otros números entre ellos: + +``` sql +SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t +``` + +``` text +┌─sequenceCount('(?1).*(?2)')(time, equals(number, 1), equals(number, 2))─┐ +│ 2 │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +**Ver también** + +- [sequenceMatch](#function-sequencematch) + +## ventanaEmbudo {#windowfunnel} + +Busca cadenas de eventos en una ventana de tiempo deslizante y calcula el número máximo de eventos que ocurrieron desde la cadena. + +La función funciona de acuerdo con el algoritmo: + +- La función busca datos que desencadenan la primera condición en la cadena y establece el contador de eventos en 1. Este es el momento en que comienza la ventana deslizante. + +- Si los eventos de la cadena ocurren secuencialmente dentro de la ventana, el contador se incrementa. Si se interrumpe la secuencia de eventos, el contador no se incrementa. + +- Si los datos tienen varias cadenas de eventos en diferentes puntos de finalización, la función solo generará el tamaño de la cadena más larga. + +**Sintaxis** + +``` sql +windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) +``` + +**Parámetros** + +- `window` — Longitud de la ventana corredera en segundos. +- `mode` - Es un argumento opcional. + - `'strict'` - Cuando el `'strict'` se establece, windowFunnel() aplica condiciones solo para los valores únicos. +- `timestamp` — Nombre de la columna que contiene la marca de tiempo. Tipos de datos admitidos: [Fecha](../../data_types/date.md), [FechaHora](../../data_types/datetime.md#data_type-datetime) y otros tipos de enteros sin signo (tenga en cuenta que aunque timestamp admite el `UInt64` tipo, su valor no puede exceder el máximo Int64, que es 2 ^ 63 - 1). +- `cond` — Condiciones o datos que describan la cadena de eventos. [UInt8](../../data_types/int_uint.md). + +**Valor devuelto** + +El número máximo de condiciones desencadenadas consecutivas de la cadena dentro de la ventana de tiempo deslizante. +Se analizan todas las cadenas en la selección. + +Tipo: `Integer`. + +**Ejemplo** + +Determine si un período de tiempo establecido es suficiente para que el usuario seleccione un teléfono y lo compre dos veces en la tienda en línea. + +Establezca la siguiente cadena de eventos: + +1. El usuario inició sesión en su cuenta en la tienda (`eventID = 1003`). +2. El usuario busca un teléfono (`eventID = 1007, product = 'phone'`). +3. El usuario realizó un pedido (`eventID = 1009`). +4. El usuario volvió a realizar el pedido (`eventID = 1010`). + +Tabla de entrada: + +``` text +┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ +│ 2019-01-28 │ 1 │ 2019-01-29 10:00:00 │ 1003 │ phone │ +└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ +┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ +│ 2019-01-31 │ 1 │ 2019-01-31 09:00:00 │ 1007 │ phone │ +└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ +┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ +│ 2019-01-30 │ 1 │ 2019-01-30 08:00:00 │ 1009 │ phone │ +└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ +┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ +│ 2019-02-01 │ 1 │ 2019-02-01 08:00:00 │ 1010 │ phone │ +└────────────┴─────────┴─────────────────────┴─────────┴─────────┘ +``` + +Averigüe hasta qué punto el usuario `user_id` podría atravesar la cadena en un período de enero a febrero de 2019. + +Consulta: + +``` sql +SELECT + level, + count() AS c +FROM +( + SELECT + user_id, + windowFunnel(6048000000000000)(timestamp, eventID = 1003, eventID = 1009, eventID = 1007, eventID = 1010) AS level + FROM trend + WHERE (event_date >= '2019-01-01') AND (event_date <= '2019-02-02') + GROUP BY user_id +) +GROUP BY level +ORDER BY level ASC +``` + +Resultado: + +``` text +┌─level─┬─c─┐ +│ 4 │ 1 │ +└───────┴───┘ +``` + +## retención {#retention} + +La función toma como argumentos un conjunto de condiciones de 1 a 32 argumentos de tipo `UInt8` que indican si se cumplió una determinada condición para el evento. +Cualquier condición se puede especificar como un argumento (como en [DONDE](../../query_language/select.md#select-where)). + +Las condiciones, excepto la primera, se aplican en pares: el resultado del segundo será verdadero si el primero y el segundo son verdaderos, del tercero si el primero y el fird son verdaderos, etc. + +**Sintaxis** + +``` sql +retention(cond1, cond2, ..., cond32); +``` + +**Parámetros** + +- `cond` — una expresión que devuelve un `UInt8` resultado (1 o 0). + +**Valor devuelto** + +La matriz de 1 o 0. + +- 1 — se cumplió la condición para el evento. +- 0 - condición no se cumplió para el evento. + +Tipo: `UInt8`. + +**Ejemplo** + +Consideremos un ejemplo de cálculo de la `retention` función para determinar el tráfico del sitio. + +**1.** Сrear una tabla para ilustrar un ejemplo. + +``` sql +CREATE TABLE retention_test(date Date, uid Int32) ENGINE = Memory; + +INSERT INTO retention_test SELECT '2020-01-01', number FROM numbers(5); +INSERT INTO retention_test SELECT '2020-01-02', number FROM numbers(10); +INSERT INTO retention_test SELECT '2020-01-03', number FROM numbers(15); +``` + +Tabla de entrada: + +Consulta: + +``` sql +SELECT * FROM retention_test +``` + +Resultado: + +``` text +┌───────date─┬─uid─┐ +│ 2020-01-01 │ 0 │ +│ 2020-01-01 │ 1 │ +│ 2020-01-01 │ 2 │ +│ 2020-01-01 │ 3 │ +│ 2020-01-01 │ 4 │ +└────────────┴─────┘ +┌───────date─┬─uid─┐ +│ 2020-01-02 │ 0 │ +│ 2020-01-02 │ 1 │ +│ 2020-01-02 │ 2 │ +│ 2020-01-02 │ 3 │ +│ 2020-01-02 │ 4 │ +│ 2020-01-02 │ 5 │ +│ 2020-01-02 │ 6 │ +│ 2020-01-02 │ 7 │ +│ 2020-01-02 │ 8 │ +│ 2020-01-02 │ 9 │ +└────────────┴─────┘ +┌───────date─┬─uid─┐ +│ 2020-01-03 │ 0 │ +│ 2020-01-03 │ 1 │ +│ 2020-01-03 │ 2 │ +│ 2020-01-03 │ 3 │ +│ 2020-01-03 │ 4 │ +│ 2020-01-03 │ 5 │ +│ 2020-01-03 │ 6 │ +│ 2020-01-03 │ 7 │ +│ 2020-01-03 │ 8 │ +│ 2020-01-03 │ 9 │ +│ 2020-01-03 │ 10 │ +│ 2020-01-03 │ 11 │ +│ 2020-01-03 │ 12 │ +│ 2020-01-03 │ 13 │ +│ 2020-01-03 │ 14 │ +└────────────┴─────┘ +``` + +**2.** Agrupar usuarios por ID único `uid` utilizando el `retention` función. + +Consulta: + +``` sql +SELECT + uid, + retention(date = '2020-01-01', date = '2020-01-02', date = '2020-01-03') AS r +FROM retention_test +WHERE date IN ('2020-01-01', '2020-01-02', '2020-01-03') +GROUP BY uid +ORDER BY uid ASC +``` + +Resultado: + +``` text +┌─uid─┬─r───────┐ +│ 0 │ [1,1,1] │ +│ 1 │ [1,1,1] │ +│ 2 │ [1,1,1] │ +│ 3 │ [1,1,1] │ +│ 4 │ [1,1,1] │ +│ 5 │ [0,0,0] │ +│ 6 │ [0,0,0] │ +│ 7 │ [0,0,0] │ +│ 8 │ [0,0,0] │ +│ 9 │ [0,0,0] │ +│ 10 │ [0,0,0] │ +│ 11 │ [0,0,0] │ +│ 12 │ [0,0,0] │ +│ 13 │ [0,0,0] │ +│ 14 │ [0,0,0] │ +└─────┴─────────┘ +``` + +**3.** Calcule el número total de visitas al sitio por día. + +Consulta: + +``` sql +SELECT + sum(r[1]) AS r1, + sum(r[2]) AS r2, + sum(r[3]) AS r3 +FROM +( + SELECT + uid, + retention(date = '2020-01-01', date = '2020-01-02', date = '2020-01-03') AS r + FROM retention_test + WHERE date IN ('2020-01-01', '2020-01-02', '2020-01-03') + GROUP BY uid +) +``` + +Resultado: + +``` text +┌─r1─┬─r2─┬─r3─┐ +│ 5 │ 5 │ 5 │ +└────┴────┴────┘ +``` + +Donde: + +- `r1`- el número de visitantes únicos que visitaron el sitio durante 2020-01-01 (la `cond1` condición). +- `r2`- el número de visitantes únicos que visitaron el sitio durante un período de tiempo específico entre 2020-01-01 y 2020-01-02 (`cond1` y `cond2` condición). +- `r3`- el número de visitantes únicos que visitaron el sitio durante un período de tiempo específico entre 2020-01-01 y 2020-01-03 (`cond1` y `cond3` condición). + +## UniqUpTo (N) (x) {#uniquptonx} + +Calcula el número de diferentes valores de argumento si es menor o igual a N. Si el número de diferentes valores de argumento es mayor que N, devuelve N + 1. + +Recomendado para usar con Ns pequeños, hasta 10. El valor máximo de N es 100. + +Para el estado de una función agregada, utiliza la cantidad de memoria igual a 1 + N \* el tamaño de un valor de bytes. +Para las cadenas, almacena un hash no criptográfico de 8 bytes. Es decir, el cálculo se aproxima a las cadenas. + +La función también funciona para varios argumentos. + +Funciona lo más rápido posible, excepto en los casos en que se usa un valor N grande y el número de valores únicos es ligeramente menor que N. + +Ejemplo de uso: + +``` text +Problem: Generate a report that shows only keywords that produced at least 5 unique users. +Solution: Write in the GROUP BY query SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/agg_functions/parametric_functions/) + +## Por ejemplo, en el caso de que el usuario no tenga ningún problema.) {#summapfilteredkeys-to-keepkeys-values} + +El mismo comportamiento que [sumMap](reference.md#agg_functions-summap) excepto que una matriz de claves se pasa como un parámetro. Esto puede ser especialmente útil cuando se trabaja con una alta cardinalidad de claves. diff --git a/docs/es/query_language/alter.md b/docs/es/query_language/alter.md new file mode 100644 index 00000000000..e0868445d07 --- /dev/null +++ b/docs/es/query_language/alter.md @@ -0,0 +1,498 @@ +## ALTERAR {#query-language-queries-alter} + +El `ALTER` consulta sólo se admite para `*MergeTree` mesas, así como `Merge`y`Distributed`. La consulta tiene varias variaciones. + +### Manipulaciones de columna {#column-manipulations} + +Cambiar la estructura de la tabla. + +``` sql +ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN ... +``` + +En la consulta, especifique una lista de una o más acciones separadas por comas. +Cada acción es una operación en una columna. + +Se admiten las siguientes acciones: + +- [AÑADIR COLUMNA](#alter_add-column) — Agrega una nueva columna a la tabla. +- [COLUMNA DE GOTA](#alter_drop-column) — Elimina la columna. +- [Sistema abierto.](#alter_clear-column) — Restablece los valores de las columnas. +- [COLUMNA DE COMENTARIOS](#alter_comment-column) — Agrega un comentario de texto a la columna. +- [COLUMNA MODIFICAR](#alter_modify-column) — Cambia el tipo de columna, la expresión predeterminada y el TTL. + +Estas acciones se describen en detalle a continuación. + +#### AÑADIR COLUMNA {#alter-add-column} + +``` sql +ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after] +``` + +Agrega una nueva columna a la tabla con el `name`, `type`, [`codec`](create.md#codecs) y `default_expr` (ver la sección [Expresiones predeterminadas](create.md#create-default-values)). + +Si el `IF NOT EXISTS` se incluye una cláusula, la consulta no devolverá un error si la columna ya existe. Si especifica `AFTER name_after` (el nombre de otra columna), la columna se agrega después de la especificada en la lista de columnas de tabla. De lo contrario, la columna se agrega al final de la tabla. Tenga en cuenta que no hay forma de agregar una columna al principio de una tabla. Para una cadena de acciones, `name_after` puede ser el nombre de una columna que se agrega en una de las acciones anteriores. + +Agregar una columna solo cambia la estructura de la tabla, sin realizar ninguna acción con datos. Los datos no aparecen en el disco después de `ALTER`. Si faltan los datos para una columna al leer de la tabla, se rellena con valores predeterminados (realizando la expresión predeterminada si hay una, o usando ceros o cadenas vacías). La columna aparece en el disco después de fusionar partes de datos (consulte [Método de codificación de datos:](../operations/table_engines/mergetree.md)). + +Este enfoque nos permite completar el `ALTER` consulta al instante, sin aumentar el volumen de datos antiguos. + +Ejemplo: + +``` sql +ALTER TABLE visits ADD COLUMN browser String AFTER user_id +``` + +#### COLUMNA DE GOTA {#alter-drop-column} + +``` sql +DROP COLUMN [IF EXISTS] name +``` + +Elimina la columna con el nombre `name`. Si el `IF EXISTS` Si se especifica una cláusula, la consulta no devolverá un error si la columna no existe. + +Elimina datos del sistema de archivos. Dado que esto elimina archivos completos, la consulta se completa casi al instante. + +Ejemplo: + +``` sql +ALTER TABLE visits DROP COLUMN browser +``` + +#### Sistema abierto. {#alter-clear-column} + +``` sql +CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name +``` + +Restablece todos los datos de una columna para una partición especificada. Obtenga más información sobre cómo configurar el nombre de la partición en la sección [Cómo especificar la expresión de partición](#alter-how-to-specify-part-expr). + +Si el `IF EXISTS` Si se especifica una cláusula, la consulta no devolverá un error si la columna no existe. + +Ejemplo: + +``` sql +ALTER TABLE visits CLEAR COLUMN browser IN PARTITION tuple() +``` + +#### COLUMNA DE COMENTARIOS {#alter-comment-column} + +``` sql +COMMENT COLUMN [IF EXISTS] name 'comment' +``` + +Agrega un comentario a la columna. Si el `IF EXISTS` Si se especifica una cláusula, la consulta no devolverá un error si la columna no existe. + +Cada columna puede tener un comentario. Si ya existe un comentario para la columna, un nuevo comentario sobrescribe el comentario anterior. + +Los comentarios se almacenan en el `comment_expression` columna devuelta por el [TABLA DE DESCRIBE](misc.md#misc-describe-table) consulta. + +Ejemplo: + +``` sql +ALTER TABLE visits COMMENT COLUMN browser 'The table shows the browser used for accessing the site.' +``` + +#### COLUMNA MODIFICAR {#alter-modify-column} + +``` sql +MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] +``` + +Esta consulta cambia el `name` propiedades de la columna: + +- Tipo + +- Expresión predeterminada + +- TTL + + For examples of columns TTL modifying, see [Column TTL](../operations/table_engines/mergetree.md#mergetree-column-ttl). + +Si el `IF EXISTS` Si se especifica una cláusula, la consulta no devolverá un error si la columna no existe. + +Al cambiar el tipo, los valores se convierten como si [ToType](functions/type_conversion_functions.md) se les aplicaron funciones. Si solo se cambia la expresión predeterminada, la consulta no hace nada complejo y se completa casi al instante. + +Ejemplo: + +``` sql +ALTER TABLE visits MODIFY COLUMN browser Array(String) +``` + +Cambiar el tipo de columna es la única acción compleja: cambia el contenido de los archivos con datos. Para mesas grandes, esto puede llevar mucho tiempo. + +Hay varias etapas de procesamiento: + +- Preparación de archivos temporales (nuevos) con datos modificados. +- Cambiar el nombre de los archivos antiguos. +- Cambiar el nombre de los archivos temporales (nuevos) a los nombres antiguos. +- Eliminar los archivos antiguos. + +Solo la primera etapa lleva tiempo. Si hay un error en esta etapa, los datos no se cambian. +Si hay un error durante una de las etapas sucesivas, los datos se pueden restaurar manualmente. La excepción es si los archivos antiguos se eliminaron del sistema de archivos, pero los datos de los nuevos archivos no se escribieron en el disco y se perdieron. + +El `ALTER` se replica la consulta para cambiar columnas. Las instrucciones se guardan en ZooKeeper, luego cada réplica las aplica. Todo `ALTER` las consultas se ejecutan en el mismo orden. La consulta espera a que se completen las acciones adecuadas en las otras réplicas. Sin embargo, una consulta para cambiar columnas en una tabla replicada se puede interrumpir y todas las acciones se realizarán de forma asincrónica. + +#### Limitaciones de consulta ALTER {#alter-query-limitations} + +El `ALTER` query le permite crear y eliminar elementos separados (columnas) en estructuras de datos anidadas, pero no en estructuras de datos anidadas completas. Para agregar una estructura de datos anidada, puede agregar columnas con un nombre como `name.nested_name` y el tipo `Array(T)`. Una estructura de datos anidada es equivalente a varias columnas de matriz con un nombre que tiene el mismo prefijo antes del punto. + +No hay soporte para eliminar columnas en la clave principal o la clave de muestreo (columnas que se utilizan en el `ENGINE` expresión). Solo es posible cambiar el tipo de las columnas que se incluyen en la clave principal si este cambio no provoca que se modifiquen los datos (por ejemplo, puede agregar valores a un Enum o cambiar un tipo de `DateTime` Naciones `UInt32`). + +Si el `ALTER` la consulta no es suficiente para realizar los cambios en la tabla que necesita, puede crear una nueva tabla, copiar los datos [INSERTAR SELECCIONAR](insert_into.md#insert_query_insert-select) Consulta, luego cambie las tablas usando el [Renombrar](misc.md#misc_operations-rename) Consulta y elimina la tabla anterior. Puede usar el [Método de codificación de datos:](../operations/utils/clickhouse-copier.md) como una alternativa a la `INSERT SELECT` consulta. + +El `ALTER` query bloquea todas las lecturas y escrituras para la tabla. En otras palabras, si un largo `SELECT` se está ejecutando en el momento de la `ALTER` Consulta, el `ALTER` la consulta esperará a que se complete. Al mismo tiempo, todas las consultas nuevas a la misma tabla esperarán `ALTER` se está ejecutando. + +Para tablas que no almacenan datos por sí mismas (como `Merge` y `Distributed`), `ALTER` simplemente cambia la estructura de la tabla, y no cambia la estructura de las tablas subordinadas. Por ejemplo, cuando se ejecuta ALTER para un `Distributed` mesa, también tendrá que ejecutar `ALTER` para las tablas en todos los servidores remotos. + +### Manipulaciones con expresiones clave {#manipulations-with-key-expressions} + +Se admite el siguiente comando: + +``` sql +MODIFY ORDER BY new_expression +``` + +Solo funciona para tablas en el [`MergeTree`](../operations/table_engines/mergetree.md) familia (incluyendo +[repetición](../operations/table_engines/replication.md) tabla). El comando cambia el +[clave de clasificación](../operations/table_engines/mergetree.md) de la mesa +Naciones `new_expression` (una expresión o una tupla de expresiones). La clave principal sigue siendo la misma. + +El comando es liviano en el sentido de que solo cambia los metadatos. Para mantener la propiedad esa parte de datos +las filas están ordenadas por la expresión de clave de ordenación, no puede agregar expresiones que contengan columnas existentes +a la clave de ordenación (sólo las columnas añadidas `ADD COLUMN` comando en el mismo `ALTER` consulta). + +### Manipulaciones con índices de saltos de datos {#manipulations-with-data-skipping-indices} + +Solo funciona para tablas en el [`*MergeTree`](../operations/table_engines/mergetree.md) familia (incluyendo +[repetición](../operations/table_engines/replication.md) tabla). Las siguientes operaciones +están disponibles: + +- `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value AFTER name [AFTER name2]` - Agrega la descripción del índice a los metadatos de las tablas. + +- `ALTER TABLE [db].name DROP INDEX name` - Elimina la descripción del índice de los metadatos de las tablas y elimina los archivos de índice del disco. + +Estos comandos son livianos en el sentido de que solo cambian los metadatos o eliminan archivos. +Además, se replican (sincronizando metadatos de índices a través de ZooKeeper). + +### Manipulaciones con restricciones {#manipulations-with-constraints} + +Ver más en [limitación](create.md#constraints) + +Las restricciones se pueden agregar o eliminar utilizando la siguiente sintaxis: + +``` sql +ALTER TABLE [db].name ADD CONSTRAINT constraint_name CHECK expression; +ALTER TABLE [db].name DROP CONSTRAINT constraint_name; +``` + +Las consultas agregarán o eliminarán metadatos sobre restricciones de la tabla para que se procesen inmediatamente. + +Comprobación de restricciones *no se ejecutará* en los datos existentes si se agregaron. + +Todos los cambios en las tablas replicadas se transmiten a ZooKeeper, por lo que se aplicarán en otras réplicas. + +### Manipulaciones con particiones y piezas {#alter-manipulations-with-partitions} + +Las siguientes operaciones con [partición](../operations/table_engines/custom_partitioning_key.md) están disponibles: + +- [DETACH PARTITION](#alter_detach-partition) – Mueve una partición a la `detached` Directorio y olvidarlo. +- [PARTICIÓN DE CAÍDA](#alter_drop-partition) – Elimina una partición. +- [ADJUNTA PARTE\|PARTICIÓN](#alter_attach-partition) – Añade una pieza o partición desde el `detached` directorio a la tabla. +- [REEMPLAZAR LA PARTICIÓN](#alter_replace-partition) - Copia la partición de datos de una tabla a otra. +- [ADJUNTA PARTICIÓN DE](#alter_attach-partition-from) – Copia la partición de datos de una tabla a otra y añade. +- [REEMPLAZAR LA PARTICIÓN](#alter_replace-partition) - Copia la partición de datos de una tabla a otra y reemplaza. +- [MUEVA LA PARTICIÓN A LA MESA](#alter_move_to_table-partition) (\#alter\_move\_to\_table-partition) - Mover la partición de datos de una tabla a otra. +- [COLUMNA CLARA EN PARTICIPACIÓN](#alter_clear-column-partition) - Restablece el valor de una columna especificada en una partición. +- [ÍNDICE CLARO EN PARTICIPACIÓN](#alter_clear-index-partition) - Restablece el índice secundario especificado en una partición. +- [CONGELAR PARTICIÓN](#alter_freeze-partition) – Crea una copia de seguridad de una partición. +- [PARTICIÓN FETCH](#alter_fetch-partition) – Descarga una partición de otro servidor. +- [PARTICIÓN DE MOVIMIENTO\|PARTE](#alter_move-partition) – Mover partición / parte de datos a otro disco o volumen. + + + +#### DETACH PARTITION {\#alter\_detach-partition} {#detach-partition-alter-detach-partition} + +``` sql +ALTER TABLE table_name DETACH PARTITION partition_expr +``` + +Mueve todos los datos de la partición especificada `detached` directorio. El servidor se olvida de la partición de datos separada como si no existiera. El servidor no sabrá acerca de estos datos hasta que [CONECTAR](#alter_attach-partition) consulta. + +Ejemplo: + +``` sql +ALTER TABLE visits DETACH PARTITION 201901 +``` + +Lea cómo configurar la expresión de partición en una sección [Cómo especificar la expresión de partición](#alter-how-to-specify-part-expr). + +Después de ejecutar la consulta, puede hacer lo que quiera con los datos en el `detached` directorio — eliminarlo del sistema de archivos, o simplemente dejarlo. + +Esta consulta se replica – mueve los datos a la `detached` directorio en todas las réplicas. Tenga en cuenta que solo puede ejecutar esta consulta en una réplica de líder. Para averiguar si una réplica es un líder, realice `SELECT` Consulta a la [sistema.Replica](../operations/system_tables.md#system_tables-replicas) tabla. Alternativamente, es más fácil hacer un `DETACH` consulta en todas las réplicas: todas las réplicas producen una excepción, excepto la réplica líder. + +#### PARTICIÓN DE CAÍDA {#alter-drop-partition} + +``` sql +ALTER TABLE table_name DROP PARTITION partition_expr +``` + +Elimina la partición especificada de la tabla. Esta consulta etiqueta la partición como inactiva y elimina los datos por completo, aproximadamente en 10 minutos. + +Lea cómo configurar la expresión de partición en una sección [Cómo especificar la expresión de partición](#alter-how-to-specify-part-expr). + +La consulta se replica: elimina los datos de todas las réplicas. + +#### CAÍDA DE DESPRENDIMIENTO DE LA PARTICIÓN\|PARTE {#alter-drop-detached} + +``` sql +ALTER TABLE table_name DROP DETACHED PARTITION|PART partition_expr +``` + +Quita la parte especificada o todas las partes de la partición especificada de `detached`. +Más información sobre cómo establecer la expresión de partición en una sección [Cómo especificar la expresión de partición](#alter-how-to-specify-part-expr). + +#### ADJUNTA PARTICIÓN\|PARTE {#alter-attach-partition} + +``` sql +ALTER TABLE table_name ATTACH PARTITION|PART partition_expr +``` + +Agrega datos a la tabla desde el `detached` directorio. Es posible agregar datos para una partición completa o para una parte separada. Ejemplos: + +``` sql +ALTER TABLE visits ATTACH PARTITION 201901; +ALTER TABLE visits ATTACH PART 201901_2_2_0; +``` + +Más información sobre cómo establecer la expresión de partición en una sección [Cómo especificar la expresión de partición](#alter-how-to-specify-part-expr). + +Esta consulta se replica. El iniciador de réplica comprueba si hay datos en el `detached` directorio. Si existen datos, la consulta comprueba su integridad. Si todo es correcto, la consulta agrega los datos a la tabla. Todas las demás réplicas descargan los datos del iniciador de réplica. + +Entonces puedes poner datos en el `detached` en una réplica, y utilice el directorio `ALTER ... ATTACH` consulta para agregarlo a la tabla en todas las réplicas. + +#### ADJUNTA PARTICIÓN DE {#alter-attach-partition-from} + +``` sql +ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1 +``` + +Esta consulta copia la partición de datos `table1` Naciones `table2` añade datos a los que existen en el `table2`. Tenga en cuenta que los datos no se eliminarán de `table1`. + +Para que la consulta se ejecute correctamente, se deben cumplir las siguientes condiciones: + +- Ambas tablas deben tener la misma estructura. +- Ambas tablas deben tener la misma clave de partición. + +#### REEMPLAZAR LA PARTICIÓN {#alter-replace-partition} + +``` sql +ALTER TABLE table2 REPLACE PARTITION partition_expr FROM table1 +``` + +Esta consulta copia la partición de datos `table1` Naciones `table2` y reemplaza la partición existente en el `table2`. Tenga en cuenta que los datos no se eliminarán de `table1`. + +Para que la consulta se ejecute correctamente, se deben cumplir las siguientes condiciones: + +- Ambas tablas deben tener la misma estructura. +- Ambas tablas deben tener la misma clave de partición. + +#### MUEVA LA PARTICIÓN A LA MESA {#alter-move-to-table-partition} + +``` sql +ALTER TABLE table_source MOVE PARTITION partition_expr TO TABLE table_dest +``` + +Esta consulta mueve la partición de datos `table_source` Naciones `table_dest` con la eliminación de los datos de `table_source`. + +Para que la consulta se ejecute correctamente, se deben cumplir las siguientes condiciones: + +- Ambas tablas deben tener la misma estructura. +- Ambas tablas deben tener la misma clave de partición. +- Ambas tablas deben ser de la misma familia de motores. (replicado o no replicado) +- Ambas tablas deben tener la misma política de almacenamiento. + +#### COLUMNA CLARA EN PARTICIPACIÓN {#alter-clear-column-partition} + +``` sql +ALTER TABLE table_name CLEAR COLUMN column_name IN PARTITION partition_expr +``` + +Restablece todos los valores de la columna especificada en una partición. Si el `DEFAULT` cláusula se determinó al crear una tabla, esta consulta establece el valor de columna en un valor predeterminado especificado. + +Ejemplo: + +``` sql +ALTER TABLE visits CLEAR COLUMN hour in PARTITION 201902 +``` + +#### CONGELAR PARTICIÓN {#alter-freeze-partition} + +``` sql +ALTER TABLE table_name FREEZE [PARTITION partition_expr] +``` + +Esta consulta crea una copia de seguridad local de una partición especificada. Si el `PARTITION` se omite la cláusula, la consulta crea la copia de seguridad de todas las particiones a la vez. + +!!! note "Nota" + Todo el proceso de copia de seguridad se realiza sin detener el servidor. + +Tenga en cuenta que para las tablas de estilo antiguo puede especificar el prefijo del nombre de la partición (por ejemplo, ‘2019’) - entonces la consulta crea la copia de seguridad para todas las particiones correspondientes. Lea cómo configurar la expresión de partición en una sección [Cómo especificar la expresión de partición](#alter-how-to-specify-part-expr). + +En el momento de la ejecución, para una instantánea de datos, la consulta crea vínculos rígidos a los datos de una tabla. Los enlaces duros se colocan en el directorio `/var/lib/clickhouse/shadow/N/...`, donde: + +- `/var/lib/clickhouse/` es el directorio ClickHouse de trabajo especificado en la configuración. +- `N` es el número incremental de la copia de seguridad. + +!!! note "Nota" + Si usted estados unidos [un conjunto de discos para el almacenamiento de datos en una tabla](../operations/table_engines/mergetree.md#table_engine-mergetree-multiple-volumes) el `shadow/N` directorio aparece en cada disco, almacenando partes de datos que coinciden con el `PARTITION` expresión. + +La misma estructura de directorios se crea dentro de la copia de seguridad que dentro `/var/lib/clickhouse/`. La consulta realiza ‘chmod’ para todos los archivos, prohibiendo escribir en ellos. + +Después de crear la copia de seguridad, puede copiar los datos desde `/var/lib/clickhouse/shadow/` al servidor remoto y, a continuación, elimínelo del servidor local. Tenga en cuenta que el `ALTER t FREEZE PARTITION` consulta no se réplica. Crea una copia de seguridad local solo en el servidor local. + +La consulta crea una copia de seguridad casi instantáneamente (pero primero espera a que las consultas actuales a la tabla correspondiente terminen de ejecutarse). + +`ALTER TABLE t FREEZE PARTITION` copia solo los datos, no los metadatos de la tabla. Para hacer una copia de seguridad de los metadatos de la tabla, copie el archivo `/var/lib/clickhouse/metadata/database/table.sql` + +Para restaurar los datos de una copia de seguridad, haga lo siguiente: + +1. Cree la tabla si no existe. Para ver la consulta, utilice el .archivo sql (reemplazar `ATTACH` es ella con `CREATE`). +2. Copie los datos de la `data/database/table/` Directorio dentro de la copia de seguridad a la `/var/lib/clickhouse/data/database/table/detached/` Directorio. +3. Ejecutar `ALTER TABLE t ATTACH PARTITION` Consultas para agregar los datos a una tabla. + +La restauración desde una copia de seguridad no requiere detener el servidor. + +Para obtener más información sobre las copias de seguridad y la restauración de datos, consulte [Copia de seguridad de datos](../operations/backup.md) apartado. + +#### ÍNDICE CLARO EN PARTICIPACIÓN {#alter-clear-index-partition} + +``` sql +ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr +``` + +La consulta funciona de forma similar a `CLEAR COLUMN`, pero restablece un índice en lugar de una columna de datos. + +#### PARTICIÓN FETCH {#alter-fetch-partition} + +``` sql +ALTER TABLE table_name FETCH PARTITION partition_expr FROM 'path-in-zookeeper' +``` + +Descarga una partición desde otro servidor. Esta consulta solo funciona para las tablas replicadas. + +La consulta hace lo siguiente: + +1. Descarga la partición del fragmento especificado. En ‘path-in-zookeeper’ debe especificar una ruta al fragmento en ZooKeeper. +2. Luego, la consulta coloca los datos descargados en el `detached` Directorio de la `table_name` tabla. Descripción [ADJUNTA PARTICIÓN\|PARTE](#alter_attach-partition) consulta para agregar los datos a la tabla. + +Por ejemplo: + +``` sql +ALTER TABLE users FETCH PARTITION 201902 FROM '/clickhouse/tables/01-01/visits'; +ALTER TABLE users ATTACH PARTITION 201902; +``` + +Tenga en cuenta que: + +- El `ALTER ... FETCH PARTITION` la consulta no está replicada. Coloca la partición en el `detached` sólo en el servidor local. +- El `ALTER TABLE ... ATTACH` consulta se replica. Agrega los datos a todas las réplicas. Los datos se agregan a una de las réplicas desde el `detached` directorio, y para los demás - de réplicas vecinas. + +Antes de descargar, el sistema verifica si la partición existe y la estructura de la tabla coincide. La réplica más adecuada se selecciona automáticamente de las réplicas en buen estado. + +Aunque se llama a la consulta `ALTER TABLE`, no cambia la estructura de la tabla y no cambiar inmediatamente los datos disponibles en la tabla. + +#### PARTICIÓN DE MOVIMIENTO\|PARTE {#alter-move-partition} + +Mueve particiones o partes de datos a otro volumen o disco para `MergeTree`-mesas de motor. Ver [Uso de varios dispositivos de bloque para el almacenamiento de datos](../operations/table_engines/mergetree.md#table_engine-mergetree-multiple-volumes). + +``` sql +ALTER TABLE table_name MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name' +``` + +El `ALTER TABLE t MOVE` Consulta: + +- No replicado, porque diferentes réplicas pueden tener diferentes directivas de almacenamiento. +- Devuelve un error si el disco o volumen especificado no está configurado. La consulta también devuelve un error si no se pueden aplicar las condiciones de movimiento de datos especificadas en la directiva de almacenamiento. +- Puede devolver un error en el caso, cuando los datos que se moverán ya se mueven por un proceso en segundo plano, concurrente `ALTER TABLE t MOVE` consulta o como resultado de la fusión de datos de fondo. Un usuario no debe realizar ninguna acción adicional en este caso. + +Ejemplo: + +``` sql +ALTER TABLE hits MOVE PART '20190301_14343_16206_438' TO VOLUME 'slow' +ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' +``` + +#### Cómo establecer la expresión de partición {#alter-how-to-specify-part-expr} + +Puede especificar la expresión de partición en `ALTER ... PARTITION` de diferentes maneras: + +- Como valor de la `partition` columna de la `system.parts` tabla. Por ejemplo, `ALTER TABLE visits DETACH PARTITION 201901`. +- Como la expresión de la columna de la tabla. Se admiten constantes y expresiones constantes. Por ejemplo, `ALTER TABLE visits DETACH PARTITION toYYYYMM(toDate('2019-01-25'))`. +- Usando el ID de partición. El ID de partición es un identificador de cadena de la partición (legible por humanos, si es posible) que se usa como nombres de particiones en el sistema de archivos y en ZooKeeper. El ID de partición debe especificarse en el `PARTITION ID` cláusula, entre comillas simples. Por ejemplo, `ALTER TABLE visits DETACH PARTITION ID '201901'`. +- En el [ALTERAR PIEZA DE ADJUNTO](#alter_attach-partition) y [PARTE DESMONTADA DE GOTA](#alter_drop-detached) Consulta, para especificar el nombre de una parte, utilice un literal de cadena con un valor `name` columna de la [sistema.detached\_parts](../operations/system_tables.md#system_tables-detached_parts) tabla. Por ejemplo, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. + +El uso de comillas al especificar la partición depende del tipo de expresión de partición. Por ejemplo, para el `String` Tipo, debe especificar su nombre entre comillas (`'`). Para el `Date` y `Int*` tipos no se necesitan comillas. + +Para las tablas de estilo antiguo, puede especificar la partición como un número `201901` O una cadena `'201901'`. La sintaxis para las tablas de nuevo estilo es más estricta con los tipos (similar al analizador para el formato de entrada VALUES). + +Todas las reglas anteriores también son ciertas para el [OPTIMIZAR](misc.md#misc_operations-optimize) consulta. Si necesita especificar la única partición al optimizar una tabla no particionada, establezca la expresión `PARTITION tuple()`. Por ejemplo: + +``` sql +OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; +``` + +Los ejemplos de `ALTER ... PARTITION` las consultas se demuestran en las pruebas [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_local.sql) y [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). + +### Manipulaciones con Tabla TTL {#manipulations-with-table-ttl} + +Usted puede cambiar [Tabla TTL](../operations/table_engines/mergetree.md#mergetree-table-ttl) con una solicitud del siguiente formulario: + +``` sql +ALTER TABLE table-name MODIFY TTL ttl-expression +``` + +### Sincronicidad de las consultas ALTER {#synchronicity-of-alter-queries} + +Para tablas no replicables, todas `ALTER` las consultas se realizan de forma sincrónica. Para las tablas replicables, la consulta solo agrega instrucciones para las acciones apropiadas para `ZooKeeper`, y las acciones mismas se realizan tan pronto como sea posible. Sin embargo, la consulta puede esperar a que estas acciones se completen en todas las réplicas. + +Para `ALTER ... ATTACH|DETACH|DROP` Consultas, puede utilizar el `replication_alter_partitions_sync` configuración para configurar la espera. +Valores posibles: `0` – no espere; `1` – sólo esperar a su propia ejecución (por defecto); `2` – esperar a todos. + +### Mutación {#alter-mutations} + +Las mutaciones son una variante de consulta ALTER que permite cambiar o eliminar filas en una tabla. En contraste con el estándar `UPDATE` y `DELETE` consultas destinadas a cambios de datos puntuales, las mutaciones están destinadas a operaciones pesadas que cambian muchas filas en una tabla. Apoyado para el `MergeTree` familia de motores de mesa, incluidos los motores con soporte de replicación. + +Las tablas existentes están listas para las mutaciones tal como están (no es necesaria la conversión), pero después de que la primera mutación se aplica a una tabla, su formato de metadatos se vuelve incompatible con las versiones anteriores del servidor y volver a una versión anterior se vuelve imposible. + +Comandos disponibles actualmente: + +``` sql +ALTER TABLE [db.]table DELETE WHERE filter_expr +``` + +El `filter_expr` debe ser de tipo `UInt8`. La consulta elimina las filas de la tabla para la que esta expresión toma un valor distinto de cero. + +``` sql +ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr +``` + +El `filter_expr` debe ser de tipo `UInt8`. Esta consulta actualiza los valores de las columnas especificadas a los valores de las expresiones correspondientes `filter_expr` toma un valor distinto de cero. Los valores se convierten al tipo de columna utilizando el `CAST` operador. No se admite la actualización de columnas que se utilizan en el cálculo de la clave principal o de partición. + +``` sql +ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name +``` + +La consulta reconstruye el índice secundario `name` en la partición `partition_name`. + +Una consulta puede contener varios comandos separados por comas. + +Para las tablas \*MergeTree, las mutaciones se ejecutan reescribiendo partes de datos completas. No hay atomicidad - las partes se sustituyen por partes mutadas tan pronto como están listas y una `SELECT` La consulta que comenzó a ejecutarse durante una mutación verá datos de partes que ya han sido mutadas junto con datos de partes que aún no han sido mutadas. + +Las mutaciones están totalmente ordenadas por su orden de creación y se aplican a cada parte en ese orden. Las mutaciones también se ordenan parcialmente con INSERTs: los datos que se insertaron en la tabla antes de que se enviara la mutación se mutarán y los datos que se insertaron después de eso no se mutarán. Tenga en cuenta que las mutaciones no bloquean INSERTs de ninguna manera. + +Una consulta de mutación regresa inmediatamente después de agregar la entrada de mutación (en el caso de tablas replicadas a ZooKeeper, para tablas no replicadas, al sistema de archivos). La mutación en sí se ejecuta de forma asíncrona utilizando la configuración del perfil del sistema. Para realizar un seguimiento del progreso de las mutaciones, puede usar el [`system.mutations`](../operations/system_tables.md#system_tables-mutations) tabla. Una mutación que se envió correctamente continuará ejecutándose incluso si se reinician los servidores ClickHouse. No hay forma de revertir la mutación una vez que se presenta, pero si la mutación está atascada por alguna razón, puede cancelarse con el [`KILL MUTATION`](misc.md#kill-mutation) consulta. + +Las entradas de mutaciones terminadas no se eliminan de inmediato (el número de entradas conservadas viene determinado por el `finished_mutations_to_keep` parámetro del motor de almacenamiento). Las entradas de mutación más antiguas se eliminan. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/alter/) diff --git a/docs/es/query_language/create.md b/docs/es/query_language/create.md new file mode 100644 index 00000000000..91d6e845ef2 --- /dev/null +++ b/docs/es/query_language/create.md @@ -0,0 +1,302 @@ +# CREAR Consultas {#create-queries} + +## CREAR BASE DE DATOS {#query-language-create-database} + +Crea una base de datos. + +``` sql +CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] +``` + +### Clausula {#clauses} + +- `IF NOT EXISTS` + + If the `db_name` database already exists, then ClickHouse doesn't create a new database and: + + - Doesn't throw an exception if clause is specified. + - Throws an exception if clause isn't specified. + +- `ON CLUSTER` + + ClickHouse creates the `db_name` database on all the servers of a specified cluster. + +- `ENGINE` + + - [MySQL](../database_engines/mysql.md) + + Allows you to retrieve data from the remote MySQL server. + + By default, ClickHouse uses its own [database engine](../database_engines/index.md). + +## CREAR TABLA {#create-table-query} + +El `CREATE TABLE` puede tener varias formas. + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [compression_codec] [TTL expr2], + ... +) ENGINE = engine +``` + +Crea una tabla llamada ‘name’ es el ‘db’ base de datos o la base de datos actual si ‘db’ no está establecida, con la estructura especificada entre paréntesis y ‘engine’ motor. +La estructura de la tabla es una lista de descripciones de columnas. Si los índices son compatibles con el motor, se indican como parámetros para el motor de tablas. + +Una descripción de columna es `name type` en el caso más simple. Ejemplo: `RegionID UInt32`. +Las expresiones también se pueden definir para los valores predeterminados (ver más abajo). + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name AS [db2.]name2 [ENGINE = engine] +``` + +Crea una tabla con la misma estructura que otra tabla. Puede especificar un motor diferente para la tabla. Si no se especifica el motor, se utilizará el mismo motor que para el `db2.name2` tabla. + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name AS table_function() +``` + +Crea una tabla con la estructura y los datos [función de la tabla](table_functions/index.md). + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... +``` + +Crea una tabla con una estructura como el resultado de la `SELECT` Consulta, con el ‘engine’ motor, y lo llena con datos de SELECT. + +En todos los casos, si `IF NOT EXISTS` Si se especifica la tabla, la consulta no devolverá un error si la tabla ya existe. En este caso, la consulta no hará nada. + +Puede haber otras cláusulas después del `ENGINE` cláusula en la consulta. Consulte la documentación detallada sobre cómo crear tablas en las descripciones de [motores de mesa](../operations/table_engines/index.md#table_engines). + +### Valores predeterminados {#create-default-values} + +La descripción de la columna puede especificar una expresión para un valor predeterminado, de una de las siguientes maneras:`DEFAULT expr`, `MATERIALIZED expr`, `ALIAS expr`. +Ejemplo: `URLDomain String DEFAULT domain(URL)`. + +Si no se define una expresión para el valor predeterminado, los valores predeterminados se establecerán en ceros para números, cadenas vacías para cadenas, matrices vacías para matrices y `0000-00-00` para fechas o `0000-00-00 00:00:00` para las fechas con el tiempo. Los NULL no son compatibles. + +Si se define la expresión predeterminada, el tipo de columna es opcional. Si no hay un tipo definido explícitamente, se utiliza el tipo de expresión predeterminado. Ejemplo: `EventDate DEFAULT toDate(EventTime)` – el ‘Date’ tipo será utilizado para el ‘EventDate’ columna. + +Si el tipo de datos y la expresión predeterminada se definen explícitamente, esta expresión se convertirá al tipo especificado utilizando funciones de conversión de tipos. Ejemplo: `Hits UInt32 DEFAULT 0` significa lo mismo que `Hits UInt32 DEFAULT toUInt32(0)`. + +Las expresiones predeterminadas se pueden definir como una expresión arbitraria de las constantes y columnas de la tabla. Al crear y cambiar la estructura de la tabla, comprueba que las expresiones no contengan bucles. Para INSERT, comprueba que las expresiones se puedan resolver, que se hayan pasado todas las columnas a partir de las que se pueden calcular. + +`DEFAULT expr` + +Valor predeterminado Normal. Si la consulta INSERT no especifica la columna correspondiente, se completará calculando la expresión correspondiente. + +`MATERIALIZED expr` + +Expresión materializada. Dicha columna no se puede especificar para INSERT, porque siempre se calcula. +Para un INSERT sin una lista de columnas, estas columnas no se consideran. +Además, esta columna no se sustituye cuando se utiliza un asterisco en una consulta SELECT. Esto es para preservar el invariante que el volcado obtuvo usando `SELECT *` se puede volver a insertar en la tabla usando INSERT sin especificar la lista de columnas. + +`ALIAS expr` + +Sinónimo. Dicha columna no se almacena en la tabla en absoluto. +Sus valores no se pueden insertar en una tabla y no se sustituyen cuando se utiliza un asterisco en una consulta SELECT. +Se puede usar en SELECT si el alias se expande durante el análisis de consultas. + +Cuando se utiliza la consulta ALTER para agregar nuevas columnas, no se escriben datos antiguos para estas columnas. En su lugar, al leer datos antiguos que no tienen valores para las nuevas columnas, las expresiones se calculan sobre la marcha de forma predeterminada. Sin embargo, si la ejecución de las expresiones requiere diferentes columnas que no están indicadas en la consulta, estas columnas se leerán adicionalmente, pero solo para los bloques de datos que lo necesitan. + +Si agrega una nueva columna a una tabla pero luego cambia su expresión predeterminada, los valores utilizados para los datos antiguos cambiarán (para los datos donde los valores no se almacenaron en el disco). Tenga en cuenta que cuando se ejecutan combinaciones en segundo plano, los datos de las columnas que faltan en una de las partes de combinación se escriben en la parte combinada. + +No es posible establecer valores predeterminados para elementos en estructuras de datos anidadas. + +### Limitación {#constraints} + +Junto con las descripciones de columnas, se podrían definir restricciones: + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1], + ... + CONSTRAINT constraint_name_1 CHECK boolean_expr_1, + ... +) ENGINE = engine +``` + +`boolean_expr_1` podría por cualquier expresión booleana. Si se definen restricciones para la tabla, cada una de ellas se verificará para cada fila en `INSERT` consulta. Si no se cumple alguna restricción, el servidor generará una excepción con el nombre de la restricción y la expresión de comprobación. + +Agregar una gran cantidad de restricciones puede afectar negativamente el rendimiento de grandes `INSERT` consulta. + +### Expresión TTL {#ttl-expression} + +Define el tiempo de almacenamiento de los valores. Solo se puede especificar para tablas de la familia MergeTree. Para la descripción detallada, ver [TTL para columnas y tablas](../operations/table_engines/mergetree.md#table_engine-mergetree-ttl). + +### Códecs de compresión de columna {#codecs} + +De forma predeterminada, ClickHouse aplica el `lz4` método de compresión. Para `MergeTree`- familia de motor puede cambiar el método de compresión predeterminado en el [compresión](../operations/server_settings/settings.md#server-settings-compression) sección de una configuración de servidor. También puede definir el método de compresión para cada columna `CREATE TABLE` consulta. + +``` sql +CREATE TABLE codec_example +( + dt Date CODEC(ZSTD), + ts DateTime CODEC(LZ4HC), + float_value Float32 CODEC(NONE), + double_value Float64 CODEC(LZ4HC(9)) + value Float32 CODEC(Delta, ZSTD) +) +ENGINE = +... +``` + +Si se especifica un códec, el códec predeterminado no se aplica. Los códecs se pueden combinar en una tubería, por ejemplo, `CODEC(Delta, ZSTD)`. Para seleccionar la mejor combinación de códecs para su proyecto, pase puntos de referencia similares a los descritos en Altinity [Nuevas codificaciones para mejorar la eficiencia de ClickHouse](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) artículo. + +!!! warning "Advertencia" + No puede descomprimir archivos de base de datos ClickHouse con utilidades externas como `lz4`. En su lugar, use el especial [Compresor de clickhouse](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor) utilidad. + +La compresión es compatible con los siguientes motores de tablas: + +- [Método de codificación de datos:](../operations/table_engines/mergetree.md) familia. Admite códecs de compresión de columnas y selecciona el método de compresión predeterminado mediante [compresión](../operations/server_settings/settings.md#server-settings-compression) configuración. +- [Registro](../operations/table_engines/log_family.md) Familia. Utiliza el `lz4` método de compresión por defecto y soporta códecs de compresión de columna. +- [Establecer](../operations/table_engines/set.md). Solo admite la compresión predeterminada. +- [Unir](../operations/table_engines/join.md). Solo admite la compresión predeterminada. + +ClickHouse admite códecs de propósito común y códecs especializados. + +#### Especializados Codecs {#create-query-specialized-codecs} + +Estos códecs están diseñados para hacer que la compresión sea más efectiva mediante el uso de características específicas de los datos. Algunos de estos códecs no comprimen los propios datos. En su lugar, preparan los datos para un códec de propósito común, que lo comprime mejor que sin esta preparación. + +Códecs Especializados: + +- `Delta(delta_bytes)` — Enfoque de compresión en el que los valores brutos se sustituyen por la diferencia de dos valores vecinos, excepto el primer valor que permanece sin cambios. Hasta `delta_bytes` se utilizan para almacenar valores delta, por lo que `delta_bytes` es el tamaño máximo de los valores brutos. Posible `delta_bytes` valores: 1, 2, 4, 8. El valor predeterminado para `delta_bytes` ser `sizeof(type)` si es igual a 1, 2, 4 u 8. En todos los demás casos, es 1. +- `DoubleDelta` — Calcula delta de deltas y lo escribe en forma binaria compacta. Las tasas de compresión óptimas se logran para secuencias monotónicas con una zancada constante, como los datos de series de tiempo. Se puede utilizar con cualquier tipo de ancho fijo. Implementa el algoritmo utilizado en Gorilla TSDB, extendiéndolo para admitir tipos de 64 bits. Utiliza 1 bit adicional para deltas de 32 bytes: prefijos de 5 bits en lugar de prefijos de 4 bits. Para obtener información adicional, consulte Compresión de sellos de tiempo en [Gorila: Una base de datos de series temporales rápida, escalable y en memoria](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `Gorilla` - Calcula XOR entre el valor actual y el anterior y lo escribe en forma binaria compacta. Eficiente al almacenar una serie de valores de coma flotante que cambian lentamente, porque la mejor tasa de compresión se logra cuando los valores vecinos son binarios iguales. Implementa el algoritmo utilizado en Gorilla TSDB, extendiéndolo para admitir tipos de 64 bits. Para obtener información adicional, consulte Comprimir valores en [Gorila: Una base de datos de series temporales rápida, escalable y en memoria](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `T64` — Enfoque de compresión que recorta bits altos no utilizados de valores en tipos de datos enteros (incluidos `Enum`, `Date` y `DateTime`). En cada paso de su algoritmo, el códec toma un bloque de 64 valores, los coloca en una matriz de 64x64 bits, lo transpone, recorta los bits de valores no utilizados y devuelve el resto como una secuencia. Los bits no utilizados son los bits, que no difieren entre los valores máximo y mínimo en toda la parte de datos para la que se utiliza la compresión. + +`DoubleDelta` y `Gorilla` códecs se utilizan en Gorilla TSDB como los componentes de su algoritmo de compresión. El enfoque de gorila es efectivo en escenarios en los que hay una secuencia de valores que cambian lentamente con sus marcas de tiempo. Las marcas de tiempo se comprimen efectivamente por el `DoubleDelta` códec, y los valores son efectivamente comprimidos por el `Gorilla` códec. Por ejemplo, para obtener una tabla almacenada efectivamente, puede crearla en la siguiente configuración: + +``` sql +CREATE TABLE codec_example +( + timestamp DateTime CODEC(DoubleDelta), + slow_values Float32 CODEC(Gorilla) +) +ENGINE = MergeTree() +``` + +#### Propósito común codecs {#create-query-common-purpose-codecs} + +Códecs: + +- `NONE` — Sin compresión. +- `LZ4` — Lossless [algoritmo de compresión de datos](https://github.com/lz4/lz4) utilizado por defecto. Aplica compresión rápida LZ4. +- `LZ4HC[(level)]` — Algoritmo LZ4 HC (alta compresión) con nivel configurable. Nivel predeterminado: 9. Configuración `level <= 0` aplica el nivel predeterminado. Niveles posibles: \[1, 12\]. Rango de nivel recomendado: \[4, 9\]. +- `ZSTD[(level)]` — [Algoritmo de compresión ZSTD](https://en.wikipedia.org/wiki/Zstandard) con configurable `level`. Niveles posibles: \[1, 22\]. Valor predeterminado: 1. + +Los altos niveles de compresión son útiles para escenarios asimétricos, como comprimir una vez, descomprimir repetidamente. Los niveles más altos significan una mejor compresión y un mayor uso de la CPU. + +## Tablas temporales {#temporary-tables} + +ClickHouse admite tablas temporales que tienen las siguientes características: + +- Las tablas temporales desaparecen cuando finaliza la sesión, incluso si se pierde la conexión. +- Una tabla temporal solo utiliza el motor de memoria. +- No se puede especificar la base de datos para una tabla temporal. Se crea fuera de las bases de datos. +- Imposible crear una tabla temporal con consulta DDL distribuida en todos los servidores de clúster (mediante `ON CLUSTER`): esta tabla sólo existe en la sesión actual. +- Si una tabla temporal tiene el mismo nombre que otra y una consulta especifica el nombre de la tabla sin especificar la base de datos, se utilizará la tabla temporal. +- Para el procesamiento de consultas distribuidas, las tablas temporales utilizadas en una consulta se pasan a servidores remotos. + +Para crear una tabla temporal, utilice la siguiente sintaxis: + +``` sql +CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) +``` + +En la mayoría de los casos, las tablas temporales no se crean manualmente, sino cuando se utilizan datos externos para una consulta o para `(GLOBAL) IN`. Para obtener más información, consulte las secciones correspondientes + +Es posible utilizar tablas con [MOTOR = Memoria](../operations/table_engines/memory.md) es lugar de tablas temporales. + +## Consultas DDL distribuidas (cláusula ON CLUSTER) {#distributed-ddl-queries-on-cluster-clause} + +El `CREATE`, `DROP`, `ALTER`, y `RENAME` las consultas admiten la ejecución distribuida en un clúster. +Por ejemplo, la siguiente consulta crea el `all_hits` `Distributed` la tabla en cada host `cluster`: + +``` sql +CREATE TABLE IF NOT EXISTS all_hits ON CLUSTER cluster (p Date, i Int32) ENGINE = Distributed(cluster, default, hits) +``` + +Para ejecutar estas consultas correctamente, cada host debe tener la misma definición de clúster (para simplificar la sincronización de configuraciones, puede usar sustituciones de ZooKeeper). También deben conectarse a los servidores ZooKeeper. +La versión local de la consulta finalmente se implementará en cada host del clúster, incluso si algunos hosts no están disponibles actualmente. El orden para ejecutar consultas dentro de un único host está garantizado. + +## CREAR VISTA {#create-view} + +``` sql +CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]table_name [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ... +``` + +Crea una vista. Hay dos tipos de vistas: normal y MATERIALIZADO. + +Las vistas normales no almacenan ningún dato, sino que solo realizan una lectura desde otra tabla. En otras palabras, una vista normal no es más que una consulta guardada. Al leer desde una vista, esta consulta guardada se utiliza como una subconsulta en la cláusula FROM. + +Como ejemplo, suponga que ha creado una vista: + +``` sql +CREATE VIEW view AS SELECT ... +``` + +y escribió una consulta: + +``` sql +SELECT a, b, c FROM view +``` + +Esta consulta es totalmente equivalente a usar la subconsulta: + +``` sql +SELECT a, b, c FROM (SELECT ...) +``` + +Las vistas materializadas almacenan datos transformados por la consulta SELECT correspondiente. + +Al crear una vista materializada sin `TO [db].[table]`, debe especificar ENGINE – el motor de tabla para almacenar datos. + +Al crear una vista materializada con `TO [db].[table]` usted no debe usar `POPULATE`. + +Una vista materializada se organiza de la siguiente manera: al insertar datos en la tabla especificada en SELECT, parte de los datos insertados se convierte mediante esta consulta SELECT y el resultado se inserta en la vista. + +Si especifica POPULATE, los datos de tabla existentes se insertan en la vista al crearlos, como si `CREATE TABLE ... AS SELECT ...` . De lo contrario, la consulta solo contiene los datos insertados en la tabla después de crear la vista. No recomendamos usar POPULATE, ya que los datos insertados en la tabla durante la creación de la vista no se insertarán en ella. + +Naciones `SELECT` Consulta puede contener `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`… Tenga en cuenta que las conversiones correspondientes se realizan de forma independiente en cada bloque de datos insertados. Por ejemplo, si `GROUP BY` se establece, los datos se agregan durante la inserción, pero solo dentro de un solo paquete de datos insertados. Los datos no se agregarán más. La excepción es cuando se utiliza un ENGINE que realiza de forma independiente la agregación de datos, como `SummingMergeTree`. + +La ejecución de `ALTER` las consultas sobre vistas materializadas no se han desarrollado completamente, por lo que podrían ser inconvenientes. Si la vista materializada utiliza la construcción `TO [db.]name` puede `DETACH` la vista, ejecutar `ALTER` para la tabla de destino, y luego `ATTACH` el previamente separado (`DETACH`) vista. + +Las vistas tienen el mismo aspecto que las tablas normales. Por ejemplo, se enumeran en el resultado de la `SHOW TABLES` consulta. + +No hay una consulta independiente para eliminar vistas. Para eliminar una vista, utilice `DROP TABLE`. + +## CREAR DICCIONARIO {#create-dictionary-query} + +``` sql +CREATE DICTIONARY [IF NOT EXISTS] [db.]dictionary_name +( + key1 type1 [DEFAULT|EXPRESSION expr1] [HIERARCHICAL|INJECTIVE|IS_OBJECT_ID], + key2 type2 [DEFAULT|EXPRESSION expr2] [HIERARCHICAL|INJECTIVE|IS_OBJECT_ID], + attr1 type2 [DEFAULT|EXPRESSION expr3], + attr2 type2 [DEFAULT|EXPRESSION expr4] +) +PRIMARY KEY key1, key2 +SOURCE(SOURCE_NAME([param1 value1 ... paramN valueN])) +LAYOUT(LAYOUT_NAME([param_name param_value])) +LIFETIME([MIN val1] MAX val2) +``` + +Crear [diccionario externo](dicts/external_dicts.md) con dado [estructura](dicts/external_dicts_dict_structure.md), [fuente](dicts/external_dicts_dict_sources.md), [diseño](dicts/external_dicts_dict_layout.md) y [vida](dicts/external_dicts_dict_lifetime.md). + +La estructura del diccionario externo consta de atributos. Los atributos de diccionario se especifican de manera similar a las columnas de la tabla. La única propiedad de atributo requerida es su tipo, todas las demás propiedades pueden tener valores predeterminados. + +Dependiendo del diccionario [diseño](dicts/external_dicts_dict_layout.md) se pueden especificar uno o más atributos como claves de diccionario. + +Para obtener más información, consulte [Diccionarios externos](dicts/external_dicts.md) apartado. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/create/) diff --git a/docs/es/query_language/dicts/external_dicts.md b/docs/es/query_language/dicts/external_dicts.md new file mode 100644 index 00000000000..28e408b57e2 --- /dev/null +++ b/docs/es/query_language/dicts/external_dicts.md @@ -0,0 +1,49 @@ +# Diccionarios externos {#dicts-external-dicts} + +Puede agregar sus propios diccionarios de varias fuentes de datos. El origen de datos de un diccionario puede ser un archivo ejecutable o de texto local, un recurso HTTP u otro DBMS. Para obtener más información, consulte “[Fuentes para diccionarios externos](external_dicts_dict_sources.md)”. + +Haga clic en Casa: + +- Almacena total o parcialmente los diccionarios en RAM. +- Actualiza periódicamente los diccionarios y carga dinámicamente los valores que faltan. En otras palabras, los diccionarios se pueden cargar dinámicamente. +- Permite crear diccionarios externos con archivos xml o [Consultas DDL](../create.md#create-dictionary-query). + +La configuración de diccionarios externos se puede ubicar en uno o más archivos xml. La ruta de acceso a la configuración se especifica en el [Diccionarios\_config](../../operations/server_settings/settings.md#server_settings-dictionaries_config) parámetro. + +Los diccionarios se pueden cargar en el inicio del servidor o en el primer uso, dependiendo de la [Diccionarios\_lazy\_load](../../operations/server_settings/settings.md#server_settings-dictionaries_lazy_load) configuración. + +El archivo de configuración del diccionario tiene el siguiente formato: + +``` xml + + An optional element with any content. Ignored by the ClickHouse server. + + + /etc/metrika.xml + + + + + + + + +``` + +Usted puede [configurar](external_dicts_dict.md) cualquier número de diccionarios en el mismo archivo. + +[Consultas DDL para diccionarios](../create.md#create-dictionary-query) no requiere ningún registro adicional en la configuración del servidor. Permiten trabajar con diccionarios como entidades de primera clase, como tablas o vistas. + +!!! attention "Atención" + Puede convertir valores para un diccionario pequeño describiéndolo en un `SELECT` Consulta (ver el [Ciudad](../functions/other_functions.md) función). Esta funcionalidad no está relacionada con diccionarios externos. + +## Ver también {#ext-dicts-see-also} + +- [Configuración de un diccionario externo](external_dicts_dict.md) +- [Almacenamiento de diccionarios en la memoria](external_dicts_dict_layout.md) +- [Actualizaciones del diccionario](external_dicts_dict_lifetime.md) +- [Fuentes de diccionarios externos](external_dicts_dict_sources.md) +- [Clave y campos del diccionario](external_dicts_dict_structure.md) +- [Funciones para trabajar con diccionarios externos](../functions/ext_dict_functions.md) + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/dicts/external_dicts/) diff --git a/docs/es/query_language/dicts/external_dicts_dict.md b/docs/es/query_language/dicts/external_dicts_dict.md new file mode 100644 index 00000000000..1614a78fb6b --- /dev/null +++ b/docs/es/query_language/dicts/external_dicts_dict.md @@ -0,0 +1,46 @@ +# Configuración de un diccionario externo {#dicts-external-dicts-dict} + +Si el diccionario se configura usando un archivo xml, la configuración del diccionario tiene la siguiente estructura: + +``` xml + + dict_name + + + + + + + + + + + + + + + + + +``` + +Correspondiente [Consulta DDL](../create.md#create-dictionary-query) tiene la siguiente estructura: + +``` sql +CREATE DICTIONARY dict_name +( + ... -- attributes +) +PRIMARY KEY ... -- complex or single key configuration +SOURCE(...) -- Source configuration +LAYOUT(...) -- Memory layout configuration +LIFETIME(...) -- Lifetime of dictionary in memory +``` + +- `name` – El identificador que se puede utilizar para acceder al diccionario. Usa los personajes `[a-zA-Z0-9_\-]`. +- [fuente](external_dicts_dict_sources.md) — Fuente del diccionario. +- [diseño](external_dicts_dict_layout.md) — Diseño del diccionario en la memoria. +- [estructura](external_dicts_dict_structure.md) — Estructura del diccionario . Una clave y atributos que se pueden recuperar con esta clave. +- [vida](external_dicts_dict_lifetime.md) — Frecuencia de actualizaciones del diccionario. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/dicts/external_dicts_dict/) diff --git a/docs/es/query_language/dicts/external_dicts_dict_hierarchical.md b/docs/es/query_language/dicts/external_dicts_dict_hierarchical.md new file mode 100644 index 00000000000..095a3cb2b78 --- /dev/null +++ b/docs/es/query_language/dicts/external_dicts_dict_hierarchical.md @@ -0,0 +1,63 @@ +# Diccionarios jerárquicos {#hierarchical-dictionaries} + +ClickHouse soporta diccionarios jerárquicos con un [llave numérica](external_dicts_dict_structure.md#ext_dict-numeric-key). + +Mira la siguiente estructura jerárquica: + +``` text +0 (Common parent) +│ +├── 1 (Russia) +│ │ +│ └── 2 (Moscow) +│ │ +│ └── 3 (Center) +│ +└── 4 (Great Britain) + │ + └── 5 (London) +``` + +Esta jerarquía se puede expresar como la siguiente tabla de diccionario. + +| region\_id | parent\_region | nombre\_región | +|------------|----------------|----------------| +| Uno | Cero | Rusia | +| Cómo hacer | Uno | Moscu | +| Cómo hacer | Cómo hacer | Centrar | +| Cuatro | Cero | Gran Bretaña | +| Cinco | Cuatro | Londres | + +Esta tabla contiene una columna `parent_region` que contiene la clave del padre más cercano para el elemento. + +ClickHouse soporta el [jerárquica](external_dicts_dict_structure.md#hierarchical-dict-attr) propiedad para [diccionario externo](index.md) atributo. Esta propiedad le permite configurar el diccionario jerárquico similar al descrito anteriormente. + +El [dictGetHierarchy](../functions/ext_dict_functions.md#dictgethierarchy) función le permite obtener la cadena principal de un elemento. + +Para nuestro ejemplo, la estructura del diccionario puede ser la siguiente: + +``` xml + + + + region_id + + + + parent_region + UInt64 + 0 + true + + + + region_name + String + + + + + +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/dicts/external_dicts_dict_hierarchical/) diff --git a/docs/es/query_language/dicts/external_dicts_dict_layout.md b/docs/es/query_language/dicts/external_dicts_dict_layout.md new file mode 100644 index 00000000000..fffabbf76e8 --- /dev/null +++ b/docs/es/query_language/dicts/external_dicts_dict_layout.md @@ -0,0 +1,366 @@ +# Almacenamiento de diccionarios en la memoria {#dicts-external-dicts-dict-layout} + +Hay una variad de formas de almacenar diccionarios en la memoria. + +Recomendamos [Plano](#flat), [Hashed](#dicts-external_dicts_dict_layout-hashed) y [Método de codificación de datos:](#complex-key-hashed). que proporcionan una velocidad de procesamiento óptima. + +No se recomienda el almacenamiento en caché debido al rendimiento potencialmente bajo y las dificultades para seleccionar los parámetros óptimos. Lea más en la sección “[cache](#cache)”. + +Hay varias formas de mejorar el rendimiento del diccionario: + +- Llame a la función para trabajar con el diccionario después `GROUP BY`. +- Marque los atributos para extraer como inyectivos. Un atributo se llama injective si diferentes valores de atributo corresponden a claves diferentes. Entonces, cuando `GROUP BY` utiliza una función que obtiene un valor de atributo mediante la clave, esta función se elimina automáticamente de `GROUP BY`. + +ClickHouse genera una excepción para errores con diccionarios. Ejemplos de errores: + +- No se pudo cargar el diccionario al que se accede. +- Error al consultar un `cached` diccionario. + +Puede ver la lista de diccionarios externos y sus estados en el `system.dictionaries` tabla. + +La configuración se ve así: + +``` xml + + + ... + + + + + + ... + + +``` + +Correspondiente [Consulta DDL](../create.md#create-dictionary-query): + +``` sql +CREATE DICTIONARY (...) +... +LAYOUT(LAYOUT_TYPE(param value)) -- layout settings +... +``` + +## Maneras de almacenar diccionarios en la memoria {#ways-to-store-dictionaries-in-memory} + +- [Plano](#flat) +- [Hashed](#dicts-external_dicts_dict_layout-hashed) +- [Sistema abierto.](#dicts-external_dicts_dict_layout-sparse_hashed) +- [cache](#cache) +- [range\_hashed](#range-hashed) +- [Método de codificación de datos:](#complex-key-hashed) +- [complejo\_key\_cache](#complex-key-cache) +- [Método de codificación de datos:](#ip-trie) + +### Plano {#flat} + +El diccionario está completamente almacenado en la memoria en forma de matrices planas. ¿Cuánta memoria usa el diccionario? La cantidad es proporcional al tamaño de la clave más grande (en el espacio utilizado). + +La clave del diccionario tiene el `UInt64` tipo y el valor está limitado a 500.000. Si se descubre una clave más grande al crear el diccionario, ClickHouse produce una excepción y no crea el diccionario. + +Se admiten todos los tipos de fuentes. Al actualizar, los datos (de un archivo o de una tabla) se leen en su totalidad. + +Este método proporciona el mejor rendimiento entre todos los métodos disponibles para almacenar el diccionario. + +Ejemplo de configuración: + +``` xml + + + +``` + +o + +``` sql +LAYOUT(FLAT()) +``` + +### Hashed {#dicts-external-dicts-dict-layout-hashed} + +El diccionario está completamente almacenado en la memoria en forma de una tabla hash. El diccionario puede contener cualquier número de elementos con cualquier identificador En la práctica, el número de claves puede alcanzar decenas de millones de elementos. + +Se admiten todos los tipos de fuentes. Al actualizar, los datos (de un archivo o de una tabla) se leen en su totalidad. + +Ejemplo de configuración: + +``` xml + + + +``` + +o + +``` sql +LAYOUT(HASHED()) +``` + +### Sistema abierto. {#dicts-external-dicts-dict-layout-sparse-hashed} + +Similar a `hashed`, pero usa menos memoria a favor más uso de CPU. + +Ejemplo de configuración: + +``` xml + + + +``` + +``` sql +LAYOUT(SPARSE_HASHED()) +``` + +### Método de codificación de datos: {#complex-key-hashed} + +Este tipo de almacenamiento es para su uso con material compuesto [claves](external_dicts_dict_structure.md). Similar a `hashed`. + +Ejemplo de configuración: + +``` xml + + + +``` + +``` sql +LAYOUT(COMPLEX_KEY_HASHED()) +``` + +### range\_hashed {#range-hashed} + +El diccionario se almacena en la memoria en forma de una tabla hash con una matriz ordenada de rangos y sus valores correspondientes. + +Este método de almacenamiento funciona de la misma manera que hash y permite el uso de intervalos de fecha / hora (tipo numérico arbitrario) además de la clave. + +Ejemplo: La tabla contiene descuentos para cada anunciante en el formato: + +``` text ++---------------|---------------------|-------------------|--------+ +| advertiser id | discount start date | discount end date | amount | ++===============+=====================+===================+========+ +| 123 | 2015-01-01 | 2015-01-15 | 0.15 | ++---------------|---------------------|-------------------|--------+ +| 123 | 2015-01-16 | 2015-01-31 | 0.25 | ++---------------|---------------------|-------------------|--------+ +| 456 | 2015-01-01 | 2015-01-15 | 0.05 | ++---------------|---------------------|-------------------|--------+ +``` + +Para utilizar un ejemplo para intervalos de fechas, defina el `range_min` y `range_max` elementos en el [estructura](external_dicts_dict_structure.md). Estos elementos deben contener elementos `name` y`type` (si `type` no se especifica, se utilizará el tipo predeterminado - Fecha). `type` puede ser de cualquier tipo numérico (Fecha / DateTime / UInt64 / Int32 / otros). + +Ejemplo: + +``` xml + + + Id + + + first + Date + + + last + Date + + ... +``` + +o + +``` sql +CREATE DICTIONARY somedict ( + id UInt64, + first Date, + last Date +) +PRIMARY KEY id +LAYOUT(RANGE_HASHED()) +RANGE(MIN first MAX last) +``` + +Para trabajar con estos diccionarios, debe pasar un argumento adicional al `dictGetT` función, para la que se selecciona un rango: + +``` sql +dictGetT('dict_name', 'attr_name', id, date) +``` + +Esta función devuelve el valor para el `id`s y el intervalo de fechas que incluye la fecha pasada. + +Detalles del algoritmo: + +- Si el `id` no se encuentra o no se encuentra un rango para el `id` devuelve el valor predeterminado para el diccionario. +- Si hay rangos superpuestos, puede usar cualquiera. +- Si el delimitador de rango es `NULL` o una fecha no válida (como 1900-01-01 o 2039-01-01), el rango se deja abierto. La gama puede estar abierta en ambos lados. + +Ejemplo de configuración: + +``` xml + + + + ... + + + + + + + + Abcdef + + + StartTimeStamp + UInt64 + + + EndTimeStamp + UInt64 + + + XXXType + String + + + + + + +``` + +o + +``` sql +CREATE DICTIONARY somedict( + Abcdef UInt64, + StartTimeStamp UInt64, + EndTimeStamp UInt64, + XXXType String DEFAULT '' +) +PRIMARY KEY Abcdef +RANGE(MIN StartTimeStamp MAX EndTimeStamp) +``` + +### cache {#cache} + +El diccionario se almacena en una memoria caché que tiene un número fijo de celdas. Estas celdas contienen elementos de uso frecuente. + +Al buscar un diccionario, primero se busca en la memoria caché. Para cada bloque de datos, todas las claves que no se encuentran en la memoria caché o están desactualizadas se solicitan desde el origen utilizando `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. Los datos recibidos se escriben en la memoria caché. + +Para los diccionarios de caché, la caducidad [vida](external_dicts_dict_lifetime.md) de datos en la memoria caché se puede establecer. Si más tiempo que `lifetime` ha pasado desde que se cargaron los datos en una celda, el valor de la celda no se usa y se vuelve a solicitar la próxima vez que se deba usar. +Esta es la menos efectiva de todas las formas de almacenar diccionarios. La velocidad de la memoria caché depende en gran medida de la configuración correcta y del escenario de uso. Un diccionario de tipo de caché funciona bien solo cuando las tasas de aciertos son lo suficientemente altas (recomendado 99% y superior). Puede ver la tasa de aciertos promedio en el `system.dictionaries` tabla. + +Para mejorar el rendimiento de la caché, utilice una subconsulta con `LIMIT`, y llame a la función con el diccionario externamente. + +Apoyar [fuente](external_dicts_dict_sources.md): MySQL, ClickHouse, ejecutable, HTTP. + +Ejemplo de configuración: + +``` xml + + + + 1000000000 + + +``` + +o + +``` sql +LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) +``` + +Establezca un tamaño de caché lo suficientemente grande. Necesitas experimentar para seleccionar el número de celdas: + +1. Establecer algún valor. +2. Ejecute consultas hasta que la memoria caché esté completamente llena. +3. Evalúe el consumo de memoria utilizando el `system.dictionaries` tabla. +4. Aumente o disminuya el número de celdas hasta que se alcance el consumo de memoria requerido. + +!!! warning "Advertencia" + No use ClickHouse como fuente, ya que es lento procesar consultas con lecturas aleatorias. + +### complejo\_key\_cache {#complex-key-cache} + +Este tipo de almacenamiento es para su uso con material compuesto [claves](external_dicts_dict_structure.md). Similar a `cache`. + +### Método de codificación de datos: {#ip-trie} + +Este tipo de almacenamiento sirve para asignar prefijos de red (direcciones IP) a metadatos como ASN. + +Ejemplo: La tabla contiene prefijos de red y su correspondiente número AS y código de país: + +``` text + +-----------------|-------|--------+ + | prefix | asn | cca2 | + +=================+=======+========+ + | 202.79.32.0/20 | 17501 | NP | + +-----------------|-------|--------+ + | 2620:0:870::/48 | 3856 | US | + +-----------------|-------|--------+ + | 2a02:6b8:1::/48 | 13238 | RU | + +-----------------|-------|--------+ + | 2001:db8::/32 | 65536 | ZZ | + +-----------------|-------|--------+ +``` + +Cuando se utiliza este tipo de diseño, la estructura debe tener una clave compuesta. + +Ejemplo: + +``` xml + + + + prefix + String + + + + asn + UInt32 + + + + cca2 + String + ?? + + ... +``` + +o + +``` sql +CREATE DICTIONARY somedict ( + prefix String, + asn UInt32, + cca2 String DEFAULT '??' +) +PRIMARY KEY prefix +``` + +La clave debe tener solo un atributo de tipo String que contenga un prefijo IP permitido. Todavía no se admiten otros tipos. + +Para consultas, debe utilizar las mismas funciones (`dictGetT` con una tupla) como para diccionarios con claves compuestas: + +``` sql +dictGetT('dict_name', 'attr_name', tuple(ip)) +``` + +La función toma cualquiera `UInt32` para IPv4, o `FixedString(16)` para IPv6: + +``` sql +dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1'))) +``` + +Todavía no se admiten otros tipos. La función devuelve el atributo para el prefijo que corresponde a esta dirección IP. Si hay prefijos superpuestos, se devuelve el más específico. + +Los datos se almacenan en un `trie`. Debe encajar completamente en la RAM. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/dicts/external_dicts_dict_layout/) diff --git a/docs/es/query_language/dicts/external_dicts_dict_lifetime.md b/docs/es/query_language/dicts/external_dicts_dict_lifetime.md new file mode 100644 index 00000000000..cbea58f8dd4 --- /dev/null +++ b/docs/es/query_language/dicts/external_dicts_dict_lifetime.md @@ -0,0 +1,79 @@ +# Actualizaciones del diccionario {#dictionary-updates} + +ClickHouse actualiza periódicamente los diccionarios. El intervalo de actualización para los diccionarios completamente descargados y el intervalo de invalidación para los diccionarios almacenados en caché se `` etiqueta en segundos. + +Las actualizaciones del diccionario (aparte de la carga para el primer uso) no bloquean las consultas. Durante las actualizaciones, se utiliza la versión anterior de un diccionario. Si se produce un error durante una actualización, el error se escribe en el registro del servidor y las consultas continúan utilizando la versión anterior de los diccionarios. + +Ejemplo de configuración: + +``` xml + + ... + 300 + ... + +``` + +``` sql +CREATE DICTIONARY (...) +... +LIFETIME(300) +... +``` + +Configuración `0` (`LIFETIME(0)`) impide que los diccionarios se actualicen. + +Puede establecer un intervalo de tiempo para las actualizaciones, y ClickHouse elegirá un tiempo uniformemente aleatorio dentro de este rango. Esto es necesario para distribuir la carga en la fuente del diccionario cuando se actualiza en una gran cantidad de servidores. + +Ejemplo de configuración: + +``` xml + + ... + + 300 + 360 + + ... + +``` + +o + +``` sql +LIFETIME(MIN 300 MAX 360) +``` + +Al actualizar los diccionarios, el servidor ClickHouse aplica una lógica diferente según el tipo de [fuente](external_dicts_dict_sources.md): + +- Para un archivo de texto, comprueba el tiempo de modificación. Si la hora difiere de la hora previamente grabada, el diccionario se actualiza. +- Para las tablas MyISAM, el tiempo de modificación se comprueba utilizando un `SHOW TABLE STATUS` consulta. +- Los diccionarios de otras fuentes se actualizan cada vez de forma predeterminada. + +Para fuentes MySQL (InnoDB), ODBC y ClickHouse, puede configurar una consulta que actualizará los diccionarios solo si realmente cambiaron, en lugar de cada vez. Para ello, siga estos pasos: + +- La tabla del diccionario debe tener un campo que siempre cambie cuando se actualizan los datos de origen. +- La configuración del origen debe especificar una consulta que recupere el campo de cambio. El servidor ClickHouse interpreta el resultado de la consulta como una fila, y si esta fila ha cambiado en relación con su estado anterior, el diccionario se actualiza. Especifique la consulta en el `` en la configuración de la [fuente](external_dicts_dict_sources.md). + +Ejemplo de configuración: + +``` xml + + ... + + ... + SELECT update_time FROM dictionary_source where id = 1 + + ... + +``` + +o + +``` sql +... +SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source where id = 1')) +... +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/dicts/external_dicts_dict_lifetime/) diff --git a/docs/es/query_language/dicts/external_dicts_dict_sources.md b/docs/es/query_language/dicts/external_dicts_dict_sources.md new file mode 100644 index 00000000000..3507dbff580 --- /dev/null +++ b/docs/es/query_language/dicts/external_dicts_dict_sources.md @@ -0,0 +1,601 @@ +# Fuentes de diccionarios externos {#dicts-external-dicts-dict-sources} + +Un diccionario externo se puede conectar desde muchas fuentes diferentes. + +Si el diccionario se configura usando xml-file, la configuración se ve así: + +``` xml + + + ... + + + + + + ... + + ... + +``` + +En caso de [Consulta DDL](../create.md#create-dictionary-query), la configuración igual parecerá: + +``` sql +CREATE DICTIONARY dict_name (...) +... +SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration +... +``` + +El origen está configurado en el `source` apartado. + +Tipos de fuentes (`source_type`): + +- [Archivo Local](#dicts-external_dicts_dict_sources-local_file) +- [Archivo ejecutable](#dicts-external_dicts_dict_sources-executable) +- [HTTP(s))](#dicts-external_dicts_dict_sources-http) +- DBMS + - [ODBC](#dicts-external_dicts_dict_sources-odbc) + - [MySQL](#dicts-external_dicts_dict_sources-mysql) + - [Haga clic en Casa](#dicts-external_dicts_dict_sources-clickhouse) + - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) + - [Redis](#dicts-external_dicts_dict_sources-redis) + +## Archivo Local {#dicts-external-dicts-dict-sources-local-file} + +Ejemplo de configuración: + +``` xml + + + /opt/dictionaries/os.tsv + TabSeparated + + +``` + +o + +``` sql +SOURCE(FILE(path '/opt/dictionaries/os.tsv' format 'TabSeparated')) +``` + +Configuración de campos: + +- `path` – The absolute path to the file. +- `format` – The file format. All the formats described in “[Formato](../../interfaces/formats.md#formats)” son compatibles. + +## Archivo ejecutable {#dicts-external-dicts-dict-sources-executable} + +Trabajar con archivos ejecutables depende de [cómo se almacena el diccionario en la memoria](external_dicts_dict_layout.md). Si el diccionario se almacena usando `cache` y `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts executable file and treats its output as dictionary data. + +Ejemplo de configuración: + +``` xml + + + cat /opt/dictionaries/os.tsv + TabSeparated + + +``` + +o + +``` sql +SOURCE(EXECUTABLE(command 'cat /opt/dictionaries/os.tsv' format 'TabSeparated')) +``` + +Configuración de campos: + +- `command` – The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). +- `format` – The file format. All the formats described in “[Formato](../../interfaces/formats.md#formats)” son compatibles. + +## HTTP(s)) {#dicts-external-dicts-dict-sources-http} + +Trabajar con un servidor HTTP depende de [cómo se almacena el diccionario en la memoria](external_dicts_dict_layout.md). Si el diccionario se almacena usando `cache` y `complex_key_cache`, ClickHouse solicita las claves necesarias enviando una solicitud a través del `POST` método. + +Ejemplo de configuración: + +``` xml + + + http://[::1]/os.tsv + TabSeparated + + user + password + + +
+ API-KEY + key +
+
+
+ +``` + +o + +``` sql +SOURCE(HTTP( + url 'http://[::1]/os.tsv' + format 'TabSeparated' + credentials(user 'user' password 'password') + headers(header(name 'API-KEY' value 'key')) +)) +``` + +Para que ClickHouse tenga acceso a un recurso HTTPS, debe [configurar openSSL](../../operations/server_settings/settings.md#server_settings-openssl) en la configuración del servidor. + +Configuración de campos: + +- `url` – The source URL. +- `format` – The file format. All the formats described in “[Formato](../../interfaces/formats.md#formats)” son compatibles. +- `credentials` – Basic HTTP authentication. Optional parameter. + - `user` – Username required for the authentication. + - `password` – Password required for the authentication. +- `headers` – All custom HTTP headers entries used for the HTTP request. Optional parameter. + - `header` – Single HTTP header entry. + - `name` – Identifiant name used for the header send on the request. + - `value` – Value set for a specific identifiant name. + +## ODBC {#dicts-external-dicts-dict-sources-odbc} + +Puede utilizar este método para conectar cualquier base de datos que tenga un controlador ODBC. + +Ejemplo de configuración: + +``` xml + + + DatabaseName + ShemaName.TableName
+ DSN=some_parameters + SQL_QUERY +
+ +``` + +o + +``` sql +SOURCE(ODBC( + db 'DatabaseName' + table 'SchemaName.TableName' + connection_string 'DSN=some_parameters' + invalidate_query 'SQL_QUERY' +)) +``` + +Configuración de campos: + +- `db` – Name of the database. Omit it if the database name is set in the `` parámetros. +- `table` – Name of the table and schema if exists. +- `connection_string` – Connection string. +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Actualización de diccionarios](external_dicts_dict_lifetime.md). + +ClickHouse receives quoting symbols from ODBC-driver and quote all settings in queries to driver, so it’s necessary to set table name accordingly to table name case in database. + +Si tiene problemas con las codificaciones al utilizar Oracle, consulte el [Preguntas frecuentes](../../faq/general.md#oracle-odbc-encodings) artículo. + +### Vulnerabilidad conocida de la funcionalidad del diccionario ODBC {#known-vulnerability-of-the-odbc-dictionary-functionality} + +!!! attention "Atención" + Cuando se conecta a la base de datos a través del parámetro de conexión del controlador ODBC `Servername` puede ser sustituido. En este caso los valores de `USERNAME` y `PASSWORD` de `odbc.ini` se envían al servidor remoto y pueden verse comprometidos. + +**Ejemplo de uso inseguro** + +Let’s configure unixODBC for PostgreSQL. Content of `/etc/odbc.ini`: + +``` text +[gregtest] +Driver = /usr/lib/psqlodbca.so +Servername = localhost +PORT = 5432 +DATABASE = test_db +#OPTION = 3 +USERNAME = test +PASSWORD = test +``` + +Si luego realiza una consulta como + +``` sql +SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); +``` + +El controlador ODBC enviará valores de `USERNAME` y `PASSWORD` de `odbc.ini` a `some-server.com`. + +### Ejemplo de conexión de PostgreSQL {#example-of-connecting-postgresql} + +Sistema operativo Ubuntu. + +Instalación de unixODBC y el controlador ODBC para PostgreSQL: + +``` bash +$ sudo apt-get install -y unixodbc odbcinst odbc-postgresql +``` + +Configuración `/etc/odbc.ini` (o `~/.odbc.ini`): + +``` text + [DEFAULT] + Driver = myconnection + + [myconnection] + Description = PostgreSQL connection to my_db + Driver = PostgreSQL Unicode + Database = my_db + Servername = 127.0.0.1 + UserName = username + Password = password + Port = 5432 + Protocol = 9.3 + ReadOnly = No + RowVersioning = No + ShowSystemTables = No + ConnSettings = +``` + +La configuración del diccionario en ClickHouse: + +``` xml + + + table_name + + + + + DSN=myconnection + postgresql_table
+
+ + + 300 + 360 + + + + + + + id + + + some_column + UInt64 + 0 + + +
+
+``` + +o + +``` sql +CREATE DICTIONARY table_name ( + id UInt64, + some_column UInt64 DEFAULT 0 +) +PRIMARY KEY id +SOURCE(ODBC(connection_string 'DSN=myconnection' table 'postgresql_table')) +LAYOUT(HASHED()) +LIFETIME(MIN 300 MAX 360) +``` + +Es posible que tenga que editar `odbc.ini` para especificar la ruta completa a la biblioteca con el controlador `DRIVER=/usr/local/lib/psqlodbcw.so`. + +### Ejemplo de conexión de MS SQL Server {#example-of-connecting-ms-sql-server} + +Sistema operativo Ubuntu. + +Instalación del controlador: : + +``` bash +$ sudo apt-get install tdsodbc freetds-bin sqsh +``` + +Configuración del controlador: + +``` bash + $ cat /etc/freetds/freetds.conf + ... + + [MSSQL] + host = 192.168.56.101 + port = 1433 + tds version = 7.0 + client charset = UTF-8 + + $ cat /etc/odbcinst.ini + ... + + [FreeTDS] + Description = FreeTDS + Driver = /usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so + Setup = /usr/lib/x86_64-linux-gnu/odbc/libtdsS.so + FileUsage = 1 + UsageCount = 5 + + $ cat ~/.odbc.ini + ... + + [MSSQL] + Description = FreeTDS + Driver = FreeTDS + Servername = MSSQL + Database = test + UID = test + PWD = test + Port = 1433 +``` + +Configuración del diccionario en ClickHouse: + +``` xml + + + test + + + dict
+ DSN=MSSQL;UID=test;PWD=test +
+ + + + 300 + 360 + + + + + + + + + k + + + s + String + + + +
+
+``` + +o + +``` sql +CREATE DICTIONARY test ( + k UInt64, + s String DEFAULT '' +) +PRIMARY KEY k +SOURCE(ODBC(table 'dict' connection_string 'DSN=MSSQL;UID=test;PWD=test')) +LAYOUT(FLAT()) +LIFETIME(MIN 300 MAX 360) +``` + +## DBMS {#dbms} + +### MySQL {#dicts-external-dicts-dict-sources-mysql} + +Ejemplo de configuración: + +``` xml + + + 3306 + clickhouse + qwerty + + example01-1 + 1 + + + example01-2 + 1 + + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` + +o + +``` sql +SOURCE(MYSQL( + port 3306 + user 'clickhouse' + password 'qwerty' + replica(host 'example01-1' priority 1) + replica(host 'example01-2' priority 1) + db 'db_name' + table 'table_name' + where 'id=10' + invalidate_query 'SQL_QUERY' +)) +``` + +Configuración de campos: + +- `port` – The port on the MySQL server. You can specify it for all replicas, or for each one individually (inside ``). + +- `user` – Name of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). + +- `password` – Password of the MySQL user. You can specify it for all replicas, or for each one individually (inside ``). + +- `replica` – Section of replica configurations. There can be multiple sections. + + - `replica/host` – The MySQL host. + - `replica/priority` – The replica priority. When attempting to connect, ClickHouse traverses the replicas in order of priority. The lower the number, the higher the priority. + +- `db` – Name of the database. + +- `table` – Name of the table. + +- `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` cláusula en MySQL, por ejemplo, `id > 10 AND id < 20`. Parámetro opcional. + +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Actualización de diccionarios](external_dicts_dict_lifetime.md). + +MySQL se puede conectar en un host local a través de sockets. Para hacer esto, establezca `host` y `socket`. + +Ejemplo de configuración: + +``` xml + + + localhost + /path/to/socket/file.sock + clickhouse + qwerty + db_name + table_name
+ id=10 + SQL_QUERY +
+ +``` + +o + +``` sql +SOURCE(MYSQL( + host 'localhost' + socket '/path/to/socket/file.sock' + user 'clickhouse' + password 'qwerty' + db 'db_name' + table 'table_name' + where 'id=10' + invalidate_query 'SQL_QUERY' +)) +``` + +### Haga clic en Casa {#dicts-external-dicts-dict-sources-clickhouse} + +Ejemplo de configuración: + +``` xml + + + example01-01-1 + 9000 + default + + default + ids
+ id=10 +
+ +``` + +o + +``` sql +SOURCE(CLICKHOUSE( + host 'example01-01-1' + port 9000 + user 'default' + password '' + db 'default' + table 'ids' + where 'id=10' +)) +``` + +Configuración de campos: + +- `host` – The ClickHouse host. If it is a local host, the query is processed without any network activity. To improve fault tolerance, you can create a [Distribuido](../../operations/table_engines/distributed.md) tabla e ingrésela en configuraciones posteriores. +- `port` – The port on the ClickHouse server. +- `user` – Name of the ClickHouse user. +- `password` – Password of the ClickHouse user. +- `db` – Name of the database. +- `table` – Name of the table. +- `where` – The selection criteria. May be omitted. +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Actualización de diccionarios](external_dicts_dict_lifetime.md). + +### MongoDB {#dicts-external-dicts-dict-sources-mongodb} + +Ejemplo de configuración: + +``` xml + + + localhost + 27017 + + + test + dictionary_source + + +``` + +o + +``` sql +SOURCE(MONGO( + host 'localhost' + port 27017 + user '' + password '' + db 'test' + collection 'dictionary_source' +)) +``` + +Configuración de campos: + +- `host` – The MongoDB host. +- `port` – The port on the MongoDB server. +- `user` – Name of the MongoDB user. +- `password` – Password of the MongoDB user. +- `db` – Name of the database. +- `collection` – Name of the collection. + +### Redis {#dicts-external-dicts-dict-sources-redis} + +Ejemplo de configuración: + +``` xml + + + localhost + 6379 + simple + 0 + + +``` + +o + +``` sql +SOURCE(REDIS( + host 'localhost' + port 6379 + storage_type 'simple' + db_index 0 +)) +``` + +Configuración de campos: + +- `host` – The Redis host. +- `port` – The port on the Redis server. +- `storage_type` – The structure of internal Redis storage using for work with keys. `simple` es para fuentes simples y para fuentes de clave única hash, `hash_map` es para fuentes hash con dos teclas. Los orígenes a distancia y los orígenes de caché con clave compleja no son compatibles. Puede omitirse, el valor predeterminado es `simple`. +- `db_index` – The specific numeric index of Redis logical database. May be omitted, default value is 0. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/dicts/external_dicts_dict_sources/) diff --git a/docs/es/query_language/dicts/external_dicts_dict_structure.md b/docs/es/query_language/dicts/external_dicts_dict_structure.md new file mode 100644 index 00000000000..855973f6dbb --- /dev/null +++ b/docs/es/query_language/dicts/external_dicts_dict_structure.md @@ -0,0 +1,168 @@ +# Clave y campos del diccionario {#dictionary-key-and-fields} + +El `` cláusula describe la clave del diccionario y los campos disponibles para consultas. + +Descripción XML: + +``` xml + + + + Id + + + + + + + ... + + + +``` + +Los atributos se describen en los elementos: + +- `` — [Columna clave](external_dicts_dict_structure.md#ext_dict_structure-key). +- `` — [Columna de datos](external_dicts_dict_structure.md#ext_dict_structure-attributes). Puede haber un número múltiple de atributos. + +Consulta DDL: + +``` sql +CREATE DICTIONARY dict_name ( + Id UInt64, + -- attributes +) +PRIMARY KEY Id +... +``` + +Los atributos se describen en el cuerpo de la consulta: + +- `PRIMARY KEY` — [Columna clave](external_dicts_dict_structure.md#ext_dict_structure-key) +- `AttrName AttrType` — [Columna de datos](external_dicts_dict_structure.md#ext_dict_structure-attributes). Puede haber un número múltiple de atributos. + +## Clave {#ext-dict-structure-key} + +ClickHouse admite los siguientes tipos de claves: + +- Tecla numérica. `UInt64`. Definido en el `` etiqueta o usando `PRIMARY KEY` palabra clave. +- Clave compuesta. Conjunto de valores de diferentes tipos. Definido en la etiqueta `` o `PRIMARY KEY` palabra clave. + +Una estructura xml puede contener `` o ``. La consulta DDL debe contener `PRIMARY KEY`. + +!!! warning "Advertencia" + No debe describir la clave como un atributo. + +### Tecla numérica {#ext-dict-numeric-key} + +Tipo: `UInt64`. + +Ejemplo de configuración: + +``` xml + + Id + +``` + +Campos de configuración: + +- `name` – The name of the column with keys. + +Para consulta DDL: + +``` sql +CREATE DICTIONARY ( + Id UInt64, + ... +) +PRIMARY KEY Id +... +``` + +- `PRIMARY KEY` – The name of the column with keys. + +### Clave compuesta {#composite-key} + +La clave puede ser un `tuple` de cualquier tipo de campo. El [diseño](external_dicts_dict_layout.md) es este caso debe ser `complex_key_hashed` o `complex_key_cache`. + +!!! tip "Consejo" + Una clave compuesta puede consistir en un solo elemento. Esto hace posible usar una cadena como clave, por ejemplo. + +La estructura clave se establece en el elemento ``. Los campos clave se especifican en el mismo formato que el diccionario [Atributo](external_dicts_dict_structure.md). Ejemplo: + +``` xml + + + + field1 + String + + + field2 + UInt32 + + ... + +... +``` + +o + +``` sql +CREATE DICTIONARY ( + field1 String, + field2 String + ... +) +PRIMARY KEY field1, field2 +... +``` + +Para una consulta al `dictGet*` función, una tupla se pasa como la clave. Ejemplo: `dictGetString('dict_name', 'attr_name', tuple('string for field1', num_for_field2))`. + +## Atributo {#ext-dict-structure-attributes} + +Ejemplo de configuración: + +``` xml + + ... + + Name + ClickHouseDataType + + rand64() + true + true + true + + +``` + +o + +``` sql +CREATE DICTIONARY somename ( + Name ClickHouseDataType DEFAULT '' EXPRESSION rand64() HIERARCHICAL INJECTIVE IS_OBJECT_ID +) +``` + +Campos de configuración: + +| Etiqueta | Descripción | Requerir | +|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| +| `name` | Nombre de columna. | Sí | +| `type` | Tipo de datos ClickHouse.
ClickHouse intenta convertir el valor del diccionario al tipo de datos especificado. Por ejemplo, para MySQL, el campo podría ser `TEXT`, `VARCHAR`, o `BLOB` es la tabla fuente de MySQL, pero se puede cargar como `String` es ClickHouse.
[NULL](../../data_types/nullable.md) no es compatible. | Sí | +| `null_value` | Valor predeterminado para un elemento no existente.
En el ejemplo, es una cadena vacía. No se puede utilizar `NULL` en este campo. | Sí | +| `expression` | [Expresion](../syntax.md#syntax-expressions) que ClickHouse ejecuta en el valor.
La expresión puede ser un nombre de columna en la base de datos SQL remota. Por lo tanto, puede usarlo para crear un alias para la columna remota.

Valor predeterminado: sin expresión. | No | +| `hierarchical` | Si `true` el atributo contiene el valor de una clave primaria para la clave actual. Ver [Diccionarios jerárquicos](external_dicts_dict_hierarchical.md).

Valor predeterminado: `false`. | No | +| `injective` | Indicador que muestra si el `id -> attribute` la imagen es [inyectivo](https://en.wikipedia.org/wiki/Injective_function).
Si `true`, ClickHouse puede colocar automáticamente después de la `GROUP BY` cláusula las solicitudes a los diccionarios con inyección. Por lo general, reduce significativamente la cantidad de tales solicitudes.

Valor predeterminado: `false`. | No | +| `is_object_id` | Indicador que muestra si la consulta se ejecuta para un documento MongoDB mediante `ObjectID`.

Valor predeterminado: `false`. | No | + +## Ver también {#see-also} + +- [Funciones para trabajar con diccionarios externos](../functions/ext_dict_functions.md). + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/dicts/external_dicts_dict_structure/) diff --git a/docs/es/query_language/dicts/index.md b/docs/es/query_language/dicts/index.md new file mode 100644 index 00000000000..77e6f0db41c --- /dev/null +++ b/docs/es/query_language/dicts/index.md @@ -0,0 +1,14 @@ +# Diccionario {#dictionaries} + +Un diccionario es un mapeo (`key -> attributes`) que es conveniente para varios tipos de listas de referencia. + +ClickHouse admite funciones especiales para trabajar con diccionarios que se pueden usar en consultas. Es más fácil y más eficiente usar diccionarios con funciones que un `JOIN` con tablas de referencia. + +[NULO](../syntax.md#null) no se pueden almacenar en un diccionario. + +Soporte ClickHouse: + +- [Diccionarios incorporados](internal_dicts.md#internal_dicts) con una específica [conjunto de funciones](../functions/ym_dict_functions.md). +- [Diccionarios complementarios (externos)](external_dicts.md) con un [conjunto de funciones](../functions/ext_dict_functions.md). + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/dicts/) diff --git a/docs/es/query_language/dicts/internal_dicts.md b/docs/es/query_language/dicts/internal_dicts.md new file mode 100644 index 00000000000..1d1b0afd670 --- /dev/null +++ b/docs/es/query_language/dicts/internal_dicts.md @@ -0,0 +1,48 @@ +# Diccionarios internos {#internal-dicts} + +ClickHouse contiene una función integrada para trabajar con una geobase. + +Esto le permite: + +- Utilice el ID de una región para obtener su nombre en el idioma deseado. +- Utilice el ID de una región para obtener el ID de una ciudad, área, distrito federal, país o continente. +- Compruebe si una región es parte de otra región. +- Obtener una cadena de regiones principales. + +Todas las funciones de apoyo “translocality,” la capacidad de utilizar simultáneamente diferentes perspectivas sobre la propiedad de la región. Para obtener más información, consulte la sección “Functions for working with Yandex.Metrica dictionaries”. + +Los diccionarios internos están deshabilitados en el paquete predeterminado. +Para habilitarlos, descomente los parámetros `path_to_regions_hierarchy_file` y `path_to_regions_names_files` en el archivo de configuración del servidor. + +La geobase se carga desde archivos de texto. + +Coloque el `regions_hierarchy*.txt` archivos en el `path_to_regions_hierarchy_file` directorio. Este parámetro de configuración debe contener la ruta `regions_hierarchy.txt` archivo (la jerarquía regional predeterminada), y los otros archivos (`regions_hierarchy_ua.txt`) debe estar ubicado en el mismo directorio. + +Ponga el `regions_names_*.txt` archivos en el `path_to_regions_names_files` Directorio. + +También puede crear estos archivos usted mismo. El formato de archivo es el siguiente: + +`regions_hierarchy*.txt`: TabSeparated (sin encabezado), columnas: + +- ID de la región (`UInt32`) +- ID de región padre (`UInt32`) +- tipo de región (`UInt8`): 1 - continente, 3 - país, 4 - distrito federal, 5 - región, 6 - ciudad; otros tipos no tienen valores +- población (`UInt32`) — optional column + +`regions_names_*.txt`: TabSeparated (sin encabezado), columnas: + +- ID de la región (`UInt32`) +- nombre de la región (`String`) — Can't contain tabs or line feeds, even escaped ones. + +Una matriz plana se usa para almacenar en RAM. Por esta razón, los ID no deberían ser más de un millón. + +Los diccionarios se pueden actualizar sin reiniciar el servidor. Sin embargo, el conjunto de diccionarios disponibles no se actualiza. +Para las actualizaciones, se comprueban los tiempos de modificación de archivos. Si un archivo ha cambiado, el diccionario se actualiza. +El intervalo para comprobar si hay cambios se configura en el `builtin_dictionaries_reload_interval` parámetro. +Las actualizaciones del diccionario (aparte de la carga al primer uso) no bloquean las consultas. Durante las actualizaciones, las consultas utilizan las versiones anteriores de los diccionarios. Si se produce un error durante una actualización, el error se escribe en el registro del servidor y las consultas continúan utilizando la versión anterior de los diccionarios. + +Recomendamos actualizar periódicamente los diccionarios con la geobase. Durante una actualización, genere nuevos archivos y escríbalos en una ubicación separada. Cuando todo esté listo, cambie el nombre a los archivos utilizados por el servidor. + +También hay funciones para trabajar con identificadores de sistema operativo y Yandex.Motores de búsqueda Metrica, pero no deben ser utilizados. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/dicts/internal_dicts/) diff --git a/docs/es/query_language/functions/arithmetic_functions.md b/docs/es/query_language/functions/arithmetic_functions.md new file mode 100644 index 00000000000..a41e8014967 --- /dev/null +++ b/docs/es/query_language/functions/arithmetic_functions.md @@ -0,0 +1,80 @@ +# Funciones aritméticas {#arithmetic-functions} + +Para todas las funciones aritméticas, el tipo de resultado se calcula como el tipo de número más pequeño en el que encaja el resultado, si existe dicho tipo. El mínimo se toma simultáneamente en función del número de bits, si está firmado y si flota. Si no hay suficientes bits, se toma el tipo de bit más alto. + +Ejemplo: + +``` sql +SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0) +``` + +``` text +┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐ +│ UInt8 │ UInt16 │ UInt32 │ UInt64 │ +└───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘ +``` + +Las funciones aritméticas funcionan para cualquier par de tipos de UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32 o Float64. + +El desbordamiento se produce de la misma manera que en C ++. + +## más (a, b), a + b operador {#plusa-b-a-b-operator} + +Calcula la suma de los números. +También puede agregar números enteros con una fecha o fecha y hora. En el caso de una fecha, agregar un entero significa agregar el número correspondiente de días. Para una fecha con hora, significa agregar el número correspondiente de segundos. + +## menos(a, b), a - b operador {#minusa-b-a-b-operator} + +Calcula la diferencia. El resultado siempre está firmado. + +También puede calcular números enteros a partir de una fecha o fecha con la hora. La idea es la misma – ver arriba para ‘plus’. + +## ¿Cómo puedo hacerlo? {#multiplya-b-a-b-operator} + +Calcula el producto de los números. + +## divide (a, segundo), operador a / segundo {#dividea-b-a-b-operator} + +Calcula el cociente de los números. El tipo de resultado es siempre un tipo de punto flotante. +No es una división entera. Para la división de enteros, use el ‘intDiv’ función. +Al dividir por cero obtienes ‘inf’, ‘-inf’, o ‘nan’. + +## Información de uso) {#intdiva-b} + +Calcula el cociente de los números. Se divide en enteros, redondeando hacia abajo (por el valor absoluto). +Se produce una excepción al dividir por cero o al dividir un número negativo mínimo por menos uno. + +## IntDivOrZero (a, b) {#intdivorzeroa-b} + +Difiere de ‘intDiv’ en que devuelve cero al dividir por cero o al dividir un número negativo mínimo por menos uno. + +## modulo(a, b), a % b operador {#moduloa-b-a-b-operator} + +Calcula el resto después de la división. +Si los argumentos son números de coma flotante, se convierten previamente en enteros eliminando la parte decimal. +El resto se toma en el mismo sentido que en C ++. La división truncada se usa para números negativos. +Se produce una excepción al dividir por cero o al dividir un número negativo mínimo por menos uno. + +## moduloOrZero (a, b) {#moduloorzeroa-b} + +Difiere de ‘modulo’ en que devuelve cero cuando el divisor es cero. + +## ¿Cómo funciona? {#negatea-a-operator} + +Calcula un número con el signo inverso. El resultado siempre está firmado. + +## abs (a) {#arithm-func-abs} + +Calcula el valor absoluto del número (a). Es decir, si un \< 0, devuelve -a. Para los tipos sin signo no hace nada. Para los tipos de enteros con signo, devuelve un número sin signo. + +## GCD (a, b) {#gcda-b} + +Devuelve el mayor divisor común de los números. +Se produce una excepción al dividir por cero o al dividir un número negativo mínimo por menos uno. + +## Lcm (a, b) {#lcma-b} + +Devuelve el mínimo múltiplo común de los números. +Se produce una excepción al dividir por cero o al dividir un número negativo mínimo por menos uno. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/arithmetic_functions/) diff --git a/docs/es/query_language/functions/array_join.md b/docs/es/query_language/functions/array_join.md new file mode 100644 index 00000000000..998c52be4a9 --- /dev/null +++ b/docs/es/query_language/functions/array_join.md @@ -0,0 +1,30 @@ +# arrayJoin función {#functions-arrayjoin} + +Esta es una función muy inusual. + +Las funciones normales no cambian un conjunto de filas, sino que simplemente cambian los valores en cada fila (mapa). +Las funciones agregadas comprimen un conjunto de filas (doblar o reducir). +El ‘arrayJoin’ función toma cada fila y genera un conjunto de filas (desplegar). + +Esta función toma una matriz como argumento y propaga la fila de origen a varias filas para el número de elementos de la matriz. +Todos los valores de las columnas simplemente se copian, excepto los valores de la columna donde se aplica esta función; se reemplaza con el valor de matriz correspondiente. + +Una consulta puede usar múltiples `arrayJoin` función. En este caso, la transformación se realiza varias veces. + +Tenga en cuenta la sintaxis ARRAY JOIN en la consulta SELECT, que proporciona posibilidades más amplias. + +Ejemplo: + +``` sql +SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src +``` + +``` text +┌─dst─┬─\'Hello\'─┬─src─────┐ +│ 1 │ Hello │ [1,2,3] │ +│ 2 │ Hello │ [1,2,3] │ +│ 3 │ Hello │ [1,2,3] │ +└─────┴───────────┴─────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/array_join/) diff --git a/docs/es/query_language/functions/bit_functions.md b/docs/es/query_language/functions/bit_functions.md new file mode 100644 index 00000000000..d74c89c3633 --- /dev/null +++ b/docs/es/query_language/functions/bit_functions.md @@ -0,0 +1,208 @@ +# Bit funciones {#bit-functions} + +Las funciones de bits funcionan para cualquier par de tipos de UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32 o Float64. + +El tipo de resultado es un entero con bits iguales a los bits máximos de sus argumentos. Si al menos uno de los argumentos está firmado, el resultado es un número firmado. Si un argumento es un número de coma flotante, se convierte en Int64. + +## PocoY(a, b) {#bitanda-b} + +## bitOr (a, b) {#bitora-b} + +## ¿Por qué?) {#bitxora-b} + +## ¿Por qué?) {#bitnota} + +## ¿Cómo puedo hacerlo?) {#bitshiftlefta-b} + +## ¿Cómo puedo hacerlo?) {#bitshiftrighta-b} + +## ¿Cómo puedo hacerlo?) {#bitrotatelefta-b} + +## ¿Cómo puedo hacerlo?) {#bitrotaterighta-b} + +## bitTest {#bittest} + +Toma cualquier entero y lo convierte en [Forma binaria](https://en.wikipedia.org/wiki/Binary_number) devuelve el valor de un bit en la posición especificada. La cuenta atrás comienza desde 0 de derecha a izquierda. + +**Sintaxis** + +``` sql +SELECT bitTest(number, index) +``` + +**Parámetros** + +- `number` – número entero. +- `index` – posición de bit. + +**Valores devueltos** + +Devuelve un valor de bit en la posición especificada. + +Tipo: `UInt8`. + +**Ejemplo** + +Por ejemplo, el número 43 en el sistema numérico base-2 (binario) es 101011. + +Consulta: + +``` sql +SELECT bitTest(43, 1) +``` + +Resultado: + +``` text +┌─bitTest(43, 1)─┐ +│ 1 │ +└────────────────┘ +``` + +Otro ejemplo: + +Consulta: + +``` sql +SELECT bitTest(43, 2) +``` + +Resultado: + +``` text +┌─bitTest(43, 2)─┐ +│ 0 │ +└────────────────┘ +``` + +## bitTestAll {#bittestall} + +Devuelve el resultado de [conjunción lógica](https://en.wikipedia.org/wiki/Logical_conjunction) (Operador AND) de todos los bits en posiciones dadas. La cuenta atrás comienza desde 0 de derecha a izquierda. + +La conjucción para operaciones bit a bit: + +Nivel de Cifrado WEP + +Nivel de Cifrado WEP + +Nivel de Cifrado WEP + +Nivel de Cifrado WEP + +**Sintaxis** + +``` sql +SELECT bitTestAll(number, index1, index2, index3, index4, ...) +``` + +**Parámetros** + +- `number` – número entero. +- `index1`, `index2`, `index3`, `index4` – posiciones de bit. Por ejemplo, para un conjunto de posiciones (`index1`, `index2`, `index3`, `index4`) es verdadero si y solo si todas sus posiciones son verdaderas (`index1` ⋀ `index2`, ⋀ `index3` ⋀ `index4`). + +**Valores devueltos** + +Devuelve el resultado de la conjunción lógica. + +Tipo: `UInt8`. + +**Ejemplo** + +Por ejemplo, el número 43 en el sistema numérico base-2 (binario) es 101011. + +Consulta: + +``` sql +SELECT bitTestAll(43, 0, 1, 3, 5) +``` + +Resultado: + +``` text +┌─bitTestAll(43, 0, 1, 3, 5)─┐ +│ 1 │ +└────────────────────────────┘ +``` + +Otro ejemplo: + +Consulta: + +``` sql +SELECT bitTestAll(43, 0, 1, 3, 5, 2) +``` + +Resultado: + +``` text +┌─bitTestAll(43, 0, 1, 3, 5, 2)─┐ +│ 0 │ +└───────────────────────────────┘ +``` + +## bitTestAny {#bittestany} + +Devuelve el resultado de [disyunción lógica](https://en.wikipedia.org/wiki/Logical_disjunction) (O operador) de todos los bits en posiciones dadas. La cuenta atrás comienza desde 0 de derecha a izquierda. + +La disyunción para las operaciones bit a bit: + +Nivel de Cifrado WEP + +Nivel de Cifrado WEP + +Nivel de Cifrado WEP + +Nivel de Cifrado WEP + +**Sintaxis** + +``` sql +SELECT bitTestAny(number, index1, index2, index3, index4, ...) +``` + +**Parámetros** + +- `number` – número entero. +- `index1`, `index2`, `index3`, `index4` – posiciones de bit. + +**Valores devueltos** + +Devuelve el resultado de disjuction lógico. + +Tipo: `UInt8`. + +**Ejemplo** + +Por ejemplo, el número 43 en el sistema numérico base-2 (binario) es 101011. + +Consulta: + +``` sql +SELECT bitTestAny(43, 0, 2) +``` + +Resultado: + +``` text +┌─bitTestAny(43, 0, 2)─┐ +│ 1 │ +└──────────────────────┘ +``` + +Otro ejemplo: + +Consulta: + +``` sql +SELECT bitTestAny(43, 4, 2) +``` + +Resultado: + +``` text +┌─bitTestAny(43, 4, 2)─┐ +│ 0 │ +└──────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/bit_functions/) diff --git a/docs/es/query_language/functions/bitmap_functions.md b/docs/es/query_language/functions/bitmap_functions.md new file mode 100644 index 00000000000..ae6e714ac05 --- /dev/null +++ b/docs/es/query_language/functions/bitmap_functions.md @@ -0,0 +1,489 @@ +# Funciones de mapa de bits {#bitmap-functions} + +Las funciones de mapa de bits funcionan para dos mapas de bits Cálculo del valor del objeto, es devolver un nuevo mapa de bits o cardinalidad mientras se usa el cálculo de la fórmula, como y, o, xor, y no, etc. + +Hay 2 tipos de métodos de construcción para Bitmap Object. Uno debe ser construido por la función de agregación groupBitmap con -State , el otro debe ser construido por Array Object . También es para convertir objeto de mapa de bits a objeto de matriz. + +RoaringBitmap se envuelve en una estructura de datos, mientras que el almacenamiento real de los objetos de mapa de bits. Cuando la cardinalidad es menor o igual que 32, utiliza Set objet. Cuando la cardinalidad es mayor que 32, utiliza el objeto RoaringBitmap. Es por eso que el almacenamiento del conjunto de baja cardinalidad es más rápido. + +Para obtener más información sobre RoaringBitmap, consulte: [CRoaring](https://github.com/RoaringBitmap/CRoaring). + +## bitmapBuild {#bitmap-functions-bitmapbuild} + +Construya un mapa de bits a partir de una matriz de enteros sin signo. + +``` sql +bitmapBuild(array) +``` + +**Parámetros** + +- `array` – matriz entera sin signo. + +**Ejemplo** + +``` sql +SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) +``` + +``` text +┌─res─┬─toTypeName(bitmapBuild([1, 2, 3, 4, 5]))─────┐ +│  │ AggregateFunction(groupBitmap, UInt8) │ +└─────┴──────────────────────────────────────────────┘ +``` + +## bitmapToArray {#bitmaptoarray} + +Convertir mapa de bits a matriz entera. + +``` sql +bitmapToArray(bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + +``` text +┌─res─────────┐ +│ [1,2,3,4,5] │ +└─────────────┘ +``` + +## bitmapSubsetInRange {#bitmap-functions-bitmapsubsetinrange} + +Devuelve el subconjunto en el rango especificado (no incluye range\_end). + +``` sql +bitmapSubsetInRange(bitmap, range_start, range_end) +``` + +**Parámetros** + +- `bitmap` – [Objeto de mapa de bits](#bitmap_functions-bitmapbuild). +- `range_start` – punto de inicio de la gama. Tipo: [UInt32](../../data_types/int_uint.md). +- `range_end` – punto final de rango(excluido). Tipo: [UInt32](../../data_types/int_uint.md). + +**Ejemplo** + +``` sql +SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res +``` + +``` text +┌─res───────────────┐ +│ [30,31,32,33,100] │ +└───────────────────┘ +``` + +## bitmapSubsetLimit {#bitmapsubsetlimit} + +Crea un subconjunto de mapa de bits con n elementos tomados entre `range_start` y `cardinality_limit`. + +**Sintaxis** + +``` sql +bitmapSubsetLimit(bitmap, range_start, cardinality_limit) +``` + +**Parámetros** + +- `bitmap` – [Objeto de mapa de bits](#bitmap_functions-bitmapbuild). +- `range_start` – El punto de partida del subconjunto. Tipo: [UInt32](../../data_types/int_uint.md). +- `cardinality_limit` – El subconjunto cardinalidad límite superior. Tipo: [UInt32](../../data_types/int_uint.md). + +**Valor devuelto** + +Subconjunto. + +Tipo: `Bitmap object`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res +``` + +Resultado: + +``` text +┌─res───────────────────────┐ +│ [30,31,32,33,100,200,500] │ +└───────────────────────────┘ +``` + +## bitmapContains {#bitmap-functions-bitmapcontains} + +Comprueba si el mapa de bits contiene un elemento. + +``` sql +bitmapContains(haystack, needle) +``` + +**Parámetros** + +- `haystack` – [Objeto de mapa de bits](#bitmap_functions-bitmapbuild), donde la función busca. +- `needle` – Valor que busca la función. Tipo: [UInt32](../../data_types/int_uint.md). + +**Valores devueltos** + +- 0 — Si `haystack` no contiene `needle`. +- 1 — Si `haystack` contener `needle`. + +Tipo: `UInt8`. + +**Ejemplo** + +``` sql +SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res +``` + +``` text +┌─res─┐ +│ 1 │ +└─────┘ +``` + +## bitmapHasAny {#bitmaphasany} + +Comprueba si dos mapas de bits tienen intersección por algunos elementos. + +``` sql +bitmapHasAny(bitmap1, bitmap2) +``` + +Si está seguro de que `bitmap2` Esta es la página de desambiguación en 24symbols. [bitmapContains](#bitmap_functions-bitmapcontains) función. Funciona de manera más eficiente. + +**Parámetros** + +- `bitmap*` – objeto de mapa de bits. + +**Valores de retorno** + +- `1` si `bitmap1` y `bitmap2` tienen un elemento similar al menos. +- `0`, de lo contrario. + +**Ejemplo** + +``` sql +SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res +``` + +``` text +┌─res─┐ +│ 1 │ +└─────┘ +``` + +## bitmapHasAll {#bitmaphasall} + +Análogo a `hasAll(array, array)` devuelve 1 si el primer mapa de bits contiene todos los elementos del segundo, 0 de lo contrario. +Si el segundo argumento es un mapa de bits vacío, devuelve 1. + +``` sql +bitmapHasAll(bitmap,bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res +``` + +``` text +┌─res─┐ +│ 0 │ +└─────┘ +``` + +## bitmapCardinalidad {#bitmapcardinality} + +Vuelva a ejecutar la cardinalidad del mapa de bits de tipo UInt64. + +``` sql +bitmapCardinality(bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + +``` text +┌─res─┐ +│ 5 │ +└─────┘ +``` + +## bitmapMin {#bitmapmin} + +Vuelva a ejecutar el valor más pequeño de tipo UInt64 en el conjunto, UINT32\_MAX si el conjunto está vacío. + + bitmapMin(bitmap) + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + + ┌─res─┐ + │ 1 │ + └─────┘ + +## bitmapMax {#bitmapmax} + +Vuelva a ejecutar el mayor valor de tipo UInt64 en el conjunto, 0 si el conjunto está vacío. + + bitmapMax(bitmap) + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + + ┌─res─┐ + │ 5 │ + └─────┘ + +## bitmapTransform {#bitmaptransform} + +Transformar una matriz de valores en un mapa de bits a otra matriz de valores, el resultado es un nuevo mapa de bits. + + bitmapTransform(bitmap, from_array, to_array) + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. +- `from_array` – matriz UInt32. Para idx en el rango \[0, from\_array .size()), si el mapa de bits contiene from\_array\[idx\] , luego reemplácelo con to\_array\[idx\] . Tenga en cuenta que el resultado depende del orden de la matriz si hay elementos comunes entre from\_array y to\_array . +- `to_array` – Matriz UInt32, su tamaño será el mismo que from\_array. + +**Ejemplo** + +``` sql +SELECT bitmapToArray(bitmapTransform(bitmapBuild([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), cast([5,999,2] as Array(UInt32)), cast([2,888,20] as Array(UInt32)))) AS res +``` + + ┌─res───────────────────┐ + │ [1,3,4,6,7,8,9,10,20] │ + └───────────────────────┘ + +## bitmapAnd {#bitmapand} + +Dos mapa de bits y cálculo, el resultado es un nuevo mapa de bits. + +``` sql +bitmapAnd(bitmap,bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +``` + +``` text +┌─res─┐ +│ [3] │ +└─────┘ +``` + +## bitmapO {#bitmapor} + +Dos mapa de bits o cálculo, el resultado es un nuevo mapa de bits. + +``` sql +bitmapOr(bitmap,bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +``` + +``` text +┌─res─────────┐ +│ [1,2,3,4,5] │ +└─────────────┘ +``` + +## bitmapXor {#bitmapxor} + +Dos bitmap xor cálculo, el resultado es un nuevo mapa de bits. + +``` sql +bitmapXor(bitmap,bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +``` + +``` text +┌─res───────┐ +│ [1,2,4,5] │ +└───────────┘ +``` + +## bitmapAndnot {#bitmapandnot} + +Dos mapas de bits y no cálculo, el resultado es un nuevo mapa de bits. + +``` sql +bitmapAndnot(bitmap,bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +``` + +``` text +┌─res───┐ +│ [1,2] │ +└───────┘ +``` + +## Bienvenido a WordPress. {#bitmapandcardinality} + +Dos mapa de bits y cálculo, devuelven cardinalidad de tipo UInt64. + +``` sql +bitmapAndCardinality(bitmap,bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; +``` + +``` text +┌─res─┐ +│ 1 │ +└─────┘ +``` + +## bitmapOrCardinalidad {#bitmaporcardinality} + +Dos mapa de bits o cálculo, retorno cardinalidad de tipo UInt64. + +``` sql +bitmapOrCardinality(bitmap,bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; +``` + +``` text +┌─res─┐ +│ 5 │ +└─────┘ +``` + +## bitmapXorCardinalidad {#bitmapxorcardinality} + +Dos bitmap xor cálculo, retorno cardinalidad de tipo UInt64. + +``` sql +bitmapXorCardinality(bitmap,bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; +``` + +``` text +┌─res─┐ +│ 4 │ +└─────┘ +``` + +## Por favor, introduzca su dirección de correo electrónico {#bitmapandnotcardinality} + +Dos mapas de bits yno cálculo, devuelve cardinalidad de tipo UInt64. + +``` sql +bitmapAndnotCardinality(bitmap,bitmap) +``` + +**Parámetros** + +- `bitmap` – objeto de mapa de bits. + +**Ejemplo** + +``` sql +SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; +``` + +``` text +┌─res─┐ +│ 2 │ +└─────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/bitmap_functions/) diff --git a/docs/es/query_language/functions/comparison_functions.md b/docs/es/query_language/functions/comparison_functions.md new file mode 100644 index 00000000000..c64cd659bb9 --- /dev/null +++ b/docs/es/query_language/functions/comparison_functions.md @@ -0,0 +1,32 @@ +# Funciones de comparación {#comparison-functions} + +Las funciones de comparación siempre devuelven 0 o 1 (Uint8). + +Se pueden comparar los siguientes tipos: + +- número +- cuerdas y cuerdas fijas +- Fechas +- fechas con tiempos + +dentro de cada grupo, pero no entre diferentes grupos. + +Por ejemplo, no puede comparar una fecha con una cadena. Debe usar una función para convertir la cadena a una fecha, o viceversa. + +Las cadenas se comparan por bytes. Una cadena más corta es más pequeña que todas las cadenas que comienzan con ella y que contienen al menos un carácter más. + +Nota. Hasta la versión 1.1.54134, los números firmados y sin firmar se comparaban de la misma manera que en C ++. En otras palabras, podría obtener un resultado incorrecto en casos como SELECT 9223372036854775807 \> -1 . Este comportamiento cambió en la versión 1.1.54134 y ahora es matemáticamente correcto. + +## Por ejemplo: {#function-equals} + +## notEquals, un ! operador = b y a `<>` b {#function-notequals} + +## menos, `< operator` {#function-less} + +## alcalde, `> operator` {#function-greater} + +## lessOrEquals, `<= operator` {#function-lessorequals} + +## mayorOrEquals, `>= operator` {#function-greaterorequals} + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/comparison_functions/) diff --git a/docs/es/query_language/functions/conditional_functions.md b/docs/es/query_language/functions/conditional_functions.md new file mode 100644 index 00000000000..efed413827b --- /dev/null +++ b/docs/es/query_language/functions/conditional_functions.md @@ -0,0 +1,200 @@ +# Funciones condicionales {#conditional-functions} + +## si {#if} + +Controla la bifurcación condicional. A diferencia de la mayoría de los sistemas, ClickHouse siempre evalúa ambas expresiones `then` y `else`. + +**Sintaxis** + +``` sql +SELECT if(cond, then, else) +``` + +Si la condición `cond` evalúa a un valor distinto de cero, devuelve el resultado de la expresión `then` y el resultado de la expresión `else` si está presente, se omite. Si el `cond` es cero o `NULL` el resultado de la `then` expresión se omite y el resultado de la `else` expresión, si está presente, se devuelve. + +**Parámetros** + +- `cond` – La condición para la evaluación que puede ser cero o no. El tipo es UInt8, Nullable(UInt8) o NULL. +- `then` - La expresión que se va a devolver si se cumple la condición. +- `else` - La expresión a devolver si no se cumple la condición.- + +**Valores devueltos** + +La función se ejecuta `then` y `else` expresiones y devuelve su resultado, dependiendo de si la condición `cond` terminó siendo cero o no. + +**Ejemplo** + +Consulta: + +``` sql +SELECT if(1, plus(2, 2), plus(2, 6)) +``` + +Resultado: + +``` text +┌─plus(2, 2)─┐ +│ 4 │ +└────────────┘ +``` + +Consulta: + +``` sql +SELECT if(0, plus(2, 2), plus(2, 6)) +``` + +Resultado: + +``` text +┌─plus(2, 6)─┐ +│ 8 │ +└────────────┘ +``` + +- `then` y `else` debe tener el tipo común más bajo. + +**Ejemplo:** + +Toma esto `LEFT_RIGHT` tabla: + +``` sql +SELECT * +FROM LEFT_RIGHT + +┌─left─┬─right─┐ +│ ᴺᵁᴸᴸ │ 4 │ +│ 1 │ 3 │ +│ 2 │ 2 │ +│ 3 │ 1 │ +│ 4 │ ᴺᵁᴸᴸ │ +└──────┴───────┘ +``` + +La siguiente consulta compara `left` y `right` valor: + +``` sql +SELECT + left, + right, + if(left < right, 'left is smaller than right', 'right is greater or equal than left') AS is_smaller +FROM LEFT_RIGHT +WHERE isNotNull(left) AND isNotNull(right) + +┌─left─┬─right─┬─is_smaller──────────────────────────┐ +│ 1 │ 3 │ left is smaller than right │ +│ 2 │ 2 │ right is greater or equal than left │ +│ 3 │ 1 │ right is greater or equal than left │ +└──────┴───────┴─────────────────────────────────────┘ +``` + +Nota: `NULL` los valores no se utilizan en este ejemplo, compruebe [Valores NULL en condicionales](#null-values-in-conditionals) apartado. + +## Operador ternario {#ternary-operator} + +Funciona igual que `if` función. + +Sintaxis: `cond ? then : else` + +Devoluciones `then` si el `cond` evalúa que es verdadero (mayor que cero), de lo contrario devuelve `else`. + +- `cond` debe ser de tipo de `UInt8`, y `then` y `else` debe tener el tipo común más bajo. + +- `then` y `else` puede ser `NULL` + +**Ver también** + +- [ifNotFinite](other_functions.md#ifnotfinite). + +## MultiIf {#multiif} + +Le permite escribir el [CASO](../operators.md#operator_case) operador más compacto en la consulta. + +Sintaxis: `multiIf(cond_1, then_1, cond_2, then_2, ..., else)` + +**Parámetros:** + +- `cond_N` — La condición para que la función regrese `then_N`. +- `then_N` — El resultado de la función cuando se ejecuta. +- `else` — El resultado de la función si no se cumple ninguna de las condiciones. + +La función acepta `2N+1` parámetros. + +**Valores devueltos** + +La función devuelve uno de los valores `then_N` o `else` dependiendo de las condiciones `cond_N`. + +**Ejemplo** + +De nuevo usando `LEFT_RIGHT` tabla. + +``` sql +SELECT + left, + right, + multiIf(left < right, 'left is smaller', left > right, 'left is greater', left = right, 'Both equal', 'Null value') AS result +FROM LEFT_RIGHT + +┌─left─┬─right─┬─result──────────┐ +│ ᴺᵁᴸᴸ │ 4 │ Null value │ +│ 1 │ 3 │ left is smaller │ +│ 2 │ 2 │ Both equal │ +│ 3 │ 1 │ left is greater │ +│ 4 │ ᴺᵁᴸᴸ │ Null value │ +└──────┴───────┴─────────────────┘ +``` + +## Usar resultados condicionales directamente {#using-conditional-results-directly} + +Los condicionales siempre dan como resultado `0`, `1` o `NULL`. Entonces puedes usar resultados condicionales directamente como este: + +``` sql +SELECT left < right AS is_small +FROM LEFT_RIGHT + +┌─is_small─┐ +│ ᴺᵁᴸᴸ │ +│ 1 │ +│ 0 │ +│ 0 │ +│ ᴺᵁᴸᴸ │ +└──────────┘ +``` + +## Valores NULL en condicionales {#null-values-in-conditionals} + +Cuando `NULL` están involucrados en condicionales, el resultado también será `NULL`. + +``` sql +SELECT + NULL < 1, + 2 < NULL, + NULL < NULL, + NULL = NULL + +┌─less(NULL, 1)─┬─less(2, NULL)─┬─less(NULL, NULL)─┬─equals(NULL, NULL)─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +└───────────────┴───────────────┴──────────────────┴────────────────────┘ +``` + +Por lo tanto, debe construir sus consultas cuidadosamente si los tipos son `Nullable`. + +El siguiente ejemplo demuestra esto al no agregar la condición equals a `multiIf`. + +``` sql +SELECT + left, + right, + multiIf(left < right, 'left is smaller', left > right, 'right is smaller', 'Both equal') AS faulty_result +FROM LEFT_RIGHT + +┌─left─┬─right─┬─faulty_result────┐ +│ ᴺᵁᴸᴸ │ 4 │ Both equal │ +│ 1 │ 3 │ left is smaller │ +│ 2 │ 2 │ Both equal │ +│ 3 │ 1 │ right is smaller │ +│ 4 │ ᴺᵁᴸᴸ │ Both equal │ +└──────┴───────┴──────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/conditional_functions/) diff --git a/docs/es/query_language/functions/date_time_functions.md b/docs/es/query_language/functions/date_time_functions.md new file mode 100644 index 00000000000..a9e651b9e3a --- /dev/null +++ b/docs/es/query_language/functions/date_time_functions.md @@ -0,0 +1,443 @@ +# Funciones para trabajar con fechas y horas {#functions-for-working-with-dates-and-times} + +Soporte para zonas horarias + +Todas las funciones para trabajar con la fecha y la hora que tienen un uso lógico para la zona horaria pueden aceptar un segundo argumento de zona horaria opcional. Ejemplo: Asia/Ekaterimburgo. En este caso, utilizan la zona horaria especificada en lugar de la local (predeterminada). + +``` sql +SELECT + toDateTime('2016-06-15 23:00:00') AS time, + toDate(time) AS date_local, + toDate(time, 'Asia/Yekaterinburg') AS date_yekat, + toString(time, 'US/Samoa') AS time_samoa +``` + +``` text +┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ +│ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ +└─────────────────────┴────────────┴────────────┴─────────────────────┘ +``` + +Solo se admiten las zonas horarias que difieren de UTC por un número entero de horas. + +## Todos los derechos reservados. {#totimezone} + +Convierta la hora o la fecha y la hora a la zona horaria especificada. + +## paraAño {#toyear} + +Convierte una fecha o fecha con hora en un número UInt16 que contiene el número de año (AD). + +## aTrimestre {#toquarter} + +Convierte una fecha o fecha con hora en un número UInt8 que contiene el número de trimestre. + +## ParaMes {#tomonth} + +Convierte una fecha o fecha con hora en un número UInt8 que contiene el número de mes (1-12). + +## Todos los derechos reservados. {#todayofyear} + +Convierte una fecha o fecha con hora en un número UInt16 que contiene el número del día del año (1-366). + +## Todos los derechos reservados. {#todayofmonth} + +Convierte una fecha o fecha con hora en un número UInt8 que contiene el número del día del mes (1-31). + +## Todos los derechos reservados. {#todayofweek} + +Convierte una fecha o fecha con hora en un número UInt8 que contiene el número del día de la semana (el lunes es 1 y el domingo es 7). + +## ParaHora {#tohour} + +Convierte una fecha con hora en un número UInt8 que contiene el número de la hora en el tiempo de 24 horas (0-23). +Esta función supone que si los relojes se mueven hacia adelante, es de una hora y ocurre a las 2 a.m., y si los relojes se mueven hacia atrás, es de una hora y ocurre a las 3 a.m. (lo cual no siempre es cierto, incluso en Moscú los relojes se cambiaron dos veces en un momento diferente). + +## ToMinute {#tominute} + +Convierte una fecha con hora en un número UInt8 que contiene el número del minuto de la hora (0-59). + +## aSegundo {#tosecond} + +Convierte una fecha con hora en un número UInt8 que contiene el número del segundo en el minuto (0-59). +Los segundos de salto no se contabilizan. + +## Todos los derechos reservados. {#to-unix-timestamp} + +Para el argumento DateTime: convierte el valor a su representación numérica interna (Unix Timestamp). +Para el argumento String: analice la fecha y hora de la cadena de acuerdo con la zona horaria (segundo argumento opcional, la zona horaria del servidor se usa de forma predeterminada) y devuelve la marca de tiempo de Unix correspondiente. +Para el argumento Date : el comportamiento no está especificado. + +**Sintaxis** + +``` sql +toUnixTimestamp(datetime) +toUnixTimestamp(str, [timezone]) +``` + +**Valor devuelto** + +- Devuelve la marca de tiempo de Unix. + +Tipo: `UInt32`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp +``` + +Resultado: + +``` text +┌─unix_timestamp─┐ +│ 1509836867 │ +└────────────────┘ +``` + +## Todos los derechos reservados. {#tostartofyear} + +Redondea una fecha o fecha con la hora hasta el primer día del año. +Devuelve la fecha. + +## Todos los derechos reservados. {#tostartofisoyear} + +Redondea una fecha o fecha con la hora hasta el primer día del año ISO. +Devuelve la fecha. + +## Todos los derechos reservados. {#tostartofquarter} + +Redondea una fecha o fecha con la hora hasta el primer día del trimestre. +El primer día del trimestre es el 1 de enero, el 1 de abril, el 1 de julio o el 1 de octubre. +Devuelve la fecha. + +## Todos los derechos reservados. {#tostartofmonth} + +Redondea una fecha o fecha con la hora hasta el primer día del mes. +Devuelve la fecha. + +!!! attention "Atención" + El comportamiento de analizar fechas incorrectas es específico de la implementación. ClickHouse puede devolver una fecha cero, lanzar una excepción o hacer “natural” desbordamiento. + +## paraLunes {#tomonday} + +Redondea una fecha o fecha con hora al lunes más cercano. +Devuelve la fecha. + +## ¿Cómo puedo hacerlo?\]) {#tostartofweektmode} + +Redondea una fecha o fecha con hora al domingo o lunes más cercano por modo. +Devuelve la fecha. +El argumento mode funciona exactamente igual que el argumento mode a toWeek() . Para la sintaxis de argumento único, se utiliza un valor de modo de 0. + +## Todos los derechos reservados. {#tostartofday} + +Redondea una fecha con el tiempo hasta el comienzo del día. + +## Todos los derechos reservados. {#tostartofhour} + +Redondea una fecha con el tiempo hasta el comienzo de la hora. + +## Todos los derechos reservados. {#tostartofminute} + +Redondea una fecha con el tiempo hasta el inicio del minuto. + +## Acerca de nosotros {#tostartoffiveminute} + +Redondea una fecha con el tiempo hasta el inicio del intervalo de cinco minutos. + +## Acerca de Nosotros {#tostartoftenminutes} + +Redondea una fecha con el tiempo hasta el inicio del intervalo de diez minutos. + +## Para comenzar de quince minutos {#tostartoffifteenminutes} + +Redondea la fecha con el tiempo hasta el inicio del intervalo de quince minutos. + +## También puede usar una interfaz de usuario.\]) {#tostartofintervaltime-or-data-interval-x-unit-time-zone} + +Esta es una generalización de otras funciones llamadas `toStartOf*`. Por ejemplo, +`toStartOfInterval(t, INTERVAL 1 year)` devuelve lo mismo que `toStartOfYear(t)`, +`toStartOfInterval(t, INTERVAL 1 month)` devuelve lo mismo que `toStartOfMonth(t)`, +`toStartOfInterval(t, INTERVAL 1 day)` devuelve lo mismo que `toStartOfDay(t)`, +`toStartOfInterval(t, INTERVAL 15 minute)` devuelve lo mismo que `toStartOfFifteenMinutes(t)` sucesivamente. + +## Tiempo {#totime} + +Convierte una fecha con hora en una fecha fija determinada, preservando al mismo tiempo la hora. + +## Todos los derechos reservados. {#torelativeyearnum} + +Convierte una fecha con hora o fecha en el número del año, a partir de un determinado punto fijo en el pasado. + +## Nombre de la red inalámbrica (SSID): {#torelativequarternum} + +Convierte una fecha con hora o fecha en el número del trimestre, a partir de un determinado punto fijo en el pasado. + +## Nombre de la red inalámbrica (SSID): {#torelativemonthnum} + +Convierte una fecha con hora o fecha en el número del mes, a partir de un determinado punto fijo en el pasado. + +## Nombre de la red inalámbrica (SSID): {#torelativeweeknum} + +Convierte una fecha con hora o fecha en el número de la semana, a partir de un determinado punto fijo en el pasado. + +## Nombre de la red inalámbrica (SSID): {#torelativedaynum} + +Convierte una fecha con hora o fecha en el número del día, a partir de un determinado punto fijo en el pasado. + +## Nombre de la red inalámbrica (SSID): {#torelativehournum} + +Convierte una fecha con hora o fecha en el número de la hora, a partir de un determinado punto fijo en el pasado. + +## Todos los derechos reservados. {#torelativeminutenum} + +Convierte una fecha con hora o fecha en el número del minuto, a partir de un cierto punto fijo en el pasado. + +## Todos los derechos reservados. {#torelativesecondnum} + +Convierte una fecha con hora o fecha en el número de la segunda, a partir de un cierto punto fijo en el pasado. + +## AISOAño {#toisoyear} + +Convierte una fecha o fecha con hora en un número UInt16 que contiene el número ISO Year. + +## paraISOWeek {#toisoweek} + +Convierte una fecha o fecha con hora en un número UInt8 que contiene el número de semana ISO. + +## ParaSemana(fecha\[,modo\]) {#toweekdatemode} + +Esta función devuelve el número de semana para la fecha o la fecha y hora. La forma de dos argumentos de toWeek() le permite especificar si la semana comienza el domingo o el lunes y si el valor de retorno debe estar en el rango de 0 a 53 o de 1 a 53. Si se omite el argumento mode, el modo predeterminado es 0. +`toISOWeek()`es una función de compatibilidad que es equivalente a `toWeek(date,3)`. +La siguiente tabla describe cómo funciona el argumento mode. + +| Modo | Primer día de la semana | Gama | Semana 1 es la primera semana … | +|------------|-------------------------|------------------|---------------------------------| +| Cero | Domingo | 0-53 | con un domingo de este año | +| Uno | Lunes | 0-53 | con 4 o más días este año | +| Cómo hacer | Domingo | Sistema abierto. | con un domingo de este año | +| Cómo hacer | Lunes | Sistema abierto. | con 4 o más días este año | +| Cuatro | Domingo | 0-53 | con 4 o más días este año | +| Cinco | Lunes | 0-53 | con un lunes de este año | +| Código | Domingo | Sistema abierto. | con 4 o más días este año | +| Siete | Lunes | Sistema abierto. | con un lunes de este año | +| Ocho | Domingo | Sistema abierto. | contiene 1 de enero | +| Nueve | Lunes | Sistema abierto. | contiene 1 de enero | + +Para valores de modo con un significado de “with 4 or more days this year,” semanas están numeradas según ISO 8601:1988: + +- Si la semana que contiene el 1 de enero tiene 4 o más días en el nuevo año, es la semana 1. + +- De lo contrario, es la última semana del año anterior, y la semana siguiente es la semana 1. + +Para valores de modo con un significado de “contains January 1”, la semana contiene 1 de enero es la semana 1. No importa cuántos días en el nuevo año la semana contenía, incluso si contenía sólo un día. + +``` sql +toWeek(date, [, mode][, Timezone]) +``` + +**Parámetros** + +- `date` – Fecha o DateTime. +- `mode` – Parámetro opcional, Rango de valores es \[0,9\], por defecto es 0. +- `Timezone` – Parámetro opcional, se comporta como cualquier otra función de conversión. + +**Ejemplo** + +``` sql +SELECT toDate('2016-12-27') AS date, toWeek(date) AS week0, toWeek(date,1) AS week1, toWeek(date,9) AS week9; +``` + +``` text +┌───────date─┬─week0─┬─week1─┬─week9─┐ +│ 2016-12-27 │ 52 │ 52 │ 1 │ +└────────────┴───────┴───────┴───────┘ +``` + +## aYearWeek(fecha\[,modo\]) {#toyearweekdatemode} + +Devuelve año y semana para una fecha. El año en el resultado puede ser diferente del año en el argumento de fecha para la primera y la última semana del año. + +El argumento mode funciona exactamente igual que el argumento mode a toWeek() . Para la sintaxis de argumento único, se utiliza un valor de modo de 0. + +`toISOYear()`es una función de compatibilidad que es equivalente a `intDiv(toYearWeek(date,3),100)`. + +**Ejemplo** + +``` sql +SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(date,1) AS yearWeek1, toYearWeek(date,9) AS yearWeek9; +``` + +``` text +┌───────date─┬─yearWeek0─┬─yearWeek1─┬─yearWeek9─┐ +│ 2016-12-27 │ 201652 │ 201652 │ 201701 │ +└────────────┴───────────┴───────────┴───────────┘ +``` + +## ahora {#now} + +Acepta cero argumentos y devuelve la hora actual en uno de los momentos de ejecución de la solicitud. +Esta función devuelve una constante, incluso si la solicitud tardó mucho en completarse. + +## Hoy {#today} + +Acepta cero argumentos y devuelve la fecha actual en uno de los momentos de ejecución de la solicitud. +Lo mismo que ‘toDate(now())’. + +## enfermería {#yesterday} + +Acepta cero argumentos y devuelve la fecha de ayer en uno de los momentos de ejecución de la solicitud. +Lo mismo que ‘today() - 1’. + +## timeSlot {#timeslot} + +Redondea el tiempo a la media hora. +Esta función es específica de Yandex.Metrica, ya que media hora es la cantidad mínima de tiempo para dividir una sesión en dos sesiones si una etiqueta de seguimiento muestra las páginas vistas consecutivas de un solo usuario que difieren en el tiempo en estrictamente más de esta cantidad. Esto significa que las tuplas (el ID de etiqueta, el ID de usuario y el intervalo de tiempo) se pueden usar para buscar páginas vistas que se incluyen en la sesión correspondiente. + +## paraYYYYMM {#toyyyymm} + +Convierte una fecha o fecha con hora en un número UInt32 que contiene el número de año y mes (YYYY \* 100 + MM). + +## paraYYYYMMDD {#toyyyymmdd} + +Convierte una fecha o fecha con hora en un número UInt32 que contiene el número de año y mes (AAAA \* 10000 + MM \* 100 + DD). + +## paraYYYYMMDDhhmmss {#toyyyymmddhhmmss} + +Convierte una fecha o fecha con hora en un número UInt64 que contiene el número de año y mes (YYYY \* 10000000000 + MM \* 100000000 + DD \* 1000000 + hh \* 10000 + mm \* 100 + ss). + +## Por ejemplo, en el caso de que el usuario pueda acceder a la página de inicio de sesión, seleccione la página de inicio de sesión. {#addyears-addmonths-addweeks-adddays-addhours-addminutes-addseconds-addquarters} + +La función agrega un intervalo de fecha / fecha y hora a una fecha / fecha y hora y luego devuelve la fecha / fecha y hora. Por ejemplo: + +``` sql +WITH + toDate('2018-01-01') AS date, + toDateTime('2018-01-01 00:00:00') AS date_time +SELECT + addYears(date, 1) AS add_years_with_date, + addYears(date_time, 1) AS add_years_with_date_time +``` + +``` text +┌─add_years_with_date─┬─add_years_with_date_time─┐ +│ 2019-01-01 │ 2019-01-01 00:00:00 │ +└─────────────────────┴──────────────────────────┘ +``` + +## restarAños, restarMeses, restarSemanas, restarDías, restarHoras, restarMinutos, restarSegundos, restarCuartos {#subtractyears-subtractmonths-subtractweeks-subtractdays-subtracthours-subtractminutes-subtractseconds-subtractquarters} + +La función resta un intervalo de fecha / fecha y hora a una fecha / fecha y hora y luego devuelve la fecha / fecha y hora. Por ejemplo: + +``` sql +WITH + toDate('2019-01-01') AS date, + toDateTime('2019-01-01 00:00:00') AS date_time +SELECT + subtractYears(date, 1) AS subtract_years_with_date, + subtractYears(date_time, 1) AS subtract_years_with_date_time +``` + +``` text +┌─subtract_years_with_date─┬─subtract_years_with_date_time─┐ +│ 2018-01-01 │ 2018-01-01 00:00:00 │ +└──────────────────────────┴───────────────────────────────┘ +``` + +## dateDiff {#datediff} + +Devuelve la diferencia entre dos valores Date o DateTime. + +**Sintaxis** + +``` sql +dateDiff('unit', startdate, enddate, [timezone]) +``` + +**Parámetros** + +- `unit` — Unidad de tiempo, en la que se expresa el valor devuelto. [Cadena](../syntax.md#syntax-string-literal). + + Supported values: + + | unit | + | ------ | + |second | + |minute | + |hour | + |day | + |week | + |month | + |quarter | + |year | + +- `startdate` — El primer valor de tiempo para comparar. [Fecha](../../data_types/date.md) o [FechaHora](../../data_types/datetime.md). + +- `enddate` — El segundo valor de tiempo para comparar. [Fecha](../../data_types/date.md) o [FechaHora](../../data_types/datetime.md). + +- `timezone` — Parámetro opcional. Si se especifica, se aplica a ambos `startdate` y `enddate`. Si no se especifica, las zonas horarias `startdate` y `enddate` se utilizan. Si no son lo mismo, el resultado no está especificado. + +**Valor devuelto** + +Diferencia entre `startdate` y `enddate` expresado en `unit`. + +Tipo: `int`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); +``` + +Resultado: + +``` text +┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ +│ 25 │ +└────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +## timeSlots(Hora de inicio, Duración, \[, Tamaño\]) {#timeslotsstarttime-duration-size} + +Para un intervalo de tiempo a partir de ‘StartTime’ y continuando por ‘Duration’ segundos, devuelve una matriz de momentos en el tiempo, que consiste en puntos de este intervalo ‘Size’ en segundos. ‘Size’ es un parámetro opcional: una constante UInt32, establecida en 1800 por defecto. +Por ejemplo, `timeSlots(toDateTime('2012-01-01 12:20:00'), 600) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')]`. +Esto es necesario para buscar páginas vistas en la sesión correspondiente. + +## FormatDateTime(Hora, Formato\[, Zona horaria\]) {#formatdatetime} + +La función da formato a una hora según una cadena de formato dada. NB: El formato es una expresión constante, por ejemplo, no puede tener múltiples formatos para una sola columna de resultado. + +Modificadores compatibles para Formato: +(“Example” columna muestra el resultado de formateo para el tiempo `2018-01-02 22:33:44`) + +| Modificador | Descripción | Ejemplo | +|------------------|------------------------------------------------------------------|----------------------------------| +| %C | año dividido por 100 y truncado a entero (00-99) | Veinte | +| %d | día del mes, cero acolchado (01-31) | Bienvenido | +| %D | Fecha corta de MM/DD/YY, equivalente a %m/%d/%a | Método de codificación de datos: | +| %e | día del mes, espacio acolchado ( 1-31) | Cómo hacer | +| Categoría | fecha corta AAAA-MM-DD, equivalente de la onu %Y-%m-%d | Sistema abierto. | +| %H | hora en formato 24h (00-23) | Veintidos | +| %Me | hora en formato 12h (01-12) | Diez | +| %j | día del año (001-366) | Categoría | +| Más información | mes como un número decimal (01-12) | Acerca de | +| Más información | minutos (00-59) | Sistema abierto. | +| %y | carácter de nueva línea («») | | +| %p | Designación AM o PM | PM | +| %R | HH de 24 horas: Tiempo del milímetro, equivalente a %H: %M | 22:33 | +| %S | segundo (00-59) | Sistema abierto. | +| % t | carácter de pestaña horizontal (') | | +| Tipo de artículo | Formato de hora ISO 8601 (HH:MM:SS), equivalente a %H:%M:%S | 22:33:44 | +| %u | ISO 8601 día de la semana como número con el lunes como 1 (1-7) | Cómo hacer | +| %V | Número de semana ISO 8601 (01-53) | Acerca de | +| % w | día de la semana como un número decimal con domingo como 0 (0-6) | Cómo hacer | +| %y | Año, últimos dos dígitos (00-99) | Acerca de | +| %Y | Año | 2018 Nueva York | +| %% | Por qué? | % | + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/date_time_functions/) diff --git a/docs/es/query_language/functions/encoding_functions.md b/docs/es/query_language/functions/encoding_functions.md new file mode 100644 index 00000000000..e8b59c85588 --- /dev/null +++ b/docs/es/query_language/functions/encoding_functions.md @@ -0,0 +1,168 @@ +# Funciones de codificación {#encoding-functions} + +## char {#char} + +Devuelve la cadena con la longitud como el número de argumentos pasados y cada byte tiene el valor del argumento correspondiente. Acepta varios argumentos de tipos numéricos. Si el valor del argumento está fuera del rango del tipo de datos UInt8, se convierte a UInt8 con posible redondeo y desbordamiento. + +**Sintaxis** + +``` sql +char(number_1, [number_2, ..., number_n]); +``` + +**Parámetros** + +- `number_1, number_2, ..., number_n` — Argumentos numéricos interpretados como enteros. Tipo: [En](../../data_types/int_uint.md), [Flotante](../../data_types/float.md). + +**Valor devuelto** + +- una cadena de bytes. + +Tipo: `String`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello +``` + +Resultado: + +``` text +┌─hello─┐ +│ hello │ +└───────┘ +``` + +Puede construir una cadena de codificación arbitraria pasando los bytes correspondientes. Aquí hay un ejemplo para UTF-8: + +Consulta: + +``` sql +SELECT char(0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5, 0xD1, 0x82) AS hello; +``` + +Resultado: + +``` text +┌─hello──┐ +│ привет │ +└────────┘ +``` + +Consulta: + +``` sql +SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; +``` + +Resultado: + +``` text +┌─hello─┐ +│ 你好 │ +└───────┘ +``` + +## hexagonal {#hex} + +Devuelve una cadena que contiene la representación hexadecimal del argumento. + +**Sintaxis** + +``` sql +hex(arg) +``` + +La función está usando letras mayúsculas `A-F` y no usar ningún prefijo (como `0x`) o sufijos (como `h`). + +Para argumentos enteros, imprime dígitos hexadecimales (“nibbles”) del más significativo al menos significativo (big endian o “human readable” orden). Comienza con el byte distinto de cero más significativo (se omiten los cero bytes principales) pero siempre imprime ambos dígitos de cada byte incluso si el dígito inicial es cero. + +Ejemplo: + +**Ejemplo** + +Consulta: + +``` sql +SELECT hex(1); +``` + +Resultado: + +``` text +01 +``` + +Valores de tipo `Date` y `DateTime` están formateados como enteros correspondientes (el número de días desde Epoch para Date y el valor de Unix Timestamp para DateTime). + +Para `String` y `FixedString`, todos los bytes son simplemente codificados como dos números hexadecimales. No se omiten cero bytes. + +Los valores de los tipos de coma flotante y Decimal se codifican como su representación en la memoria. Como apoyamos la pequeña arquitectura endian, están codificados en little endian. No se omiten cero bytes iniciales / finales. + +**Parámetros** + +- `arg` — Un valor para convertir a hexadecimal. Tipo: [Cadena](../../data_types/string.md), [UInt](../../data_types/int_uint.md), [Flotante](../../data_types/float.md), [Decimal](../../data_types/decimal.md), [Fecha](../../data_types/date.md) o [FechaHora](../../data_types/datetime.md). + +**Valor devuelto** + +- Una cadena con la representación hexadecimal del argumento. + +Tipo: `String`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT hex(toFloat32(number)) as hex_presentation FROM numbers(15, 2); +``` + +Resultado: + +``` text +┌─hex_presentation─┐ +│ 00007041 │ +│ 00008041 │ +└──────────────────┘ +``` + +Consulta: + +``` sql +SELECT hex(toFloat64(number)) as hex_presentation FROM numbers(15, 2); +``` + +Resultado: + +``` text +┌─hex_presentation─┐ +│ 0000000000002E40 │ +│ 0000000000003040 │ +└──────────────────┘ +``` + +## unhex(str) {#unhexstr} + +Acepta una cadena que contiene cualquier número de dígitos hexadecimales y devuelve una cadena que contiene los bytes correspondientes. Admite letras mayúsculas y minúsculas A-F. El número de dígitos hexadecimales no tiene que ser par. Si es impar, el último dígito se interpreta como la mitad menos significativa del byte 00-0F. Si la cadena de argumento contiene algo distinto de los dígitos hexadecimales, se devuelve algún resultado definido por la implementación (no se produce una excepción). +Si desea convertir el resultado en un número, puede usar el ‘reverse’ y ‘reinterpretAsType’ función. + +## UUIDStringToNum (str) {#uuidstringtonumstr} + +Acepta una cadena que contiene 36 caracteres en el formato `123e4567-e89b-12d3-a456-426655440000`, y lo devuelve como un conjunto de bytes en un FixedString(16). + +## UUIDNumToString (str) {#uuidnumtostringstr} + +Acepta un valor de FixedString(16). Devuelve una cadena que contiene 36 caracteres en formato de texto. + +## ¿Cómo puedo hacerlo?) {#bitmasktolistnum} + +Acepta un entero. Devuelve una cadena que contiene la lista de potencias de dos que suman el número de origen cuando se suma. Están separados por comas sin espacios en formato de texto, en orden ascendente. + +## ¿Qué puedes encontrar en Neodigit) {#bitmasktoarraynum} + +Acepta un entero. Devuelve una matriz de números UInt64 que contiene la lista de potencias de dos que suman el número de origen cuando se suma. Los números en la matriz están en orden ascendente. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/encoding_functions/) diff --git a/docs/es/query_language/functions/ext_dict_functions.md b/docs/es/query_language/functions/ext_dict_functions.md new file mode 100644 index 00000000000..d5d4a481141 --- /dev/null +++ b/docs/es/query_language/functions/ext_dict_functions.md @@ -0,0 +1,198 @@ +# Funciones para trabajar con diccionarios externos {#ext-dict-functions} + +Para obtener información sobre cómo conectar y configurar diccionarios externos, consulte [Diccionarios externos](../dicts/external_dicts.md). + +## dictGet {#dictget} + +Recupera un valor de un diccionario externo. + +``` sql +dictGet('dict_name', 'attr_name', id_expr) +dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) +``` + +**Parámetros** + +- `dict_name` — Nombre del diccionario. [Literal de cadena](../syntax.md#syntax-string-literal). +- `attr_name` — Nombre de la columna del diccionario. [Literal de cadena](../syntax.md#syntax-string-literal). +- `id_expr` — Valor clave. [Expresion](../syntax.md#syntax-expressions) devolviendo un [UInt64](../../data_types/int_uint.md) o [Tupla](../../data_types/tuple.md)valor -type dependiendo de la configuración del diccionario. +- `default_value_expr` — Valor devuelto si el diccionario no contiene una fila `id_expr` Nivel de Cifrado WEP [Expresion](../syntax.md#syntax-expressions) devolviendo el valor en el tipo de datos configurado para `attr_name` atributo. + +**Valor devuelto** + +- Si ClickHouse analiza el atributo correctamente en el [tipo de datos del atributo](../../query_language/dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes), funciones devuelven el valor del atributo de diccionario que corresponde a `id_expr`. + +- Si no hay la clave, correspondiente a `id_expr` es el diccionario, entonces: + + - `dictGet` returns the content of the `` element specified for the attribute in the dictionary configuration. + - `dictGetOrDefault` returns the value passed as the `default_value_expr` parameter. + +ClickHouse produce una excepción si no puede analizar el valor del atributo o si el valor no coincide con el tipo de datos del atributo. + +**Ejemplo** + +Crear un archivo de texto `ext-dict-text.csv` que contiene los siguientes: + +``` text +1,1 +2,2 +``` + +La primera columna es `id` la segunda columna es `c1`. + +Configurar el diccionario externo: + +``` xml + + + ext-dict-test + + + /path-to/ext-dict-test.csv + CSV + + + + + + + + id + + + c1 + UInt32 + + + + 0 + + +``` + +Realizar la consulta: + +``` sql +SELECT + dictGetOrDefault('ext-dict-test', 'c1', number + 1, toUInt32(number * 10)) AS val, + toTypeName(val) AS type +FROM system.numbers +LIMIT 3 +``` + +``` text +┌─val─┬─type───┐ +│ 1 │ UInt32 │ +│ 2 │ UInt32 │ +│ 20 │ UInt32 │ +└─────┴────────┘ +``` + +**Ver también** + +- [Diccionarios externos](../dicts/external_dicts.md) + +## dictHas {#dicthas} + +Comprueba si hay una clave en un diccionario. + +``` sql +dictHas('dict_name', id_expr) +``` + +**Parámetros** + +- `dict_name` — Nombre del diccionario. [Literal de cadena](../syntax.md#syntax-string-literal). +- `id_expr` — Valor clave. [Expresion](../syntax.md#syntax-expressions) devolviendo un [UInt64](../../data_types/int_uint.md)-tipo de valor. + +**Valor devuelto** + +- 0, si no hay clave. +- 1, si hay una llave. + +Tipo: `UInt8`. + +## dictGetHierarchy {#dictgethierarchy} + +Crea una matriz, que contiene todos los padres de una clave [diccionario jerárquico](../dicts/external_dicts_dict_hierarchical.md). + +**Sintaxis** + +``` sql +dictGetHierarchy('dict_name', key) +``` + +**Parámetros** + +- `dict_name` — Nombre del diccionario. [Literal de cadena](../syntax.md#syntax-string-literal). +- `key` — Valor clave. [Expresion](../syntax.md#syntax-expressions) devolviendo un [UInt64](../../data_types/int_uint.md)-tipo de valor. + +**Valor devuelto** + +- Padres por la llave. + +Tipo: [Matriz (UInt64)](../../data_types/array.md). + +## DictIsIn {#dictisin} + +Comprueba el antecesor de una clave a través de toda la cadena jerárquica en el diccionario. + +``` sql +dictIsIn('dict_name', child_id_expr, ancestor_id_expr) +``` + +**Parámetros** + +- `dict_name` — Nombre del diccionario. [Literal de cadena](../syntax.md#syntax-string-literal). +- `child_id_expr` — Clave a comprobar. [Expresion](../syntax.md#syntax-expressions) devolviendo un [UInt64](../../data_types/int_uint.md)-tipo de valor. +- `ancestor_id_expr` — Presunto ancestro de la `child_id_expr` Nivel de Cifrado WEP [Expresion](../syntax.md#syntax-expressions) devolviendo un [UInt64](../../data_types/int_uint.md)-tipo de valor. + +**Valor devuelto** + +- 0, si `child_id_expr` no es un niño de `ancestor_id_expr`. +- 1, si `child_id_expr` es un niño de `ancestor_id_expr` o si `child_id_expr` es una `ancestor_id_expr`. + +Tipo: `UInt8`. + +## Otras funciones {#ext-dict-functions-other} + +ClickHouse admite funciones especializadas que convierten los valores de atributo de diccionario a un tipo de datos específico, independientemente de la configuración del diccionario. + +Función: + +- `dictGetInt8`, `dictGetInt16`, `dictGetInt32`, `dictGetInt64` +- `dictGetUInt8`, `dictGetUInt16`, `dictGetUInt32`, `dictGetUInt64` +- `dictGetFloat32`, `dictGetFloat64` +- `dictGetDate` +- `dictGetDateTime` +- `dictGetUUID` +- `dictGetString` + +Todas estas funciones tienen el `OrDefault` modificación. Por ejemplo, `dictGetDateOrDefault`. + +Sintaxis: + +``` sql +dictGet[Type]('dict_name', 'attr_name', id_expr) +dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) +``` + +**Parámetros** + +- `dict_name` — Nombre del diccionario. [Literal de cadena](../syntax.md#syntax-string-literal). +- `attr_name` — Nombre de la columna del diccionario. [Literal de cadena](../syntax.md#syntax-string-literal). +- `id_expr` — Valor clave. [Expresion](../syntax.md#syntax-expressions) devolviendo un [UInt64](../../data_types/int_uint.md)-tipo de valor. +- `default_value_expr` — Valor que se devuelve si el diccionario no contiene una fila `id_expr` Nivel de Cifrado WEP [Expresion](../syntax.md#syntax-expressions) devolviendo un valor en el tipo de datos configurado para `attr_name` atributo. + +**Valor devuelto** + +- Si ClickHouse analiza el atributo correctamente en el [tipo de datos del atributo](../../query_language/dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes), funciones devuelven el valor del atributo de diccionario que corresponde a `id_expr`. + +- Si no se solicita `id_expr` es el diccionario entonces: + + - `dictGet[Type]` returns the content of the `` element specified for the attribute in the dictionary configuration. + - `dictGet[Type]OrDefault` returns the value passed as the `default_value_expr` parameter. + +ClickHouse produce una excepción si no puede analizar el valor del atributo o si el valor no coincide con el tipo de datos del atributo. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/ext_dict_functions/) diff --git a/docs/es/query_language/functions/functions_for_nulls.md b/docs/es/query_language/functions/functions_for_nulls.md new file mode 100644 index 00000000000..f3c71fc7b89 --- /dev/null +++ b/docs/es/query_language/functions/functions_for_nulls.md @@ -0,0 +1,305 @@ +# Funciones para trabajar con agregados anulables {#functions-for-working-with-nullable-aggregates} + +## IsNull {#isnull} + +Comprueba si el argumento es [NULO](../syntax.md#null). + +``` sql +isNull(x) +``` + +**Parámetros** + +- `x` — Un valor con un tipo de datos no compuesto. + +**Valor devuelto** + +- `1` si `x` ser `NULL`. +- `0` si `x` no es `NULL`. + +**Ejemplo** + +Tabla de entrada + +``` text +┌─x─┬────y─┐ +│ 1 │ ᴺᵁᴸᴸ │ +│ 2 │ 3 │ +└───┴──────┘ +``` + +Consulta + +``` sql +SELECT x FROM t_null WHERE isNull(y) +``` + +``` text +┌─x─┐ +│ 1 │ +└───┘ +``` + +## isNotNull {#isnotnull} + +Comprueba si el argumento es [NULO](../syntax.md#null). + +``` sql +isNotNull(x) +``` + +**Parámetros:** + +- `x` — Un valor con un tipo de datos no compuesto. + +**Valor devuelto** + +- `0` si `x` ser `NULL`. +- `1` si `x` no es `NULL`. + +**Ejemplo** + +Tabla de entrada + +``` text +┌─x─┬────y─┐ +│ 1 │ ᴺᵁᴸᴸ │ +│ 2 │ 3 │ +└───┴──────┘ +``` + +Consulta + +``` sql +SELECT x FROM t_null WHERE isNotNull(y) +``` + +``` text +┌─x─┐ +│ 2 │ +└───┘ +``` + +## Coalesce {#coalesce} + +Comprueba de izquierda a derecha si `NULL` se aprobaron argumentos y devuelve el primer no-`NULL` argumento. + +``` sql +coalesce(x,...) +``` + +**Parámetros:** + +- Cualquier número de parámetros de un tipo no compuesto. Todos los parámetros deben ser compatibles por tipo de datos. + +**Valores devueltos** + +- El primer no-`NULL` argumento. +- `NULL` si todos los argumentos hijo `NULL`. + +**Ejemplo** + +Considere una lista de contactos que pueden especificar varias formas de contactar a un cliente. + +``` text +┌─name─────┬─mail─┬─phone─────┬──icq─┐ +│ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ +│ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +└──────────┴──────┴───────────┴──────┘ +``` + +El `mail` y `phone` los campos son de tipo String, pero el `icq` campo `UInt32`, por lo que necesita ser convertido a `String`. + +Obtenga el primer método de contacto disponible para el cliente de la lista de contactos: + +``` sql +SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook +``` + +``` text +┌─name─────┬─coalesce(mail, phone, CAST(icq, 'Nullable(String)'))─┐ +│ client 1 │ 123-45-67 │ +│ client 2 │ ᴺᵁᴸᴸ │ +└──────────┴──────────────────────────────────────────────────────┘ +``` + +## ifNull {#ifnull} + +Devuelve un valor alternativo si el argumento principal es `NULL`. + +``` sql +ifNull(x,alt) +``` + +**Parámetros:** + +- `x` — El valor que se debe comprobar `NULL`. +- `alt` — El valor que devuelve la función si `x` ser `NULL`. + +**Valores devueltos** + +- Valor `x` si `x` no es `NULL`. +- Valor `alt` si `x` ser `NULL`. + +**Ejemplo** + +``` sql +SELECT ifNull('a', 'b') +``` + +``` text +┌─ifNull('a', 'b')─┐ +│ a │ +└──────────────────┘ +``` + +``` sql +SELECT ifNull(NULL, 'b') +``` + +``` text +┌─ifNull(NULL, 'b')─┐ +│ b │ +└───────────────────┘ +``` + +## nullIf {#nullif} + +Devoluciones `NULL` si los argumentos son iguales. + +``` sql +nullIf(x, y) +``` + +**Parámetros:** + +`x`, `y` — Valores de comparación. Deben ser tipos compatibles, o ClickHouse generará una excepción. + +**Valores devueltos** + +- `NULL` si los argumentos son iguales. +- El `x` valor, si los argumentos no son iguales. + +**Ejemplo** + +``` sql +SELECT nullIf(1, 1) +``` + +``` text +┌─nullIf(1, 1)─┐ +│ ᴺᵁᴸᴸ │ +└──────────────┘ +``` + +``` sql +SELECT nullIf(1, 2) +``` + +``` text +┌─nullIf(1, 2)─┐ +│ 1 │ +└──────────────┘ +``` + +## assumeNotNull {#assumenotnull} + +Resultados en un valor de tipo [NULO](../../data_types/nullable.md) para un no- `Nullable` si el valor no es `NULL`. + +``` sql +assumeNotNull(x) +``` + +**Parámetros:** + +- `x` — El valor original. + +**Valores devueltos** + +- El valor original del-`Nullable` tipo, si no es `NULL`. +- El valor predeterminado para el-`Nullable` tipo si el valor original fue `NULL`. + +**Ejemplo** + +Considere el `t_null` tabla. + +``` sql +SHOW CREATE TABLE t_null +``` + +``` text +┌─statement─────────────────────────────────────────────────────────────────┐ +│ CREATE TABLE default.t_null ( x Int8, y Nullable(Int8)) ENGINE = TinyLog │ +└───────────────────────────────────────────────────────────────────────────┘ +``` + +``` text +┌─x─┬────y─┐ +│ 1 │ ᴺᵁᴸᴸ │ +│ 2 │ 3 │ +└───┴──────┘ +``` + +Aplicar el `assumeNotNull` función a la `y` columna. + +``` sql +SELECT assumeNotNull(y) FROM t_null +``` + +``` text +┌─assumeNotNull(y)─┐ +│ 0 │ +│ 3 │ +└──────────────────┘ +``` + +``` sql +SELECT toTypeName(assumeNotNull(y)) FROM t_null +``` + +``` text +┌─toTypeName(assumeNotNull(y))─┐ +│ Int8 │ +│ Int8 │ +└──────────────────────────────┘ +``` + +## Acerca de Nosotros {#tonullable} + +Convierte el tipo de argumento a `Nullable`. + +``` sql +toNullable(x) +``` + +**Parámetros:** + +- `x` — El valor de cualquier tipo no compuesto. + +**Valor devuelto** + +- El valor de entrada con un `Nullable` tipo. + +**Ejemplo** + +``` sql +SELECT toTypeName(10) +``` + +``` text +┌─toTypeName(10)─┐ +│ UInt8 │ +└────────────────┘ +``` + +``` sql +SELECT toTypeName(toNullable(10)) +``` + +``` text +┌─toTypeName(toNullable(10))─┐ +│ Nullable(UInt8) │ +└────────────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/functions_for_nulls/) diff --git a/docs/es/query_language/functions/geo.md b/docs/es/query_language/functions/geo.md new file mode 100644 index 00000000000..62083a6a09e --- /dev/null +++ b/docs/es/query_language/functions/geo.md @@ -0,0 +1,503 @@ +# Funciones para trabajar con coordenadas geográficas {#functions-for-working-with-geographical-coordinates} + +## GranCircleDistance {#greatcircledistance} + +Calcule la distancia entre dos puntos en la superficie de la Tierra usando [la fórmula del gran círculo](https://en.wikipedia.org/wiki/Great-circle_distance). + +``` sql +greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) +``` + +**Parámetros de entrada** + +- `lon1Deg` — Longitud del primer punto en grados. Gama: `[-180°, 180°]`. +- `lat1Deg` — Latitud del primer punto en grados. Gama: `[-90°, 90°]`. +- `lon2Deg` — Longitud del segundo punto en grados. Gama: `[-180°, 180°]`. +- `lat2Deg` — Latitud del segundo punto en grados. Gama: `[-90°, 90°]`. + +Los valores positivos corresponden a latitud norte y longitud este, y los valores negativos corresponden a latitud sur y longitud oeste. + +**Valor devuelto** + +La distancia entre dos puntos en la superficie de la Tierra, en metros. + +Genera una excepción cuando los valores de los parámetros de entrada están fuera del intervalo. + +**Ejemplo** + +``` sql +SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) +``` + +``` text +┌─greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673)─┐ +│ 14132374.194975413 │ +└───────────────────────────────────────────────────────────────────┘ +``` + +## pointInEllipses {#pointinellipses} + +Comprueba si el punto pertenece al menos a una de las elipses. +Las coordenadas son geométricas en el sistema de coordenadas cartesianas. + +``` sql +pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) +``` + +**Parámetros de entrada** + +- `x, y` — Coordenadas de un punto en el plano. +- `xᵢ, yᵢ` — Coordenadas del centro de la `i`-ésimo puntos suspensivos. +- `aᵢ, bᵢ` — Ejes del `i`-ésimo puntos suspensivos en unidades de coordenadas x, y. + +Los parámetros de entrada deben ser `2+4⋅n`, donde `n` es el número de puntos suspensivos. + +**Valores devueltos** + +`1` si el punto está dentro de al menos una de las elipses; `0`si no lo es. + +**Ejemplo** + +``` sql +SELECT pointInEllipses(10., 10., 10., 9.1, 1., 0.9999) +``` + +``` text +┌─pointInEllipses(10., 10., 10., 9.1, 1., 0.9999)─┐ +│ 1 │ +└─────────────────────────────────────────────────┘ +``` + +## pointInPolygon {#pointinpolygon} + +Comprueba si el punto pertenece al polígono en el plano. + +``` sql +pointInPolygon((x, y), [(a, b), (c, d) ...], ...) +``` + +**Valores de entrada** + +- `(x, y)` — Coordenadas de un punto en el plano. Tipo de datos — [Tupla](../../data_types/tuple.md) — Una tupla de dos números. +- `[(a, b), (c, d) ...]` — Vértices de polígono. Tipo de datos — [Matriz](../../data_types/array.md). Cada vértice está representado por un par de coordenadas `(a, b)`. Los vértices deben especificarse en sentido horario o antihorario. El número mínimo de vértices es 3. El polígono debe ser constante. +- La función también admite polígonos con agujeros (secciones recortadas). En este caso, agregue polígonos que definan las secciones recortadas utilizando argumentos adicionales de la función. La función no admite polígonos no simplemente conectados. + +**Valores devueltos** + +`1` si el punto está dentro del polígono, `0` si no lo es. +Si el punto está en el límite del polígono, la función puede devolver 0 o 1. + +**Ejemplo** + +``` sql +SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res +``` + +``` text +┌─res─┐ +│ 1 │ +└─────┘ +``` + +## geohashEncode {#geohashencode} + +Codifica la latitud y la longitud como una cadena geohash, consulte (http://geohash.org/, https://en.wikipedia.org/wiki/Geohash). + +``` sql +geohashEncode(longitude, latitude, [precision]) +``` + +**Valores de entrada** + +- longitud - longitud parte de la coordenada que desea codificar. Flotando en el rango`[-180°, 180°]` +- latitude : parte de latitud de la coordenada que desea codificar. Flotando en el rango `[-90°, 90°]` +- precision - Opcional, longitud de la cadena codificada resultante, por defecto es `12`. Entero en el rango `[1, 12]`. Cualquier valor menor que `1` o mayor que `12` se convierte silenciosamente a `12`. + +**Valores devueltos** + +- alfanumérico `String` de coordenadas codificadas (se utiliza la versión modificada del alfabeto de codificación base32). + +**Ejemplo** + +``` sql +SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res +``` + +``` text +┌─res──────────┐ +│ ezs42d000000 │ +└──────────────┘ +``` + +## geohashDecode {#geohashdecode} + +Decodifica cualquier cadena codificada por geohash en longitud y latitud. + +**Valores de entrada** + +- cadena codificada - cadena codificada geohash. + +**Valores devueltos** + +- (longitud, latitud) - 2-tupla de `Float64` valores de longitud y latitud. + +**Ejemplo** + +``` sql +SELECT geohashDecode('ezs42') AS res +``` + +``` text +┌─res─────────────────────────────┐ +│ (-5.60302734375,42.60498046875) │ +└─────────────────────────────────┘ +``` + +## geoToH3 {#geotoh3} + +Devoluciones [Hombre](https://uber.github.io/h3/#/documentation/overview/introduction) índice de punto `(lon, lat)` con la resolución especificada. + +[Hombre](https://uber.github.io/h3/#/documentation/overview/introduction) es un sistema de indexación geográfica donde la superficie de la Tierra se divide en incluso azulejos hexagonales. Este sistema es jerárquico, es decir, cada hexágono en el nivel superior se puede dividir en siete incluso pero más pequeños y así sucesivamente. + +Este índice se utiliza principalmente para ubicaciones de bucketing y otras manipulaciones geoespaciales. + +**Sintaxis** + +``` sql +geoToH3(lon, lat, resolution) +``` + +**Parámetros** + +- `lon` — Longitud. Tipo: [Float64](../../data_types/float.md). +- `lat` — Latitud. Tipo: [Float64](../../data_types/float.md). +- `resolution` — Resolución del índice. Gama: `[0, 15]`. Tipo: [UInt8](../../data_types/int_uint.md). + +**Valores devueltos** + +- Número de índice hexagonal. +- 0 en caso de error. + +Tipo: `UInt64`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index +``` + +Resultado: + +``` text +┌────────────h3Index─┐ +│ 644325524701193974 │ +└────────────────────┘ +``` + +## Información adicional {#geohashesinbox} + +Devuelve una matriz de cadenas codificadas por geohash de precisión dada que caen dentro e intersecan los límites de un cuadro dado, básicamente una cuadrícula 2D aplanada en una matriz. + +**Valores de entrada** + +- longitude\_min - longitud mínima, valor flotante en el rango `[-180°, 180°]` +- latitude\_min - latitud mínima, valor flotante en el rango `[-90°, 90°]` +- longitude\_max - longitud máxima, valor flotante en el rango `[-180°, 180°]` +- latitude\_max - latitud máxima, valor flotante en el rango `[-90°, 90°]` +- precisión - precisión del geohash, `UInt8` es el rango `[1, 12]` + +Tenga en cuenta que todos los parámetros de coordenadas deben ser del mismo tipo: `Float32` o `Float64`. + +**Valores devueltos** + +- matriz de cadenas de precisión largas de geohash-cajas que cubren el área proporcionada, no debe confiar en el orden de los artículos. +- \[\] - matriz vacía si *minuto* Valores de *latitud* y *longitud* no son menos que correspondiente *máximo* valor. + +Tenga en cuenta que la función arrojará una excepción si la matriz resultante tiene más de 10'000'000 elementos de longitud. + +**Ejemplo** + +``` sql +SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos +``` + +``` text +┌─thasos──────────────────────────────────────┐ +│ ['sx1q','sx1r','sx32','sx1w','sx1x','sx38'] │ +└─────────────────────────────────────────────┘ +``` + +## h3GetBaseCell {#h3getbasecell} + +Devuelve el número de celda base del índice. + +**Sintaxis** + +``` sql +h3GetBaseCell(index) +``` + +**Parámetros** + +- `index` — Número de índice hexagonal. Tipo: [UInt64](../../data_types/int_uint.md). + +**Valores devueltos** + +- Número de celda base hexagonal. Tipo: [UInt8](../../data_types/int_uint.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT h3GetBaseCell(612916788725809151) as basecell +``` + +Resultado: + +``` text +┌─basecell─┐ +│ 12 │ +└──────────┘ +``` + +## H3HexAreaM2 {#h3hexaream2} + +Área hexagonal promedio en metros cuadrados a la resolución dada. + +**Sintaxis** + +``` sql +h3HexAreaM2(resolution) +``` + +**Parámetros** + +- `resolution` — Resolución del índice. Gama: `[0, 15]`. Tipo: [UInt8](../../data_types/int_uint.md). + +**Valores devueltos** + +- Superficie en m². Tipo: [Float64](../../data_types/float.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT h3HexAreaM2(13) as area +``` + +Resultado: + +``` text +┌─area─┐ +│ 43.9 │ +└──────┘ +``` + +## h3IndexesAreNeighbors {#h3indexesareneighbors} + +Devuelve si los H3Indexes proporcionados son vecinos o no. + +**Sintaxis** + +``` sql +h3IndexesAreNeighbors(index1, index2) +``` + +**Parámetros** + +- `index1` — Número de índice hexagonal. Tipo: [UInt64](../../data_types/int_uint.md). +- `index2` — Número de índice hexagonal. Tipo: [UInt64](../../data_types/int_uint.md). + +**Valores devueltos** + +- Devoluciones `1` si los índices son vecinos, `0` de lo contrario. Tipo: [UInt8](../../data_types/int_uint.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT h3IndexesAreNeighbors(617420388351344639, 617420388352655359) AS n +``` + +Resultado: + +``` text +┌─n─┐ +│ 1 │ +└───┘ +``` + +## H3ToChildren {#h3tochildren} + +Devuelve una matriz con los índices secundarios del índice dado. + +**Sintaxis** + +``` sql +h3ToChildren(index, resolution) +``` + +**Parámetros** + +- `index` — Número de índice hexagonal. Tipo: [UInt64](../../data_types/int_uint.md). +- `resolution` — Resolución del índice. Gama: `[0, 15]`. Tipo: [UInt8](../../data_types/int_uint.md). + +**Valores devueltos** + +- Matriz con los índices H3 hijo. Matriz de tipo: [UInt64](../../data_types/int_uint.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT h3ToChildren(599405990164561919, 6) AS children +``` + +Resultado: + +``` text +┌─children───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ [603909588852408319,603909588986626047,603909589120843775,603909589255061503,603909589389279231,603909589523496959,603909589657714687] │ +└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +## H3ToParent {#h3toparent} + +Devuelve el índice primario (más grueso) que contiene el índice dado. + +**Sintaxis** + +``` sql +h3ToParent(index, resolution) +``` + +**Parámetros** + +- `index` — Número de índice hexagonal. Tipo: [UInt64](../../data_types/int_uint.md). +- `resolution` — Resolución del índice. Gama: `[0, 15]`. Tipo: [UInt8](../../data_types/int_uint.md). + +**Valores devueltos** + +- Índice padre H3. Tipo: [UInt64](../../data_types/int_uint.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT h3ToParent(599405990164561919, 3) as parent +``` + +Resultado: + +``` text +┌─────────────parent─┐ +│ 590398848891879423 │ +└────────────────────┘ +``` + +## H3ToString {#h3tostring} + +Convierte la representación H3Index del índice en la representación de cadena. + +``` sql +h3ToString(index) +``` + +**Parámetros** + +- `index` — Número de índice hexagonal. Tipo: [UInt64](../../data_types/int_uint.md). + +**Valores devueltos** + +- Representación de cadena del índice H3. Tipo: [Cadena](../../data_types/string.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT h3ToString(617420388352917503) as h3_string +``` + +Resultado: + +``` text +┌─h3_string───────┐ +│ 89184926cdbffff │ +└─────────────────┘ +``` + +## stringToH3 {#stringtoh3} + +Convierte la representación de cadena en representación H3Index (UInt64). + +``` sql +stringToH3(index_str) +``` + +**Parámetros** + +- `index_str` — Representación de cadena del índice H3. Tipo: [Cadena](../../data_types/string.md). + +**Valores devueltos** + +- Número de índice hexagonal. Devuelve 0 en caso de error. Tipo: [UInt64](../../data_types/int_uint.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT stringToH3('89184926cc3ffff') as index +``` + +Resultado: + +``` text +┌──────────────index─┐ +│ 617420388351344639 │ +└────────────────────┘ +``` + +## h3GetResolution {#h3getresolution} + +Devuelve la resolución del índice. + +**Sintaxis** + +``` sql +h3GetResolution(index) +``` + +**Parámetros** + +- `index` — Número de índice hexagonal. Tipo: [UInt64](../../data_types/int_uint.md). + +**Valores devueltos** + +- Resolución del índice. Gama: `[0, 15]`. Tipo: [UInt8](../../data_types/int_uint.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT h3GetResolution(617420388352917503) as res +``` + +Resultado: + +``` text +┌─res─┐ +│ 9 │ +└─────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/geo/) diff --git a/docs/es/query_language/functions/hash_functions.md b/docs/es/query_language/functions/hash_functions.md new file mode 100644 index 00000000000..aac60e3f2f2 --- /dev/null +++ b/docs/es/query_language/functions/hash_functions.md @@ -0,0 +1,439 @@ +# Funciones hash {#hash-functions} + +Las funciones Hash se pueden usar para la barajada pseudoaleatoria determinista de elementos. + +## HalfMD5 {#hash-functions-halfmd5} + +[Interpretar](../../query_language/functions/type_conversion_functions.md#type_conversion_functions-reinterpretAsString) todos los parámetros de entrada como cadenas y calcula el [MD5](https://en.wikipedia.org/wiki/MD5) valor hash para cada uno de ellos. Luego combina hashes, toma los primeros 8 bytes del hash de la cadena resultante y los interpreta como `UInt64` en orden de bytes de big-endian. + +``` sql +halfMD5(par1, ...) +``` + +La función es relativamente lenta (5 millones de cadenas cortas por segundo por núcleo del procesador). +Considere usar el [sipHash64](#hash_functions-siphash64) función en su lugar. + +**Parámetros** + +La función toma un número variable de parámetros de entrada. Los parámetros pueden ser cualquiera de los [Tipos de datos compatibles](../../data_types/index.md). + +**Valor devuelto** + +Naciones [UInt64](../../data_types/int_uint.md) tipo de datos valor hash. + +**Ejemplo** + +``` sql +SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS halfMD5hash, toTypeName(halfMD5hash) AS type +``` + +``` text +┌────────halfMD5hash─┬─type───┐ +│ 186182704141653334 │ UInt64 │ +└────────────────────┴────────┘ +``` + +## MD5 {#hash-functions-md5} + +Calcula el MD5 de una cadena y devuelve el conjunto de bytes resultante como FixedString(16). +Si no necesita MD5 en particular, pero necesita un hash criptográfico de 128 bits decente, use el ‘sipHash128’ función en su lugar. +Si desea obtener el mismo resultado que la salida de la utilidad md5sum, use lower(hex(MD5(s)) . + +## sipHash64 {#hash-functions-siphash64} + +Producir un [SipHash](https://131002.net/siphash/) hachís valor. + +``` sql +sipHash64(par1,...) +``` + +Esta es una función hash criptográfica. Funciona al menos tres veces más rápido que el [MD5](#hash_functions-md5) función. + +Función [interpretar](../../query_language/functions/type_conversion_functions.md#type_conversion_functions-reinterpretAsString) todos los parámetros de entrada como cadenas y calcula el valor hash para cada uno de ellos. Luego combina hashes por el siguiente algoritmo: + +1. Después de hash todos los parámetros de entrada, la función obtiene la matriz de hashes. +2. La función toma el primero y el segundo elementos y calcula un hash para la matriz de ellos. +3. Luego, la función toma el valor hash, calculado en el paso anterior, y el tercer elemento de la matriz hash inicial, y calcula un hash para la matriz de ellos. +4. El paso anterior se repite para todos los elementos restantes de la matriz hash inicial. + +**Parámetros** + +La función toma un número variable de parámetros de entrada. Los parámetros pueden ser cualquiera de los [Tipos de datos compatibles](../../data_types/index.md). + +**Valor devuelto** + +Naciones [UInt64](../../data_types/int_uint.md) tipo de datos valor hash. + +**Ejemplo** + +``` sql +SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type +``` + +``` text +┌──────────────SipHash─┬─type───┐ +│ 13726873534472839665 │ UInt64 │ +└──────────────────────┴────────┘ +``` + +## sipHash128 {#hash-functions-siphash128} + +Calcula SipHash a partir de una cadena. +Acepta un argumento de tipo String. ¿Cómo puedo hacerlo? +Difiere de sipHash64 en que el estado final de plegado xor solo se realiza hasta 128 bits. + +## cityHash64 {#cityhash64} + +Producir un [Método de codificación de datos:](https://github.com/google/cityhash) hachís valor. + +``` sql +cityHash64(par1,...) +``` + +Esta es una función hash rápida no criptográfica. Utiliza el algoritmo CityHash para parámetros de cadena y la función hash no criptográfica rápida específica de la implementación para parámetros con otros tipos de datos. La función utiliza el combinador CityHash para obtener los resultados finales. + +**Parámetros** + +La función toma un número variable de parámetros de entrada. Los parámetros pueden ser cualquiera de los [Tipos de datos compatibles](../../data_types/index.md). + +**Valor devuelto** + +Naciones [UInt64](../../data_types/int_uint.md) tipo de datos valor hash. + +**Ejemplos** + +Ejemplo de llamada: + +``` sql +SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS CityHash, toTypeName(CityHash) AS type +``` + +``` text +┌─────────────CityHash─┬─type───┐ +│ 12072650598913549138 │ UInt64 │ +└──────────────────────┴────────┘ +``` + +En el ejemplo siguiente se muestra cómo calcular la suma de comprobación de toda la tabla con precisión hasta el orden de fila: + +``` sql +SELECT groupBitXor(cityHash64(*)) FROM table +``` + +## intHash32 {#inthash32} + +Calcula un código hash de 32 bits a partir de cualquier tipo de entero. +Esta es una función hash no criptográfica relativamente rápida de calidad media para los números. + +## intHash64 {#inthash64} + +Calcula un código hash de 64 bits a partir de cualquier tipo de entero. +Funciona más rápido que intHash32. Calidad media. + +## SHA1 {#sha1} + +## SHA224 {#sha224} + +## SHA256 {#sha256} + +Calcula SHA-1, SHA-224 o SHA-256 de una cadena y devuelve el conjunto de bytes resultante como FixedString(20), FixedString(28) o FixedString(32). +La función funciona bastante lentamente (SHA-1 procesa alrededor de 5 millones de cadenas cortas por segundo por núcleo del procesador, mientras que SHA-224 y SHA-256 procesan alrededor de 2.2 millones). +Recomendamos usar esta función solo en los casos en que necesite una función hash específica y no pueda seleccionarla. +Incluso en estos casos, recomendamos aplicar la función offline y precalcular valores al insertarlos en la tabla, en lugar de aplicarlo en SELECTS. + +## Nombre de la red inalámbrica (SSID):\]) {#urlhashurl-n} + +Una función hash no criptográfica rápida y de calidad decente para una cadena obtenida de una URL utilizando algún tipo de normalización. +`URLHash(s)` – Calcula un hash de una cadena sin uno de los símbolos finales `/`,`?` o `#` al final, si está presente. +`URLHash(s, N)` – Calcula un hash de una cadena hasta el nivel N en la jerarquía de URL, sin uno de los símbolos finales `/`,`?` o `#` al final, si está presente. +Los niveles son los mismos que en URLHierarchy. Esta función es específica de Yandex.Métrica. + +## Método de codificación de datos: {#farmhash64} + +Producir un [Método de codificación de datos:](https://github.com/google/farmhash) hachís valor. + +``` sql +farmHash64(par1, ...) +``` + +La función utiliza el `Hash64` de todos [métodos disponibles](https://github.com/google/farmhash/blob/master/src/farmhash.h). + +**Parámetros** + +La función toma un número variable de parámetros de entrada. Los parámetros pueden ser cualquiera de los [Tipos de datos compatibles](../../data_types/index.md). + +**Valor devuelto** + +Naciones [UInt64](../../data_types/int_uint.md) tipo de datos valor hash. + +**Ejemplo** + +``` sql +SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS FarmHash, toTypeName(FarmHash) AS type +``` + +``` text +┌─────────────FarmHash─┬─type───┐ +│ 17790458267262532859 │ UInt64 │ +└──────────────────────┴────────┘ +``` + +## Nombre de la red inalámbrica (SSID): {#hash-functions-javahash} + +Calcular [Nivel de Cifrado WEP](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) de una cuerda. Esta función hash no es rápida ni tiene una buena calidad. La única razón para usarlo es cuando este algoritmo ya se usa en otro sistema y debe calcular exactamente el mismo resultado. + +**Sintaxis** + +``` sql +SELECT javaHash(''); +``` + +**Valor devuelto** + +Naciones `Int32` tipo de datos valor hash. + +**Ejemplo** + +Consulta: + +``` sql +SELECT javaHash('Hello, world!'); +``` + +Resultado: + +``` text +┌─javaHash('Hello, world!')─┐ +│ -1880044555 │ +└───────────────────────────┘ +``` + +## javaHashUTF16LE {#javahashutf16le} + +Calcular [Nivel de Cifrado WEP](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) de una cadena, suponiendo que contiene bytes que representan una cadena en codificación UTF-16LE. + +**Sintaxis** + +``` sql +javaHashUTF16LE(stringUtf16le) +``` + +**Parámetros** + +- `stringUtf16le` — una cadena en codificación UTF-16LE. + +**Valor devuelto** + +Naciones `Int32` tipo de datos valor hash. + +**Ejemplo** + +Consulta correcta con cadena codificada UTF-16LE. + +Consulta: + +``` sql +SELECT javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le')) +``` + +Resultado: + +``` text +┌─javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le'))─┐ +│ 3556498 │ +└──────────────────────────────────────────────────────────────┘ +``` + +## HiveHash {#hash-functions-hivehash} + +Calcular `HiveHash` de una cuerda. + +``` sql +SELECT hiveHash(''); +``` + +Esto es sólo [Nivel de Cifrado WEP](#hash_functions-javahash) con poco de signo puesto a cero. Esta función se utiliza en [Colmena de Apache](https://en.wikipedia.org/wiki/Apache_Hive) para versiones anteriores a la 3.0. Esta función hash no es rápida ni tiene una buena calidad. La única razón para usarlo es cuando este algoritmo ya se usa en otro sistema y debe calcular exactamente el mismo resultado. + +**Valor devuelto** + +Naciones `Int32` tipo de datos valor hash. + +Tipo: `hiveHash`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT hiveHash('Hello, world!'); +``` + +Resultado: + +``` text +┌─hiveHash('Hello, world!')─┐ +│ 267439093 │ +└───────────────────────────┘ +``` + +## Método de codificación de datos: {#metrohash64} + +Producir un [Método de codificación de datos:](http://www.jandrewrogers.com/2015/05/27/metrohash/) hachís valor. + +``` sql +metroHash64(par1, ...) +``` + +**Parámetros** + +La función toma un número variable de parámetros de entrada. Los parámetros pueden ser cualquiera de los [Tipos de datos compatibles](../../data_types/index.md). + +**Valor devuelto** + +Naciones [UInt64](../../data_types/int_uint.md) tipo de datos valor hash. + +**Ejemplo** + +``` sql +SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MetroHash, toTypeName(MetroHash) AS type +``` + +``` text +┌────────────MetroHash─┬─type───┐ +│ 14235658766382344533 │ UInt64 │ +└──────────────────────┴────────┘ +``` + +## SaltarConsistentHash {#jumpconsistenthash} + +Calcula JumpConsistentHash forma un UInt64. +Acepta dos argumentos: una clave de tipo UInt64 y el número de cubos. Devuelve Int32. +Para obtener más información, consulte el enlace: [SaltarConsistentHash](https://arxiv.org/pdf/1406.2294.pdf) + +## murmurHash2\_32, murmurHash2\_64 {#murmurhash2-32-murmurhash2-64} + +Producir un [Método de codificación de datos:](https://github.com/aappleby/smhasher) hachís valor. + +``` sql +murmurHash2_32(par1, ...) +murmurHash2_64(par1, ...) +``` + +**Parámetros** + +Ambas funciones toman un número variable de parámetros de entrada. Los parámetros pueden ser cualquiera de los [Tipos de datos compatibles](../../data_types/index.md). + +**Valor devuelto** + +- El `murmurHash2_32` función devuelve el valor hash que tiene el [UInt32](../../data_types/int_uint.md) tipo de datos. +- El `murmurHash2_64` función devuelve el valor hash que tiene el [UInt64](../../data_types/int_uint.md) tipo de datos. + +**Ejemplo** + +``` sql +SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash2, toTypeName(MurmurHash2) AS type +``` + +``` text +┌──────────MurmurHash2─┬─type───┐ +│ 11832096901709403633 │ UInt64 │ +└──────────────────────┴────────┘ +``` + +## murmurHash3\_32, murmurHash3\_64 {#murmurhash3-32-murmurhash3-64} + +Producir un [Método de codificación de datos:](https://github.com/aappleby/smhasher) hachís valor. + +``` sql +murmurHash3_32(par1, ...) +murmurHash3_64(par1, ...) +``` + +**Parámetros** + +Ambas funciones toman un número variable de parámetros de entrada. Los parámetros pueden ser cualquiera de los [Tipos de datos compatibles](../../data_types/index.md). + +**Valor devuelto** + +- El `murmurHash3_32` función devuelve un [UInt32](../../data_types/int_uint.md) tipo de datos valor hash. +- El `murmurHash3_64` función devuelve un [UInt64](../../data_types/int_uint.md) tipo de datos valor hash. + +**Ejemplo** + +``` sql +SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash3, toTypeName(MurmurHash3) AS type +``` + +``` text +┌─MurmurHash3─┬─type───┐ +│ 2152717 │ UInt32 │ +└─────────────┴────────┘ +``` + +## murmurHash3\_128 {#murmurhash3-128} + +Producir un [Método de codificación de datos:](https://github.com/aappleby/smhasher) hachís valor. + +``` sql +murmurHash3_128( expr ) +``` + +**Parámetros** + +- `expr` — [Expresiones](../syntax.md#syntax-expressions) devolviendo un [Cadena](../../data_types/string.md)-tipo de valor. + +**Valor devuelto** + +Naciones [Cadena fija (16)](../../data_types/fixedstring.md) tipo de datos valor hash. + +**Ejemplo** + +``` sql +SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) AS type +``` + +``` text +┌─MurmurHash3──────┬─type────────────┐ +│ 6�1�4"S5KT�~~q │ FixedString(16) │ +└──────────────────┴─────────────────┘ +``` + +## xxHash32, xxHash64 {#hash-functions-xxhash32} + +Calcular `xxHash` de una cuerda. Se propone en dos sabores, 32 y 64 bits. + +``` sql +SELECT xxHash32(''); + +OR + +SELECT xxHash64(''); +``` + +**Valor devuelto** + +Naciones `Uint32` o `Uint64` tipo de datos valor hash. + +Tipo: `xxHash`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT xxHash32('Hello, world!'); +``` + +Resultado: + +``` text +┌─xxHash32('Hello, world!')─┐ +│ 834093149 │ +└───────────────────────────┘ +``` + +**Ver también** + +- [xxHash](http://cyan4973.github.io/xxHash/). + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/hash_functions/) diff --git a/docs/es/query_language/functions/higher_order_functions.md b/docs/es/query_language/functions/higher_order_functions.md new file mode 100644 index 00000000000..8b66442115f --- /dev/null +++ b/docs/es/query_language/functions/higher_order_functions.md @@ -0,0 +1,257 @@ +# Funciones de orden superior {#higher-order-functions} + +## `->` operador, función lambda (params, expr) {#operator-lambdaparams-expr-function} + +Permite describir una función lambda para pasar a una función de orden superior. El lado izquierdo de la flecha tiene un parámetro formal, que es cualquier ID, o múltiples parámetros formales: cualquier ID en una tupla. El lado derecho de la flecha tiene una expresión que puede usar estos parámetros formales, así como cualquier columnas de tabla. + +Ejemplos: `x -> 2 * x, str -> str != Referer.` + +Las funciones de orden superior solo pueden aceptar funciones lambda como su argumento funcional. + +Una función lambda que acepta múltiples argumentos se puede pasar a una función de orden superior. En este caso, a la función de orden superior se le pasan varias matrices de idéntica longitud a las que corresponderán estos argumentos. + +Para algunas funciones, tales como [arrayCount](#higher_order_functions-array-count) o [arraySum](#higher_order_functions-array-count), el primer argumento (la función lambda) se puede omitir. En este caso, se supone un mapeo idéntico. + +No se puede omitir una función lambda para las siguientes funciones: + +- [arrayMap](#higher_order_functions-array-map) +- [arrayFilter](#higher_order_functions-array-filter) +- [arrayFill](#higher_order_functions-array-fill) +- [arrayReverseFill](#higher_order_functions-array-reverse-fill) +- [arraySplit](#higher_order_functions-array-split) +- [arrayReverseSplit](#higher_order_functions-array-reverse-split) +- [arrayFirst](#higher_order_functions-array-first) +- [arrayFirstIndex](#higher_order_functions-array-first-index) + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-map} + +Devuelve una matriz obtenida de la aplicación original `func` función a cada elemento en el `arr` matriz. + +Ejemplos: + +``` sql +SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res; +``` + +``` text +┌─res─────┐ +│ [3,4,5] │ +└─────────┘ +``` + +En el ejemplo siguiente se muestra cómo crear una tupla de elementos de diferentes matrices: + +``` sql +SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res +``` + +``` text +┌─res─────────────────┐ +│ [(1,4),(2,5),(3,6)] │ +└─────────────────────┘ +``` + +Tenga en cuenta que el primer argumento (función lambda) no se puede omitir en el `arrayMap` función. + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-filter} + +Devuelve una matriz que contiene sólo los elementos en `arr1` para los cuales `func` devuelve algo distinto de 0. + +Ejemplos: + +``` sql +SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res +``` + +``` text +┌─res───────────┐ +│ ['abc World'] │ +└───────────────┘ +``` + +``` sql +SELECT + arrayFilter( + (i, x) -> x LIKE '%World%', + arrayEnumerate(arr), + ['Hello', 'abc World'] AS arr) + AS res +``` + +``` text +┌─res─┐ +│ [2] │ +└─────┘ +``` + +Tenga en cuenta que el primer argumento (función lambda) no se puede omitir en el `arrayFilter` función. + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-fill} + +Escanear a través de `arr1` desde el primer elemento hasta el último elemento y reemplazar `arr1[i]` por `arr1[i - 1]` si `func` devuelve 0. El primer elemento de `arr1` no será reemplazado. + +Ejemplos: + +``` sql +SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res +``` + +``` text +┌─res──────────────────────────────┐ +│ [1,1,3,11,12,12,12,5,6,14,14,14] │ +└──────────────────────────────────┘ +``` + +Tenga en cuenta que el primer argumento (función lambda) no se puede omitir en el `arrayFill` función. + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-reverse-fill} + +Escanear a través de `arr1` del último elemento al primer elemento y reemplace `arr1[i]` por `arr1[i + 1]` si `func` devuelve 0. El último elemento de `arr1` no será reemplazado. + +Ejemplos: + +``` sql +SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14, null, null]) AS res +``` + +``` text +┌─res────────────────────────────────┐ +│ [1,3,3,11,12,5,5,5,6,14,NULL,NULL] │ +└────────────────────────────────────┘ +``` + +Tenga en cuenta que el primer argumento (función lambda) no se puede omitir en el `arrayReverseFill` función. + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-split} + +Dividir `arr1` en múltiples matrices. Cuando `func` devuelve algo distinto de 0, la matriz se dividirá en el lado izquierdo del elemento. La matriz no se dividirá antes del primer elemento. + +Ejemplos: + +``` sql +SELECT arraySplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res +``` + +``` text +┌─res─────────────┐ +│ [[1,2,3],[4,5]] │ +└─────────────────┘ +``` + +Tenga en cuenta que el primer argumento (función lambda) no se puede omitir en el `arraySplit` función. + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-reverse-split} + +Dividir `arr1` en múltiples matrices. Cuando `func` devuelve algo distinto de 0, la matriz se dividirá en el lado derecho del elemento. La matriz no se dividirá después del último elemento. + +Ejemplos: + +``` sql +SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res +``` + +``` text +┌─res───────────────┐ +│ [[1],[2,3,4],[5]] │ +└───────────────────┘ +``` + +Tenga en cuenta que el primer argumento (función lambda) no se puede omitir en el `arraySplit` función. + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-count} + +Devuelve el número de elementos de la matriz arr para los cuales func devuelve algo distinto de 0. Si ‘func’ no se especifica, devuelve el número de elementos distintos de cero en la matriz. + +### ¿Cómo puedo hacerlo?, …) {#arrayexistsfunc-arr1} + +Devuelve 1 si hay al menos un elemento en ‘arr’ para los cuales ‘func’ devuelve algo distinto de 0. De lo contrario, devuelve 0. + +### ¿Cómo puedo hacerlo?, …) {#arrayallfunc-arr1} + +Devuelve 1 si ‘func’ devuelve algo distinto de 0 para todos los elementos en ‘arr’. De lo contrario, devuelve 0. + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-sum} + +Devuelve la suma de la ‘func’ valor. Si se omite la función, simplemente devuelve la suma de los elementos de la matriz. + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-first} + +Devuelve el primer elemento en el ‘arr1’ matriz para la cual ‘func’ devuelve algo distinto de 0. + +Tenga en cuenta que el primer argumento (función lambda) no se puede omitir en el `arrayFirst` función. + +### ¿Cómo puedo hacerlo?, …) {#higher-order-functions-array-first-index} + +Devuelve el índice del primer elemento ‘arr1’ matriz para la cual ‘func’ devuelve algo distinto de 0. + +Tenga en cuenta que el primer argumento (función lambda) no se puede omitir en el `arrayFirstIndex` función. + +### ¿Cómo puedo hacerlo?, …) {#arraycumsumfunc-arr1} + +Devuelve una matriz de sumas parciales de elementos en la matriz de origen (una suma en ejecución). Si el `func` se especifica la función, luego los valores de los elementos de la matriz se convierten mediante esta función antes de sumar. + +Ejemplo: + +``` sql +SELECT arrayCumSum([1, 1, 1, 1]) AS res +``` + +``` text +┌─res──────────┐ +│ [1, 2, 3, 4] │ +└──────────────┘ +``` + +### Información de archivo) {#arraycumsumnonnegativearr} + +Lo mismo que `arrayCumSum`, devuelve una matriz de sumas parciales de elementos en la matriz de origen (una suma en ejecución). Diferente `arrayCumSum`, cuando el valor devuelto contiene un valor menor que cero, el valor se reemplaza con cero y el cálculo posterior se realiza con cero parámetros. Por ejemplo: + +``` sql +SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res +``` + +``` text +┌─res───────┐ +│ [1,2,0,1] │ +└───────────┘ +``` + +### ¿Cómo puedo hacerlo?, …) {#arraysortfunc-arr1} + +Devuelve una matriz como resultado de ordenar los elementos de `arr1` en orden ascendente. Si el `func` se especifica la función, el orden de clasificación se determina por el resultado de la función `func` aplicado a los elementos de la matriz (arrays) + +El [Transformación de Schwartzian](https://en.wikipedia.org/wiki/Schwartzian_transform) se utiliza para mejorar la eficiencia de clasificación. + +Ejemplo: + +``` sql +SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); +``` + +``` text +┌─res────────────────┐ +│ ['world', 'hello'] │ +└────────────────────┘ +``` + +Para obtener más información sobre el `arraySort` método, véase el [Funciones para trabajar con matrices](array_functions.md#array_functions-sort) apartado. + +### ¿Cómo puedo hacerlo?, …) {#arrayreversesortfunc-arr1} + +Devuelve una matriz como resultado de ordenar los elementos de `arr1` en orden descendente. Si el `func` se especifica la función, el orden de clasificación se determina por el resultado de la función `func` Aplicado a los elementos de la matriz (arrays). + +Ejemplo: + +``` sql +SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; +``` + +``` text +┌─res───────────────┐ +│ ['hello','world'] │ +└───────────────────┘ +``` + +Para obtener más información sobre el `arrayReverseSort` método, véase el [Funciones para trabajar con matrices](array_functions.md#array_functions-reverse-sort) apartado. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/higher_order_functions/) diff --git a/docs/es/query_language/functions/in_functions.md b/docs/es/query_language/functions/in_functions.md new file mode 100644 index 00000000000..a0ebd07a3ea --- /dev/null +++ b/docs/es/query_language/functions/in_functions.md @@ -0,0 +1,19 @@ +# Funciones para implementar el operador IN {#functions-for-implementing-the-in-operator} + +## Información de uso {#in-functions} + +Vea la sección [IN operadores](../select.md#select-in-operators). + +## tuple(x, y, …), operador (x, y, …) {#tuplex-y-operator-x-y} + +Una función que permite agrupar varias columnas. +Para columnas con los tipos T1, T2, …, devuelve una tupla de tipo Tuple(T1, T2, …) que contiene estas columnas. No hay ningún costo para ejecutar la función. +Las tuplas se usan normalmente como valores intermedios para un argumento de operadores IN, o para crear una lista de parámetros formales de funciones lambda. Las tuplas no se pueden escribir en una tabla. + +## Puede utilizar el siguiente ejemplo: {#tupleelementtuple-n-operator-x-n} + +Una función que permite obtener una columna de una tupla. +‘N’ es el índice de columna, comenzando desde 1. N debe ser una constante. ‘N’ debe ser una constante. ‘N’ debe ser un entero postivo estricto no mayor que el tamaño de la tupla. +No hay ningún costo para ejecutar la función. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/in_functions/) diff --git a/docs/es/query_language/functions/index.md b/docs/es/query_language/functions/index.md new file mode 100644 index 00000000000..8e69d670b59 --- /dev/null +++ b/docs/es/query_language/functions/index.md @@ -0,0 +1,66 @@ +# Función {#functions} + +Hay al menos \* dos tipos de funciones: funciones regulares (simplemente se llaman “functions”) y funciones agregadas. Estos son conceptos completamente diferentes. Las funciones regulares funcionan como si se aplicaran a cada fila por separado (para cada fila, el resultado de la función no depende de las otras filas). Las funciones agregadas acumulan un conjunto de valores de varias filas (es decir,dependen de todo el conjunto de filas). + +En esta sección discutimos las funciones regulares. Para las funciones agregadas, consulte la sección “Aggregate functions”. + +\* - Existe un tercer tipo de función que el ‘arrayJoin’ la función pertenece a; las funciones de la tabla también se pueden mencionar por separado.\* + +## Tipeo fuerte {#strong-typing} + +A diferencia del SQL estándar, ClickHouse tiene una tipificación fuerte. En otras palabras, no hace conversiones implícitas entre tipos. Cada función funciona para un conjunto específico de tipos. Esto significa que a veces necesita usar funciones de conversión de tipos. + +## Eliminación de subexpresiones comunes {#common-subexpression-elimination} + +Se considera que todas las expresiones de una consulta que tienen el mismo AST (el mismo registro o el mismo resultado del análisis sintáctico) tienen valores idénticos. Tales expresiones se concatenan y se ejecutan una vez. Las subconsultas idénticas también se eliminan de esta manera. + +## Tipos de resultados {#types-of-results} + +Todas las funciones devuelven un único retorno como resultado (no varios valores, y no valores cero). El tipo de resultado generalmente se define solo por los tipos de argumentos, no por los valores. Las excepciones son la función tupleElement (el operador a.N) y la función toFixedString. + +## Constante {#constants} + +Para simplificar, ciertas funciones solo pueden funcionar con constantes para algunos argumentos. Por ejemplo, el argumento correcto del operador LIKE debe ser una constante. +Casi todas las funciones devuelven una constante para argumentos constantes. La excepción son las funciones que generan números aleatorios. +El ‘now’ función devuelve valores diferentes para las consultas que se ejecutaron en diferentes momentos, pero el resultado se considera una constante, ya que la constancia solo es importante dentro de una sola consulta. +Una expresión constante también se considera una constante (por ejemplo, la mitad derecha del operador LIKE se puede construir a partir de múltiples constantes). + +Las funciones se pueden implementar de diferentes maneras para argumentos constantes y no constantes (se ejecuta un código diferente). Pero los resultados para una constante y para una columna verdadera que contiene solo el mismo valor deben coincidir entre sí. + +## Procesamiento NULL {#null-processing} + +Las funciones tienen los siguientes comportamientos: + +- Si al menos uno de los argumentos de la función es `NULL` el resultado de la función es también `NULL`. +- Comportamiento especial que se especifica individualmente en la descripción de cada función. En el código fuente de ClickHouse, estas funciones tienen `UseDefaultImplementationForNulls=false`. + +## Constancia {#constancy} + +Las funciones no pueden cambiar los valores de sus argumentos; cualquier cambio se devuelve como resultado. Por lo tanto, el resultado del cálculo de funciones separadas no depende del orden en que se escriban las funciones en la consulta. + +## Manejo de errores {#error-handling} + +Algunas funciones pueden producir una excepción si los datos no son válidos. En este caso, la consulta se cancela y se devuelve un texto de error al cliente. Para el procesamiento distribuido, cuando se produce una excepción en uno de los servidores, los otros servidores también intentan anular la consulta. + +## Evaluación de expresiones de argumento {#evaluation-of-argument-expressions} + +En casi todos los lenguajes de programación, uno de los argumentos puede no evaluarse para ciertos operadores. Esto suele ser los operadores `&&`, `||`, y `?:`. +Pero en ClickHouse, los argumentos de las funciones (operadores) siempre se evalúan. Esto se debe a que partes enteras de columnas se evalúan a la vez, en lugar de calcular cada fila por separado. + +## Realización de funciones para el procesamiento de consultas distribuidas {#performing-functions-for-distributed-query-processing} + +Para el procesamiento de consultas distribuidas, se realizan tantas etapas de procesamiento de consultas como sea posible en servidores remotos, y el resto de las etapas (fusionando resultados intermedios y todo lo posterior) se realizan en el servidor solicitante. + +Esto significa que las funciones se pueden realizar en diferentes servidores. +Por ejemplo, en la consulta `SELECT f(sum(g(x))) FROM distributed_table GROUP BY h(y),` + +- Más información `distributed_table` tiene al menos dos fragmentos, las funciones ‘g’ y ‘h’ se realizan en servidores remotos, y la función ‘f’ se realiza en el servidor solicitante. +- Más información `distributed_table` tiene sólo un fragmento, todos los ‘f’, ‘g’, y ‘h’ funciones se realizan en el servidor de este fragmento. + +El resultado de una función generalmente no depende del servidor en el que se realice. Sin embargo, a veces esto es importante. +Por ejemplo, las funciones que funcionan con diccionarios utilizan el diccionario que existe en el servidor en el que se están ejecutando. +Otro ejemplo es el `hostName` función, que devuelve el nombre del servidor en el que se está ejecutando para `GROUP BY` por servidores en un `SELECT` consulta. + +Si se realiza una función en una consulta en el servidor solicitante, pero debe realizarla en servidores remotos, puede envolverla en un ‘any’ agregar o agregarlo a una clave en `GROUP BY`. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/) diff --git a/docs/es/query_language/functions/introspection.md b/docs/es/query_language/functions/introspection.md new file mode 100644 index 00000000000..dd0fb0c61f5 --- /dev/null +++ b/docs/es/query_language/functions/introspection.md @@ -0,0 +1,303 @@ +# Funciones de introspección {#introspection-functions} + +Puede utilizar las funciones descritas en este capítulo para [ELFO](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) y [ENANO](https://en.wikipedia.org/wiki/DWARF) para la creación de perfiles de consultas. + +!!! warning "Advertencia" + Estas funciones son lentas y pueden imponer consideraciones de seguridad. + +Para el correcto funcionamiento de las funciones de introspección: + +- Instale el `clickhouse-common-static-dbg` paquete. + +- Establezca el [allow\_introspection\_functions](../../operations/settings/settings.md#settings-allow_introspection_functions) a 1. + + For security reasons introspection functions are disabled by default. + +ClickHouse guarda los informes del generador de perfiles [trace\_log](../../operations/system_tables.md#system_tables-trace_log) tabla del sistema. Asegúrese de que la tabla y el generador de perfiles estén configurados correctamente. + +## addressToLine {#addresstoline} + +Convierte la dirección de memoria virtual dentro del proceso del servidor ClickHouse en el nombre de archivo y el número de línea en el código fuente de ClickHouse. + +Si utiliza paquetes oficiales de ClickHouse, debe instalar el `clickhouse-common-static-dbg` paquete. + +**Sintaxis** + +``` sql +addressToLine(address_of_binary_instruction) +``` + +**Parámetros** + +- `address_of_binary_instruction` ([UInt64](../../data_types/int_uint.md)) — Dirección de la instrucción en un proceso en ejecución. + +**Valor devuelto** + +- Nombre de archivo del código fuente y el número de línea en este archivo delimitado por dos puntos. + + For example, `/build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.cpp:199`, where `199` is a line number. + +- Nombre de un binario, si la función no pudo encontrar la información de depuración. + +- Cadena vacía, si la dirección no es válida. + +Tipo: [Cadena](../../data_types/string.md). + +**Ejemplo** + +Habilitación de las funciones de introspección: + +``` sql +SET allow_introspection_functions=1 +``` + +Seleccionando la primera cadena de la `trace_log` tabla del sistema: + +``` sql +SELECT * FROM system.trace_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2019-11-19 +event_time: 2019-11-19 18:57:23 +revision: 54429 +timer_type: Real +thread_number: 48 +query_id: 421b6855-1858-45a5-8f37-f383409d6d72 +trace: [140658411141617,94784174532828,94784076370703,94784076372094,94784076361020,94784175007680,140658411116251,140658403895439] +``` + +El `trace` campo contiene el seguimiento de la pila en el momento del muestreo. + +Obtener el nombre de archivo del código fuente y el número de línea para una sola dirección: + +``` sql +SELECT addressToLine(94784076370703) \G +``` + +``` text +Row 1: +────── +addressToLine(94784076370703): /build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.cpp:199 +``` + +Aplicando la función a todo el seguimiento de la pila: + +``` sql +SELECT + arrayStringConcat(arrayMap(x -> addressToLine(x), trace), '\n') AS trace_source_code_lines +FROM system.trace_log +LIMIT 1 +\G +``` + +El [arrayMap](higher_order_functions.md#higher_order_functions-array-map) permite procesar cada elemento individual de la `trace` matriz por el `addressToLine` función. El resultado de este procesamiento se ve en el `trace_source_code_lines` columna de salida. + +``` text +Row 1: +────── +trace_source_code_lines: /lib/x86_64-linux-gnu/libpthread-2.27.so +/usr/lib/debug/usr/bin/clickhouse +/build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.cpp:199 +/build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.h:155 +/usr/include/c++/9/bits/atomic_base.h:551 +/usr/lib/debug/usr/bin/clickhouse +/lib/x86_64-linux-gnu/libpthread-2.27.so +/build/glibc-OTsEL5/glibc-2.27/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:97 +``` + +## addressToSymbol {#addresstosymbol} + +Convierte la dirección de memoria virtual dentro del proceso del servidor ClickHouse en el símbolo de los archivos de objetos ClickHouse. + +**Sintaxis** + +``` sql +addressToSymbol(address_of_binary_instruction) +``` + +**Parámetros** + +- `address_of_binary_instruction` ([UInt64](../../data_types/int_uint.md)) — Dirección de la instrucción en un proceso en ejecución. + +**Valor devuelto** + +- Símbolo de archivos de objetos ClickHouse. +- Cadena vacía, si la dirección no es válida. + +Tipo: [Cadena](../../data_types/string.md). + +**Ejemplo** + +Habilitación de las funciones de introspección: + +``` sql +SET allow_introspection_functions=1 +``` + +Seleccionando la primera cadena de la `trace_log` tabla del sistema: + +``` sql +SELECT * FROM system.trace_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2019-11-20 +event_time: 2019-11-20 16:57:59 +revision: 54429 +timer_type: Real +thread_number: 48 +query_id: 724028bf-f550-45aa-910d-2af6212b94ac +trace: [94138803686098,94138815010911,94138815096522,94138815101224,94138815102091,94138814222988,94138806823642,94138814457211,94138806823642,94138814457211,94138806823642,94138806795179,94138806796144,94138753770094,94138753771646,94138753760572,94138852407232,140399185266395,140399178045583] +``` + +El `trace` campo contiene el seguimiento de la pila en el momento del muestreo. + +Obtener un símbolo para una sola dirección: + +``` sql +SELECT addressToSymbol(94138803686098) \G +``` + +``` text +Row 1: +────── +addressToSymbol(94138803686098): _ZNK2DB24IAggregateFunctionHelperINS_20AggregateFunctionSumImmNS_24AggregateFunctionSumDataImEEEEE19addBatchSinglePlaceEmPcPPKNS_7IColumnEPNS_5ArenaE +``` + +Aplicando la función a todo el seguimiento de la pila: + +``` sql +SELECT + arrayStringConcat(arrayMap(x -> addressToSymbol(x), trace), '\n') AS trace_symbols +FROM system.trace_log +LIMIT 1 +\G +``` + +El [arrayMap](higher_order_functions.md#higher_order_functions-array-map) permite procesar cada elemento individual de la `trace` matriz por el `addressToSymbols` función. El resultado de este procesamiento se ve en el `trace_symbols` columna de salida. + +``` text +Row 1: +────── +trace_symbols: _ZNK2DB24IAggregateFunctionHelperINS_20AggregateFunctionSumImmNS_24AggregateFunctionSumDataImEEEEE19addBatchSinglePlaceEmPcPPKNS_7IColumnEPNS_5ArenaE +_ZNK2DB10Aggregator21executeWithoutKeyImplERPcmPNS0_28AggregateFunctionInstructionEPNS_5ArenaE +_ZN2DB10Aggregator14executeOnBlockESt6vectorIN3COWINS_7IColumnEE13immutable_ptrIS3_EESaIS6_EEmRNS_22AggregatedDataVariantsERS1_IPKS3_SaISC_EERS1_ISE_SaISE_EERb +_ZN2DB10Aggregator14executeOnBlockERKNS_5BlockERNS_22AggregatedDataVariantsERSt6vectorIPKNS_7IColumnESaIS9_EERS6_ISB_SaISB_EERb +_ZN2DB10Aggregator7executeERKSt10shared_ptrINS_17IBlockInputStreamEERNS_22AggregatedDataVariantsE +_ZN2DB27AggregatingBlockInputStream8readImplEv +_ZN2DB17IBlockInputStream4readEv +_ZN2DB26ExpressionBlockInputStream8readImplEv +_ZN2DB17IBlockInputStream4readEv +_ZN2DB26ExpressionBlockInputStream8readImplEv +_ZN2DB17IBlockInputStream4readEv +_ZN2DB28AsynchronousBlockInputStream9calculateEv +_ZNSt17_Function_handlerIFvvEZN2DB28AsynchronousBlockInputStream4nextEvEUlvE_E9_M_invokeERKSt9_Any_data +_ZN14ThreadPoolImplI20ThreadFromGlobalPoolE6workerESt14_List_iteratorIS0_E +_ZZN20ThreadFromGlobalPoolC4IZN14ThreadPoolImplIS_E12scheduleImplIvEET_St8functionIFvvEEiSt8optionalImEEUlvE1_JEEEOS4_DpOT0_ENKUlvE_clEv +_ZN14ThreadPoolImplISt6threadE6workerESt14_List_iteratorIS0_E +execute_native_thread_routine +start_thread +clone +``` + +## demangle {#demangle} + +Convierte un símbolo que puede obtener utilizando el [addressToSymbol](#addresstosymbol) función a la función de C++ nombre. + +**Sintaxis** + +``` sql +demangle(symbol) +``` + +**Parámetros** + +- `symbol` ([Cadena](../../data_types/string.md)) — Símbolo de un archivo de objeto. + +**Valor devuelto** + +- Nombre de la función C++ +- Cadena vacía si un símbolo no es válido. + +Tipo: [Cadena](../../data_types/string.md). + +**Ejemplo** + +Habilitación de las funciones de introspección: + +``` sql +SET allow_introspection_functions=1 +``` + +Seleccionando la primera cadena de la `trace_log` tabla del sistema: + +``` sql +SELECT * FROM system.trace_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2019-11-20 +event_time: 2019-11-20 16:57:59 +revision: 54429 +timer_type: Real +thread_number: 48 +query_id: 724028bf-f550-45aa-910d-2af6212b94ac +trace: [94138803686098,94138815010911,94138815096522,94138815101224,94138815102091,94138814222988,94138806823642,94138814457211,94138806823642,94138814457211,94138806823642,94138806795179,94138806796144,94138753770094,94138753771646,94138753760572,94138852407232,140399185266395,140399178045583] +``` + +El `trace` campo contiene el seguimiento de la pila en el momento del muestreo. + +Obtener un nombre de función para una sola dirección: + +``` sql +SELECT demangle(addressToSymbol(94138803686098)) \G +``` + +``` text +Row 1: +────── +demangle(addressToSymbol(94138803686098)): DB::IAggregateFunctionHelper > >::addBatchSinglePlace(unsigned long, char*, DB::IColumn const**, DB::Arena*) const +``` + +Aplicando la función a todo el seguimiento de la pila: + +``` sql +SELECT + arrayStringConcat(arrayMap(x -> demangle(addressToSymbol(x)), trace), '\n') AS trace_functions +FROM system.trace_log +LIMIT 1 +\G +``` + +El [arrayMap](higher_order_functions.md#higher_order_functions-array-map) permite procesar cada elemento individual de la `trace` matriz por el `demangle` función. El resultado de este procesamiento se ve en el `trace_functions` columna de salida. + +``` text +Row 1: +────── +trace_functions: DB::IAggregateFunctionHelper > >::addBatchSinglePlace(unsigned long, char*, DB::IColumn const**, DB::Arena*) const +DB::Aggregator::executeWithoutKeyImpl(char*&, unsigned long, DB::Aggregator::AggregateFunctionInstruction*, DB::Arena*) const +DB::Aggregator::executeOnBlock(std::vector::immutable_ptr, std::allocator::immutable_ptr > >, unsigned long, DB::AggregatedDataVariants&, std::vector >&, std::vector >, std::allocator > > >&, bool&) +DB::Aggregator::executeOnBlock(DB::Block const&, DB::AggregatedDataVariants&, std::vector >&, std::vector >, std::allocator > > >&, bool&) +DB::Aggregator::execute(std::shared_ptr const&, DB::AggregatedDataVariants&) +DB::AggregatingBlockInputStream::readImpl() +DB::IBlockInputStream::read() +DB::ExpressionBlockInputStream::readImpl() +DB::IBlockInputStream::read() +DB::ExpressionBlockInputStream::readImpl() +DB::IBlockInputStream::read() +DB::AsynchronousBlockInputStream::calculate() +std::_Function_handler::_M_invoke(std::_Any_data const&) +ThreadPoolImpl::worker(std::_List_iterator) +ThreadFromGlobalPool::ThreadFromGlobalPool::scheduleImpl(std::function, int, std::optional)::{lambda()#3}>(ThreadPoolImpl::scheduleImpl(std::function, int, std::optional)::{lambda()#3}&&)::{lambda()#1}::operator()() const +ThreadPoolImpl::worker(std::_List_iterator) +execute_native_thread_routine +start_thread +clone +``` diff --git a/docs/es/query_language/functions/ip_address_functions.md b/docs/es/query_language/functions/ip_address_functions.md new file mode 100644 index 00000000000..3705168a6e2 --- /dev/null +++ b/docs/es/query_language/functions/ip_address_functions.md @@ -0,0 +1,241 @@ +# Funciones para trabajar con direcciones IP {#functions-for-working-with-ip-addresses} + +## Número de código: IPv4NumToString (num) {#ipv4numtostringnum} + +Toma un número UInt32. Lo interpreta como una dirección IPv4 en big endian. Devuelve una cadena que contiene la dirección IPv4 correspondiente en el formato A.B.C.d (números separados por puntos en forma decimal). + +## Nombre de la red inalámbrica (SSID):) {#ipv4stringtonums} + +La función inversa de IPv4NumToString. Si la dirección IPv4 tiene un formato no válido, devuelve 0. + +## Cómo hacer esto?) {#ipv4numtostringclasscnum} + +Similar a IPv4NumToString, pero usando xxx en lugar del último octeto. + +Ejemplo: + +``` sql +SELECT + IPv4NumToStringClassC(ClientIP) AS k, + count() AS c +FROM test.hits +GROUP BY k +ORDER BY c DESC +LIMIT 10 +``` + +``` text +┌─k──────────────┬─────c─┐ +│ 83.149.9.xxx │ 26238 │ +│ 217.118.81.xxx │ 26074 │ +│ 213.87.129.xxx │ 25481 │ +│ 83.149.8.xxx │ 24984 │ +│ 217.118.83.xxx │ 22797 │ +│ 78.25.120.xxx │ 22354 │ +│ 213.87.131.xxx │ 21285 │ +│ 78.25.121.xxx │ 20887 │ +│ 188.162.65.xxx │ 19694 │ +│ 83.149.48.xxx │ 17406 │ +└────────────────┴───────┘ +``` + +Desde el uso ‘xxx’ es altamente inusual, esto puede cambiarse en el futuro. Le recomendamos que no confíe en el formato exacto de este fragmento. + +### ¿Cómo puedo hacerlo?) {#ipv6numtostringx} + +Acepta un valor FixedString(16) que contiene la dirección IPv6 en formato binario. Devuelve una cadena que contiene esta dirección en formato de texto. +Las direcciones IPv4 asignadas a IPv6 se emiten en el formato ::ffff:111.222.33.44. Ejemplos: + +``` sql +SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr +``` + +``` text +┌─addr─────────┐ +│ 2a02:6b8::11 │ +└──────────────┘ +``` + +``` sql +SELECT + IPv6NumToString(ClientIP6 AS k), + count() AS c +FROM hits_all +WHERE EventDate = today() AND substring(ClientIP6, 1, 12) != unhex('00000000000000000000FFFF') +GROUP BY k +ORDER BY c DESC +LIMIT 10 +``` + +``` text +┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ +│ 2a02:2168:aaa:bbbb::2 │ 24695 │ +│ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ +│ 2a02:6b8:0:fff::ff │ 16389 │ +│ 2a01:4f8:111:6666::2 │ 16016 │ +│ 2a02:2168:888:222::1 │ 15896 │ +│ 2a01:7e00::ffff:ffff:ffff:222 │ 14774 │ +│ 2a02:8109:eee:ee:eeee:eeee:eeee:eeee │ 14443 │ +│ 2a02:810b:8888:888:8888:8888:8888:8888 │ 14345 │ +│ 2a02:6b8:0:444:4444:4444:4444:4444 │ 14279 │ +│ 2a01:7e00::ffff:ffff:ffff:ffff │ 13880 │ +└─────────────────────────────────────────┴───────┘ +``` + +``` sql +SELECT + IPv6NumToString(ClientIP6 AS k), + count() AS c +FROM hits_all +WHERE EventDate = today() +GROUP BY k +ORDER BY c DESC +LIMIT 10 +``` + +``` text +┌─IPv6NumToString(ClientIP6)─┬──────c─┐ +│ ::ffff:94.26.111.111 │ 747440 │ +│ ::ffff:37.143.222.4 │ 529483 │ +│ ::ffff:5.166.111.99 │ 317707 │ +│ ::ffff:46.38.11.77 │ 263086 │ +│ ::ffff:79.105.111.111 │ 186611 │ +│ ::ffff:93.92.111.88 │ 176773 │ +│ ::ffff:84.53.111.33 │ 158709 │ +│ ::ffff:217.118.11.22 │ 154004 │ +│ ::ffff:217.118.11.33 │ 148449 │ +│ ::ffff:217.118.11.44 │ 148243 │ +└────────────────────────────┴────────┘ +``` + +## Nombre de la red inalámbrica (SSID):) {#ipv6stringtonums} + +La función inversa de IPv6NumToString. Si la dirección IPv6 tiene un formato no válido, devuelve una cadena de bytes nulos. +HEX puede ser mayúscula o minúscula. + +## IPv4ToIPv6 (x) {#ipv4toipv6x} + +Toma un `UInt32` numero. Lo interpreta como una dirección IPv4 en [gran endian](https://en.wikipedia.org/wiki/Endianness). Devuelve un `FixedString(16)` valor que contiene la dirección IPv6 en formato binario. Ejemplos: + +``` sql +SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr +``` + +``` text +┌─addr───────────────┐ +│ ::ffff:192.168.0.1 │ +└────────────────────┘ +``` + +## Para obtener más información, consulta nuestra Política de privacidad y nuestras Condiciones de uso) {#cutipv6x-bitstocutforipv6-bitstocutforipv4} + +Acepta un valor FixedString(16) que contiene la dirección IPv6 en formato binario. Devuelve una cadena que contiene la dirección del número especificado de bits eliminados en formato de texto. Por ejemplo: + +``` sql +WITH + IPv6StringToNum('2001:0DB8:AC10:FE01:FEED:BABE:CAFE:F00D') AS ipv6, + IPv4ToIPv6(IPv4StringToNum('192.168.0.1')) AS ipv4 +SELECT + cutIPv6(ipv6, 2, 0), + cutIPv6(ipv4, 0, 2) +``` + +``` text +┌─cutIPv6(ipv6, 2, 0)─────────────────┬─cutIPv6(ipv4, 0, 2)─┐ +│ 2001:db8:ac10:fe01:feed:babe:cafe:0 │ ::ffff:192.168.0.0 │ +└─────────────────────────────────────┴─────────────────────┘ +``` + +## ¿Cómo puedo hacerlo?), {#ipv4cidrtorangeipv4-cidr} + +Acepta un valor IPv4 y UInt8 que contiene el valor [CIDR](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing). Devuelve una tupla con dos IPv4 que contienen el rango inferior y el rango superior de la subred. + +``` sql +SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16) +``` + +``` text +┌─IPv4CIDRToRange(toIPv4('192.168.5.2'), 16)─┐ +│ ('192.168.0.0','192.168.255.255') │ +└────────────────────────────────────────────┘ +``` + +## ¿Cómo puedo hacerlo?), {#ipv6cidrtorangeipv6-cidr} + +Acepta un valor IPv6 y UInt8 que contiene el CIDR. Devuelve una tupla con dos IPv6 que contienen el rango inferior y el rango superior de la subred. + +``` sql +SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32); +``` + +``` text +┌─IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32)─┐ +│ ('2001:db8::','2001:db8:ffff:ffff:ffff:ffff:ffff:ffff') │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +## Acerca de nosotros) {#toipv4string} + +Un alias para `IPv4StringToNum()` que toma una forma de cadena de dirección IPv4 y devuelve el valor de [IPv4](../../data_types/domains/ipv4.md) tipo, que es binario igual al valor devuelto por `IPv4StringToNum()`. + +``` sql +WITH + '171.225.130.45' as IPv4_string +SELECT + toTypeName(IPv4StringToNum(IPv4_string)), + toTypeName(toIPv4(IPv4_string)) +``` + +``` text +┌─toTypeName(IPv4StringToNum(IPv4_string))─┬─toTypeName(toIPv4(IPv4_string))─┐ +│ UInt32 │ IPv4 │ +└──────────────────────────────────────────┴─────────────────────────────────┘ +``` + +``` sql +WITH + '171.225.130.45' as IPv4_string +SELECT + hex(IPv4StringToNum(IPv4_string)), + hex(toIPv4(IPv4_string)) +``` + +``` text +┌─hex(IPv4StringToNum(IPv4_string))─┬─hex(toIPv4(IPv4_string))─┐ +│ ABE1822D │ ABE1822D │ +└───────────────────────────────────┴──────────────────────────┘ +``` + +## Acerca de nosotros) {#toipv6string} + +Un alias para `IPv6StringToNum()` que toma una forma de cadena de dirección IPv6 y devuelve el valor de [IPv6](../../data_types/domains/ipv6.md) tipo, que es binario igual al valor devuelto por `IPv6StringToNum()`. + +``` sql +WITH + '2001:438:ffff::407d:1bc1' as IPv6_string +SELECT + toTypeName(IPv6StringToNum(IPv6_string)), + toTypeName(toIPv6(IPv6_string)) +``` + +``` text +┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ +│ FixedString(16) │ IPv6 │ +└──────────────────────────────────────────┴─────────────────────────────────┘ +``` + +``` sql +WITH + '2001:438:ffff::407d:1bc1' as IPv6_string +SELECT + hex(IPv6StringToNum(IPv6_string)), + hex(toIPv6(IPv6_string)) +``` + +``` text +┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ +│ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ +└───────────────────────────────────┴──────────────────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/ip_address_functions/) diff --git a/docs/es/query_language/functions/json_functions.md b/docs/es/query_language/functions/json_functions.md new file mode 100644 index 00000000000..10c6b4b3d6f --- /dev/null +++ b/docs/es/query_language/functions/json_functions.md @@ -0,0 +1,224 @@ +# Funciones para trabajar con JSON {#functions-for-working-with-json} + +En el Yandex.Metrica, JSON es transmitido por los usuarios como parámetros de sesión. Hay algunas funciones especiales para trabajar con este JSON. (Aunque en la mayoría de los casos, los JSON también se procesan previamente, y los valores resultantes se colocan en columnas separadas en su formato procesado.) Todas estas funciones se basan en sólidas suposiciones sobre lo que puede ser el JSON, pero tratan de hacer lo menos posible para hacer el trabajo. + +Se hacen las siguientes suposiciones: + +1. El nombre de campo (argumento de función) debe ser una constante. +2. El nombre del campo de alguna manera está codificado canónicamente en JSON. Por ejemplo: `visitParamHas('{"abc":"def"}', 'abc') = 1` pero `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` +3. Los campos se buscan en cualquier nivel de anidación, indiscriminadamente. Si hay varios campos coincidentes, se utiliza la primera aparición. +4. El JSON no tiene caracteres de espacio fuera de los literales de cadena. + +## visitParamHas (params, nombre) {#visitparamhasparams-name} + +Comprueba si hay un campo con el ‘name’ nombre. + +## visitParamExtractUInt (params, nombre) {#visitparamextractuintparams-name} + +Analiza UInt64 a partir del valor del campo denominado ‘name’. Si se trata de un campo de cadena, intenta analizar un número desde el principio de la cadena. Si el campo no existe, o existe pero no contiene un número, devuelve 0. + +## visitParamExtractInt (params, nombre) {#visitparamextractintparams-name} + +Lo mismo que para Int64. + +## visitParamExtractFloat (params, nombre) {#visitparamextractfloatparams-name} + +Lo mismo que para Float64. + +## visitParamExtractBool (params, nombre) {#visitparamextractboolparams-name} + +Analiza un valor verdadero/falso. El resultado es UInt8. + +## visitParamExtractRaw (params, nombre) {#visitparamextractrawparams-name} + +Devuelve el valor de un campo, incluidos los separadores. + +Ejemplos: + +``` sql +visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' +visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' +``` + +## Por favor, consulte el siguiente enlace:) {#visitparamextractstringparams-name} + +Analiza la cadena entre comillas dobles. El valor es sin escape. Si no se pudo desescapar, devuelve una cadena vacía. + +Ejemplos: + +``` sql +visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' +visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' +visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' +visitParamExtractString('{"abc":"hello}', 'abc') = '' +``` + +Actualmente no hay soporte para puntos de código en el formato `\uXXXX\uYYYY` que no son del plano multilingüe básico (se convierten a CESU-8 en lugar de UTF-8). + +Las siguientes funciones se basan en [simdjson](https://github.com/lemire/simdjson) diseñado para requisitos de análisis JSON más complejos. La suposición 2 mencionada anteriormente todavía se aplica. + +## ¿Qué puedes encontrar en Neodigit) {#isvalidjsonjson} + +Comprueba que la cadena pasada es un json válido. + +Ejemplos: + +``` sql +SELECT isValidJSON('{"a": "hello", "b": [-100, 200.0, 300]}') = 1 +SELECT isValidJSON('not a json') = 0 +``` + +## ¿Cómo puedo hacerlo?\]…) {#jsonhasjson-indices-or-keys} + +Si el valor existe en el documento JSON, `1` serán devueltos. + +Si el valor no existe, `0` serán devueltos. + +Ejemplos: + +``` sql +SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 +SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 +``` + +`indices_or_keys` es una lista de cero o más argumentos, cada uno de ellos puede ser de cadena o entero. + +- Cadena = miembro del objeto de acceso por clave. +- Entero positivo = acceder al n-ésimo miembro / clave desde el principio. +- Entero negativo = acceder al n-ésimo miembro / clave desde el final. + +El índice mínimo del elemento es 1. Por lo tanto, el elemento 0 no existe. + +Puede usar enteros para acceder a matrices JSON y objetos JSON. + +Entonces, por ejemplo: + +``` sql +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' +SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' +``` + +## ¿Cómo puedo hacerlo?\]…) {#jsonlengthjson-indices-or-keys} + +Devuelve la longitud de una matriz JSON o un objeto JSON. + +Si el valor no existe o tiene un tipo incorrecto, `0` serán devueltos. + +Ejemplos: + +``` sql +SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 +SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 +``` + +## ¿Cómo puedo hacerlo?\]…) {#jsontypejson-indices-or-keys} + +Devuelve el tipo de un valor JSON. + +Si el valor no existe, `Null` serán devueltos. + +Ejemplos: + +``` sql +SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' +SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' +SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' +``` + +## ¿Cómo puedo hacerlo?\]…) {#jsonextractuintjson-indices-or-keys} + +## ¿Cómo puedo hacerlo?\]…) {#jsonextractintjson-indices-or-keys} + +## ¿Cómo puedo hacerlo?\]…) {#jsonextractfloatjson-indices-or-keys} + +## ¿Cómo puedo hacerlo?\]…) {#jsonextractbooljson-indices-or-keys} + +Analiza un JSON y extrae un valor. Estas funciones son similares a `visitParam` función. + +Si el valor no existe o tiene un tipo incorrecto, `0` serán devueltos. + +Ejemplos: + +``` sql +SELECT JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 +SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 +SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 +``` + +## Por ejemplo, puede utilizar el siguiente ejemplo:\]…) {#jsonextractstringjson-indices-or-keys} + +Analiza un JSON y extrae una cadena. Esta función es similar a `visitParamExtractString` función. + +Si el valor no existe o tiene un tipo incorrecto, se devolverá una cadena vacía. + +El valor es sin escape. Si no se pudo desescapar, devuelve una cadena vacía. + +Ejemplos: + +``` sql +SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' +SELECT JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' +SELECT JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' +SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' +SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' +``` + +## Por ejemplo, se puede utilizar el siguiente método:) {#jsonextractjson-indices-or-keys-return-type} + +Analiza un JSON y extrae un valor del tipo de datos ClickHouse dado. + +Esta es una generalización de la anterior `JSONExtract` función. +Esto significa +`JSONExtract(..., 'String')` devuelve exactamente lo mismo que `JSONExtractString()`, +`JSONExtract(..., 'Float64')` devuelve exactamente lo mismo que `JSONExtractFloat()`. + +Ejemplos: + +``` sql +SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))') = ('hello',[-100,200,300]) +SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)') = ([-100,200,300],'hello') +SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))') = [-100, NULL, NULL] +SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4, 'Nullable(Int64)') = NULL +SELECT JSONExtract('{"passed": true}', 'passed', 'UInt8') = 1 +SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Thursday' +SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' +``` + +## JSONExtractKeysAndValues(json\[, indices\_or\_keys…\], value\_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} + +Analizar pares clave-valor de un JSON donde los valores son del tipo de datos ClickHouse dado. + +Ejemplo: + +``` sql +SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; +``` + +## ¿Cómo puedo hacerlo?\]…) {#jsonextractrawjson-indices-or-keys} + +Devuelve una parte de JSON. + +Si la pieza no existe o tiene un tipo incorrecto, se devolverá una cadena vacía. + +Ejemplo: + +``` sql +SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' +``` + +## Por ejemplo, puede utilizar el siguiente ejemplo:\]…) {#jsonextractarrayrawjson-indices-or-keys} + +Devuelve una matriz con elementos de matriz JSON, cada uno representado como cadena sin analizar. + +Si la parte no existe o no es una matriz, se devolverá una matriz vacía. + +Ejemplo: + +``` sql +SELECT JSONExtractArrayRaw('{"a": "hello", "b": [-100, 200.0, "hello"]}', 'b') = ['-100', '200.0', '"hello"']' +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/json_functions/) diff --git a/docs/es/query_language/functions/logical_functions.md b/docs/es/query_language/functions/logical_functions.md new file mode 100644 index 00000000000..ecfb4c51f30 --- /dev/null +++ b/docs/es/query_language/functions/logical_functions.md @@ -0,0 +1,15 @@ +# Funciones lógicas {#logical-functions} + +Las funciones lógicas aceptan cualquier tipo numérico, pero devuelven un número UInt8 igual a 0 o 1. + +Cero como argumento se considera “false,” mientras que cualquier valor distinto de cero se considera “true”. + +## y, Y operador {#and-and-operator} + +## o, operador O {#or-or-operator} + +## no, NO operador {#not-not-operator} + +## xor {#xor} + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/logical_functions/) diff --git a/docs/es/query_language/functions/machine_learning_functions.md b/docs/es/query_language/functions/machine_learning_functions.md new file mode 100644 index 00000000000..340231e20de --- /dev/null +++ b/docs/es/query_language/functions/machine_learning_functions.md @@ -0,0 +1,13 @@ +# Funciones de aprendizaje automático {#machine-learning-functions} + +## evalMLMethod (predicción) {#machine-learning-methods-evalmlmethod} + +Predicción utilizando modelos de regresión ajustados utiliza `evalMLMethod` función. Ver enlace en `linearRegression`. + +### Regresión lineal estocástica {#stochastic-linear-regression} + +El [stochasticLinearRegression](../agg_functions/reference.md#agg_functions-stochasticlinearregression) la función agregada implementa el método de descenso de gradiente estocástico utilizando el modelo lineal y la función de pérdida MSE. Utilizar `evalMLMethod` para predecir sobre nuevos datos. + +### Regresión logística estocástica {#stochastic-logistic-regression} + +El [stochasticLogisticRegression](../agg_functions/reference.md#agg_functions-stochasticlogisticregression) la función de agregado implementa el método de descenso de gradiente estocástico para el problema de clasificación binaria. Utilizar `evalMLMethod` para predecir sobre nuevos datos. diff --git a/docs/es/query_language/functions/math_functions.md b/docs/es/query_language/functions/math_functions.md new file mode 100644 index 00000000000..77ab4dff4c1 --- /dev/null +++ b/docs/es/query_language/functions/math_functions.md @@ -0,0 +1,109 @@ +# Funciones matemáticas {#mathematical-functions} + +Todas las funciones devuelven un número Float64. La precisión del resultado es cercana a la precisión máxima posible, pero el resultado puede no coincidir con el número representable de la máquina más cercano al número real correspondiente. + +## e() {#e} + +Devuelve un número Float64 que está cerca del número e. + +## Ciudad() {#pi} + +Devuelve un número Float64 que está cerca del número π. + +## exp(x) {#expx} + +Acepta un argumento numérico y devuelve un número Float64 cercano al exponente del argumento. + +## Información) {#logx-lnx} + +Acepta un argumento numérico y devuelve un número Float64 cercano al logaritmo natural del argumento. + +## exp2(x) {#exp2x} + +Acepta un argumento numérico y devuelve un número Float64 cercano a 2 a la potencia de x. + +## log2 (x) {#log2x} + +Acepta un argumento numérico y devuelve un número Float64 cercano al logaritmo binario del argumento. + +## exp10 (x) {#exp10x} + +Acepta un argumento numérico y devuelve un número Float64 cercano a 10 a la potencia de x. + +## log10 (x) {#log10x} + +Acepta un argumento numérico y devuelve un número Float64 cercano al logaritmo decimal del argumento. + +## sqrt(x) {#sqrtx} + +Acepta un argumento numérico y devuelve un número Float64 cercano a la raíz cuadrada del argumento. + +## Cbrt (x) {#cbrtx} + +Acepta un argumento numérico y devuelve un número Float64 cercano a la raíz cúbica del argumento. + +## erf(x) {#erfx} + +Si ‘x’ no es negativo, entonces erf(x / σ√2) es la probabilidad de que una variable aleatoria tenga una distribución normal con desviación estándar ‘σ’ toma el valor que está separado del valor esperado en más de ‘x’. + +Ejemplo (regla de tres sigma): + +``` sql +SELECT erf(3 / sqrt(2)) +``` + +``` text +┌─erf(divide(3, sqrt(2)))─┐ +│ 0.9973002039367398 │ +└─────────────────────────┘ +``` + +## erfc(x) {#erfcx} + +Acepta un argumento numérico y devuelve un número Float64 cercano a 1 - erf(x), pero sin pérdida de precisión para grandes ‘x’ valor. + +## Lgamma (x) {#lgammax} + +El logaritmo de la función gamma. + +## ¿Qué puedes encontrar en Neodigit) {#tgammax} + +Función gamma. + +## sin(x) {#sinx} + +Sinusoidal. + +## cos(x) {#cosx} + +El coseno. + +## pantalla) {#tanx} + +Tangente. + +## (x) {#asinx} + +El arco sinusoidal. + +## Acerca de) {#acosx} + +El arco coseno. + +## atan (x) {#atanx} + +La tangente del arco. + +## pow(x, y), potencia(x, y) {#powx-y-powerx-y} + +Toma dos argumentos numéricos x e y. Devuelve un número Float64 cercano a x a la potencia de y. + +## IntExp2 {#intexp2} + +Acepta un argumento numérico y devuelve un número UInt64 cercano a 2 a la potencia de x. + +## IntExp10 {#intexp10} + +Acepta un argumento numérico y devuelve un número UInt64 cercano a 10 a la potencia de x. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/math_functions/) diff --git a/docs/es/query_language/functions/other_functions.md b/docs/es/query_language/functions/other_functions.md new file mode 100644 index 00000000000..109cb1560f6 --- /dev/null +++ b/docs/es/query_language/functions/other_functions.md @@ -0,0 +1,1072 @@ +# Otras funciones {#other-functions} + +## nombre de host() {#hostname} + +Devuelve una cadena con el nombre del host en el que se realizó esta función. Para el procesamiento distribuido, este es el nombre del host del servidor remoto, si la función se realiza en un servidor remoto. + +## FQDN {#fqdn} + +Devuelve el nombre de dominio completo. + +**Sintaxis** + +``` sql +fqdn(); +``` + +Esta función no distingue entre mayúsculas y minúsculas. + +**Valor devuelto** + +- Cadena con el nombre de dominio completo. + +Tipo: `String`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT FQDN(); +``` + +Resultado: + +``` text +┌─FQDN()──────────────────────────┐ +│ clickhouse.ru-central1.internal │ +└─────────────────────────────────┘ +``` + +## Nombre básico {#basename} + +Extrae la parte final de una cadena después de la última barra o barra invertida. Esta función se utiliza a menudo para extraer el nombre de archivo de una ruta. + +``` sql +basename( expr ) +``` + +**Parámetros** + +- `expr` — Expresión que da como resultado un [Cadena](../../data_types/string.md) valor de tipo. Todas las barras diagonales inversas deben escaparse en el valor resultante. + +**Valor devuelto** + +Una cadena que contiene: + +- La parte final de una cadena después de la última barra o barra invertida. + + If the input string contains a path ending with slash or backslash, for example, `/` or `c:\`, the function returns an empty string. + +- La cadena original si no hay barras diagonales o barras diagonales inversas. + +**Ejemplo** + +``` sql +SELECT 'some/long/path/to/file' AS a, basename(a) +``` + +``` text +┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ +│ some\long\path\to\file │ file │ +└────────────────────────┴────────────────────────────────────────┘ +``` + +``` sql +SELECT 'some\\long\\path\\to\\file' AS a, basename(a) +``` + +``` text +┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ +│ some\long\path\to\file │ file │ +└────────────────────────┴────────────────────────────────────────┘ +``` + +``` sql +SELECT 'some-file-name' AS a, basename(a) +``` + +``` text +┌─a──────────────┬─basename('some-file-name')─┐ +│ some-file-name │ some-file-name │ +└────────────────┴────────────────────────────┘ +``` + +## Ancho visible (x) {#visiblewidthx} + +Calcula el ancho aproximado al enviar valores a la consola en formato de texto (separado por tabuladores). +Esta función es utilizada por el sistema para implementar formatos Pretty. + +`NULL` se representa como una cadena correspondiente a `NULL` es `Pretty` formato. + +``` sql +SELECT visibleWidth(NULL) +``` + +``` text +┌─visibleWidth(NULL)─┐ +│ 4 │ +└────────────────────┘ +``` + +## ¿Cómo puedo hacerlo?) {#totypenamex} + +Devuelve una cadena que contiene el nombre de tipo del argumento pasado. + +Si `NULL` se pasa a la función como entrada, luego devuelve el `Nullable(Nothing)` tipo, que corresponde a un tipo interno `NULL` representación en ClickHouse. + +## BlockSize() {#function-blocksize} + +Obtiene el tamaño del bloque. +En ClickHouse, las consultas siempre se ejecutan en bloques (conjuntos de partes de columna). Esta función permite obtener el tamaño del bloque al que lo llamó. + +## materializar (x) {#materializex} + +Convierte una constante en una columna completa que contiene solo un valor. +En ClickHouse, las columnas completas y las constantes se representan de manera diferente en la memoria. Las funciones funcionan de manera diferente para argumentos constantes y argumentos normales (se ejecuta un código diferente), aunque el resultado es casi siempre el mismo. Esta función es para depurar este comportamiento. + +## ignorar(…) {#ignore} + +Acepta cualquier argumento, incluyendo `NULL`. Siempre devuelve 0. +Sin embargo, el argumento aún se evalúa. Esto se puede usar para puntos de referencia. + +## sueño (segundos) {#sleepseconds} + +Dormir ‘seconds’ segundos en cada bloque de datos. Puede especificar un número entero o un número de punto flotante. + +## SueñoCada fila(segundos) {#sleepeachrowseconds} + +Dormir ‘seconds’ segundos en cada fila. Puede especificar un número entero o un número de punto flotante. + +## currentDatabase() {#currentdatabase} + +Devuelve el nombre de la base de datos actual. +Puede utilizar esta función en los parámetros del motor de tablas en una consulta CREATE TABLE donde debe especificar la base de datos. + +## currentUser() {#other-function-currentuser} + +Devuelve el inicio de sesión del usuario actual. El inicio de sesión del usuario, que inició la consulta, se devolverá en caso de consulta distibuted. + +``` sql +SELECT currentUser(); +``` + +Apodo: `user()`, `USER()`. + +**Valores devueltos** + +- Inicio de sesión del usuario actual. +- Inicio de sesión del usuario que inició la consulta en caso de consulta distribuida. + +Tipo: `String`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT currentUser(); +``` + +Resultado: + +``` text +┌─currentUser()─┐ +│ default │ +└───────────────┘ +``` + +## isFinite(x) {#isfinitex} + +Acepta Float32 y Float64 y devuelve UInt8 igual a 1 si el argumento no es infinito y no es un NaN, de lo contrario 0. + +## IsInfinite (x) {#isinfinitex} + +Acepta Float32 y Float64 y devuelve UInt8 igual a 1 si el argumento es infinito, de lo contrario 0. Tenga en cuenta que se devuelve 0 para un NaN. + +## ifNotFinite {#ifnotfinite} + +Comprueba si el valor de punto flotante es finito. + +**Sintaxis** + + ifNotFinite(x,y) + +**Parámetros** + +- `x` — Valor que debe comprobarse para el infinito. Tipo: [Flotante\*](../../data_types/float.md). +- `y` — Valor de reserva. Tipo: [Flotante\*](../../data_types/float.md). + +**Valor devuelto** + +- `x` si `x` es finito. +- `y` si `x` no es finito. + +**Ejemplo** + +Consulta: + + SELECT 1/0 as infimum, ifNotFinite(infimum,42) + +Resultado: + + ┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ + │ inf │ 42 │ + └─────────┴───────────────────────────────┘ + +Puede obtener un resultado similar usando [Operador ternario](conditional_functions.md#ternary-operator): `isFinite(x) ? x : y`. + +## isNaN(x) {#isnanx} + +Acepta Float32 y Float64 y devuelve UInt8 igual a 1 si el argumento es un NaN, de lo contrario 0. + +## hasColumnInTable(\[‘hostname’\[, ‘username’\[, ‘password’\]\],\] ‘database’, ‘table’, ‘column’) {#hascolumnintablehostname-username-password-database-table-column} + +Acepta cadenas constantes: nombre de base de datos, nombre de tabla y nombre de columna. Devuelve una expresión constante UInt8 igual a 1 si hay una columna; de lo contrario, 0. Si se establece el parámetro hostname, la prueba se ejecutará en un servidor remoto. +La función produce una excepción si la tabla no existe. +Para los elementos de una estructura de datos anidada, la función comprueba la existencia de una columna. Para la propia estructura de datos anidados, la función devuelve 0. + +## Bar {#function-bar} + +Permite construir un diagrama unicode-art. + +`bar(x, min, max, width)` Dibuja una banda con un ancho proporcional a `(x - min)` E igual a `width` Caracteres cuando `x = max`. + +Parámetros: + +- `x` — Tamaño para mostrar. +- `min, max` — Constantes enteras. El valor debe encajar en `Int64`. +- `width` — Constante, entero positivo, puede ser fraccional. + +La banda se dibuja con precisión a un octavo de un símbolo. + +Ejemplo: + +``` sql +SELECT + toHour(EventTime) AS h, + count() AS c, + bar(c, 0, 600000, 20) AS bar +FROM test.hits +GROUP BY h +ORDER BY h ASC +``` + +``` text +┌──h─┬──────c─┬─bar────────────────┐ +│ 0 │ 292907 │ █████████▋ │ +│ 1 │ 180563 │ ██████ │ +│ 2 │ 114861 │ ███▋ │ +│ 3 │ 85069 │ ██▋ │ +│ 4 │ 68543 │ ██▎ │ +│ 5 │ 78116 │ ██▌ │ +│ 6 │ 113474 │ ███▋ │ +│ 7 │ 170678 │ █████▋ │ +│ 8 │ 278380 │ █████████▎ │ +│ 9 │ 391053 │ █████████████ │ +│ 10 │ 457681 │ ███████████████▎ │ +│ 11 │ 493667 │ ████████████████▍ │ +│ 12 │ 509641 │ ████████████████▊ │ +│ 13 │ 522947 │ █████████████████▍ │ +│ 14 │ 539954 │ █████████████████▊ │ +│ 15 │ 528460 │ █████████████████▌ │ +│ 16 │ 539201 │ █████████████████▊ │ +│ 17 │ 523539 │ █████████████████▍ │ +│ 18 │ 506467 │ ████████████████▊ │ +│ 19 │ 520915 │ █████████████████▎ │ +│ 20 │ 521665 │ █████████████████▍ │ +│ 21 │ 542078 │ ██████████████████ │ +│ 22 │ 493642 │ ████████████████▍ │ +│ 23 │ 400397 │ █████████████▎ │ +└────┴────────┴────────────────────┘ +``` + +## Ciudad {#transform} + +Transforma un valor de acuerdo con la asignación explícitamente definida de algunos elementos a otros. +Hay dos variaciones de esta función: + +### ¿Cómo puedo hacerlo?) {#transformx-array-from-array-to-default} + +`x` – Qué transformar. + +`array_from` – Matriz constante de valores para la conversión. + +`array_to` – Matriz constante de valores para convertir los valores en ‘from’ Naciones. + +`default` – ¿Qué valor utilizar si ‘x’ no es igual a ninguno de los valores en ‘from’. + +`array_from` y `array_to` – Matrices del mismo tamaño. + +Tipo: + +`transform(T, Array(T), Array(U), U) -> U` + +`T` y `U` pueden ser tipos numéricos, de cadena o de fecha o de fecha y hora. +Cuando se indica la misma letra (T o U), para los tipos numéricos pueden no ser tipos coincidentes, sino tipos que tienen un tipo común. +Por ejemplo, el primer argumento puede tener el tipo Int64, mientras que el segundo tiene el tipo Array(UInt16). + +Si el ‘x’ valor es igual a uno de los elementos en el ‘array\_from’ matriz, devuelve el elemento existente (que está numerado igual) de la ‘array\_to’ matriz. De lo contrario, devuelve ‘default’. Si hay varios elementos coincidentes en ‘array\_from’, devuelve una de las coincidencias. + +Ejemplo: + +``` sql +SELECT + transform(SearchEngineID, [2, 3], ['Yandex', 'Google'], 'Other') AS title, + count() AS c +FROM test.hits +WHERE SearchEngineID != 0 +GROUP BY title +ORDER BY c DESC +``` + +``` text +┌─title─────┬──────c─┐ +│ Yandex │ 498635 │ +│ Google │ 229872 │ +│ Other │ 104472 │ +└───────────┴────────┘ +``` + +### ¿Cómo puedo hacerlo?) {#transformx-array-from-array-to} + +Difiere de la primera variación en que el ‘default’ se omite el argumento. +Si el ‘x’ valor es igual a uno de los elementos en el ‘array\_from’ matriz, devuelve el elemento coincidente (que está numerado igual) de la ‘array\_to’ matriz. De lo contrario, devuelve ‘x’. + +Tipo: + +`transform(T, Array(T), Array(T)) -> T` + +Ejemplo: + +``` sql +SELECT + transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'example.com']) AS s, + count() AS c +FROM test.hits +GROUP BY domain(Referer) +ORDER BY count() DESC +LIMIT 10 +``` + +``` text +┌─s──────────────┬───────c─┐ +│ │ 2906259 │ +│ www.yandex │ 867767 │ +│ ███████.ru │ 313599 │ +│ mail.yandex.ru │ 107147 │ +│ ██████.ru │ 100355 │ +│ █████████.ru │ 65040 │ +│ news.yandex.ru │ 64515 │ +│ ██████.net │ 59141 │ +│ example.com │ 57316 │ +└────────────────┴─────────┘ +``` + +## Tamaño de formatoReadable (x) {#formatreadablesizex} + +Acepta el tamaño (número de bytes). Devuelve un tamaño redondeado con un sufijo (KiB, MiB, etc.) como una cadena. + +Ejemplo: + +``` sql +SELECT + arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, + formatReadableSize(filesize_bytes) AS filesize +``` + +``` text +┌─filesize_bytes─┬─filesize───┐ +│ 1 │ 1.00 B │ +│ 1024 │ 1.00 KiB │ +│ 1048576 │ 1.00 MiB │ +│ 192851925 │ 183.92 MiB │ +└────────────────┴────────────┘ +``` + +## menos (a, b) {#leasta-b} + +Devuelve el valor más pequeño de a y b. + +## alcalde(a, b) {#greatesta-b} + +Devuelve el valor más grande de a y b. + +## operatividad() {#uptime} + +Devuelve el tiempo de actividad del servidor en segundos. + +## versión() {#version} + +Devuelve la versión del servidor como una cadena. + +## Zona horaria() {#timezone} + +Devuelve la zona horaria del servidor. + +## blockNumber {#blocknumber} + +Devuelve el número de secuencia del bloque de datos donde se encuentra la fila. + +## rowNumberInBlock {#function-rownumberinblock} + +Devuelve el número ordinal de la fila en el bloque de datos. Los diferentes bloques de datos siempre se recalculan. + +## rowNumberInAllBlocks() {#rownumberinallblocks} + +Devuelve el número ordinal de la fila en el bloque de datos. Esta función solo considera los bloques de datos afectados. + +## vecino {#neighbor} + +La función de ventana que proporciona acceso a una fila en un desplazamiento especificado que viene antes o después de la fila actual de una columna determinada. + +**Sintaxis** + +``` sql +neighbor(column, offset[, default_value]) +``` + +El resultado de la función depende de los bloques de datos afectados y del orden de los datos en el bloque. +Si realiza una subconsulta con ORDER BY y llama a la función desde fuera de la subconsulta, puede obtener el resultado esperado. + +**Parámetros** + +- `column` — Un nombre de columna o una expresión escalar. +- `offset` — El número de filas hacia delante o hacia atrás desde la fila actual de `column`. [Int64](../../data_types/int_uint.md). +- `default_value` — Opcional. El valor que se devolverá si offset va más allá del alcance del bloque. Tipo de bloques de datos afectados. + +**Valores devueltos** + +- Valor para `column` es `offset` distancia de la fila actual si `offset` valor no está fuera de los límites del bloque. +- Valor predeterminado para `column` si `offset` valor está fuera de los límites del bloque. Si `default_value` se da, entonces será utilizado. + +Tipo: tipo de bloques de datos afectados o tipo de valor predeterminado. + +**Ejemplo** + +Consulta: + +``` sql +SELECT number, neighbor(number, 2) FROM system.numbers LIMIT 10; +``` + +Resultado: + +``` text +┌─number─┬─neighbor(number, 2)─┐ +│ 0 │ 2 │ +│ 1 │ 3 │ +│ 2 │ 4 │ +│ 3 │ 5 │ +│ 4 │ 6 │ +│ 5 │ 7 │ +│ 6 │ 8 │ +│ 7 │ 9 │ +│ 8 │ 0 │ +│ 9 │ 0 │ +└────────┴─────────────────────┘ +``` + +Consulta: + +``` sql +SELECT number, neighbor(number, 2, 999) FROM system.numbers LIMIT 10; +``` + +Resultado: + +``` text +┌─number─┬─neighbor(number, 2, 999)─┐ +│ 0 │ 2 │ +│ 1 │ 3 │ +│ 2 │ 4 │ +│ 3 │ 5 │ +│ 4 │ 6 │ +│ 5 │ 7 │ +│ 6 │ 8 │ +│ 7 │ 9 │ +│ 8 │ 999 │ +│ 9 │ 999 │ +└────────┴──────────────────────────┘ +``` + +Esta función se puede utilizar para calcular el valor métrico interanual: + +Consulta: + +``` sql +WITH toDate('2018-01-01') AS start_date +SELECT + toStartOfMonth(start_date + (number * 32)) AS month, + toInt32(month) % 100 AS money, + neighbor(money, -12) AS prev_year, + round(prev_year / money, 2) AS year_over_year +FROM numbers(16) +``` + +Resultado: + +``` text +┌──────month─┬─money─┬─prev_year─┬─year_over_year─┐ +│ 2018-01-01 │ 32 │ 0 │ 0 │ +│ 2018-02-01 │ 63 │ 0 │ 0 │ +│ 2018-03-01 │ 91 │ 0 │ 0 │ +│ 2018-04-01 │ 22 │ 0 │ 0 │ +│ 2018-05-01 │ 52 │ 0 │ 0 │ +│ 2018-06-01 │ 83 │ 0 │ 0 │ +│ 2018-07-01 │ 13 │ 0 │ 0 │ +│ 2018-08-01 │ 44 │ 0 │ 0 │ +│ 2018-09-01 │ 75 │ 0 │ 0 │ +│ 2018-10-01 │ 5 │ 0 │ 0 │ +│ 2018-11-01 │ 36 │ 0 │ 0 │ +│ 2018-12-01 │ 66 │ 0 │ 0 │ +│ 2019-01-01 │ 97 │ 32 │ 0.33 │ +│ 2019-02-01 │ 28 │ 63 │ 2.25 │ +│ 2019-03-01 │ 56 │ 91 │ 1.62 │ +│ 2019-04-01 │ 87 │ 22 │ 0.25 │ +└────────────┴───────┴───────────┴────────────────┘ +``` + +## EjecuciónDiferencia (x) {#other-functions-runningdifference} + +Calcula la diferencia entre los sucesivos valores de fila en el bloque de datos. +Devuelve 0 para la primera fila y la diferencia con respecto a la fila anterior para cada fila subsiguiente. + +El resultado de la función depende de los bloques de datos afectados y del orden de los datos en el bloque. +Si realiza una subconsulta con ORDER BY y llama a la función desde fuera de la subconsulta, puede obtener el resultado esperado. + +Ejemplo: + +``` sql +SELECT + EventID, + EventTime, + runningDifference(EventTime) AS delta +FROM +( + SELECT + EventID, + EventTime + FROM events + WHERE EventDate = '2016-11-24' + ORDER BY EventTime ASC + LIMIT 5 +) +``` + +``` text +┌─EventID─┬───────────EventTime─┬─delta─┐ +│ 1106 │ 2016-11-24 00:00:04 │ 0 │ +│ 1107 │ 2016-11-24 00:00:05 │ 1 │ +│ 1108 │ 2016-11-24 00:00:05 │ 0 │ +│ 1109 │ 2016-11-24 00:00:09 │ 4 │ +│ 1110 │ 2016-11-24 00:00:10 │ 1 │ +└─────────┴─────────────────────┴───────┘ +``` + +Tenga en cuenta que el tamaño del bloque afecta el resultado. Con cada nuevo bloque, el `runningDifference` estado de reset. + +``` sql +SELECT + number, + runningDifference(number + 1) AS diff +FROM numbers(100000) +WHERE diff != 1 +``` + +``` text +┌─number─┬─diff─┐ +│ 0 │ 0 │ +└────────┴──────┘ +┌─number─┬─diff─┐ +│ 65536 │ 0 │ +└────────┴──────┘ +``` + +``` sql +set max_block_size=100000 -- default value is 65536! + +SELECT + number, + runningDifference(number + 1) AS diff +FROM numbers(100000) +WHERE diff != 1 +``` + +``` text +┌─number─┬─diff─┐ +│ 0 │ 0 │ +└────────┴──────┘ +``` + +## runningDifferenceStartingWithFirstvalue {#runningdifferencestartingwithfirstvalue} + +Lo mismo que para [runningDifference](./other_functions.md#other_functions-runningdifference), la diferencia es el valor de la primera fila, devolvió el valor de la primera fila, y cada fila subsiguiente devuelve la diferencia de la fila anterior. + +## ¿Cómo puedo hacerlo?) {#macnumtostringnum} + +Acepta un número UInt64. Lo interpreta como una dirección MAC en big endian. Devuelve una cadena que contiene la dirección MAC correspondiente con el formato AA:BB:CC:DD:EE:FF (números separados por dos puntos en forma hexadecimal). + +## Sistema abierto.) {#macstringtonums} + +La función inversa de MACNumToString. Si la dirección MAC tiene un formato no válido, devuelve 0. + +## Sistema abierto.) {#macstringtoouis} + +Acepta una dirección MAC con el formato AA:BB:CC:DD:EE:FF (números separados por dos puntos en forma hexadecimal). Devuelve los primeros tres octetos como un número UInt64. Si la dirección MAC tiene un formato no válido, devuelve 0. + +## getSizeOfEnumType {#getsizeofenumtype} + +Devuelve el número de campos en [Enum](../../data_types/enum.md). + +``` sql +getSizeOfEnumType(value) +``` + +**Parámetros:** + +- `value` — Valor del tipo `Enum`. + +**Valores devueltos** + +- El número de campos con `Enum` valores de entrada. +- Se produce una excepción si el tipo no es `Enum`. + +**Ejemplo** + +``` sql +SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x +``` + +``` text +┌─x─┐ +│ 2 │ +└───┘ +``` + +## BlockSerializedSize {#blockserializedsize} + +Devuelve el tamaño en el disco (sin tener en cuenta la compresión). + +``` sql +blockSerializedSize(value[, value[, ...]]) +``` + +**Parámetros:** + +- `value` — Cualquier valor. + +**Valores devueltos** + +- El número de bytes que se escribirán en el disco para el bloque de valores (sin compresión). + +**Ejemplo** + +``` sql +SELECT blockSerializedSize(maxState(1)) as x +``` + +``` text +┌─x─┐ +│ 2 │ +└───┘ +``` + +## ToColumnTypeName {#tocolumntypename} + +Devuelve el nombre de la clase que representa el tipo de datos de la columna en la RAM. + +``` sql +toColumnTypeName(value) +``` + +**Parámetros:** + +- `value` — Cualquier tipo de valor. + +**Valores devueltos** + +- Una cadena con el nombre de la clase que se utiliza para representar la `value` tipo de datos en la memoria RAM. + +**Ejemplo de la diferencia entre`toTypeName ' and ' toColumnTypeName`** + +``` sql +SELECT toTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) +``` + +``` text +┌─toTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ +│ DateTime │ +└─────────────────────────────────────────────────────┘ +``` + +``` sql +SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) +``` + +``` text +┌─toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ +│ Const(UInt32) │ +└───────────────────────────────────────────────────────────┘ +``` + +El ejemplo muestra que el `DateTime` tipo de datos se almacena en la memoria como `Const(UInt32)`. + +## dumpColumnStructure {#dumpcolumnstructure} + +Produce una descripción detallada de las estructuras de datos en la memoria RAM + +``` sql +dumpColumnStructure(value) +``` + +**Parámetros:** + +- `value` — Cualquier tipo de valor. + +**Valores devueltos** + +- Una cadena que describe la estructura que se utiliza para representar el `value` tipo de datos en la memoria RAM. + +**Ejemplo** + +``` sql +SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) +``` + +``` text +┌─dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ +│ DateTime, Const(size = 1, UInt32(size = 1)) │ +└──────────────────────────────────────────────────────────────┘ +``` + +## defaultValueOfArgumentType {#defaultvalueofargumenttype} + +Genera el valor predeterminado para el tipo de datos. + +No incluye valores predeterminados para columnas personalizadas establecidas por el usuario. + +``` sql +defaultValueOfArgumentType(expression) +``` + +**Parámetros:** + +- `expression` — Tipo de valor arbitrario o una expresión que da como resultado un valor de tipo arbitrario. + +**Valores devueltos** + +- `0` para los números. +- Cadena vacía para cadenas. +- `ᴺᵁᴸᴸ` para [NULO](../../data_types/nullable.md). + +**Ejemplo** + +``` sql +SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) +``` + +``` text +┌─defaultValueOfArgumentType(CAST(1, 'Int8'))─┐ +│ 0 │ +└─────────────────────────────────────────────┘ +``` + +``` sql +SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) +``` + +``` text +┌─defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)'))─┐ +│ ᴺᵁᴸᴸ │ +└───────────────────────────────────────────────────────┘ +``` + +## replicar {#other-functions-replicate} + +Crea una matriz con un solo valor. + +Utilizado para la implementación interna de [arrayJoin](array_join.md#functions_arrayjoin). + +``` sql +SELECT replicate(x, arr); +``` + +**Parámetros:** + +- `arr` — Matriz original. ClickHouse crea una nueva matriz de la misma longitud que el original y la llena con el valor `x`. +- `x` — El valor con el que se llenará la matriz resultante. + +**Valor devuelto** + +Una matriz llena con el valor `x`. + +Tipo: `Array`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT replicate(1, ['a', 'b', 'c']) +``` + +Resultado: + +``` text +┌─replicate(1, ['a', 'b', 'c'])─┐ +│ [1,1,1] │ +└───────────────────────────────┘ +``` + +## Sistema de archivosDisponible {#filesystemavailable} + +Devuelve la cantidad de espacio restante en el sistema de archivos donde se encuentran los archivos de las bases de datos. Siempre es más pequeño que el espacio libre total ([Sistema de archivosLibre](#filesystemfree)) porque algo de espacio está reservado para el sistema operativo. + +**Sintaxis** + +``` sql +filesystemAvailable() +``` + +**Valor devuelto** + +- La cantidad de espacio restante disponible en bytes. + +Tipo: [UInt64](../../data_types/int_uint.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT formatReadableSize(filesystemAvailable()) AS "Available space", toTypeName(filesystemAvailable()) AS "Type"; +``` + +Resultado: + +``` text +┌─Available space─┬─Type───┐ +│ 30.75 GiB │ UInt64 │ +└─────────────────┴────────┘ +``` + +## Sistema de archivosLibre {#filesystemfree} + +Devuelve la cantidad total del espacio libre en el sistema de archivos donde se encuentran los archivos de las bases de datos. Ver también `filesystemAvailable` + +**Sintaxis** + +``` sql +filesystemFree() +``` + +**Valor devuelto** + +- Cantidad de espacio libre en bytes. + +Tipo: [UInt64](../../data_types/int_uint.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT formatReadableSize(filesystemFree()) AS "Free space", toTypeName(filesystemFree()) AS "Type"; +``` + +Resultado: + +``` text +┌─Free space─┬─Type───┐ +│ 32.39 GiB │ UInt64 │ +└────────────┴────────┘ +``` + +## sistema de archivosCapacidad {#filesystemcapacity} + +Devuelve la capacidad del sistema de archivos en bytes. Para la evaluación, el [camino](../../operations/server_settings/settings.md#server_settings-path) al directorio de datos debe estar configurado. + +**Sintaxis** + +``` sql +filesystemCapacity() +``` + +**Valor devuelto** + +- Información de capacidad del sistema de archivos en bytes. + +Tipo: [UInt64](../../data_types/int_uint.md). + +**Ejemplo** + +Consulta: + +``` sql +SELECT formatReadableSize(filesystemCapacity()) AS "Capacity", toTypeName(filesystemCapacity()) AS "Type" +``` + +Resultado: + +``` text +┌─Capacity──┬─Type───┐ +│ 39.32 GiB │ UInt64 │ +└───────────┴────────┘ +``` + +## finalizeAggregation {#function-finalizeaggregation} + +Toma el estado de la función agregada. Devuelve el resultado de la agregación (estado finalizado). + +## runningAccumulate {#function-runningaccumulate} + +Toma los estados de la función agregada y devuelve una columna con valores, son el resultado de la acumulación de estos estados para un conjunto de líneas de bloque, desde la primera hasta la línea actual. +Por ejemplo, toma el estado de la función agregada (ejemplo runningAccumulate(uniqState(UserID)) ), y para cada fila de bloque, devuelve el resultado de la función agregada en la fusión de estados de todas las filas anteriores y la fila actual. +Por lo tanto, el resultado de la función depende de la partición de los datos en los bloques y del orden de los datos en el bloque. + +## joinGet {#joinget} + +La función le permite extraer datos de la tabla de la misma manera que [Diccionario](../../query_language/dicts/index.md). + +Obtiene datos de [Unir](../../operations/table_engines/join.md#creating-a-table) usando la clave de unión especificada. + +Solo admite tablas creadas con `ENGINE = Join(ANY, LEFT, )` instrucción. + +**Sintaxis** + +``` sql +joinGet(join_storage_table_name, `value_column`, join_keys) +``` + +**Parámetros** + +- `join_storage_table_name` — un [identificador](../syntax.md#syntax-identifiers) indica dónde se realiza la búsqueda. El identificador se busca en la base de datos predeterminada (ver parámetro `default_database` en el archivo de configuración). Para reemplazar la base de datos predeterminada, utilice `USE db_name` o especifique la base de datos y la tabla a través del separador `db_name.db_table`, ver el ejemplo. +- `value_column` — nombre de la columna de la tabla que contiene los datos necesarios. +- `join_keys` — lista de claves. + +**Valor devuelto** + +Devuelve la lista de valores correspondientes a la lista de claves. + +Si cierto no existe en la tabla fuente, entonces `0` o `null` será devuelto basado en [Sistema abierto.](../../operations/settings/settings.md#join_use_nulls) configuración. + +Más información sobre `join_use_nulls` es [Únase a la operación](../../operations/table_engines/join.md). + +**Ejemplo** + +Tabla de entrada: + +``` sql +CREATE DATABASE db_test +CREATE TABLE db_test.id_val(`id` UInt32, `val` UInt32) ENGINE = Join(ANY, LEFT, id) SETTINGS join_use_nulls = 1 +INSERT INTO db_test.id_val VALUES (1,11)(2,12)(4,13) +``` + +``` text +┌─id─┬─val─┐ +│ 4 │ 13 │ +│ 2 │ 12 │ +│ 1 │ 11 │ +└────┴─────┘ +``` + +Consulta: + +``` sql +SELECT joinGet(db_test.id_val,'val',toUInt32(number)) from numbers(4) SETTINGS join_use_nulls = 1 +``` + +Resultado: + +``` text +┌─joinGet(db_test.id_val, 'val', toUInt32(number))─┐ +│ 0 │ +│ 11 │ +│ 12 │ +│ 0 │ +└──────────────────────────────────────────────────┘ +``` + +## modelEvaluate (nombre\_modelo, …) {#function-modelevaluate} + +Evaluar modelo externo. +Acepta un nombre de modelo y argumentos de modelo. Devuelve Float64. + +## ¿Cómo puedo hacerlo?\]) {#throwifx-custom-message} + +Lance una excepción si el argumento no es cero. +custom\_message - es un parámetro opcional: una cadena constante, proporciona un mensaje de error + +``` sql +SELECT throwIf(number = 3, 'Too many') FROM numbers(10); +``` + +``` text +↙ Progress: 0.00 rows, 0.00 B (0.00 rows/s., 0.00 B/s.) Received exception from server (version 19.14.1): +Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many. +``` + +## identidad {#identity} + +Devuelve el mismo valor que se usó como argumento. Se utiliza para la depuración y pruebas, permite cancelar el uso de índice, y obtener el rendimiento de la consulta de un análisis completo. Cuando se analiza la consulta para el posible uso del índice, el analizador no mira dentro `identity` función. + +**Sintaxis** + +``` sql +identity(x) +``` + +**Ejemplo** + +Consulta: + +``` sql +SELECT identity(42) +``` + +Resultado: + +``` text +┌─identity(42)─┐ +│ 42 │ +└──────────────┘ +``` + +## randomPrintableASCII {#randomascii} + +Genera una cadena con un conjunto aleatorio de [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) caracteres imprimibles. + +**Sintaxis** + +``` sql +randomPrintableASCII(length) +``` + +**Parámetros** + +- `length` — Longitud de cadena resultante. Entero positivo. + + If you pass `length < 0`, behavior of the function is undefined. + +**Valor devuelto** + +- Cadena con un conjunto aleatorio de [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters) caracteres imprimibles. + +Tipo: [Cadena](../../data_types/string.md) + +**Ejemplo** + +``` sql +SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3 +``` + +``` text +┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐ +│ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │ +│ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │ +│ 2 │ /"+<"wUTh:=LjJ Vm!c&hI*m#XTfzz │ 30 │ +└────────┴────────────────────────────────┴──────────────────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/other_functions/) diff --git a/docs/es/query_language/functions/random_functions.md b/docs/es/query_language/functions/random_functions.md new file mode 100644 index 00000000000..fd9496bec03 --- /dev/null +++ b/docs/es/query_language/functions/random_functions.md @@ -0,0 +1,23 @@ +# Funciones para generar números pseudoaleatorios {#functions-for-generating-pseudo-random-numbers} + +Se utilizan generadores no criptográficos de números pseudoaleatorios. + +Todas las funciones aceptan cero argumentos o un argumento. +Si se pasa un argumento, puede ser de cualquier tipo y su valor no se usa para nada. +El único propósito de este argumento es evitar la eliminación de subexpresiones comunes, de modo que dos instancias diferentes de la misma función devuelvan columnas diferentes con números aleatorios diferentes. + +## rand {#rand} + +Devuelve un número pseudoaleatorio UInt32, distribuido uniformemente entre todos los números de tipo UInt32. +Utiliza un generador congruente lineal. + +## rand64 {#rand64} + +Devuelve un número pseudoaleatorio UInt64, distribuido uniformemente entre todos los números de tipo UInt64. +Utiliza un generador congruente lineal. + +## randConstant {#randconstant} + +Devuelve un número pseudoaleatorio UInt32, El valor es uno para diferentes bloques. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/random_functions/) diff --git a/docs/es/query_language/functions/rounding_functions.md b/docs/es/query_language/functions/rounding_functions.md new file mode 100644 index 00000000000..aa51e567aee --- /dev/null +++ b/docs/es/query_language/functions/rounding_functions.md @@ -0,0 +1,183 @@ +# Funciones de redondeo {#rounding-functions} + +## Piso(x\[, N\]) {#floorx-n} + +Devuelve el número de ronda más grande que es menor o igual que `x`. Un número redondo es un múltiplo de 1/10N, o el número más cercano del tipo de datos apropiado si 1 / 10N no es exacto. +‘N’ es una constante entera, parámetro opcional. Por defecto es cero, lo que significa redondear a un entero. +‘N’ puede ser negativo. + +Ejemplos: `floor(123.45, 1) = 123.4, floor(123.45, -1) = 120.` + +`x` es cualquier tipo numérico. El resultado es un número del mismo tipo. +Para argumentos enteros, tiene sentido redondear con un negativo `N` valor no negativo `N` la función no hace nada). +Si el redondeo causa desbordamiento (por ejemplo, floor(-128, -1)), se devuelve un resultado específico de la implementación. + +## Por ejemplo:\]) {#ceilx-n-ceilingx-n} + +Devuelve el número redondo más pequeño que es mayor o igual que `x`. En todos los demás sentidos, es lo mismo que el `floor` función (véase más arriba). + +## ¿Cómo puedo hacerlo?\]) {#truncx-n-truncatex-n} + +Devuelve el número redondo con el valor absoluto más grande que tiene un valor absoluto menor o igual que `x`‘s. In every other way, it is the same as the ’floor’ función (véase más arriba). + +## Ronda (x\[, N\]) {#rounding-functions-round} + +Redondea un valor a un número especificado de decimales. + +La función devuelve el número más cercano del orden especificado. En caso de que el número dado tenga la misma distancia que los números circundantes, la función utiliza el redondeo de banquero para los tipos de números flotantes y redondea desde cero para los otros tipos de números. + +``` sql +round(expression [, decimal_places]) +``` + +**Parámetros:** + +- `expression` — Un número que se redondeará. Puede ser cualquier [expresión](../syntax.md#syntax-expressions) devolviendo el numérico [Tipo de datos](../../data_types/index.md#data_types). +- `decimal-places` — Un valor entero. + - Si `decimal-places > 0` luego la función redondea el valor a la derecha del punto decimal. + - Si `decimal-places < 0` luego la función redondea el valor a la izquierda del punto decimal. + - Si `decimal-places = 0` entonces la función redondea el valor a entero. En este caso, el argumento puede omitirse. + +**Valor devuelto:** + +El número redondeado del mismo tipo que el número de entrada. + +### Ejemplos {#examples} + +**Ejemplo de uso** + +``` sql +SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 +``` + +``` text +┌───x─┬─round(divide(number, 2))─┐ +│ 0 │ 0 │ +│ 0.5 │ 0 │ +│ 1 │ 1 │ +└─────┴──────────────────────────┘ +``` + +**Ejemplos de redondeo** + +Redondeando al número más cercano. + +``` text +round(3.2, 0) = 3 +round(4.1267, 2) = 4.13 +round(22,-1) = 20 +round(467,-2) = 500 +round(-467,-2) = -500 +``` + +Redondeo del banquero. + +``` text +round(3.5) = 4 +round(4.5) = 4 +round(3.55, 1) = 3.6 +round(3.65, 1) = 3.6 +``` + +**Ver también** + +- [roundBankers](#roundbankers) + +## roundBankers {#roundbankers} + +Redondea un número a una posición decimal especificada. + +- Si el número de redondeo está a medio camino entre dos números, la función utiliza el redondeo de banquero. + + Banker's rounding is a method of rounding fractional numbers. When the rounding number is halfway between two numbers, it's rounded to the nearest even digit at the specified decimal position. For example: 3.5 rounds up to 4, 2.5 rounds down to 2. + + It's the default rounding method for floating point numbers defined in [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). The [round](#rounding_functions-round) function performs the same rounding for floating point numbers. The `roundBankers` function also rounds integers the same way, for example, `roundBankers(45, -1) = 40`. + +- En otros casos, la función redondea los números al entero más cercano. + +Usando el redondeo del banquero, puede reducir el efecto que tiene el redondeo de números en los resultados de sumar o restar estos números. + +Por ejemplo, suma números 1.5, 2.5, 3.5, 4.5 con redondeo diferente: + +- Sin redondeo: 1.5 + 2.5 + 3.5 + 4.5 = 12. +- Redondeo del banquero: 2 + 2 + 4 + 4 = 12. +- Redondeando al entero más cercano: 2 + 3 + 4 + 5 = 14. + +**Sintaxis** + +``` sql +roundBankers(expression [, decimal_places]) +``` + +**Parámetros** + +- `expression` — Un número que se redondeará. Puede ser cualquier [expresión](../syntax.md#syntax-expressions) devolviendo el numérico [Tipo de datos](../../data_types/index.md#data_types). +- `decimal-places` — Lugares decimales. Un número entero. + - `decimal-places > 0` — La función redondea el número a la derecha del punto decimal. Ejemplo: `roundBankers(3.55, 1) = 3.6`. + - `decimal-places < 0` — La función redondea el número a la izquierda del punto decimal. Ejemplo: `roundBankers(24.55, -1) = 20`. + - `decimal-places = 0` — La función redondea el número a un entero. En este caso, el argumento puede omitirse. Ejemplo: `roundBankers(2.5) = 2`. + +**Valor devuelto** + +Un valor redondeado por el método de redondeo del banquero. + +### Ejemplos {#examples-1} + +**Ejemplo de uso** + +Consulta: + +``` sql + SELECT number / 2 AS x, roundBankers(x, 0) AS b fROM system.numbers limit 10 +``` + +Resultado: + +``` text +┌───x─┬─b─┐ +│ 0 │ 0 │ +│ 0.5 │ 0 │ +│ 1 │ 1 │ +│ 1.5 │ 2 │ +│ 2 │ 2 │ +│ 2.5 │ 2 │ +│ 3 │ 3 │ +│ 3.5 │ 4 │ +│ 4 │ 4 │ +│ 4.5 │ 4 │ +└─────┴───┘ +``` + +**Ejemplos de redondeo de Banker** + +``` text +roundBankers(0.4) = 0 +roundBankers(-3.5) = -4 +roundBankers(4.5) = 4 +roundBankers(3.55, 1) = 3.6 +roundBankers(3.65, 1) = 3.6 +roundBankers(10.35, 1) = 10.4 +roundBankers(10.755, 2) = 11,76 +``` + +**Ver también** + +- [Ronda](#rounding_functions-round) + +## ¿Cómo puedo hacerlo?) {#roundtoexp2num} + +Acepta un número. Si el número es menor que uno, devuelve 0. De lo contrario, redondea el número al grado más cercano (todo no negativo) de dos. + +## RondaDuración(num) {#rounddurationnum} + +Acepta un número. Si el número es menor que uno, devuelve 0. De lo contrario, redondea el número a números del conjunto: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. Esta función es específica de Yandex.Métrica y se utiliza para aplicar el informe sobre la duración del período de sesiones. + +## RondaEdad(num) {#roundagenum} + +Acepta un número. Si el número es menor que 18, devuelve 0. De lo contrario, redondea el número a un número del conjunto: 18, 25, 35, 45, 55. Esta función es específica de Yandex.Métrica y se utiliza para la aplicación del informe sobre la edad del usuario. + +## ¿Cómo puedo hacerlo?) {#rounddownnum-arr} + +Acepta un número y lo redondea a un elemento en la matriz especificada. Si el valor es menor que el límite más bajo, se devuelve el límite más bajo. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/rounding_functions/) diff --git a/docs/es/query_language/functions/splitting_merging_functions.md b/docs/es/query_language/functions/splitting_merging_functions.md new file mode 100644 index 00000000000..37c76748e2f --- /dev/null +++ b/docs/es/query_language/functions/splitting_merging_functions.md @@ -0,0 +1,67 @@ +# Funciones para dividir y fusionar cadenas y matrices {#functions-for-splitting-and-merging-strings-and-arrays} + +## Por ejemplo:) {#splitbycharseparator-s} + +Divide una cadena en subcadenas separadas por ‘separator’.'separador' debe ser una constante de cadena que consta de exactamente un carácter. +Devuelve una matriz de subcadenas seleccionadas. Se pueden seleccionar subcadenas vacías si el separador aparece al principio o al final de la cadena, o si hay varios separadores consecutivos. + +**Ejemplo:** + +``` sql +SELECT splitByChar(',', '1,2,3,abcde') +``` + +``` text +┌─splitByChar(',', '1,2,3,abcde')─┐ +│ ['1','2','3','abcde'] │ +└─────────────────────────────────┘ +``` + +## Por ejemplo:) {#splitbystringseparator-s} + +Lo mismo que el anterior, pero usa una cadena de múltiples caracteres como separador. Si la cadena está vacía, dividirá la cadena en una matriz de caracteres individuales. + +**Ejemplo:** + +``` sql +SELECT splitByString(', ', '1, 2 3, 4,5, abcde') +``` + +``` text +┌─splitByString(', ', '1, 2 3, 4,5, abcde')─┐ +│ ['1','2 3','4,5','abcde'] │ +└───────────────────────────────────────────┘ +``` + +``` sql +SELECT splitByString('', 'abcde') +``` + +``` text +┌─splitByString('', 'abcde')─┐ +│ ['a','b','c','d','e'] │ +└────────────────────────────┘ +``` + +## Por ejemplo, se puede usar una matriz.\]) {#arraystringconcatarr-separator} + +Concatena las cadenas enumeradas en la matriz con el separador.'separador' es un parámetro opcional: una cadena constante, establecida en una cadena vacía por defecto. +Devuelve la cadena. + +## Sistema abierto.) {#alphatokenss} + +Selecciona subcadenas de bytes consecutivos de los rangos a-z y A-Z.Devuelve una matriz de subcadenas. + +**Ejemplo:** + +``` sql +SELECT alphaTokens('abca1abc') +``` + +``` text +┌─alphaTokens('abca1abc')─┐ +│ ['abca','abc'] │ +└─────────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/splitting_merging_functions/) diff --git a/docs/es/query_language/functions/string_functions.md b/docs/es/query_language/functions/string_functions.md new file mode 100644 index 00000000000..8e756452aed --- /dev/null +++ b/docs/es/query_language/functions/string_functions.md @@ -0,0 +1,482 @@ +# Funciones para trabajar con cadenas {#functions-for-working-with-strings} + +## Vaciar {#empty} + +Devuelve 1 para una cadena vacía o 0 para una cadena no vacía. +El tipo de resultado es UInt8. +Una cadena se considera no vacía si contiene al menos un byte, incluso si se trata de un espacio o un byte nulo. +La función también funciona para matrices. + +## notEmpty {#notempty} + +Devuelve 0 para una cadena vacía o 1 para una cadena no vacía. +El tipo de resultado es UInt8. +La función también funciona para matrices. + +## longitud {#length} + +Devuelve la longitud de una cadena en bytes (no en caracteres y no en puntos de código). +El tipo de resultado es UInt64. +La función también funciona para matrices. + +## longitudUTF8 {#lengthutf8} + +Devuelve la longitud de una cadena en puntos de código Unicode (no en caracteres), suponiendo que la cadena contiene un conjunto de bytes que componen texto codificado en UTF-8. Si no se cumple esta suposición, devuelve algún resultado (no arroja una excepción). +El tipo de resultado es UInt64. + +## char\_length, CHAR\_LENGTH {#char-length} + +Devuelve la longitud de una cadena en puntos de código Unicode (no en caracteres), suponiendo que la cadena contiene un conjunto de bytes que componen texto codificado en UTF-8. Si no se cumple esta suposición, devuelve algún resultado (no arroja una excepción). +El tipo de resultado es UInt64. + +## character\_length, CHARACTER\_LENGTH {#character-length} + +Devuelve la longitud de una cadena en puntos de código Unicode (no en caracteres), suponiendo que la cadena contiene un conjunto de bytes que componen texto codificado en UTF-8. Si no se cumple esta suposición, devuelve algún resultado (no arroja una excepción). +El tipo de resultado es UInt64. + +## inferior, lcase {#lower} + +Convierte símbolos latinos ASCII en una cadena a minúsculas. + +## superior, ucase {#upper} + +Convierte los símbolos latinos ASCII en una cadena a mayúsculas. + +## Método de codificación de datos: {#lowerutf8} + +Convierte una cadena en minúsculas, suponiendo que la cadena contiene un conjunto de bytes que componen un texto codificado en UTF-8. +No detecta el idioma. Entonces, para el turco, el resultado podría no ser exactamente correcto. +Si la longitud de la secuencia de bytes UTF-8 es diferente para mayúsculas y minúsculas de un punto de código, el resultado puede ser incorrecto para este punto de código. +Si la cadena contiene un conjunto de bytes que no es UTF-8, entonces el comportamiento no está definido. + +## Todos los derechos reservados. {#upperutf8} + +Convierte una cadena en mayúsculas, suponiendo que la cadena contiene un conjunto de bytes que componen un texto codificado en UTF-8. +No detecta el idioma. Entonces, para el turco, el resultado podría no ser exactamente correcto. +Si la longitud de la secuencia de bytes UTF-8 es diferente para mayúsculas y minúsculas de un punto de código, el resultado puede ser incorrecto para este punto de código. +Si la cadena contiene un conjunto de bytes que no es UTF-8, entonces el comportamiento no está definido. + +## Sistema abierto. {#isvalidutf8} + +Devuelve 1, si el conjunto de bytes es válido codificado en UTF-8, de lo contrario 0. + +## Acerca de Nosotros {#tovalidutf8} + +Reemplaza los caracteres UTF-8 no válidos por `�` (U+FFFD) carácter. Todos los caracteres no válidos que se ejecutan en una fila se contraen en el único carácter de reemplazo. + +``` sql +toValidUTF8( input_string ) +``` + +Parámetros: + +- input\_string — Cualquier conjunto de bytes representados como el [Cadena](../../data_types/string.md) objeto de tipo de datos. + +Valor devuelto: cadena UTF-8 válida. + +**Ejemplo** + +``` sql +SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') +``` + +``` text +┌─toValidUTF8('a����b')─┐ +│ a�b │ +└───────────────────────┘ +``` + +## repetir {#repeat} + +Repite una cadena tantas veces como se especifique y concatena los valores replicados como una única cadena. + +**Sintaxis** + +``` sql +repeat(s, n) +``` + +**Parámetros** + +- `s` — La cadena de repetir. [Cadena](../../data_types/string.md). +- `n` — El número de veces para repetir la cadena. [UInt](../../data_types/int_uint.md). + +**Valor devuelto** + +La cadena única, que contiene la cadena `s` repetir `n` tiempo. Si `n` \< 1, la función devuelve cadena vacía. + +Tipo: `String`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT repeat('abc', 10) +``` + +Resultado: + +``` text +┌─repeat('abc', 10)──────────────┐ +│ abcabcabcabcabcabcabcabcabcabc │ +└────────────────────────────────┘ +``` + +## Inverso {#reverse} + +Invierte la cadena (como una secuencia de bytes). + +## reverseUTF8 {#reverseutf8} + +Invierte una secuencia de puntos de código Unicode, suponiendo que la cadena contiene un conjunto de bytes que representan un texto UTF-8. De lo contrario, hace otra cosa (no arroja una excepción). + +## Formato (patrón, s0, s1, …) {#format} + +Formatear el patrón constante con la cadena enumerada en los argumentos. `pattern` es un patrón de formato de Python simplificado. La cadena de formato contiene “replacement fields” rodeado de llaves `{}`. Cualquier cosa que no esté contenida entre llaves se considera texto literal, que se copia sin cambios en la salida. Si necesita incluir un carácter de llave en el texto literal, se puede escapar duplicando: `{{ '{{' }}` y `{{ '}}' }}`. Los nombres de campo pueden ser números (comenzando desde cero) o vacíos (luego se tratan como números de consecuencia). + +``` sql +SELECT format('{1} {0} {1}', 'World', 'Hello') +``` + +``` text +┌─format('{1} {0} {1}', 'World', 'Hello')─┐ +│ Hello World Hello │ +└─────────────────────────────────────────┘ +``` + +``` sql +SELECT format('{} {}', 'Hello', 'World') +``` + +``` text +┌─format('{} {}', 'Hello', 'World')─┐ +│ Hello World │ +└───────────────────────────────────┘ +``` + +## concat {#concat} + +Concatena las cadenas enumeradas en los argumentos, sin un separador. + +**Sintaxis** + +``` sql +concat(s1, s2, ...) +``` + +**Parámetros** + +Valores de tipo String o FixedString. + +**Valores devueltos** + +Devuelve la cadena que resulta de concatenar los argumentos. + +Si alguno de los valores de argumento `NULL`, `concat` devoluciones `NULL`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT concat('Hello, ', 'World!') +``` + +Resultado: + +``` text +┌─concat('Hello, ', 'World!')─┐ +│ Hello, World! │ +└─────────────────────────────┘ +``` + +## ConcatAssumeInjective {#concatassumeinjective} + +Lo mismo que [concat](#concat), la diferencia es que usted necesita asegurar eso `concat(s1, s2, ...) → sn` es inyectivo, se utilizará para la optimización de GROUP BY. + +La función se llama “injective” si siempre devuelve un resultado diferente para diferentes valores de argumentos. En otras palabras: diferentes argumentos nunca arrojan un resultado idéntico. + +**Sintaxis** + +``` sql +concatAssumeInjective(s1, s2, ...) +``` + +**Parámetros** + +Valores de tipo String o FixedString. + +**Valores devueltos** + +Devuelve la cadena que resulta de concatenar los argumentos. + +Si alguno de los valores de argumento `NULL`, `concatAssumeInjective` devoluciones `NULL`. + +**Ejemplo** + +Tabla de entrada: + +``` sql +CREATE TABLE key_val(`key1` String, `key2` String, `value` UInt32) ENGINE = TinyLog; +INSERT INTO key_val VALUES ('Hello, ','World',1), ('Hello, ','World',2), ('Hello, ','World!',3), ('Hello',', World!',2); +SELECT * from key_val; +``` + +``` text +┌─key1────┬─key2─────┬─value─┐ +│ Hello, │ World │ 1 │ +│ Hello, │ World │ 2 │ +│ Hello, │ World! │ 3 │ +│ Hello │ , World! │ 2 │ +└─────────┴──────────┴───────┘ +``` + +Consulta: + +``` sql +SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY concatAssumeInjective(key1, key2) +``` + +Resultado: + +``` text +┌─concat(key1, key2)─┬─sum(value)─┐ +│ Hello, World! │ 3 │ +│ Hello, World! │ 2 │ +│ Hello, World │ 3 │ +└────────────────────┴────────────┘ +``` + +## substring(s, desplazamiento, longitud), mid(s, desplazamiento, longitud), substr(s, desplazamiento, longitud) {#substring} + +Devuelve una subcadena que comienza con el byte ‘offset’ índice que es ‘length’ bytes de largo. La indexación de caracteres comienza desde uno (como en SQL estándar). El ‘offset’ y ‘length’ los argumentos deben ser constantes. + +## substringUTF8(s, desplazamiento, longitud) {#substringutf8} + +Lo mismo que ‘substring’, pero para puntos de código Unicode. Funciona bajo el supuesto de que la cadena contiene un conjunto de bytes que representan un texto codificado en UTF-8. Si no se cumple esta suposición, devuelve algún resultado (no arroja una excepción). + +## Aquí hay algunas opciones) {#appendtrailingcharifabsent} + +Si el ‘s’ cadena no está vacía y no contiene el ‘c’ carácter al final, se añade el ‘c’ carácter hasta el final. + +## convertirCharset(s), de, a) {#convertcharset} + +Devuelve la cadena ‘s’ que se convirtió de la codificación en ‘from’ a la codificación en ‘to’. + +## Sistema abierto.) {#base64encode} + +Codificar ‘s’ cadena en base64 + +## base64Decode(s)))) {#base64decode} + +Decodificar cadena codificada en base64 ‘s’ en la cadena original. En caso de fallo plantea una excepción. + +## tryBase64Decode(s) {#trybase64decode} + +Similar a base64Decode, pero en caso de error se devolverá una cadena vacía. + +## terminaCon(s, sufijo) {#endswith} + +Devuelve si se debe terminar con el sufijo especificado. Devuelve 1 si la cadena termina con el sufijo especificado, de lo contrario devuelve 0. + +## Comienza con (str, prefijo) {#startswith} + +Devuelve 1 si la cadena comienza con el prefijo especificado, de lo contrario devuelve 0. + +``` sql +SELECT startsWith('Spider-Man', 'Spi'); +``` + +**Valores devueltos** + +- 1, si la cadena comienza con el prefijo especificado. +- 0, si la cadena no comienza con el prefijo especificado. + +**Ejemplo** + +Consulta: + +``` sql +SELECT startsWith('Hello, world!', 'He'); +``` + +Resultado: + +``` text +┌─startsWith('Hello, world!', 'He')─┐ +│ 1 │ +└───────────────────────────────────┘ +``` + +## recortar {#trim} + +Quita todos los caracteres especificados del inicio o el final de una cadena. +De forma predeterminada, elimina todas las apariciones consecutivas de espacios en blanco comunes (carácter ASCII 32) de ambos extremos de una cadena. + +**Sintaxis** + +``` sql +trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) +``` + +**Parámetros** + +- `trim_character` — caracteres especificados para el recorte. [Cadena](../../data_types/string.md). +- `input_string` — cuerda para el acabado. [Cadena](../../data_types/string.md). + +**Valor devuelto** + +Una cadena sin caracteres especificados iniciales y (o) finales. + +Tipo: `String`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT trim(BOTH ' ()' FROM '( Hello, world! )') +``` + +Resultado: + +``` text +┌─trim(BOTH ' ()' FROM '( Hello, world! )')─┐ +│ Hello, world! │ +└───────────────────────────────────────────────┘ +``` + +## trimLeft {#trimleft} + +Quita todas las apariciones consecutivas de espacios en blanco comunes (carácter ASCII 32) desde el principio de una cadena. No elimina otros tipos de caracteres de espacio en blanco (tab, espacio sin interrupción, etc.). + +**Sintaxis** + +``` sql +trimLeft(input_string) +``` + +Apodo: `ltrim(input_string)`. + +**Parámetros** + +- `input_string` — cuerda para recortar. [Cadena](../../data_types/string.md). + +**Valor devuelto** + +Una cadena sin espacios en blanco comunes iniciales. + +Tipo: `String`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT trimLeft(' Hello, world! ') +``` + +Resultado: + +``` text +┌─trimLeft(' Hello, world! ')─┐ +│ Hello, world! │ +└─────────────────────────────────────┘ +``` + +## trimRight {#trimright} + +Quita todas las apariciones consecutivas de espacios en blanco comunes (carácter ASCII 32) del final de una cadena. No elimina otros tipos de caracteres de espacio en blanco (tab, espacio sin interrupción, etc.). + +**Sintaxis** + +``` sql +trimRight(input_string) +``` + +Apodo: `rtrim(input_string)`. + +**Parámetros** + +- `input_string` — cuerda para recortar. [Cadena](../../data_types/string.md). + +**Valor devuelto** + +Una cadena sin espacios en blanco comunes finales. + +Tipo: `String`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT trimRight(' Hello, world! ') +``` + +Resultado: + +``` text +┌─trimRight(' Hello, world! ')─┐ +│ Hello, world! │ +└──────────────────────────────────────┘ +``` + +## AjusteTanto {#trimboth} + +Quita todas las apariciones consecutivas de espacios en blanco comunes (carácter ASCII 32) de ambos extremos de una cadena. No elimina otros tipos de caracteres de espacio en blanco (tab, espacio sin interrupción, etc.). + +**Sintaxis** + +``` sql +trimBoth(input_string) +``` + +Apodo: `trim(input_string)`. + +**Parámetros** + +- `input_string` — cuerda para recortar. [Cadena](../../data_types/string.md). + +**Valor devuelto** + +Una cadena sin espacios en blanco comunes iniciales y finales. + +Tipo: `String`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT trimBoth(' Hello, world! ') +``` + +Resultado: + +``` text +┌─trimBoth(' Hello, world! ')─┐ +│ Hello, world! │ +└─────────────────────────────────────┘ +``` + +## CRC32(s)))) {#crc32} + +Devuelve la suma de comprobación CRC32 de una cadena, utilizando el polinomio CRC-32-IEEE 802.3 y el valor inicial `0xffffffff` (implementación zlib). + +El tipo de resultado es UInt32. + +## CRC32IEEE(s) {#crc32ieee} + +Devuelve la suma de comprobación CRC32 de una cadena, utilizando el polinomio CRC-32-IEEE 802.3. + +El tipo de resultado es UInt32. + +## CRC64(s)))) {#crc64} + +Devuelve la suma de comprobación CRC64 de una cadena, utilizando el polinomio CRC-64-ECMA. + +El tipo de resultado es UInt64. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/string_functions/) diff --git a/docs/es/query_language/functions/string_replace_functions.md b/docs/es/query_language/functions/string_replace_functions.md new file mode 100644 index 00000000000..66bb57e70a0 --- /dev/null +++ b/docs/es/query_language/functions/string_replace_functions.md @@ -0,0 +1,87 @@ +# Funciones para buscar y reemplazar en cadenas {#functions-for-searching-and-replacing-in-strings} + +## replaceOne(pajar, patrón, reemplazo) {#replaceonehaystack-pattern-replacement} + +Sustituye la primera aparición, si existe, de la ‘pattern’ subcadena en ‘haystack’ con el ‘replacement’ subcadena. +Sucesivo, ‘pattern’ y ‘replacement’ deben ser constantes. + +## replaceAll (pajar, patrón, reemplazo), replace (pajar, patrón, reemplazo) {#replaceallhaystack-pattern-replacement-replacehaystack-pattern-replacement} + +Sustituye todas las apariciones del ‘pattern’ subcadena en ‘haystack’ con el ‘replacement’ subcadena. + +## replaceRegexpOne (pajar, patrón, reemplazo) {#replaceregexponehaystack-pattern-replacement} + +Reemplazo usando el ‘pattern’ expresión regular. Una expresión regular re2. +Sustituye sólo la primera ocurrencia, si existe. +Un patrón se puede especificar como ‘replacement’. Este patrón puede incluir sustituciones `\0-\9`. +Sustitución `\0` incluye toda la expresión regular. Sustitución `\1-\9` corresponden a los números de subpatrón. `\` es una plantilla, escapar de ella usando `\`. +También tenga en cuenta que un literal de cadena requiere un escape adicional. + +Ejemplo 1. Conversión de la fecha a formato americano: + +``` sql +SELECT DISTINCT + EventDate, + replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res +FROM test.hits +LIMIT 7 +FORMAT TabSeparated +``` + +``` text +2014-03-17 03/17/2014 +2014-03-18 03/18/2014 +2014-03-19 03/19/2014 +2014-03-20 03/20/2014 +2014-03-21 03/21/2014 +2014-03-22 03/22/2014 +2014-03-23 03/23/2014 +``` + +Ejemplo 2. Copiar una cadena diez veces: + +``` sql +SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res +``` + +``` text +┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ +│ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ +└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +## replaceRegexpAll (pajar, patrón, reemplazo) {#replaceregexpallhaystack-pattern-replacement} + +Esto hace lo mismo, pero reemplaza todas las ocurrencias. Ejemplo: + +``` sql +SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res +``` + +``` text +┌─res────────────────────────┐ +│ HHeelllloo,, WWoorrlldd!! │ +└────────────────────────────┘ +``` + +Como excepción, si una expresión regular funcionó en una subcadena vacía, el reemplazo no se realiza más de una vez. +Ejemplo: + +``` sql +SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res +``` + +``` text +┌─res─────────────────┐ +│ here: Hello, World! │ +└─────────────────────┘ +``` + +## Sistema abierto.) {#regexpquotemetas} + +La función agrega una barra invertida antes de algunos caracteres predefinidos en la cadena. +Caracteres predefinidos: ‘0’, ‘\\’, ‘\|’, ‘(’, ‘)’, ‘^’, ‘$’, ‘.’, ‘\[’Lugar de origen, ‘?’Acerca de\*‘,’+‘,’{‘,’:‘,’¿Por qué? +Esta implementación difiere ligeramente de re2::RE2::QuoteMeta. Escapa de byte cero como \\0 en lugar de 00 y escapa solo de los caracteres requeridos. +Para obtener más información, consulte el enlace: [Bienvenido](https://github.com/google/re2/blob/master/re2/re2.cc#L473) + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/string_replace_functions/) diff --git a/docs/es/query_language/functions/string_search_functions.md b/docs/es/query_language/functions/string_search_functions.md new file mode 100644 index 00000000000..657c6eb999a --- /dev/null +++ b/docs/es/query_language/functions/string_search_functions.md @@ -0,0 +1,372 @@ +# Funciones para buscar cadenas {#functions-for-searching-strings} + +La búsqueda distingue entre mayúsculas y minúsculas de forma predeterminada en todas estas funciones. Hay variantes separadas para la búsqueda insensible a mayúsculas y minúsculas. + +## posición (pajar, aguja), localizar (pajar, aguja) {#position} + +Devuelve la posición (en bytes) de la subcadena encontrada en la cadena, comenzando desde 1. + +Funciona bajo el supuesto de que la cadena contiene un conjunto de bytes que representan un texto codificado de un solo byte. Si no se cumple esta suposición y un carácter no se puede representar con un solo byte, la función no produce una excepción y devuelve algún resultado inesperado. Si el carácter se puede representar usando dos bytes, usará dos bytes y así sucesivamente. + +Para una búsqueda sin distinción de mayúsculas y minúsculas, utilice la función [positionCaseInsensitive](#positioncaseinsensitive). + +**Sintaxis** + +``` sql +position(haystack, needle) +``` + +Apodo: `locate(haystack, needle)`. + +**Parámetros** + +- `haystack` — cadena, en la que se buscará la subcadena. [Cadena](../syntax.md#syntax-string-literal). +- `needle` — subcadena que se va a buscar. [Cadena](../syntax.md#syntax-string-literal). + +**Valores devueltos** + +- Posición inicial en bytes (contando desde 1), si se encontró subcadena. +- 0, si no se encontró la subcadena. + +Tipo: `Integer`. + +**Ejemplos** + +Frase «Hello, world!» contiene un conjunto de bytes que representan un texto codificado de un solo byte. La función devuelve algún resultado esperado: + +Consulta: + +``` sql +SELECT position('Hello, world!', '!') +``` + +Resultado: + +``` text +┌─position('Hello, world!', '!')─┐ +│ 13 │ +└────────────────────────────────┘ +``` + +La misma frase en ruso contiene caracteres que no se pueden representar usando un solo byte. La función devuelve algún resultado inesperado (uso [PosiciónUTF8](#positionutf8) función para texto codificado de varios bytes): + +Consulta: + +``` sql +SELECT position('Привет, мир!', '!') +``` + +Resultado: + +``` text +┌─position('Привет, мир!', '!')─┐ +│ 21 │ +└───────────────────────────────┘ +``` + +## positionCaseInsensitive {#positioncaseinsensitive} + +Lo mismo que [posición](#position) devuelve la posición (en bytes) de la subcadena encontrada en la cadena, comenzando desde 1. Utilice la función para una búsqueda que no distingue entre mayúsculas y minúsculas. + +Funciona bajo el supuesto de que la cadena contiene un conjunto de bytes que representan un texto codificado de un solo byte. Si no se cumple esta suposición y un carácter no se puede representar con un solo byte, la función no produce una excepción y devuelve algún resultado inesperado. Si el carácter se puede representar usando dos bytes, usará dos bytes y así sucesivamente. + +**Sintaxis** + +``` sql +positionCaseInsensitive(haystack, needle) +``` + +**Parámetros** + +- `haystack` — cadena, en la que se buscará la subcadena. [Cadena](../syntax.md#syntax-string-literal). +- `needle` — subcadena que se va a buscar. [Cadena](../syntax.md#syntax-string-literal). + +**Valores devueltos** + +- Posición inicial en bytes (contando desde 1), si se encontró subcadena. +- 0, si no se encontró la subcadena. + +Tipo: `Integer`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT positionCaseInsensitive('Hello, world!', 'hello') +``` + +Resultado: + +``` text +┌─positionCaseInsensitive('Hello, world!', 'hello')─┐ +│ 1 │ +└───────────────────────────────────────────────────┘ +``` + +## PosiciónUTF8 {#positionutf8} + +Devuelve la posición (en puntos Unicode) de la subcadena encontrada en la cadena, comenzando desde 1. + +Funciona bajo el supuesto de que la cadena contiene un conjunto de bytes que representan un texto codificado en UTF-8. Si no se cumple esta suposición, la función no produce una excepción y devuelve algún resultado inesperado. Si el carácter se puede representar usando dos puntos Unicode, usará dos y así sucesivamente. + +Para una búsqueda sin distinción de mayúsculas y minúsculas, utilice la función [PosiciónCasoInsensitiveUTF8](#positioncaseinsensitiveutf8). + +**Sintaxis** + +``` sql +positionUTF8(haystack, needle) +``` + +**Parámetros** + +- `haystack` — cadena, en la que se buscará la subcadena. [Cadena](../syntax.md#syntax-string-literal). +- `needle` — subcadena que se va a buscar. [Cadena](../syntax.md#syntax-string-literal). + +**Valores devueltos** + +- Posición inicial en puntos Unicode (contando desde 1), si se encontró subcadena. +- 0, si no se encontró la subcadena. + +Tipo: `Integer`. + +**Ejemplos** + +Frase «Hello, world!» en ruso contiene un conjunto de puntos Unicode que representan un texto codificado de un solo punto. La función devuelve algún resultado esperado: + +Consulta: + +``` sql +SELECT positionUTF8('Привет, мир!', '!') +``` + +Resultado: + +``` text +┌─positionUTF8('Привет, мир!', '!')─┐ +│ 12 │ +└───────────────────────────────────┘ +``` + +Frase «Salut, étudiante!», donde el carácter `é` Puede ser representado usando un punto (`U+00E9`) o dos puntos (`U+0065U+0301`) la función se puede devolver algún resultado inesperado: + +Consulta de la carta `é`, que se representa un punto Unicode `U+00E9`: + +``` sql +SELECT positionUTF8('Salut, étudiante!', '!') +``` + +Resultado: + +``` text +┌─positionUTF8('Salut, étudiante!', '!')─┐ +│ 17 │ +└────────────────────────────────────────┘ +``` + +Consulta de la carta `é`, que se representa dos puntos Unicode `U+0065U+0301`: + +``` sql +SELECT positionUTF8('Salut, étudiante!', '!') +``` + +Resultado: + +``` text +┌─positionUTF8('Salut, étudiante!', '!')─┐ +│ 18 │ +└────────────────────────────────────────┘ +``` + +## PosiciónCasoInsensitiveUTF8 {#positioncaseinsensitiveutf8} + +Lo mismo que [PosiciónUTF8](#positionutf8), pero no distingue entre mayúsculas y minúsculas. Devuelve la posición (en puntos Unicode) de la subcadena encontrada en la cadena, comenzando desde 1. + +Funciona bajo el supuesto de que la cadena contiene un conjunto de bytes que representan un texto codificado en UTF-8. Si no se cumple esta suposición, la función no produce una excepción y devuelve algún resultado inesperado. Si el carácter se puede representar usando dos puntos Unicode, usará dos y así sucesivamente. + +**Sintaxis** + +``` sql +positionCaseInsensitiveUTF8(haystack, needle) +``` + +**Parámetros** + +- `haystack` — cadena, en la que se buscará la subcadena. [Cadena](../syntax.md#syntax-string-literal). +- `needle` — subcadena que se va a buscar. [Cadena](../syntax.md#syntax-string-literal). + +**Valor devuelto** + +- Posición inicial en puntos Unicode (contando desde 1), si se encontró subcadena. +- 0, si no se encontró la subcadena. + +Tipo: `Integer`. + +**Ejemplo** + +Consulta: + +``` sql +SELECT positionCaseInsensitiveUTF8('Привет, мир!', 'Мир') +``` + +Resultado: + +``` text +┌─positionCaseInsensitiveUTF8('Привет, мир!', 'Мир')─┐ +│ 9 │ +└────────────────────────────────────────────────────┘ +``` + +## multiSearchAllPositions {#multisearchallpositions} + +Lo mismo que [posición](string_search_functions.md#position) pero devuelve `Array` posiciones (en bytes) de las subcadenas correspondientes encontradas en la cadena. Las posiciones se indexan a partir de 1. + +La búsqueda se realiza en secuencias de bytes sin respecto a la codificación de cadenas y la intercalación. + +- Para la búsqueda ASCII sin distinción de mayúsculas y minúsculas, utilice la función `multiSearchAllPositionsCaseInsensitive`. +- Para buscar en UTF-8, use la función [MultiSearchAllPositionsUTF8](#multiSearchAllPositionsUTF8). +- Para la búsqueda UTF-8 sin distinción de mayúsculas y minúsculas, utilice la función multiSearchAllPositionsCaseInsensitiveUTF8. + +**Sintaxis** + +``` sql +multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) +``` + +**Parámetros** + +- `haystack` — cadena, en la que se buscará la subcadena. [Cadena](../syntax.md#syntax-string-literal). +- `needle` — subcadena que se va a buscar. [Cadena](../syntax.md#syntax-string-literal). + +**Valores devueltos** + +- Matriz de posiciones iniciales en bytes (contando desde 1), si se encontró la subcadena correspondiente y 0 si no se encuentra. + +**Ejemplo** + +Consulta: + +``` sql +SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']) +``` + +Resultado: + +``` text +┌─multiSearchAllPositions('Hello, World!', ['hello', '!', 'world'])─┐ +│ [0,13,0] │ +└───────────────────────────────────────────────────────────────────┘ +``` + +## MultiSearchAllPositionsUTF8 {#multisearchallpositionsutf8} + +Ver `multiSearchAllPositions`. + +## multiSearchFirstPosition(pajar, \[agujaUno, agujaCómo hacer, …, agujay\]) {#multisearchfirstposition} + +Lo mismo que `position` pero devuelve el desplazamiento más a la izquierda de la cadena `haystack` que se corresponde con algunas de las agujas. + +Para una búsqueda que no distingue entre mayúsculas y minúsculas o / y en formato UTF-8, use funciones `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`. + +## multiSearchFirstIndex(pajar, \[agujaUno, agujaCómo hacer, …, agujay\]) {#multisearchfirstindexhaystack-needle1-needle2-needlen} + +Devuelve el índice `i` (a partir de 1) de la aguja encontrada más a la izquierdame es la cadena `haystack` y 0 de lo contrario. + +Para una búsqueda que no distingue entre mayúsculas y minúsculas o / y en formato UTF-8, use funciones `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. + +## multiSearchAny(pajar, \[agujaUno, agujaCómo hacer, …, agujay\]) {#function-multisearchany} + +Devuelve 1, si al menos una aguja de cuerdame coincide con la cadena `haystack` y 0 de lo contrario. + +Para una búsqueda que no distingue entre mayúsculas y minúsculas o / y en formato UTF-8, use funciones `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. + +!!! note "Nota" + En todos `multiSearch*` el número de agujas debe ser inferior a 2Ocho debido a la especificación de implementación. + +## match (pajar, patrón) {#matchhaystack-pattern} + +Comprueba si la cadena coincide con la `pattern` expresión regular. Un `re2` expresión regular. El [sintaxis](https://github.com/google/re2/wiki/Syntax) de la `re2` expresiones regulares es más limitada que la sintaxis de las expresiones regulares de Perl. + +Devuelve 0 si no coinciden, o 1 si coinciden. + +Tenga en cuenta que el símbolo de barra invertida (`\`) se utiliza para escapar en la expresión regular. El mismo símbolo se usa para escapar en literales de cadena. Por lo tanto, para escapar del símbolo en una expresión regular, debe escribir dos barras invertidas (\\) en un literal de cadena. + +La expresión regular funciona con la cadena como si fuera un conjunto de bytes. La expresión regular no puede contener bytes nulos. +Para que los patrones busquen subcadenas en una cadena, es mejor usar LIKE o ‘position’, ya que trabajan mucho más rápido. + +## multiMatchAny(pajar, \[patrónUno, patrónCómo hacer, …, patróny\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} + +Lo mismo que `match`, pero devuelve 0 si ninguna de las expresiones regulares coinciden y 1 si alguno de los patrones coinciden. Se utiliza [hyperscan](https://github.com/intel/hyperscan) biblioteca. Para que los patrones busquen subcadenas en una cadena, es mejor usar `multiSearchAny` ya que funciona mucho más rápido. + +!!! note "Nota" + La longitud de cualquiera de los `haystack` Cadena debe ser inferior a 2Nivel de Cifrado WEP bytes de lo contrario, se lanza la excepción. Esta restricción tiene lugar debido a la API de hiperscan. + +## multiMatchAnyIndex(pajar, \[patrónUno, patrónCómo hacer, …, patróny\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} + +Lo mismo que `multiMatchAny`, pero devuelve cualquier índice que coincida con el pajar. + +## ¿Cómo puedo obtener más información?Uno, patrónCómo hacer, …, patróny\]) {#multimatchallindiceshaystack-pattern1-pattern2-patternn} + +Lo mismo que `multiMatchAny`, pero devuelve la matriz de todas las indicaciones que coinciden con el pajar en cualquier orden. + +## multiFuzzyMatchAny(pajar, distancia, \[patrónUno, patrónCómo hacer, …, patróny\]) {#multifuzzymatchanyhaystack-distance-pattern1-pattern2-patternn} + +Lo mismo que `multiMatchAny`, pero devuelve 1 si algún patrón coincide con el pajar dentro de una constante [Editar distancia](https://en.wikipedia.org/wiki/Edit_distance). Esta función también está en modo experimental y puede ser extremadamente lenta. Para obtener más información, consulte [documentación de hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). + +## multiFuzzyMatchAnyIndex(pajar, distancia, \[patrónUno, patrónCómo hacer, …, patróny\]) {#multifuzzymatchanyindexhaystack-distance-pattern1-pattern2-patternn} + +Lo mismo que `multiFuzzyMatchAny`, pero devuelve cualquier índice que coincida con el pajar dentro de una distancia de edición constante. + +## multiFuzzyMatchAllIndices(pajar, distancia, \[patrónUno, patrónCómo hacer, …, patróny\]) {#multifuzzymatchallindiceshaystack-distance-pattern1-pattern2-patternn} + +Lo mismo que `multiFuzzyMatchAny`, pero devuelve la matriz de todos los índices en cualquier orden que coincida con el pajar dentro de una distancia de edición constante. + +!!! note "Nota" + `multiFuzzyMatch*` las funciones no admiten expresiones regulares UTF-8, y dichas expresiones se tratan como bytes debido a la restricción de hiperscan. + +!!! note "Nota" + Para desactivar todas las funciones que utilizan hyperscan, utilice la configuración `SET allow_hyperscan = 0;`. + +## extracto(pajar, patrón) {#extracthaystack-pattern} + +Extrae un fragmento de una cadena utilizando una expresión regular. Si ‘haystack’ no coinciden con el ‘pattern’ regex, se devuelve una cadena vacía. Si la expresión regular no contiene subpatrones, toma el fragmento que coincide con toda la expresión regular. De lo contrario, toma el fragmento que coincide con el primer subpatrón. + +## extractAll(pajar, patrón) {#extractallhaystack-pattern} + +Extrae todos los fragmentos de una cadena utilizando una expresión regular. Si ‘haystack’ no coinciden con el ‘pattern’ regex, se devuelve una cadena vacía. Devuelve una matriz de cadenas que consiste en todas las coincidencias con la expresión regular. En general, el comportamiento es el mismo que el ‘extract’ función (toma el primer subpatrón, o la expresión completa si no hay un subpatrón). + +## como (pajar, patrón), operador de patrón COMO pajar {#function-like} + +Comprueba si una cadena coincide con una expresión regular simple. +La expresión regular puede contener los metasímbolos `%` y `_`. + +`%` indica cualquier cantidad de bytes (incluidos cero caracteres). + +`_` indica cualquier byte. + +Utilice la barra invertida (`\`) para escapar de metasímbolos. Vea la nota sobre el escape en la descripción del ‘match’ función. + +Para expresiones regulares como `%needle%`, el código es más óptimo y trabaja tan rápido como el `position` función. +Para otras expresiones regulares, el código es el mismo que para ‘match’ función. + +## notLike(haystack, pattern), haystack NOT LIKE operador de patrón {#function-notlike} + +Lo mismo que ‘like’ pero negativo. + +## ngramDistance(pajar, aguja) {#ngramdistancehaystack-needle} + +Calcula la distancia de 4 gramos entre `haystack` y `needle`: cuenta la diferencia simétrica entre dos conjuntos múltiples de 4 gramos y la normaliza por la suma de sus cardinalidades. Devuelve un número flotante de 0 a 1: cuanto más cerca de cero, más cadenas son similares entre sí. Si la constante `needle` o `haystack` es más de 32Kb, arroja una excepción. Si algunos de los no constantes `haystack` o `needle` Las cadenas son más de 32Kb, la distancia es siempre una. + +Para la búsqueda sin distinción de mayúsculas y minúsculas o / y en formato UTF-8, use funciones `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. + +## ngramSearch(pajar, aguja) {#ngramsearchhaystack-needle} + +Lo mismo que `ngramDistance` pero calcula la diferencia no simétrica entre `needle` y `haystack` – el número de n-grams de la aguja menos el número común de n-grams normalizado por el número de `needle` n-gramas. Cuanto más cerca de uno, más probable es `needle` está en el `haystack`. Puede ser útil para la búsqueda de cadenas difusas. + +Para la búsqueda sin distinción de mayúsculas y minúsculas o / y en formato UTF-8, use funciones `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. + +!!! note "Nota" + Para el caso UTF-8 usamos una distancia de 3 gramos. Todas estas no son distancias n-gram perfectamente justas. Usamos hashes de 2 bytes para hash n-gramos y luego calculamos la diferencia (no) simétrica entre estas tablas hash: pueden ocurrir colisiones. Con el formato UTF-8 sin distinción de mayúsculas y minúsculas, no usamos fair `tolower` función - ponemos a cero el bit 5-th (comenzando desde cero) de cada byte de punto de código y el primer bit de byte zeroth si bytes más de uno - esto funciona para el latín y principalmente para todas las letras cirílicas. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/string_search_functions/) diff --git a/docs/es/query_language/functions/type_conversion_functions.md b/docs/es/query_language/functions/type_conversion_functions.md new file mode 100644 index 00000000000..2b4aa569ba9 --- /dev/null +++ b/docs/es/query_language/functions/type_conversion_functions.md @@ -0,0 +1,419 @@ +# Funciones de conversión de tipos {#type-conversion-functions} + +## Problemas comunes de conversiones numéricas {#numeric-conversion-issues} + +Cuando convierte un valor de uno a otro tipo de datos, debe recordar que, en un caso común, es una operación insegura que puede provocar una pérdida de datos. Puede producirse una pérdida de datos si intenta ajustar el valor de un tipo de datos más grande a un tipo de datos más pequeño, o si convierte valores entre diferentes tipos de datos. + +Haga clic en Casa tiene el [mismo comportamiento que los programas de C++](https://en.cppreference.com/w/cpp/language/implicit_conversion). + +## ¿Cómo puedo obtener más información?) {#toint8163264} + +Convierte un valor de entrada en el [En](../../data_types/int_uint.md) tipo de datos. Esta familia de funciones incluye: + +- `toInt8(expr)` — Resultados en el `Int8` tipo de datos. +- `toInt16(expr)` — Resultados en el `Int16` tipo de datos. +- `toInt32(expr)` — Resultados en el `Int32` tipo de datos. +- `toInt64(expr)` — Resultados en el `Int64` tipo de datos. + +**Parámetros** + +- `expr` — [Expresion](../syntax.md#syntax-expressions) devolviendo un número o una cadena con la representación decimal de un número. No se admiten representaciones binarias, octales y hexadecimales de números. Los ceros principales son despojados. + +**Valor devuelto** + +Valor entero en el `Int8`, `Int16`, `Int32`, o `Int64` tipo de datos. + +Funciones de uso [Tetas grandes](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), lo que significa que truncan dígitos fraccionarios de números. + +El comportamiento de las funciones [NaN y Inf](../../data_types/float.md#data_type-float-nan-inf) los argumentos no están definidos. Recuerde acerca de [problemas de conversión numérica](#numeric-conversion-issues), al usar las funciones. + +**Ejemplo** + +``` sql +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +``` + +``` text +┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ +│ -9223372036854775808 │ 32 │ 16 │ 8 │ +└──────────────────────┴─────────────┴───────────────┴─────────────┘ +``` + +## ¿Cómo puedo obtener más información? {#toint8163264orzero} + +Toma un argumento de tipo String e intenta analizarlo en Int (8 \| 16 \| 32 \| 64). Si falla, devuelve 0. + +**Ejemplo** + +``` sql +select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +``` + +``` text +┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ +│ 123123 │ 0 │ +└─────────────────────────┴───────────────────────────┘ +``` + +## ¿Cómo puedo hacerlo? {#toint8163264ornull} + +Toma un argumento de tipo String e intenta analizarlo en Int (8 \| 16 \| 32 \| 64). Si falla, devuelve NULL. + +**Ejemplo** + +``` sql +select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +``` + +``` text +┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ +│ 123123 │ ᴺᵁᴸᴸ │ +└─────────────────────────┴───────────────────────────┘ +``` + +## ¿Cómo puedo obtener más información?) {#touint8163264} + +Convierte un valor de entrada en el [UInt](../../data_types/int_uint.md) tipo de datos. Esta familia de funciones incluye: + +- `toUInt8(expr)` — Resultados en el `UInt8` tipo de datos. +- `toUInt16(expr)` — Resultados en el `UInt16` tipo de datos. +- `toUInt32(expr)` — Resultados en el `UInt32` tipo de datos. +- `toUInt64(expr)` — Resultados en el `UInt64` tipo de datos. + +**Parámetros** + +- `expr` — [Expresion](../syntax.md#syntax-expressions) devolviendo un número o una cadena con la representación decimal de un número. No se admiten representaciones binarias, octales y hexadecimales de números. Los ceros principales son despojados. + +**Valor devuelto** + +Valor entero en el `UInt8`, `UInt16`, `UInt32`, o `UInt64` tipo de datos. + +Funciones de uso [Tetas grandes](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), lo que significa que truncan dígitos fraccionarios de números. + +El comportamiento de las funciones para los instrumentos negativos y para [NaN y Inf](../../data_types/float.md#data_type-float-nan-inf) los argumentos no están definidos. Si pasa una cadena con un número negativo, por ejemplo `'-32'`, ClickHouse genera una excepción. Recuerde acerca de [problemas de conversión numérica](#numeric-conversion-issues), al usar las funciones. + +**Ejemplo** + +``` sql +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +``` + +``` text +┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ +│ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ +└─────────────────────┴───────────────┴────────────────┴──────────────┘ +``` + +## ¿Cómo puedo obtener más información? {#touint8163264orzero} + +## ¿Cómo puedo hacerlo? {#touint8163264ornull} + +## ¿Cómo puedo obtener más información?) {#tofloat3264} + +## ¿Cómo puedo hacerlo? {#tofloat3264orzero} + +## ¿Cómo puedo hacerlo? {#tofloat3264ornull} + +## Fecha {#todate} + +## Todos los derechos reservados {#todateorzero} + +## ToDateOrNull {#todateornull} + +## toDateTime {#todatetime} + +## ToDateTimeOrZero {#todatetimeorzero} + +## ToDateTimeOrNull {#todatetimeornull} + +## toDecimal(32/64/128) {#todecimal3264128} + +Convertir `value` Angeles [Decimal](../../data_types/decimal.md) tipo de datos con precisión de `S`. El `value` puede ser un número o una cadena. El `S` (escala) parámetro especifica el número de decimales. + +- `toDecimal32(value, S)` +- `toDecimal64(value, S)` +- `toDecimal128(value, S)` + +## ¿Cómo puedo hacer esto? {#todecimal3264128ornull} + +Convierte una cadena de entrada en un [Información detallada))](../../data_types/decimal.md) valor de tipo de datos. Esta familia de funciones incluye: + +- `toDecimal32OrNull(expr, S)` — Resultados en `Nullable(Decimal32(S))` tipo de datos. +- `toDecimal64OrNull(expr, S)` — Resultados en `Nullable(Decimal64(S))` tipo de datos. +- `toDecimal128OrNull(expr, S)` — Resultados en `Nullable(Decimal128(S))` tipo de datos. + +Estas funciones deben usarse en lugar de `toDecimal*()` funciones, si usted prefiere conseguir un `NULL` valor de entrada en lugar de una excepción en el caso de un error de análisis de valor de entrada. + +**Parámetros** + +- `expr` — [Expresion](../syntax.md#syntax-expressions), devuelve un valor en el [Cadena](../../data_types/string.md) tipo de datos. ClickHouse espera la representación textual del número decimal. Por ejemplo, `'1.111'`. +- `S` — Escala, el número de decimales en el valor resultante. + +**Valor devuelto** + +Un valor en el `Nullable(Decimal(P,S))` tipo de datos. El valor contiene: + +- Número con `S` lugares decimales, si ClickHouse interpreta la cadena de entrada como un número. +- `NULL`, si ClickHouse no puede interpretar la cadena de entrada como un número o si el número de entrada contiene más de `S` lugares decimales. + +**Ejemplos** + +``` sql +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +``` + +``` text +┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ +│ -1.11100 │ Nullable(Decimal(9, 5)) │ +└──────────┴────────────────────────────────────────────────────┘ +``` + +``` sql +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +``` + +``` text +┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ +│ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ +└──────┴────────────────────────────────────────────────────┘ +``` + +## Por ejemplo: {#todecimal3264128orzero} + +Convierte un valor de entrada en el [Decimal (P, S)](../../data_types/decimal.md) tipo de datos. Esta familia de funciones incluye: + +- `toDecimal32OrZero( expr, S)` — Resultados en `Decimal32(S)` tipo de datos. +- `toDecimal64OrZero( expr, S)` — Resultados en `Decimal64(S)` tipo de datos. +- `toDecimal128OrZero( expr, S)` — Resultados en `Decimal128(S)` tipo de datos. + +Estas funciones deben usarse en lugar de `toDecimal*()` funciones, si usted prefiere conseguir un `0` valor de entrada en lugar de una excepción en el caso de un error de análisis de valor de entrada. + +**Parámetros** + +- `expr` — [Expresion](../syntax.md#syntax-expressions), devuelve un valor en el [Cadena](../../data_types/string.md) tipo de datos. ClickHouse espera la representación textual del número decimal. Por ejemplo, `'1.111'`. +- `S` — Escala, el número de decimales en el valor resultante. + +**Valor devuelto** + +Un valor en el `Nullable(Decimal(P,S))` tipo de datos. El valor contiene: + +- Número con `S` lugares decimales, si ClickHouse interpreta la cadena de entrada como un número. +- 0 con `S` decimales, si ClickHouse no puede interpretar la cadena de entrada como un número o si el número de entrada contiene más de `S` lugares decimales. + +**Ejemplo** + +``` sql +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) +``` + +``` text +┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ +│ -1.11100 │ Decimal(9, 5) │ +└──────────┴────────────────────────────────────────────────────┘ +``` + +``` sql +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) +``` + +``` text +┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ +│ 0.00 │ Decimal(9, 2) │ +└──────┴────────────────────────────────────────────────────┘ +``` + +## ToString {#tostring} + +Funciones para convertir entre números, cadenas (pero no cadenas fijas), fechas y fechas con horas. +Todas estas funciones aceptan un argumento. + +Al convertir a o desde una cadena, el valor se formatea o se analiza utilizando las mismas reglas que para el formato TabSeparated (y casi todos los demás formatos de texto). Si no se puede analizar la cadena, se produce una excepción y se cancela la solicitud. + +Al convertir fechas a números o viceversa, la fecha corresponde al número de días desde el comienzo de la época Unix. +Al convertir fechas con horas a números o viceversa, la fecha con hora corresponde al número de segundos desde el comienzo de la época Unix. + +Los formatos de fecha y fecha con hora para las funciones toDate/toDateTime se definen de la siguiente manera: + +``` text +YYYY-MM-DD +YYYY-MM-DD hh:mm:ss +``` + +Como excepción, si convierte de tipos numéricos UInt32, Int32, UInt64 o Int64 a Date, y si el número es mayor o igual que 65536, el número se interpreta como una marca de tiempo Unix (y no como el número de días) y se redondea a la fecha. Esto permite soporte para la ocurrencia común de la escritura ‘toDate(unix\_timestamp)’, que de otra manera sería un error y requeriría escribir el más engorroso ‘toDate(toDateTime(unix\_timestamp))’. + +La conversión entre una fecha y una fecha con la hora se realiza de la manera natural: agregando un tiempo nulo o eliminando el tiempo. + +La conversión entre tipos numéricos utiliza las mismas reglas que las asignaciones entre diferentes tipos numéricos en C++. + +Además, la función toString del argumento DateTime puede tomar un segundo argumento String que contiene el nombre de la zona horaria. Ejemplo: `Asia/Yekaterinburg` En este caso, la hora se formatea de acuerdo con la zona horaria especificada. + +``` sql +SELECT + now() AS now_local, + toString(now(), 'Asia/Yekaterinburg') AS now_yekat +``` + +``` text +┌───────────now_local─┬─now_yekat───────────┐ +│ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ +└─────────────────────┴─────────────────────┘ +``` + +Ver también el `toUnixTimestamp` función. + +## ¿Qué puedes encontrar en Neodigit) {#tofixedstrings-n} + +Convierte un argumento de tipo String en un tipo FixedString(N) (una cadena con longitud fija N). N debe ser una constante. +Si la cadena tiene menos bytes que N, se pasa con bytes nulos a la derecha. Si la cadena tiene más bytes que N, se produce una excepción. + +## Todos los derechos reservados.) {#tostringcuttozeros} + +Acepta un argumento String o FixedString. Devuelve la cadena con el contenido truncado en el primer byte cero encontrado. + +Ejemplo: + +``` sql +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut +``` + +``` text +┌─s─────────────┬─s_cut─┐ +│ foo\0\0\0\0\0 │ foo │ +└───────────────┴───────┘ +``` + +``` sql +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut +``` + +``` text +┌─s──────────┬─s_cut─┐ +│ foo\0bar\0 │ foo │ +└────────────┴───────┘ +``` + +## ¿Cómo puedo obtener más información?) {#reinterpretasuint8163264} + +## ¿Cómo puedo obtener más información?) {#reinterpretasint8163264} + +## ¿Cómo puedo obtener más información?) {#reinterpretasfloat3264} + +## reinterpretAsDate {#reinterpretasdate} + +## reinterpretAsDateTime {#reinterpretasdatetime} + +Estas funciones aceptan una cadena e interpretan los bytes colocados al principio de la cadena como un número en orden de host (little endian). Si la cadena no es lo suficientemente larga, las funciones funcionan como si la cadena estuviera rellenada con el número necesario de bytes nulos. Si la cadena es más larga de lo necesario, se ignoran los bytes adicionales. Una fecha se interpreta como el número de días desde el comienzo de la época Unix, y una fecha con hora se interpreta como el número de segundos desde el comienzo de la época Unix. + +## reinterpretAsString {#type-conversion-functions-reinterpretasstring} + +Esta función acepta un número o fecha o fecha con hora, y devuelve una cadena que contiene bytes que representan el valor correspondiente en orden de host (little endian). Los bytes nulos se eliminan desde el final. Por ejemplo, un valor de tipo UInt32 de 255 es una cadena que tiene un byte de longitud. + +## reinterpretAsFixedString {#reinterpretasfixedstring} + +Esta función acepta un número o fecha o fecha con hora, y devuelve un FixedString que contiene bytes que representan el valor correspondiente en orden de host (little endian). Los bytes nulos se eliminan desde el final. Por ejemplo, un valor de tipo UInt32 de 255 es un FixedString que tiene un byte de longitud. + +## CAST(x, t) {#type-conversion-function-cast} + +Convertir ‘x’ Angeles ‘t’ tipo de datos. La sintaxis CAST(x AS t) también es compatible. + +Ejemplo: + +``` sql +SELECT + '2016-06-15 23:00:00' AS timestamp, + CAST(timestamp AS DateTime) AS datetime, + CAST(timestamp AS Date) AS date, + CAST(timestamp, 'String') AS string, + CAST(timestamp, 'FixedString(22)') AS fixed_string +``` + +``` text +┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ +│ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ +└─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ +``` + +La conversión a FixedString(N) solo funciona para argumentos de tipo String o FixedString(N). + +Conversión de tipo a [NULO](../../data_types/nullable.md) y la espalda es compatible. Ejemplo: + +``` sql +SELECT toTypeName(x) FROM t_null +``` + +``` text +┌─toTypeName(x)─┐ +│ Int8 │ +│ Int8 │ +└───────────────┘ +``` + +``` sql +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +``` + +``` text +┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ +│ Nullable(UInt16) │ +│ Nullable(UInt16) │ +└─────────────────────────────────────────┘ +``` + +## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval} + +Convierte un argumento de tipo Number en un [Intervalo](../../data_types/special_data_types/interval.md) tipo de datos. + +**Sintaxis** + +``` sql +toIntervalSecond(number) +toIntervalMinute(number) +toIntervalHour(number) +toIntervalDay(number) +toIntervalWeek(number) +toIntervalMonth(number) +toIntervalQuarter(number) +toIntervalYear(number) +``` + +**Parámetros** + +- `number` — Duración del intervalo. Número entero positivo. + +**Valores devueltos** + +- El valor en `Interval` tipo de datos. + +**Ejemplo** + +``` sql +WITH + toDate('2019-01-01') AS date, + INTERVAL 1 WEEK AS interval_week, + toIntervalWeek(1) AS interval_to_week +SELECT + date + interval_week, + date + interval_to_week +``` + +``` text +┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ +│ 2019-01-08 │ 2019-01-08 │ +└───────────────────────────┴──────────────────────────────┘ +``` + +## parseDateTimeBestEffort {#type-conversion-functions-parsedatetimebesteffort} + +Analice un argumento de tipo numérico a un tipo Date o DateTime. +diferente de toDate y toDateTime, parseDateTimeBestEffort puede progresar en un formato de fecha más complejo. +Para obtener más información, consulte el enlace: [Formato de fecha compleja](https://xkcd.com/1179/) + +## parseDateTimeBestEffortOrNull {#parsedatetimebesteffortornull} + +Lo mismo que para [parseDateTimeBestEffort](#type_conversion_functions-parsedatetimebesteffort) excepto que devuelve null cuando encuentra un formato de fecha que no se puede procesar. + +## parseDateTimeBestEffortOrZero {#parsedatetimebesteffortorzero} + +Lo mismo que para [parseDateTimeBestEffort](#type_conversion_functions-parsedatetimebesteffort) que devuelve una fecha cero o una fecha cero cuando encuentra un formato de fecha que no se puede procesar. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/type_conversion_functions/) diff --git a/docs/es/query_language/functions/url_functions.md b/docs/es/query_language/functions/url_functions.md new file mode 100644 index 00000000000..7601f357e92 --- /dev/null +++ b/docs/es/query_language/functions/url_functions.md @@ -0,0 +1,202 @@ +# Funciones para trabajar con URL {#functions-for-working-with-urls} + +Todas estas funciones no siguen el RFC. Se simplifican al máximo para mejorar el rendimiento. + +## Funciones que extraen partes de una URL {#functions-that-extract-parts-of-a-url} + +Si la parte relevante no está presente en una URL, se devuelve una cadena vacía. + +### protocolo {#protocol} + +Extrae el protocolo de una URL. + +Ejemplos de valores típicos devueltos: http, https, ftp, mailto, tel, magnet… + +### dominio {#domain} + +Extrae el nombre de host de una dirección URL. + +``` sql +domain(url) +``` + +**Parámetros** + +- `url` — URL. Tipo: [Cadena](../../data_types/string.md). + +La URL se puede especificar con o sin un esquema. Ejemplos: + +``` text +svn+ssh://some.svn-hosting.com:80/repo/trunk +some.svn-hosting.com:80/repo/trunk +https://yandex.com/time/ +``` + +Para estos ejemplos, el `domain` función devuelve los siguientes resultados: + +``` text +some.svn-hosting.com +some.svn-hosting.com +yandex.com +``` + +**Valores devueltos** + +- Nombre de host. Si ClickHouse puede analizar la cadena de entrada como una URL. +- Cadena vacía. Si ClickHouse no puede analizar la cadena de entrada como una URL. + +Tipo: `String`. + +**Ejemplo** + +``` sql +SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk') +``` + +``` text +┌─domain('svn+ssh://some.svn-hosting.com:80/repo/trunk')─┐ +│ some.svn-hosting.com │ +└────────────────────────────────────────────────────────┘ +``` + +### Nuestros servicios {#domainwithoutwww} + +Devuelve el dominio y no elimina más de uno ‘www.’ desde el principio de la misma, si está presente. + +### topLevelDomain {#topleveldomain} + +Extrae el dominio de nivel superior de una URL. + +``` sql +topLevelDomain(url) +``` + +**Parámetros** + +- `url` — URL. Tipo: [Cadena](../../data_types/string.md). + +La URL se puede especificar con o sin un esquema. Ejemplos: + +``` text +svn+ssh://some.svn-hosting.com:80/repo/trunk +some.svn-hosting.com:80/repo/trunk +https://yandex.com/time/ +``` + +**Valores devueltos** + +- Nombre de dominio. Si ClickHouse puede analizar la cadena de entrada como una URL. +- Cadena vacía. Si ClickHouse no puede analizar la cadena de entrada como una URL. + +Tipo: `String`. + +**Ejemplo** + +``` sql +SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk') +``` + +``` text +┌─topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk')─┐ +│ com │ +└────────────────────────────────────────────────────────────────────┘ +``` + +### FirstSignificantSubdomain {#firstsignificantsubdomain} + +Devuelve el “first significant subdomain”. Este es un concepto no estándar específico de Yandex.Métrica. El primer subdominio significativo es un dominio de segundo nivel si es ‘com’, ‘net’, ‘org’, o ‘co’. De lo contrario, es un dominio de tercer nivel. Por ejemplo, firstSignificantSubdomain (‘https://news.yandex.ru/’Número ‘yandex’, firstSignificantSubdomain (‘https://news.yandex.com.tr/’Número ‘yandex’. La lista de “insignificant” dominios de segundo nivel y otros detalles de implementación pueden cambiar en el futuro. + +### cutToFirstSignificantSubdomain {#cuttofirstsignificantsubdomain} + +Devuelve la parte del dominio que incluye subdominios de nivel superior “first significant subdomain” (véase la explicación anterior). + +Por ejemplo, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`. + +### camino {#path} + +Devuelve la ruta de acceso. Ejemplo: `/top/news.html` La ruta de acceso no incluye la cadena de consulta. + +### pathFull {#pathfull} + +Lo mismo que el anterior, pero incluyendo cadena de consulta y fragmento. Ejemplo: /top/news.html?Página = 2 \# comentarios + +### queryString {#querystring} + +Devuelve la cadena de consulta. Ejemplo: page=1&lr=213. query-string no incluye el signo de interrogación inicial, así como \# y todo después de \#. + +### fragmento {#fragment} + +Devuelve el identificador de fragmento. el fragmento no incluye el símbolo hash inicial. + +### queryStringAndFragment {#querystringandfragment} + +Devuelve la cadena de consulta y el identificador de fragmento. Ejemplo: page=1\#29390. + +### extractURLParameter(URL, nombre) {#extracturlparameterurl-name} + +Devuelve el valor de la ‘name’ parámetro en la URL, si está presente. De lo contrario, una cadena vacía. Si hay muchos parámetros con este nombre, devuelve la primera aparición. Esta función funciona bajo el supuesto de que el nombre del parámetro está codificado en la URL exactamente de la misma manera que en el argumento pasado. + +### extractURLParameters (URL) {#extracturlparametersurl} + +Devuelve una matriz de cadenas name=value correspondientes a los parámetros de URL. Los valores no se decodifican de ninguna manera. + +### ExtractURLParameterNames (URL) {#extracturlparameternamesurl} + +Devuelve una matriz de cadenas de nombre correspondientes a los nombres de los parámetros de URL. Los valores no se decodifican de ninguna manera. + +### URLJerarquía (URL) {#urlhierarchyurl} + +Devuelve una matriz que contiene la URL, truncada al final por los símbolos /,? en la ruta y la cadena de consulta. Los caracteres separadores consecutivos se cuentan como uno. El corte se realiza en la posición después de todos los caracteres separadores consecutivos. + +### URLPathHierarchy (URL) {#urlpathhierarchyurl} + +Lo mismo que el anterior, pero sin el protocolo y el host en el resultado. El elemento / (raíz) no está incluido. Ejemplo: la función se utiliza para implementar informes de árbol de la URL en Yandex. Métrica. + +``` text +URLPathHierarchy('https://example.com/browse/CONV-6788') = +[ + '/browse/', + '/browse/CONV-6788' +] +``` + +### decodeURLComponent (URL) {#decodeurlcomponenturl} + +Devuelve la dirección URL decodificada. +Ejemplo: + +``` sql +SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; +``` + +``` text +┌─DecodedURL─────────────────────────────┐ +│ http://127.0.0.1:8123/?query=SELECT 1; │ +└────────────────────────────────────────┘ +``` + +## Funciones que eliminan parte de una URL. {#functions-that-remove-part-of-a-url} + +Si la URL no tiene nada similar, la URL permanece sin cambios. + +### Sistema abierto. {#cutwww} + +Elimina no más de uno ‘www.’ desde el principio del dominio de la URL, si está presente. + +### cutQueryString {#cutquerystring} + +Quita la cadena de consulta. El signo de interrogación también se elimina. + +### cutFragment {#cutfragment} + +Quita el identificador de fragmento. El signo de número también se elimina. + +### cutQueryStringAndFragment {#cutquerystringandfragment} + +Quita la cadena de consulta y el identificador de fragmento. El signo de interrogación y el signo de número también se eliminan. + +### cutURLParameter(URL, nombre) {#cuturlparameterurl-name} + +Elimina el ‘name’ Parámetro URL, si está presente. Esta función funciona bajo el supuesto de que el nombre del parámetro está codificado en la URL exactamente de la misma manera que en el argumento pasado. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/url_functions/) diff --git a/docs/es/query_language/functions/uuid_functions.md b/docs/es/query_language/functions/uuid_functions.md new file mode 100644 index 00000000000..469e4dd807c --- /dev/null +++ b/docs/es/query_language/functions/uuid_functions.md @@ -0,0 +1,116 @@ +# Funciones para trabajar con UUID {#functions-for-working-with-uuid} + +Las funciones para trabajar con UUID se enumeran a continuación. + +## GenerateUUIDv4 {#uuid-function-generate} + +Genera el [UUID](../../data_types/uuid.md) de [versión 4](https://tools.ietf.org/html/rfc4122#section-4.4). + +``` sql +generateUUIDv4() +``` + +**Valor devuelto** + +El valor de tipo UUID. + +**Ejemplo de uso** + +En este ejemplo se muestra la creación de una tabla con la columna de tipo UUID e insertar un valor en la tabla. + +``` sql +CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog + +INSERT INTO t_uuid SELECT generateUUIDv4() + +SELECT * FROM t_uuid +``` + +``` text +┌────────────────────────────────────x─┐ +│ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │ +└──────────────────────────────────────┘ +``` + +## paraUUID (x) {#touuid-x} + +Convierte el valor de tipo de cadena en tipo UUID. + +``` sql +toUUID(String) +``` + +**Valor devuelto** + +El valor de tipo UUID. + +**Ejemplo de uso** + +``` sql +SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid +``` + +``` text +┌─────────────────────────────────uuid─┐ +│ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │ +└──────────────────────────────────────┘ +``` + +## UUIDStringToNum {#uuidstringtonum} + +Acepta una cadena que contiene 36 caracteres en el formato `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, y lo devuelve como un conjunto de bytes en un [Cadena fija (16)](../../data_types/fixedstring.md). + +``` sql +UUIDStringToNum(String) +``` + +**Valor devuelto** + +Cadena fija (16) + +**Ejemplos de uso** + +``` sql +SELECT + '612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid, + UUIDStringToNum(uuid) AS bytes +``` + +``` text + +┌─uuid─────────────────────────────────┬─bytes────────────┐ +│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │ +└──────────────────────────────────────┴──────────────────┘ +``` + +## UUIDNumToString {#uuidnumtostring} + +Acepta un [Cadena fija (16)](../../data_types/fixedstring.md) valor, y devuelve una cadena que contiene 36 caracteres en formato de texto. + +``` sql +UUIDNumToString(FixedString(16)) +``` + +**Valor devuelto** + +Cadena. + +**Ejemplo de uso** + +``` sql +SELECT + 'a/<@];!~p{jTj={)' AS bytes, + UUIDNumToString(toFixedString(bytes, 16)) AS uuid +``` + +``` text +┌─bytes────────────┬─uuid─────────────────────────────────┐ +│ a/<@];!~p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ +└──────────────────┴──────────────────────────────────────┘ +``` + +## Ver también {#see-also} + +- [dictGetUUID](ext_dict_functions.md#ext_dict_functions-other) + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/uuid_function/) diff --git a/docs/es/query_language/functions/ym_dict_functions.md b/docs/es/query_language/functions/ym_dict_functions.md new file mode 100644 index 00000000000..f80204f777f --- /dev/null +++ b/docs/es/query_language/functions/ym_dict_functions.md @@ -0,0 +1,126 @@ +# Funciones para trabajar con Yandex.Métrica, diccionarios {#functions-for-working-with-yandex-metrica-dictionaries} + +Para que las funciones a continuación funcionen, la configuración del servidor debe especificar las rutas y direcciones para obtener todo el Yandex.Diccionarios Metrica. Los diccionarios se cargan en la primera llamada de cualquiera de estas funciones. Si no se pueden cargar las listas de referencia, se produce una excepción. + +Para obtener información sobre cómo crear listas de referencia, consulte la sección “Dictionaries”. + +## Múltiples geobases {#multiple-geobases} + +ClickHouse admite trabajar con múltiples geobases alternativas (jerarquías regionales) simultáneamente, con el fin de soportar diversas perspectivas sobre a qué países pertenecen ciertas regiones. + +El ‘clickhouse-server’ config especifica el archivo con la jerarquía regional::`/opt/geo/regions_hierarchy.txt` + +Además de este archivo, también busca archivos cercanos que tengan el símbolo \_ y cualquier sufijo anexado al nombre (antes de la extensión del archivo). +Por ejemplo, también encontrará el archivo `/opt/geo/regions_hierarchy_ua.txt` si está presente. + +`ua` se llama la clave del diccionario. Para un diccionario sin un sufijo, la clave es una cadena vacía. + +Todos los diccionarios se vuelven a cargar en tiempo de ejecución (una vez cada cierto número de segundos, como se define en el parámetro de configuración builtin\_dictionaries\_reload\_interval , o una vez por hora por defecto). Sin embargo, la lista de diccionarios disponibles se define una vez, cuando se inicia el servidor. + +Todas las funciones para trabajar con regiones tienen un argumento opcional al final: la clave del diccionario. Se conoce como la geobase. +Ejemplo: + +``` sql +regionToCountry(RegionID) – Uses the default dictionary: /opt/geo/regions_hierarchy.txt +regionToCountry(RegionID, '') – Uses the default dictionary: /opt/geo/regions_hierarchy.txt +regionToCountry(RegionID, 'ua') – Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt +``` + +### ¿Cómo puedo hacerlo?\]) {#regiontocityid-geobase} + +Acepta un número UInt32: el ID de región de la geobase de Yandex. Si esta región es una ciudad o parte de una ciudad, devuelve el ID de región para la ciudad apropiada. De lo contrario, devuelve 0. + +### ¿Cómo puedo hacerlo?\]) {#regiontoareaid-geobase} + +Convierte una región en un área (tipo 5 en la geobase). En todos los demás sentidos, esta función es la misma que ‘regionToCity’. + +``` sql +SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua')) +FROM system.numbers +LIMIT 15 +``` + +``` text +┌─regionToName(regionToArea(toUInt32(number), \'ua\'))─┐ +│ │ +│ Moscow and Moscow region │ +│ St. Petersburg and Leningrad region │ +│ Belgorod region │ +│ Ivanovsk region │ +│ Kaluga region │ +│ Kostroma region │ +│ Kursk region │ +│ Lipetsk region │ +│ Orlov region │ +│ Ryazan region │ +│ Smolensk region │ +│ Tambov region │ +│ Tver region │ +│ Tula region │ +└──────────────────────────────────────────────────────┘ +``` + +### ¿Cómo puedo hacerlo?\]) {#regiontodistrictid-geobase} + +Convierte una región en un distrito federal (tipo 4 en la geobase). En todos los demás sentidos, esta función es la misma que ‘regionToCity’. + +``` sql +SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua')) +FROM system.numbers +LIMIT 15 +``` + +``` text +┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'))─┐ +│ │ +│ Central federal district │ +│ Northwest federal district │ +│ South federal district │ +│ North Caucases federal district │ +│ Privolga federal district │ +│ Ural federal district │ +│ Siberian federal district │ +│ Far East federal district │ +│ Scotland │ +│ Faroe Islands │ +│ Flemish region │ +│ Brussels capital region │ +│ Wallonia │ +│ Federation of Bosnia and Herzegovina │ +└──────────────────────────────────────────────────────────┘ +``` + +### ¿Cómo puedo hacerlo?\]) {#regiontocountryid-geobase} + +Convierte una región en un país. En todos los demás sentidos, esta función es la misma que ‘regionToCity’. +Ejemplo: `regionToCountry(toUInt32(213)) = 225` convierte Moscú (213) a Rusia (225). + +### Aquí está el código de identificación.\]) {#regiontocontinentid-geobase} + +Convierte una región en un continente. En todos los demás sentidos, esta función es la misma que ‘regionToCity’. +Ejemplo: `regionToContinent(toUInt32(213)) = 10001` convierte Moscú (213) a Eurasia (10001). + +### Aquí está el código de identificación de la población.\]) {#regiontopopulationid-geobase} + +Obtiene la población de una región. +La población se puede registrar en archivos con la geobase. Vea la sección “External dictionaries”. +Si la población no se registra para la región, devuelve 0. +En la geobase de Yandex, la población podría registrarse para las regiones secundarias, pero no para las regiones parentales. + +### ¿Cómo puedo hacerlo?\]) {#regioninlhs-rhs-geobase} + +Comprueba si un ‘lhs’ región pertenece a un ‘rhs’ regi. Devuelve un número UInt8 igual a 1 si pertenece, o 0 si no pertenece. +La relación es reflexiva: cualquier región también pertenece a sí misma. + +### RegiónJerarquía (id\[, geobase\]) {#regionhierarchyid-geobase} + +Acepta un número UInt32: el ID de región de la geobase de Yandex. Devuelve una matriz de ID de región que consiste en la región pasada y todos los elementos primarios a lo largo de la cadena. +Ejemplo: `regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]`. + +### ¿Cómo puedo hacerlo?\]) {#regiontonameid-lang} + +Acepta un número UInt32: el ID de región de la geobase de Yandex. Una cadena con el nombre del idioma se puede pasar como un segundo argumento. Los idiomas soportados son: ru, en, ua, uk, by, kz, tr. Si se omite el segundo argumento, el idioma ‘ru’ se utiliza. Si el idioma no es compatible, se produce una excepción. Devuelve una cadena: el nombre de la región en el idioma correspondiente. Si la región con el ID especificado no existe, se devuelve una cadena vacía. + +`ua` y `uk` ambos significan ucraniano. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/functions/ym_dict_functions/) diff --git a/docs/es/query_language/index.md b/docs/es/query_language/index.md new file mode 100644 index 00000000000..a7f21a41441 --- /dev/null +++ b/docs/es/query_language/index.md @@ -0,0 +1,9 @@ +# Referencia SQL {#sql-reference} + +- [SELECCIONAR](select.md) +- [ES INSERTAR](insert_into.md) +- [CREAR](create.md) +- [ALTERAR](alter.md#query_language_queries_alter) +- [Otros tipos de consultas](misc.md) + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/) diff --git a/docs/es/query_language/insert_into.md b/docs/es/query_language/insert_into.md new file mode 100644 index 00000000000..8222bcc3a7a --- /dev/null +++ b/docs/es/query_language/insert_into.md @@ -0,0 +1,73 @@ +## INSERTAR {#insert} + +Adición de datos. + +Formato de consulta básico: + +``` sql +INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ... +``` + +La consulta puede especificar una lista de columnas para insertar `[(c1, c2, c3)]`. En este caso, el resto de las columnas se llenan con: + +- Los valores calculados a partir del `DEFAULT` expresiones especificadas en la definición de la tabla. +- Ceros y cadenas vacías, si `DEFAULT` expresiones no están definidas. + +Si [strict\_insert\_defaults=1](../operations/settings/settings.md), columnas que no tienen `DEFAULT` definido debe figurar en la consulta. + +Los datos se pueden pasar al INSERT en cualquier [Formato](../interfaces/formats.md#formats) con el apoyo de ClickHouse. El formato debe especificarse explícitamente en la consulta: + +``` sql +INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set +``` + +Por ejemplo, el siguiente formato de consulta es idéntico a la versión básica de INSERT … VALUES: + +``` sql +INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... +``` + +ClickHouse elimina todos los espacios y un avance de línea (si hay uno) antes de los datos. Al formar una consulta, recomendamos colocar los datos en una nueva línea después de los operadores de consulta (esto es importante si los datos comienzan con espacios). + +Ejemplo: + +``` sql +INSERT INTO t FORMAT TabSeparated +11 Hello, world! +22 Qwerty +``` + +Puede insertar datos por separado de la consulta mediante el cliente de línea de comandos o la interfaz HTTP. Para obtener más información, consulte la sección “[Interfaz](../interfaces/index.md#interfaces)”. + +### Limitación {#constraints} + +Si la tabla tiene [limitación](create.md#constraints), sus expresiones se verificarán para cada fila de datos insertados. Si alguna de esas restricciones no se satisface, el servidor generará una excepción que contenga el nombre y la expresión de la restricción, la consulta se detendrá. + +### Insertar los resultados de `SELECT` {#insert-query-insert-select} + +``` sql +INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... +``` + +Las columnas se asignan de acuerdo con su posición en la cláusula SELECT. Sin embargo, sus nombres en la expresión SELECT y la tabla para INSERT pueden diferir. Si es necesario, se realiza la fundición de tipo. + +Ninguno de los formatos de datos, excepto Valores, permite establecer valores para expresiones como `now()`, `1 + 2` y así sucesivamente. El formato Values permite el uso limitado de expresiones, pero esto no se recomienda, porque en este caso se usa código ineficiente para su ejecución. + +No se admiten otras consultas para modificar partes de datos: `UPDATE`, `DELETE`, `REPLACE`, `MERGE`, `UPSERT`, `INSERT UPDATE`. +Sin embargo, puede eliminar datos antiguos usando `ALTER TABLE ... DROP PARTITION`. + +`FORMAT` cláusula debe especificarse al final de la consulta si `SELECT` cláusula contiene la función de tabla [entrada()](table_functions/input.md). + +### Consideraciones de rendimiento {#performance-considerations} + +`INSERT` ordena los datos de entrada por clave principal y los divide en particiones por una clave de partición. Si inserta datos en varias particiones a la vez, puede reducir significativamente el rendimiento del `INSERT` consulta. Para evitar esto: + +- Agregue datos en lotes bastante grandes, como 100.000 filas a la vez. +- Agrupe los datos por una clave de partición antes de cargarlos en ClickHouse. + +El rendimiento no disminuirá si: + +- Los datos se agregan en tiempo real. +- Carga datos que normalmente están ordenados por tiempo. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/insert_into/) diff --git a/docs/es/query_language/misc.md b/docs/es/query_language/misc.md new file mode 100644 index 00000000000..a31d24f1aa1 --- /dev/null +++ b/docs/es/query_language/misc.md @@ -0,0 +1,245 @@ +# Consultas Misceláneas {#miscellaneous-queries} + +## CONECTAR {#attach} + +Esta consulta es exactamente la misma que `CREATE` pero + +- En lugar de la palabra `CREATE` utiliza la palabra `ATTACH`. +- La consulta no crea datos en el disco, pero supone que los datos ya están en los lugares apropiados, y simplemente agrega información sobre la tabla al servidor. + Después de ejecutar una consulta ATTACH, el servidor sabrá sobre la existencia de la tabla. + +Si la tabla se separó previamente (`DETACH`), lo que significa que su estructura es conocida, puede usar taquigrafía sin definir la estructura. + +``` sql +ATTACH TABLE [IF NOT EXISTS] [db.]name [ON CLUSTER cluster] +``` + +Esta consulta se utiliza al iniciar el servidor. El servidor almacena los metadatos de la tabla como archivos con `ATTACH` consultas, que simplemente se ejecuta en el lanzamiento (con la excepción de las tablas del sistema, que se crean explícitamente en el servidor). + +## MESA DE VERIFICACIÓN {#check-table} + +Comprueba si los datos de la tabla están dañados. + +``` sql +CHECK TABLE [db.]name +``` + +El `CHECK TABLE` query compara los tamaños de archivo reales con los valores esperados que se almacenan en el servidor. Si los tamaños de archivo no coinciden con los valores almacenados, significa que los datos están dañados. Esto puede deberse, por ejemplo, a un bloqueo del sistema durante la ejecución de la consulta. + +La respuesta de consulta contiene el `result` columna con una sola fila. La fila tiene un valor de +[Booleana](../data_types/boolean.md) Tipo: + +- 0 - Los datos de la tabla están dañados. +- 1 - Los datos mantienen la integridad. + +El `CHECK TABLE` query admite los siguientes motores de tablas: + +- [Registro](../operations/table_engines/log.md) +- [TinyLog](../operations/table_engines/tinylog.md) +- [StripeLog](../operations/table_engines/stripelog.md) +- [Familia MergeTree](../operations/table_engines/mergetree.md) + +Realizado sobre las tablas con otros motores de tabla causa una excepción. + +Motores del `*Log` La familia no proporciona recuperación automática de datos en caso de fallo. Utilice el `CHECK TABLE` consulta para rastrear la pérdida de datos de manera oportuna. + +Para `MergeTree` motores familiares, el `CHECK TABLE` query muestra un estado de comprobación para cada parte de datos individual de una tabla en el servidor local. + +**Si los datos están dañados** + +Si la tabla está dañada, puede copiar los datos no dañados a otra tabla. Para hacer esto: + +1. Cree una nueva tabla con la misma estructura que la tabla dañada. Para ello, ejecute la consulta `CREATE TABLE AS `. +2. Establezca el [max\_threads](../operations/settings/settings.md#settings-max_threads) valor a 1 para procesar la siguiente consulta en un único subproceso. Para ello, ejecute la consulta `SET max_threads = 1`. +3. Ejecutar la consulta `INSERT INTO SELECT * FROM `. Esta solicitud copia los datos no dañados de la tabla dañada a otra tabla. Solo se copiarán los datos anteriores a la parte dañada. +4. Reinicie el `clickhouse-client` para restablecer el `max_threads` valor. + +## TABLA DE DESCRIBE {#misc-describe-table} + +``` sql +DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] +``` + +Devuelve lo siguiente `String` Tipo columnas: + +- `name` — Nombre de la columna. +- `type`— Tipo de columna. +- `default_type` — Cláusula utilizada en [expresión predeterminada](create.md#create-default-values) (`DEFAULT`, `MATERIALIZED` o `ALIAS`). La columna contiene una cadena vacía, si no se especifica la expresión predeterminada. +- `default_expression` — Valor especificado en el `DEFAULT` clausula. +- `comment_expression` — Texto de comentario. + +Las estructuras de datos anidadas se generan en “expanded” formato. Cada columna se muestra por separado, con el nombre después de un punto. + +## SEPARAR {#detach} + +Elimina información sobre el ‘name’ tabla desde el servidor. El servidor deja de conocer la existencia de la tabla. + +``` sql +DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] +``` + +Esto no elimina los datos ni los metadatos de la tabla. En el próximo lanzamiento del servidor, el servidor leerá los metadatos y volverá a conocer la tabla. +Del mismo modo, un “detached” se puede volver a conectar usando el `ATTACH` consulta (con la excepción de las tablas del sistema, que no tienen metadatos almacenados para ellas). + +No hay `DETACH DATABASE` consulta. + +## GOTA {#drop} + +Esta consulta tiene dos tipos: `DROP DATABASE` y `DROP TABLE`. + +``` sql +DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster] +``` + +Elimina todas las tablas dentro del ‘db’ base de datos, a continuación, elimina ‘db’ base de datos en sí. +Si `IF EXISTS` se especifica, no devuelve un error si la base de datos no existe. + +``` sql +DROP [TEMPORARY] TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] +``` + +Elimina la tabla. +Si `IF EXISTS` se especifica, no devuelve un error si la tabla no existe o la base de datos no existe. + + DROP DICTIONARY [IF EXISTS] [db.]name + +Elimina el diccionario. +Si `IF EXISTS` se especifica, no devuelve un error si la tabla no existe o la base de datos no existe. + +## EXISTIR {#exists} + +``` sql +EXISTS [TEMPORARY] [TABLE|DICTIONARY] [db.]name [INTO OUTFILE filename] [FORMAT format] +``` + +Devuelve una sola `UInt8`columna -type, que contiene el valor único `0` si la tabla o base de datos no existe, o `1` si la tabla existe en la base de datos especificada. + +## Matar consulta {#kill-query} + +``` sql +KILL QUERY [ON CLUSTER cluster] + WHERE + [SYNC|ASYNC|TEST] + [FORMAT format] +``` + +Intenta terminar por la fuerza las consultas que se están ejecutando actualmente. +Las consultas a finalizar se seleccionan en el sistema.tabla de procesos utilizando los criterios definidos en el `WHERE` cláusula de la `KILL` consulta. + +Ejemplos: + +``` sql +-- Forcibly terminates all queries with the specified query_id: +KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90' + +-- Synchronously terminates all queries run by 'username': +KILL QUERY WHERE user='username' SYNC +``` + +Los usuarios de solo lectura solo pueden detener sus propias consultas. + +De forma predeterminada, se utiliza la versión asincrónica de las consultas (`ASYNC`), que no espera la confirmación de que las consultas se han detenido. + +La versión síncrona (`SYNC`) espera a que se detengan todas las consultas y muestra información sobre cada proceso a medida que se detiene. +La respuesta contiene el `kill_status` columna, que puede tomar los siguientes valores: + +1. ‘finished’ – La consulta se ha finalizado correctamente. +2. ‘waiting’ – Esperando a que finalice la consulta después de enviarle una señal para finalizar. +3. Los otros valores consultan por qué no se puede detener. + +Una consulta de prueba (`TEST`) sólo comprueba los derechos del usuario y muestra una lista de consultas para detener. + +## MUTACIÓN DE MATAR {#kill-mutation} + +``` sql +KILL MUTATION [ON CLUSTER cluster] + WHERE + [TEST] + [FORMAT format] +``` + +Intenta cancelar y quitar [mutación](alter.md#alter-mutations) que se están ejecutando actualmente. Las mutaciones para cancelar se seleccionan en el [`system.mutations`](../operations/system_tables.md#system_tables-mutations) utilizando el filtro especificado por el `WHERE` cláusula de la `KILL` consulta. + +Una consulta de prueba (`TEST`) sólo comprueba los derechos del usuario y muestra una lista de consultas para detener. + +Ejemplos: + +``` sql +-- Cancel and remove all mutations of the single table: +KILL MUTATION WHERE database = 'default' AND table = 'table' + +-- Cancel the specific mutation: +KILL MUTATION WHERE database = 'default' AND table = 'table' AND mutation_id = 'mutation_3.txt' +``` + +La consulta es útil cuando una mutación está bloqueada y no puede finalizar (por ejemplo, si alguna función en la consulta de mutación arroja una excepción cuando se aplica a los datos contenidos en la tabla). + +Los cambios ya realizados por la mutación no se revierten. + +## OPTIMIZAR {#misc-operations-optimize} + +``` sql +OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE] +``` + +Esta consulta intenta inicializar una combinación no programada de partes de datos para tablas con un motor de tablas [Método de codificación de datos:](../operations/table_engines/mergetree.md) Familia. + +El `OPTMIZE` consulta también es compatible con el [Método de codificación de datos:](../operations/table_engines/materializedview.md) y el [Búfer](../operations/table_engines/buffer.md) motor. No se admiten otros motores de tabla. + +Cuando `OPTIMIZE` se utiliza con el [ReplicatedMergeTree](../operations/table_engines/replication.md) la familia de motores de tablas, ClickHouse crea una tarea para fusionar y espera la ejecución en todos los nodos (si `replication_alter_partitions_sync` está habilitada la configuración). + +- Si `OPTIMIZE` no realiza una fusión por ningún motivo, no notifica al cliente. Para habilitar las notificaciones, [Optize\_throw\_if\_noop](../operations/settings/settings.md#setting-optimize_throw_if_noop) configuración. +- Si especifica un `PARTITION`, sólo la partición especificada está optimizada. [Cómo establecer la expresión de partición](alter.md#alter-how-to-specify-part-expr). +- Si especifica `FINAL`, la optimización se realiza incluso cuando todos los datos ya están en una parte. +- Si especifica `DEDUPLICATE`, luego se deduplicarán filas completamente idénticas (se comparan todas las columnas), tiene sentido solo para el motor MergeTree. + +!!! warning "Advertencia" + `OPTIMIZE` no se puede arreglar el “Too many parts” error. + +## Renombrar {#misc-operations-rename} + +Cambia el nombre de una o más tablas. + +``` sql +RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ... [ON CLUSTER cluster] +``` + +Todas las tablas se renombran bajo bloqueo global. Cambiar el nombre de las tablas es una operación ligera. Si ha indicado otra base de datos después de TO, la tabla se moverá a esta base de datos. Sin embargo, los directorios con bases de datos deben residir en el mismo sistema de archivos (de lo contrario, se devuelve un error). + +## ESTABLECER {#query-set} + +``` sql +SET param = value +``` + +Asignar `value` Angeles `param` [configuración](../operations/settings/index.md) para la sesión actual. No se puede cambiar [configuración del servidor](../operations/server_settings/index.md) de esta manera. + +También puede establecer todos los valores del perfil de configuración especificado en una sola consulta. + +``` sql +SET profile = 'profile-name-from-the-settings-file' +``` + +Para obtener más información, consulte [Configuración](../operations/settings/settings.md). + +## TRUNCAR {#truncate} + +``` sql +TRUNCATE TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] +``` + +Elimina todos los datos de una tabla. Cuando la cláusula `IF EXISTS` se omite, la consulta devuelve un error si la tabla no existe. + +El `TRUNCATE` consulta no es compatible con [Vista](../operations/table_engines/view.md), [File](../operations/table_engines/file.md), [URL](../operations/table_engines/url.md) y [Nulo](../operations/table_engines/null.md) motores de mesa. + +## UTILIZAR {#use} + +``` sql +USE db +``` + +Permite establecer la base de datos actual para la sesión. +La base de datos actual se utiliza para buscar tablas si la base de datos no está definida explícitamente en la consulta con un punto antes del nombre de la tabla. +Esta consulta no se puede realizar cuando se utiliza el protocolo HTTP, ya que no existe un concepto de sesión. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/misc/) diff --git a/docs/es/query_language/operators.md b/docs/es/query_language/operators.md new file mode 100644 index 00000000000..d6883ef9c18 --- /dev/null +++ b/docs/es/query_language/operators.md @@ -0,0 +1,271 @@ +# Operador {#operators} + +Todos los operadores se transforman a sus funciones correspondientes en la etapa de análisis de consulta de acuerdo con su precedencia y asociatividad. +Los grupos de operadores se enumeran en orden de prioridad (cuanto más alto esté en la lista, más temprano estará conectado el operador a sus argumentos). + +## Operadores de acceso {#access-operators} + +`a[N]` – Acceso a un elemento de una matriz. El `arrayElement(a, N)` función. + +`a.N` – El acceso a un elemento de tupla. El `tupleElement(a, N)` función. + +## Operador de negación numérica {#numeric-negation-operator} + +`-a` – El `negate (a)` función. + +## Operadores de multiplicación y división {#multiplication-and-division-operators} + +`a * b` – El `multiply (a, b)` función. + +`a / b` – El `divide(a, b)` función. + +`a % b` – El `modulo(a, b)` función. + +## Operadores de suma y resta {#addition-and-subtraction-operators} + +`a + b` – El `plus(a, b)` función. + +`a - b` – El `minus(a, b)` función. + +## Operadores de comparación {#comparison-operators} + +`a = b` – El `equals(a, b)` función. + +`a == b` – El `equals(a, b)` función. + +`a != b` – El `notEquals(a, b)` función. + +`a <> b` – El `notEquals(a, b)` función. + +`a <= b` – El `lessOrEquals(a, b)` función. + +`a >= b` – El `greaterOrEquals(a, b)` función. + +`a < b` – El `less(a, b)` función. + +`a > b` – El `greater(a, b)` función. + +`a LIKE s` – El `like(a, b)` función. + +`a NOT LIKE s` – El `notLike(a, b)` función. + +`a BETWEEN b AND c` – Lo mismo que `a >= b AND a <= c`. + +`a NOT BETWEEN b AND c` – Lo mismo que `a < b OR a > c`. + +## Operadores para trabajar con conjuntos de datos {#operators-for-working-with-data-sets} + +*Ver [IN operadores](select.md#select-in-operators).* + +`a IN ...` – El `in(a, b)` función. + +`a NOT IN ...` – El `notIn(a, b)` función. + +`a GLOBAL IN ...` – El `globalIn(a, b)` función. + +`a GLOBAL NOT IN ...` – El `globalNotIn(a, b)` función. + +## Operadores para trabajar con fechas y horas {#operators-datetime} + +### EXTRAER {#operator-extract} + +``` sql +EXTRACT(part FROM date); +``` + +Extrae una parte de una fecha determinada. Por ejemplo, puede recuperar un mes a partir de una fecha determinada o un segundo a partir de una hora. + +El `part` parámetro especifica qué parte de la fecha se va a recuperar. Los siguientes valores están disponibles: + +- `DAY` — El día del mes. Valores posibles: 1-31. +- `MONTH` — El número de un mes. Valores posibles: 1-12. +- `YEAR` — Año. +- `SECOND` — Segundo. Valores posibles: 0–59. +- `MINUTE` — Minuto. Valores posibles: 0–59. +- `HOUR` — Hora. Valores posibles: 0–23. + +El `part` El parámetro no distingue entre mayúsculas y minúsculas. + +El `date` parámetro especifica la fecha o la hora a procesar. Bien [Fecha](../data_types/date.md) o [FechaHora](../data_types/datetime.md) tipo es compatible. + +Ejemplos: + +``` sql +SELECT EXTRACT(DAY FROM toDate('2017-06-15')); +SELECT EXTRACT(MONTH FROM toDate('2017-06-15')); +SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); +``` + +En el siguiente ejemplo creamos una tabla e insertamos en ella un valor con el `DateTime` tipo. + +``` sql +CREATE TABLE test.Orders +( + OrderId UInt64, + OrderName String, + OrderDate DateTime +) +ENGINE = Log; +``` + +``` sql +INSERT INTO test.Orders VALUES (1, 'Jarlsberg Cheese', toDateTime('2008-10-11 13:23:44')); +``` + +``` sql +SELECT + toYear(OrderDate) AS OrderYear, + toMonth(OrderDate) AS OrderMonth, + toDayOfMonth(OrderDate) AS OrderDay, + toHour(OrderDate) AS OrderHour, + toMinute(OrderDate) AS OrderMinute, + toSecond(OrderDate) AS OrderSecond +FROM test.Orders; +``` + +``` text +┌─OrderYear─┬─OrderMonth─┬─OrderDay─┬─OrderHour─┬─OrderMinute─┬─OrderSecond─┐ +│ 2008 │ 10 │ 11 │ 13 │ 23 │ 44 │ +└───────────┴────────────┴──────────┴───────────┴─────────────┴─────────────┘ +``` + +Puedes ver más ejemplos en [prueba](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00619_extract.sql). + +### INTERVALO {#operator-interval} + +Crea un [Intervalo](../data_types/special_data_types/interval.md)-type valor que debe utilizarse en operaciones aritméticas con [Fecha](../data_types/date.md) y [FechaHora](../data_types/datetime.md)-tipo valores. + +Tipos de intervalos: +- `SECOND` +- `MINUTE` +- `HOUR` +- `DAY` +- `WEEK` +- `MONTH` +- `QUARTER` +- `YEAR` + +!!! warning "Advertencia" + Los intervalos con diferentes tipos no se pueden combinar. No puede usar expresiones como `INTERVAL 4 DAY 1 HOUR`. Exprese los intervalos en unidades que son más pequeñas o iguales a la unidad más pequeña del intervalo, por ejemplo `INTERVAL 25 HOUR`. Puede usar operaciones consequtive como en el siguiente ejemplo. + +Ejemplo: + +``` sql +SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL 3 HOUR +``` + +``` text +┌───current_date_time─┬─plus(plus(now(), toIntervalDay(4)), toIntervalHour(3))─┐ +│ 2019-10-23 11:16:28 │ 2019-10-27 14:16:28 │ +└─────────────────────┴────────────────────────────────────────────────────────┘ +``` + +**Ver también** + +- [Intervalo](../data_types/special_data_types/interval.md) Tipo de datos +- [ToInterval](functions/type_conversion_functions.md#function-tointerval) funciones de conversión de tipo + +## Operador de Negación Lógica {#logical-negation-operator} + +`NOT a` – El `not(a)` función. + +## Operador lógico and {#logical-and-operator} + +`a AND b` – El`and(a, b)` función. + +## Operador lógico or {#logical-or-operator} + +`a OR b` – El `or(a, b)` función. + +## Operador condicional {#conditional-operator} + +`a ? b : c` – El `if(a, b, c)` función. + +Nota: + +El operador condicional calcula los valores de b y c, luego verifica si se cumple la condición a y luego devuelve el valor correspondiente. Si `b` o `C` es una [arrayJoin()](functions/array_join.md#functions_arrayjoin) función, cada fila se replicará independientemente de la “a” condición. + +## Expresión condicional {#operator-case} + +``` sql +CASE [x] + WHEN a THEN b + [WHEN ... THEN ...] + [ELSE c] +END +``` + +Si `x` se especifica, entonces `transform(x, [a, ...], [b, ...], c)` se utiliza la función. De lo contrario – `multiIf(a, b, ..., c)`. + +Si no hay `ELSE c` cláusula en la expresión, el valor predeterminado es `NULL`. + +El `transform` no funciona con `NULL`. + +## Operador de Concatenación {#concatenation-operator} + +`s1 || s2` – El `concat(s1, s2) function.` + +## Operador de Creación Lambda {#lambda-creation-operator} + +`x -> expr` – El `lambda(x, expr) function.` + +Los siguientes operadores no tienen prioridad, ya que son corchetes: + +## Operador de creación de matrices {#array-creation-operator} + +`[x1, ...]` – El `array(x1, ...) function.` + +## Operador de creación de tupla {#tuple-creation-operator} + +`(x1, x2, ...)` – El `tuple(x2, x2, ...) function.` + +## Asociatividad {#associativity} + +Todos los operadores binarios han dejado asociatividad. Por ejemplo, `1 + 2 + 3` se transforma a `plus(plus(1, 2), 3)`. +A veces esto no funciona de la manera que usted espera. Por ejemplo, `SELECT 4 > 2 > 3` resultará en 0. + +Para la eficiencia, el `and` y `or` funciones aceptan cualquier número de argumentos. Las cadenas correspondientes de `AND` y `OR` operadores se transforman en una sola llamada de estas funciones. + +## Comprobación de `NULL` {#checking-for-null} + +ClickHouse soporta el `IS NULL` y `IS NOT NULL` operador. + +### ES NULO {#operator-is-null} + +- Para [NULO](../data_types/nullable.md) valores de tipo, el `IS NULL` operador devuelve: + - `1` Español `NULL`. + - `0` de lo contrario. +- Para otros valores, el `IS NULL` operador siempre devuelve `0`. + + + +``` sql +SELECT x+100 FROM t_null WHERE y IS NULL +``` + +``` text +┌─plus(x, 100)─┐ +│ 101 │ +└──────────────┘ +``` + +### NO ES NULO {#is-not-null} + +- Para [NULO](../data_types/nullable.md) valores de tipo, el `IS NOT NULL` operador devuelve: + - `0` Español `NULL`. + - `1` de lo contrario. +- Para otros valores, el `IS NOT NULL` operador siempre devuelve `1`. + + + +``` sql +SELECT * FROM t_null WHERE y IS NOT NULL +``` + +``` text +┌─x─┬─y─┐ +│ 2 │ 3 │ +└───┴───┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/operators/) diff --git a/docs/es/query_language/select.md b/docs/es/query_language/select.md new file mode 100644 index 00000000000..e886b0f0fcb --- /dev/null +++ b/docs/es/query_language/select.md @@ -0,0 +1,1375 @@ +# SELECCIONAR consultas Sintaxis {#select-queries-syntax} + +`SELECT` realiza la recuperación de datos. + +``` sql +[WITH expr_list|(subquery)] +SELECT [DISTINCT] expr_list +[FROM [db.]table | (subquery) | table_function] [FINAL] +[SAMPLE sample_coeff] +[ARRAY JOIN ...] +[GLOBAL] [ANY|ALL] [INNER|LEFT|RIGHT|FULL|CROSS] [OUTER] JOIN (subquery)|table USING columns_list +[PREWHERE expr] +[WHERE expr] +[GROUP BY expr_list] [WITH TOTALS] +[HAVING expr] +[ORDER BY expr_list] +[LIMIT [offset_value, ]n BY columns] +[LIMIT [n, ]m] +[UNION ALL ...] +[INTO OUTFILE filename] +[FORMAT format] +``` + +Todas las cláusulas son opcionales, excepto la lista requerida de expresiones inmediatamente después de SELECT. +Las siguientes cláusulas se describen casi en el mismo orden que en el transportador de ejecución de consultas. + +Si la consulta omite el `DISTINCT`, `GROUP BY` y `ORDER BY` cláusulas y el `IN` y `JOIN` subconsultas, la consulta se procesará por completo, utilizando O (1) cantidad de RAM. +De lo contrario, la consulta podría consumir mucha RAM si no se especifican las restricciones adecuadas: `max_memory_usage`, `max_rows_to_group_by`, `max_rows_to_sort`, `max_rows_in_distinct`, `max_bytes_in_distinct`, `max_rows_in_set`, `max_bytes_in_set`, `max_rows_in_join`, `max_bytes_in_join`, `max_bytes_before_external_sort`, `max_bytes_before_external_group_by`. Para obtener más información, consulte la sección «Settings». Es posible utilizar la clasificación externa (guardar tablas temporales en un disco) y la agregación externa. `The system does not have "merge join"`. + +### CON Cláusula {#with-clause} + +Esta sección proporciona soporte para expresiones de tabla común ([CTE](https://en.wikipedia.org/wiki/Hierarchical_and_recursive_queries_in_SQL)), con algunas limitaciones: +1. No se admiten consultas recursivas +2. Cuando se usa una subconsulta dentro de la sección WITH, su resultado debe ser escalar con exactamente una fila +3. Los resultados de la expresión no están disponibles en las subconsultas +Los resultados de las expresiones de la cláusula WITH se pueden usar dentro de la cláusula SELECT. + +Ejemplo 1: Usar expresión constante como «variable» + +``` sql +WITH '2019-08-01 15:23:00' as ts_upper_bound +SELECT * +FROM hits +WHERE + EventDate = toDate(ts_upper_bound) AND + EventTime <= ts_upper_bound +``` + +Ejemplo 2: Evictar el resultado de la expresión de sum(bytes) de la lista de columnas de la cláusula SELECT + +``` sql +WITH sum(bytes) as s +SELECT + formatReadableSize(s), + table +FROM system.parts +GROUP BY table +ORDER BY s +``` + +Ejemplo 3: Uso de los resultados de la subconsulta escalar + +``` sql +/* this example would return TOP 10 of most huge tables */ +WITH + ( + SELECT sum(bytes) + FROM system.parts + WHERE active + ) AS total_disk_usage +SELECT + (sum(bytes) / total_disk_usage) * 100 AS table_disk_usage, + table +FROM system.parts +GROUP BY table +ORDER BY table_disk_usage DESC +LIMIT 10 +``` + +Ejemplo 4: Reutilización de la expresión en subconsulta +Como solución alternativa para la limitación actual para el uso de expresiones en subconsultas, puede duplicarla. + +``` sql +WITH ['hello'] AS hello +SELECT + hello, + * +FROM +( + WITH ['hello'] AS hello + SELECT hello +) +``` + +``` text +┌─hello─────┬─hello─────┐ +│ ['hello'] │ ['hello'] │ +└───────────┴───────────┘ +``` + +### Cláusula FROM {#select-from} + +Si se omite la cláusula FROM, los datos se leerán desde el `system.one` tabla. +El `system.one` table contiene exactamente una fila (esta tabla cumple el mismo propósito que la tabla DUAL que se encuentra en otros DBMS). + +El `FROM` cláusula especifica la fuente de la que se leen los datos: + +- Tabla +- Subconsultas +- [Función de la tabla](table_functions/index.md) + +`ARRAY JOIN` y el regular `JOIN` también se pueden incluir (ver más abajo). + +En lugar de una tabla, el `SELECT` subconsulta se puede especificar entre paréntesis. +A diferencia del SQL estándar, no es necesario especificar un sinónimo después de una subconsulta. + +Para ejecutar una consulta, todas las columnas enumeradas en la consulta se extraen de la tabla adecuada. Las columnas no necesarias para la consulta externa se eliminan de las subconsultas. +Si una consulta no muestra ninguna columnas (por ejemplo, `SELECT count() FROM t`), alguna columna se extrae de la tabla de todos modos (se prefiere la más pequeña), para calcular el número de filas. + +#### Modificador FINAL {#select-from-final} + +Aplicable al seleccionar datos de tablas del [Método de codificación de datos:](../operations/table_engines/mergetree.md)-Familia de motores distintos de `GraphiteMergeTree`. Cuando `FINAL` ClickHouse fusiona completamente los datos antes de devolver el resultado y, por lo tanto, realiza todas las transformaciones de datos que ocurren durante las fusiones para el motor de tabla dado. + +También soportado para: +- [Replicado](../operations/table_engines/replication.md) versiones de `MergeTree` motor. +- [Vista](../operations/table_engines/view.md), [Búfer](../operations/table_engines/buffer.md), [Distribuido](../operations/table_engines/distributed.md), y [Método de codificación de datos:](../operations/table_engines/materializedview.md) motores que funcionan sobre otros motores, siempre que se hayan creado sobre `MergeTree`-mesas de motor. + +Consultas que usan `FINAL` se ejecutan no tan rápido como las consultas similares que no lo hacen, porque: + +- La consulta se ejecuta en un solo subproceso y los datos se combinan durante la ejecución de la consulta. +- Consultas con `FINAL` leer columnas de clave primaria además de las columnas especificadas en la consulta. + +En la mayoría de los casos, evite usar `FINAL`. + +### Cláusula SAMPLE {#select-sample-clause} + +El `SAMPLE` cláusula permite un procesamiento de consultas aproximado. + +Cuando se habilita el muestreo de datos, la consulta no se realiza en todos los datos, sino solo en una cierta fracción de datos (muestra). Por ejemplo, si necesita calcular estadísticas para todas las visitas, es suficiente ejecutar la consulta en la fracción 1/10 de todas las visitas y luego multiplicar el resultado por 10. + +El procesamiento de consultas aproximado puede ser útil en los siguientes casos: + +- Cuando tiene requisitos de temporización estrictos (como \<100 ms) pero no puede justificar el costo de recursos de hardware adicionales para cumplirlos. +- Cuando sus datos brutos no son precisos, por lo que la aproximación no degrada notablemente la calidad. +- Los requisitos comerciales se centran en los resultados aproximados (por rentabilidad o para comercializar los resultados exactos a los usuarios premium). + +!!! note "Nota" + Sólo puede utilizar el muestreo con las tablas en el [Método de codificación de datos:](../operations/table_engines/mergetree.md) familia, y sólo si la expresión de muestreo se especificó durante la creación de la tabla (ver [Motor MergeTree](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table)). + +Las características del muestreo de datos se enumeran a continuación: + +- El muestreo de datos es un mecanismo determinista. El resultado de la misma `SELECT .. SAMPLE` la consulta es siempre la misma. +- El muestreo funciona consistentemente para diferentes tablas. Para tablas con una sola clave de muestreo, una muestra con el mismo coeficiente siempre selecciona el mismo subconjunto de datos posibles. Por ejemplo, una muestra de ID de usuario toma filas con el mismo subconjunto de todos los ID de usuario posibles de diferentes tablas. Esto significa que puede utilizar el ejemplo en subconsultas [ES](#select-in-operators) clausula. Además, puede unir muestras usando el [UNIR](#select-join) clausula. +- El muestreo permite leer menos datos de un disco. Tenga en cuenta que debe especificar la clave de muestreo correctamente. Para obtener más información, consulte [Creación de una tabla MergeTree](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table). + +Para el `SAMPLE` cláusula se admite la siguiente sintaxis: + +| Sintaxis de la cláusula MUESTRA | Descripción | +|---------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `SAMPLE k` | Aquí `k` es el número de 0 a 1.
La consulta se ejecuta en `k` de datos. Por ejemplo, `SAMPLE 0.1` Ejecuta la consulta en el 10% de los datos. [Leer más](#select-sample-k) | +| `SAMPLE n` | Aquí `n` es un entero suficientemente grande.
La consulta se ejecuta en una muestra de al menos `n` filas (pero no significativamente más que esto). Por ejemplo, `SAMPLE 10000000` ejecuta la consulta en un mínimo de 10.000.000 de filas. [Leer más](#select-sample-n) | +| `SAMPLE k OFFSET m` | Aquí `k` y `m` son los números del 0 al 1.
La consulta se ejecuta en una muestra de `k` de los datos. Los datos utilizados para el ejemplo se compensan por `m` fracción. [Leer más](#select-sample-offset) | + +#### MUESTRA k {#select-sample-k} + +Aquí `k` es el número de 0 a 1 (se admiten notaciones fraccionarias y decimales). Por ejemplo, `SAMPLE 1/2` o `SAMPLE 0.5`. + +En un `SAMPLE k` cláusula, la muestra se toma de la `k` de datos. El ejemplo se muestra a continuación: + +``` sql +SELECT + Title, + count() * 10 AS PageViews +FROM hits_distributed +SAMPLE 0.1 +WHERE + CounterID = 34 +GROUP BY Title +ORDER BY PageViews DESC LIMIT 1000 +``` + +En este ejemplo, la consulta se ejecuta en una muestra de 0,1 (10%) de datos. Los valores de las funciones agregadas no se corrigen automáticamente, por lo que para obtener un resultado aproximado, el valor `count()` se multiplica manualmente por 10. + +#### MUESTRA n {#select-sample-n} + +Aquí `n` es un entero suficientemente grande. Por ejemplo, `SAMPLE 10000000`. + +En este caso, la consulta se ejecuta en una muestra de al menos `n` filas (pero no significativamente más que esto). Por ejemplo, `SAMPLE 10000000` ejecuta la consulta en un mínimo de 10.000.000 de filas. + +Dado que la unidad mínima para la lectura de datos es un gránulo (su tamaño se establece mediante el `index_granularity` ajuste), tiene sentido establecer una muestra que es mucho más grande que el tamaño del gránulo. + +Cuando se utiliza el `SAMPLE n` cláusula, no sabe qué porcentaje relativo de datos se procesó. Por lo tanto, no conoce el coeficiente por el que se deben multiplicar las funciones agregadas. Utilice el `_sample_factor` columna virtual para obtener el resultado aproximado. + +El `_sample_factor` columna contiene coeficientes relativos que se calculan dinámicamente. Esta columna se crea automáticamente cuando [crear](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table) una tabla con la clave de muestreo especificada. Los ejemplos de uso del `_sample_factor` columna se muestran a continuación. + +Consideremos la tabla `visits`, que contiene las estadísticas sobre las visitas al sitio. El primer ejemplo muestra cómo calcular el número de páginas vistas: + +``` sql +SELECT sum(PageViews * _sample_factor) +FROM visits +SAMPLE 10000000 +``` + +El siguiente ejemplo muestra cómo calcular el número total de visitas: + +``` sql +SELECT sum(_sample_factor) +FROM visits +SAMPLE 10000000 +``` + +El siguiente ejemplo muestra cómo calcular la duración media de la sesión. Tenga en cuenta que no necesita usar el coeficiente relativo para calcular los valores promedio. + +``` sql +SELECT avg(Duration) +FROM visits +SAMPLE 10000000 +``` + +#### MUESTRA k OFFSET m {#select-sample-offset} + +Aquí `k` y `m` son números del 0 al 1. Los ejemplos se muestran a continuación. + +**Ejemplo 1** + +``` sql +SAMPLE 1/10 +``` + +En este ejemplo, la muestra es 1/10 de todos los datos: + +`[++------------------]` + +**Ejemplo 2** + +``` sql +SAMPLE 1/10 OFFSET 1/2 +``` + +Aquí, se toma una muestra del 10% de la segunda mitad de los datos. + +`[----------++--------]` + +### ARRAY JOIN Cláusula {#select-array-join-clause} + +Permite ejecutar `JOIN` con una matriz o estructura de datos anidada. La intención es similar a la [arrayJoin](functions/array_join.md#functions_arrayjoin) función, pero su funcionalidad es más amplia. + +``` sql +SELECT +FROM +[LEFT] ARRAY JOIN +[WHERE|PREWHERE ] +... +``` + +Sólo puede especificar una sola `ARRAY JOIN` cláusula en una consulta. + +El orden de ejecución de la consulta se optimiza cuando se ejecuta `ARRAY JOIN`. Aunque `ARRAY JOIN` debe especificarse siempre antes de la `WHERE/PREWHERE` cláusula, se puede realizar ya sea antes `WHERE/PREWHERE` (si el resultado es necesario en esta cláusula), o después de completarlo (para reducir el volumen de cálculos). El optimizador de consultas controla el orden de procesamiento. + +Tipos admitidos de `ARRAY JOIN` se enumeran a continuación: + +- `ARRAY JOIN` - En este caso, las matrices vacías no se incluyen en el resultado de `JOIN`. +- `LEFT ARRAY JOIN` - El resultado de `JOIN` contiene filas con matrices vacías. El valor de una matriz vacía se establece en el valor predeterminado para el tipo de elemento de matriz (normalmente 0, cadena vacía o NULL). + +Los siguientes ejemplos demuestran el uso de la `ARRAY JOIN` y `LEFT ARRAY JOIN` clausula. Vamos a crear una tabla con un [Matriz](../data_types/array.md) escriba la columna e inserte valores en ella: + +``` sql +CREATE TABLE arrays_test +( + s String, + arr Array(UInt8) +) ENGINE = Memory; + +INSERT INTO arrays_test +VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); +``` + +``` text +┌─s───────────┬─arr─────┐ +│ Hello │ [1,2] │ +│ World │ [3,4,5] │ +│ Goodbye │ [] │ +└─────────────┴─────────┘ +``` + +El siguiente ejemplo utiliza el `ARRAY JOIN` clausula: + +``` sql +SELECT s, arr +FROM arrays_test +ARRAY JOIN arr; +``` + +``` text +┌─s─────┬─arr─┐ +│ Hello │ 1 │ +│ Hello │ 2 │ +│ World │ 3 │ +│ World │ 4 │ +│ World │ 5 │ +└───────┴─────┘ +``` + +El siguiente ejemplo utiliza el `LEFT ARRAY JOIN` clausula: + +``` sql +SELECT s, arr +FROM arrays_test +LEFT ARRAY JOIN arr; +``` + +``` text +┌─s───────────┬─arr─┐ +│ Hello │ 1 │ +│ Hello │ 2 │ +│ World │ 3 │ +│ World │ 4 │ +│ World │ 5 │ +│ Goodbye │ 0 │ +└─────────────┴─────┘ +``` + +#### Uso de alias {#using-aliases} + +Se puede especificar un alias para una matriz en el `ARRAY JOIN` clausula. En este caso, este alias puede acceder a un elemento de matriz, pero el nombre original tiene acceso a la matriz en sí. Ejemplo: + +``` sql +SELECT s, arr, a +FROM arrays_test +ARRAY JOIN arr AS a; +``` + +``` text +┌─s─────┬─arr─────┬─a─┐ +│ Hello │ [1,2] │ 1 │ +│ Hello │ [1,2] │ 2 │ +│ World │ [3,4,5] │ 3 │ +│ World │ [3,4,5] │ 4 │ +│ World │ [3,4,5] │ 5 │ +└───────┴─────────┴───┘ +``` + +Usando alias, puede realizar `ARRAY JOIN` con una matriz externa. Por ejemplo: + +``` sql +SELECT s, arr_external +FROM arrays_test +ARRAY JOIN [1, 2, 3] AS arr_external; +``` + +``` text +┌─s───────────┬─arr_external─┐ +│ Hello │ 1 │ +│ Hello │ 2 │ +│ Hello │ 3 │ +│ World │ 1 │ +│ World │ 2 │ +│ World │ 3 │ +│ Goodbye │ 1 │ +│ Goodbye │ 2 │ +│ Goodbye │ 3 │ +└─────────────┴──────────────┘ +``` + +Múltiples matrices se pueden separar por comas en el `ARRAY JOIN` clausula. En este caso, `JOIN` se realiza con ellos simultáneamente (la suma directa, no el producto cartesiano). Tenga en cuenta que todas las matrices deben tener el mismo tamaño. Ejemplo: + +``` sql +SELECT s, arr, a, num, mapped +FROM arrays_test +ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped; +``` + +``` text +┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ +│ Hello │ [1,2] │ 1 │ 1 │ 2 │ +│ Hello │ [1,2] │ 2 │ 2 │ 3 │ +│ World │ [3,4,5] │ 3 │ 1 │ 4 │ +│ World │ [3,4,5] │ 4 │ 2 │ 5 │ +│ World │ [3,4,5] │ 5 │ 3 │ 6 │ +└───────┴─────────┴───┴─────┴────────┘ +``` + +El siguiente ejemplo utiliza el [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate) función: + +``` sql +SELECT s, arr, a, num, arrayEnumerate(arr) +FROM arrays_test +ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; +``` + +``` text +┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ +│ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ +│ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ +│ World │ [3,4,5] │ 3 │ 1 │ [1,2,3] │ +│ World │ [3,4,5] │ 4 │ 2 │ [1,2,3] │ +│ World │ [3,4,5] │ 5 │ 3 │ [1,2,3] │ +└───────┴─────────┴───┴─────┴─────────────────────┘ +``` + +#### ARRAY JOIN con estructura de datos anidada {#array-join-with-nested-data-structure} + +`ARRAY`JOIN\`\` también funciona con [estructuras de datos anidados](../data_types/nested_data_structures/nested.md). Ejemplo: + +``` sql +CREATE TABLE nested_test +( + s String, + nest Nested( + x UInt8, + y UInt32) +) ENGINE = Memory; + +INSERT INTO nested_test +VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []); +``` + +``` text +┌─s───────┬─nest.x──┬─nest.y─────┐ +│ Hello │ [1,2] │ [10,20] │ +│ World │ [3,4,5] │ [30,40,50] │ +│ Goodbye │ [] │ [] │ +└─────────┴─────────┴────────────┘ +``` + +``` sql +SELECT s, `nest.x`, `nest.y` +FROM nested_test +ARRAY JOIN nest; +``` + +``` text +┌─s─────┬─nest.x─┬─nest.y─┐ +│ Hello │ 1 │ 10 │ +│ Hello │ 2 │ 20 │ +│ World │ 3 │ 30 │ +│ World │ 4 │ 40 │ +│ World │ 5 │ 50 │ +└───────┴────────┴────────┘ +``` + +Al especificar nombres de estructuras de datos anidadas en `ARRAY JOIN` el significado es el mismo `ARRAY JOIN` con todos los elementos de la matriz en los que consiste. Los ejemplos se enumeran a continuación: + +``` sql +SELECT s, `nest.x`, `nest.y` +FROM nested_test +ARRAY JOIN `nest.x`, `nest.y`; +``` + +``` text +┌─s─────┬─nest.x─┬─nest.y─┐ +│ Hello │ 1 │ 10 │ +│ Hello │ 2 │ 20 │ +│ World │ 3 │ 30 │ +│ World │ 4 │ 40 │ +│ World │ 5 │ 50 │ +└───────┴────────┴────────┘ +``` + +Esta variación también tiene sentido: + +``` sql +SELECT s, `nest.x`, `nest.y` +FROM nested_test +ARRAY JOIN `nest.x`; +``` + +``` text +┌─s─────┬─nest.x─┬─nest.y─────┐ +│ Hello │ 1 │ [10,20] │ +│ Hello │ 2 │ [10,20] │ +│ World │ 3 │ [30,40,50] │ +│ World │ 4 │ [30,40,50] │ +│ World │ 5 │ [30,40,50] │ +└───────┴────────┴────────────┘ +``` + +Se puede usar un alias para una estructura de datos anidada, con el fin de seleccionar `JOIN` resultado o la matriz de origen. Ejemplo: + +``` sql +SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` +FROM nested_test +ARRAY JOIN nest AS n; +``` + +``` text +┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ +│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ +│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ +│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ +│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ +│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ +└───────┴─────┴─────┴─────────┴────────────┘ +``` + +Ejemplo de uso del [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate) función: + +``` sql +SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num +FROM nested_test +ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num; +``` + +``` text +┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ +│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ +│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ +│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ 1 │ +│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ 2 │ +│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ 3 │ +└───────┴─────┴─────┴─────────┴────────────┴─────┘ +``` + +### Cláusula JOIN {#select-join} + +Se une a los datos en el [SQL UNIRSE](https://en.wikipedia.org/wiki/Join_(SQL)) sentido. + +!!! info "Nota" + No relacionado con [ARRAY UNIRSE](#select-array-join-clause). + +``` sql +SELECT +FROM +[GLOBAL] [ANY|ALL] [INNER|LEFT|RIGHT|FULL|CROSS] [OUTER] JOIN +(ON )|(USING ) ... +``` + +Los nombres de tabla se pueden especificar en lugar de `` y ``. Esto es equivalente a la `SELECT * FROM table` subconsulta, excepto en un caso especial cuando la tabla tiene [Unir](../operations/table_engines/join.md) motor – una matriz preparada para unirse. + +#### Tipos admitidos de `JOIN` {#select-join-types} + +- `INNER JOIN` (o)) `JOIN`) +- `LEFT JOIN` (o)) `LEFT OUTER JOIN`) +- `RIGHT JOIN` (o)) `RIGHT OUTER JOIN`) +- `FULL JOIN` (o)) `FULL OUTER JOIN`) +- `CROSS JOIN` (o)) `,` ) + +Ver el estándar [SQL UNIRSE](https://en.wikipedia.org/wiki/Join_(SQL)) Descripción + +#### ÚNETE Múltiple {#multiple-join} + +Al realizar consultas, ClickHouse reescribe las uniones de varias tablas en la secuencia de uniones de dos tablas. Por ejemplo, si hay cuatro tablas para unir ClickHouse une la primera y la segunda, luego une el resultado con la tercera tabla, y en el último paso, se une a la cuarta. + +Si una consulta contiene el `WHERE` cláusula, ClickHouse intenta empujar hacia abajo los filtros de esta cláusula a través de la unión intermedia. Si no puede aplicar el filtro a cada unión intermedia, ClickHouse aplica los filtros después de que se completen todas las combinaciones. + +Recomendamos el `JOIN ON` o `JOIN USING` sintaxis para crear consultas. Por ejemplo: + +``` sql +SELECT * FROM t1 JOIN t2 ON t1.a = t2.a JOIN t3 ON t1.a = t3.a +``` + +Puede utilizar listas de tablas separadas por comas `FROM` clausula. Por ejemplo: + +``` sql +SELECT * FROM t1, t2, t3 WHERE t1.a = t2.a AND t1.a = t3.a +``` + +No mezcle estas sintaxis. + +ClickHouse no admite directamente la sintaxis con comas, por lo que no recomendamos su uso. El algoritmo intenta reescribir la consulta en términos de `CROSS JOIN` y `INNER JOIN` y luego procede al procesamiento de consultas. Al reescribir la consulta, ClickHouse intenta optimizar el rendimiento y el consumo de memoria. De forma predeterminada, ClickHouse trata las comas como `INNER JOIN` cláusula y convierte `INNER JOIN` Naciones `CROSS JOIN` cuando el algoritmo no puede garantizar que `INNER JOIN` devuelve los datos requeridos. + +#### Rigor {#select-join-strictness} + +- `ALL` — Si la tabla correcta tiene varias filas coincidentes, ClickHouse crea un [Producto cartesiano](https://en.wikipedia.org/wiki/Cartesian_product) de filas coincidentes. Este es el estándar `JOIN` comportamiento en SQL. +- `ANY` — Si la tabla correcta tiene varias filas coincidentes, solo se une la primera encontrada. Si la tabla correcta solo tiene una fila coincidente, los resultados de las consultas `ANY` y `ALL` palabras clave son las mismas. +- `ASOF` — Para unir secuencias con una coincidencia no exacta. `ASOF JOIN` el uso se describe a continuación. + +**ASOF UNIRSE A Uso** + +`ASOF JOIN` es útil cuando necesita unir registros que no tienen una coincidencia exacta. + +Tablas para `ASOF JOIN` debe tener una columna de secuencia ordenada. Esta columna no puede estar sola en una tabla y debe ser uno de los tipos de datos: `UInt32`, `UInt64`, `Float32`, `Float64`, `Date`, y `DateTime`. + +Sintaxis `ASOF JOIN ... ON`: + +``` sql +SELECT expressions_list +FROM table_1 +ASOF LEFT JOIN table_2 +ON equi_cond AND closest_match_cond +``` + +Puede usar cualquier número de condiciones de igualdad y exactamente una condición de coincidencia más cercana. Por ejemplo, `SELECT count() FROM table_1 ASOF LEFT JOIN table_2 ON table_1.a == table_2.b AND table_2.t <= table_1.t`. + +Condiciones admitidas para la coincidencia más cercana: `>`, `>=`, `<`, `<=`. + +Sintaxis `ASOF JOIN ... USING`: + +``` sql +SELECT expressions_list +FROM table_1 +ASOF JOIN table_2 +USING (equi_column1, ... equi_columnN, asof_column) +``` + +`ASOF JOIN` utilizar `equi_columnX` para unirse a la igualdad y `asof_column` para unirse en el partido más cercano con el `table_1.asof_column >= table_2.asof_column` condición. El `asof_column` columna siempre el último en el `USING` clausula. + +Por ejemplo, considere las siguientes tablas: + +``` text + table_1 table_2 + + event | ev_time | user_id event | ev_time | user_id +----------|---------|---------- ----------|---------|---------- + ... ... +event_1_1 | 12:00 | 42 event_2_1 | 11:59 | 42 + ... event_2_2 | 12:30 | 42 +event_1_2 | 13:00 | 42 event_2_3 | 13:00 | 42 + ... ... +``` + +`ASOF JOIN` puede tomar la marca de tiempo de un evento de usuario de `table_1` y encontrar un evento en `table_2` donde la marca de tiempo es la más cercana a la marca de tiempo del evento `table_1` correspondiente a la condición de coincidencia más cercana. Los valores de marca de tiempo iguales son los más cercanos si están disponibles. Aquí, el `user_id` se puede utilizar para unirse a la igualdad y el `ev_time` columna se puede utilizar para unirse en el partido más cercano. En nuestro ejemplo, `event_1_1` se puede unir con `event_2_1` y `event_1_2` se puede unir con `event_2_3` pero `event_2_2` no se puede unir. + +!!! note "Nota" + `ASOF` unirse es **ni** apoyado en el [Unir](../operations/table_engines/join.md) motor de mesa. + +Para establecer el valor de rigor predeterminado, utilice el parámetro de configuración de sesión [Por favor, introduzca su dirección de correo electrónico](../operations/settings/settings.md#settings-join_default_strictness). + +#### ÚNETE MUNDIAL {#global-join} + +Cuando se utiliza una normal `JOIN`, la consulta se envía a servidores remotos. Las subconsultas se ejecutan en cada una de ellas para crear la tabla correcta, y la unión se realiza con esta tabla. En otras palabras, la tabla correcta se forma en cada servidor por separado. + +Cuando se utiliza `GLOBAL ... JOIN`, primero el servidor requestor ejecuta una subconsulta para calcular la tabla correcta. Esta tabla temporal se pasa a cada servidor remoto y las consultas se ejecutan en ellos utilizando los datos temporales que se transmitieron. + +Tenga cuidado al usar `GLOBAL`. Para obtener más información, consulte la sección [Subconsultas distribuidas](#select-distributed-subqueries). + +#### Recomendaciones de uso {#usage-recommendations} + +Cuando se ejecuta un `JOIN`, no hay optimización del orden de ejecución en relación con otras etapas de la consulta. La combinación (una búsqueda en la tabla de la derecha) se ejecuta antes de filtrar `WHERE` y antes de la agregación. Para establecer explícitamente el orden de procesamiento, recomendamos ejecutar un `JOIN` Subconsulta con una subconsulta. + +Ejemplo: + +``` sql +SELECT + CounterID, + hits, + visits +FROM +( + SELECT + CounterID, + count() AS hits + FROM test.hits + GROUP BY CounterID +) ANY LEFT JOIN +( + SELECT + CounterID, + sum(Sign) AS visits + FROM test.visits + GROUP BY CounterID +) USING CounterID +ORDER BY hits DESC +LIMIT 10 +``` + +``` text +┌─CounterID─┬───hits─┬─visits─┐ +│ 1143050 │ 523264 │ 13665 │ +│ 731962 │ 475698 │ 102716 │ +│ 722545 │ 337212 │ 108187 │ +│ 722889 │ 252197 │ 10547 │ +│ 2237260 │ 196036 │ 9522 │ +│ 23057320 │ 147211 │ 7689 │ +│ 722818 │ 90109 │ 17847 │ +│ 48221 │ 85379 │ 4652 │ +│ 19762435 │ 77807 │ 7026 │ +│ 722884 │ 77492 │ 11056 │ +└───────────┴────────┴────────┘ +``` + +Las subconsultas no permiten establecer nombres ni usarlos para hacer referencia a una columna de una subconsulta específica. +Las columnas especificadas en `USING` debe tener los mismos nombres en ambas subconsultas, y las otras columnas deben tener un nombre diferente. Puede usar alias para cambiar los nombres de las columnas en subconsultas (el ejemplo usa los alias `hits` y `visits`). + +El `USING` clause especifica una o más columnas a unir, lo que establece la igualdad de estas columnas. La lista de columnas se establece sin corchetes. No se admiten condiciones de unión más complejas. + +La tabla correcta (el resultado de la subconsulta) reside en la RAM. Si no hay suficiente memoria, no puede ejecutar `JOIN`. + +Cada vez que se ejecuta una consulta `JOIN`, la subconsulta se ejecuta de nuevo porque el resultado no se almacena en caché. Para evitar esto, use el especial [Unir](../operations/table_engines/join.md) motor de tabla, que es una matriz preparada para unirse que siempre está en RAM. + +En algunos casos, es más eficiente de usar `IN` es lugar de `JOIN`. +Entre los diversos tipos de `JOIN` el más eficiente es `ANY LEFT JOIN`, entonces `ANY INNER JOIN`. Los menos eficientes hijo `ALL LEFT JOIN` y `ALL INNER JOIN`. + +Si necesita un `JOIN` para unirse a tablas de dimensión (son tablas relativamente pequeñas que contienen propiedades de dimensión, como nombres para campañas publicitarias), un `JOIN` podría no ser muy conveniente debido al hecho de que se vuelve a acceder a la tabla correcta para cada consulta. Para tales casos, hay un «external dictionaries» característica que debe utilizar en lugar de `JOIN`. Para obtener más información, consulte la sección [Diccionarios externos](dicts/external_dicts.md). + +**Limitaciones de memoria** + +ClickHouse utiliza el [hash unirse](https://en.wikipedia.org/wiki/Hash_join) algoritmo. Haga clic enCasa toma el `` y crea una tabla hash para ello en RAM. Si necesita restringir el consumo de memoria de la operación de unión, use la siguiente configuración: + +- [Método de codificación de datos:](../operations/settings/query_complexity.md#settings-max_rows_in_join) — Limita el número de filas en la tabla hash. +- [Método de codificación de datos:](../operations/settings/query_complexity.md#settings-max_bytes_in_join) — Limita el tamaño de la tabla hash. + +Cuando se alcanza cualquiera de estos límites, ClickHouse actúa como el [join\_overflow\_mode](../operations/settings/query_complexity.md#settings-join_overflow_mode) configuración instruye. + +#### Procesamiento de celdas vacías o NULL {#processing-of-empty-or-null-cells} + +Al unir tablas, pueden aparecer las celdas vacías. Configuración [Sistema abierto.](../operations/settings/settings.md#join_use_nulls) definir cómo ClickHouse llena estas celdas. + +Si el `JOIN` las llaves hijo [NULO](../data_types/nullable.md) campos, las filas donde al menos una de las claves tiene el valor [NULO](syntax.md#null-literal) no se unen. + +#### Limitaciones de sintaxis {#syntax-limitations} + +Para múltiples `JOIN` cláusulas en una sola `SELECT` Consulta: + +- Tomando todas las columnas a través de `*` está disponible solo si se unen tablas, no subconsultas. +- El `PREWHERE` cláusula no está disponible. + +Para `ON`, `WHERE`, y `GROUP BY` clausula: + +- Las expresiones arbitrarias no se pueden utilizar en `ON`, `WHERE`, y `GROUP BY` cláusulas, pero puede definir una expresión en un `SELECT` cláusula y luego usarla en estas cláusulas a través de un alias. + +### DONDE Cláusula {#select-where} + +Si hay una cláusula where, debe contener una expresión con el tipo UInt8. Esta suele ser una expresión con comparación y operadores lógicos. +Esta expresión se usará para filtrar datos antes de todas las demás transformaciones. + +Si los índices son compatibles con el motor de tablas de base de datos, la expresión se evalúa en función de la capacidad de usar índices. + +### PREWHERE Cláusula {#prewhere-clause} + +Esta cláusula tiene el mismo significado que la cláusula where. La diferencia radica en qué datos se leen de la tabla. +Al usar PREWHERE, primero solo se leen las columnas necesarias para ejecutar PREWHERE. Luego se leen las otras columnas que son necesarias para ejecutar la consulta, pero solo aquellos bloques donde la expresión PREWHERE es verdadera. + +Tiene sentido usar PREWHERE si hay condiciones de filtración utilizadas por una minoría de las columnas de la consulta, pero que proporcionan una filtración de datos fuerte. Esto reduce el volumen de datos a leer. + +Por ejemplo, es útil escribir PREWHERE para consultas que extraen un gran número de columnas, pero que solo tienen filtración para unas pocas columnas. + +PREWHERE solo es compatible con tablas de la `*MergeTree` Familia. + +Una consulta puede especificar simultáneamente PREWHERE y WHERE. En este caso, PREWHERE precede WHERE. + +Si el ‘optimize\_move\_to\_prewhere’ La configuración se establece en 1 y PREWHERE se omite, el sistema utiliza la heurística para mover automáticamente partes de expresiones de WHERE a PREWHERE. + +### GRUPO POR Cláusula {#select-group-by-clause} + +Esta es una de las partes más importantes de un DBMS orientado a columnas. + +Si hay una cláusula GROUP BY, debe contener una lista de expresiones. Cada expresión se mencionará aquí como una «key». +Todas las expresiones de las cláusulas SELECT, HAVING y ORDER BY deben calcularse a partir de claves o de funciones agregadas. En otras palabras, cada columna seleccionada de la tabla debe usarse en claves o dentro de funciones agregadas. + +Si una consulta solo contiene columnas de tabla dentro de funciones agregadas, se puede omitir la cláusula GROUP BY y se asume la agregación mediante un conjunto vacío de claves. + +Ejemplo: + +``` sql +SELECT + count(), + median(FetchTiming > 60 ? 60 : FetchTiming), + count() - sum(Refresh) +FROM hits +``` + +Sin embargo, a diferencia del SQL estándar, si la tabla no tiene ninguna fila (o no hay ninguna, o no hay ninguna después de usar WHERE para filtrar), se devuelve un resultado vacío y no el resultado de una de las filas que contienen los valores iniciales de las funciones agregadas. + +A diferencia de MySQL (y conforme a SQL estándar), no puede obtener algún valor de alguna columna que no esté en una función clave o agregada (excepto expresiones constantes). Para evitar esto, puede usar el ‘any’ función de agregado (obtener el primer valor encontrado) o ‘min/max’. + +Ejemplo: + +``` sql +SELECT + domainWithoutWWW(URL) AS domain, + count(), + any(Title) AS title -- getting the first occurred page header for each domain. +FROM hits +GROUP BY domain +``` + +Para cada valor de clave diferente encontrado, GROUP BY calcula un conjunto de valores de función agregados. + +GROUP BY no se admite para columnas de matriz. + +No se puede especificar una constante como argumentos para funciones agregadas. Ejemplo: sum(1). En lugar de esto, puedes deshacerte de la constante. Ejemplo: `count()`. + +#### Procesamiento NULL {#null-processing} + +Para agrupar, ClickHouse interpreta [NULO](syntax.md) como valor, y `NULL=NULL`. + +He aquí un ejemplo para mostrar lo que esto significa. + +Supongamos que tienes esta tabla: + +``` text +┌─x─┬────y─┐ +│ 1 │ 2 │ +│ 2 │ ᴺᵁᴸᴸ │ +│ 3 │ 2 │ +│ 3 │ 3 │ +│ 3 │ ᴺᵁᴸᴸ │ +└───┴──────┘ +``` + +Consulta `SELECT sum(x), y FROM t_null_big GROUP BY y` resultados en: + +``` text +┌─sum(x)─┬────y─┐ +│ 4 │ 2 │ +│ 3 │ 3 │ +│ 5 │ ᴺᵁᴸᴸ │ +└────────┴──────┘ +``` + +Se puede ver que `GROUP BY` para `y = NULL` resumir `x` como si `NULL` es este valor. + +Si pasa varias teclas a `GROUP BY` el resultado le dará todas las combinaciones de la selección, como si `NULL` fueron un valor específico. + +#### CON TOTALS Modificador {#with-totals-modifier} + +Si se especifica el modificador WITH TOTALS, se calculará otra fila. Esta fila tendrá columnas clave que contienen valores predeterminados (zeros o líneas vacías) y columnas de funciones agregadas con los valores calculados en todas las filas (el «total» valor). + +Esta fila adicional se genera en formatos JSON \*, TabSeparated \* y Pretty \*, por separado de las otras filas. En los otros formatos, esta fila no se géneros. + +En los formatos JSON\*, esta fila se muestra como una ‘totals’ campo. En los formatos TabSeparated\*, la fila viene después del resultado principal, precedida por una fila vacía (después de los otros datos). En los formatos Pretty\*, la fila se muestra como una tabla separada después del resultado principal. + +`WITH TOTALS` se puede ejecutar de diferentes maneras cuando HAVING está presente. El comportamiento depende de la ‘totals\_mode’ configuración. +Predeterminada, `totals_mode = 'before_having'`. En este caso, ‘totals’ se calcula en todas las filas, incluidas las que no pasan por HAVING y ‘max\_rows\_to\_group\_by’. + +Las otras alternativas incluyen solo las filas que pasan por HAVING en ‘totals’, y comportarse de manera diferente con el ajuste `max_rows_to_group_by` y `group_by_overflow_mode = 'any'`. + +`after_having_exclusive` – No incluya filas que no hayan pasado `max_rows_to_group_by`. En otras palabras, ‘totals’ tendrá menos o el mismo número de filas que si `max_rows_to_group_by` se omitieron. + +`after_having_inclusive` – Incluir todas las filas que no pasaron ‘max\_rows\_to\_group\_by’ es ‘totals’. En otras palabras, ‘totals’ tendrá más o el mismo número de filas como lo haría si `max_rows_to_group_by` se omitieron. + +`after_having_auto` – Cuente el número de filas que pasaron por HAVING. Si es más de una cierta cantidad (de forma predeterminada, 50%), incluya todas las filas que no pasaron ‘max\_rows\_to\_group\_by’ es ‘totals’. De lo contrario, no los incluya. + +`totals_auto_threshold` – Por defecto, 0.5. El coeficiente para `after_having_auto`. + +Si `max_rows_to_group_by` y `group_by_overflow_mode = 'any'` no se utilizan, todas las variaciones de `after_having` son los mismos, y se puede utilizar cualquiera de ellos (por ejemplo, `after_having_auto`). + +Puede usar WITH TOTALS en subconsultas, incluidas las subconsultas en la cláusula JOIN (en este caso, se combinan los valores totales respectivos). + +#### GRUPO POR en memoria externa {#select-group-by-in-external-memory} + +Puede habilitar el volcado de datos temporales en el disco para restringir el uso de memoria durante `GROUP BY`. +El [max\_bytes\_before\_external\_group\_by](../operations/settings/settings.md#settings-max_bytes_before_external_group_by) determina el umbral de consumo de RAM para el dumping `GROUP BY` datos temporales al sistema de archivos. Si se establece en 0 (el valor predeterminado), está deshabilitado. + +Cuando se utiliza `max_bytes_before_external_group_by`, le recomendamos que establezca `max_memory_usage` aproximadamente el doble de alto. Esto es necesario porque hay dos etapas para la agregación: leer la fecha y formar datos intermedios (1) y fusionar los datos intermedios (2). El volcado de datos al sistema de archivos solo puede ocurrir durante la etapa 1. Si los datos temporales no se descargaron, la etapa 2 puede requerir hasta la misma cantidad de memoria que en la etapa 1. + +Por ejemplo, si [Método de codificación de datos:](../operations/settings/settings.md#settings_max_memory_usage) se estableció en 10000000000 y desea usar agregación externa, tiene sentido establecer `max_bytes_before_external_group_by` a 10000000000, y max\_memory\_usage a 20000000000. Cuando se activa la agregación externa (si hubo al menos un volcado de datos temporales), el consumo máximo de RAM es solo un poco más que `max_bytes_before_external_group_by`. + +Con el procesamiento de consultas distribuidas, la agregación externa se realiza en servidores remotos. Para que el servidor solicitante use solo una pequeña cantidad de RAM, establezca `distributed_aggregation_memory_efficient` a 1. + +Al fusionar datos en el disco, así como al fusionar resultados de servidores remotos cuando `distributed_aggregation_memory_efficient` la configuración está habilitada, consume hasta `1/256 * the_number_of_threads` de la cantidad total de RAM. + +Cuando la agregación externa está habilitada, si `max_bytes_before_external_group_by` de datos (es decir, los datos no se enjuagaron), la consulta se ejecuta tan rápido como sin agregación externa. Si se vació algún dato temporal, el tiempo de ejecución será varias veces más largo (aproximadamente tres veces). + +Si usted tiene un `ORDER BY` con un `LIMIT` despues `GROUP BY`, entonces la cantidad de RAM usada depende de la cantidad de datos en `LIMIT`, no en toda la tabla. Pero si el `ORDER BY` no tiene `LIMIT`, no se olvide de habilitar la clasificación externa (`max_bytes_before_external_sort`). + +### LIMITAR POR Cláusula {#limit-by-clause} + +Una consulta con el `LIMIT n BY expressions` cláusula selecciona la primera `n` para cada valor distinto de `expressions`. La clave para `LIMIT BY` puede contener cualquier número de [expresiones](syntax.md#syntax-expressions). + +ClickHouse admite la siguiente sintaxis: + +- `LIMIT [offset_value, ]n BY expressions` +- `LIMIT n OFFSET offset_value BY expressions` + +Durante el procesamiento de consultas, ClickHouse selecciona los datos ordenados por clave de ordenación. La clave de ordenación se establece explícitamente utilizando un [ORDEN POR](#select-order-by) cláusula o implícitamente como una propiedad del motor de tablas. Entonces se aplica ClickHouse `LIMIT n BY expressions` y devuelve la primera `n` filas para cada combinación distinta de `expressions`. Si `OFFSET` se especifica, a continuación, para cada bloque de datos que pertenece a una combinación distinta de `expressions`, ClickHouse salta `offset_value` número de filas desde el principio del bloque y devuelve un máximo de `n` filas como resultado. Si `offset_value` es mayor que el número de filas en el bloque de datos, ClickHouse devuelve cero filas del bloque. + +`LIMIT BY` no está relacionado con `LIMIT`. Ambos se pueden usar en la misma consulta. + +**Ejemplos** + +Tabla de muestra: + +``` sql +CREATE TABLE limit_by(id Int, val Int) ENGINE = Memory; +INSERT INTO limit_by values(1, 10), (1, 11), (1, 12), (2, 20), (2, 21); +``` + +Consulta: + +``` sql +SELECT * FROM limit_by ORDER BY id, val LIMIT 2 BY id +``` + +``` text +┌─id─┬─val─┐ +│ 1 │ 10 │ +│ 1 │ 11 │ +│ 2 │ 20 │ +│ 2 │ 21 │ +└────┴─────┘ +``` + +``` sql +SELECT * FROM limit_by ORDER BY id, val LIMIT 1, 2 BY id +``` + +``` text +┌─id─┬─val─┐ +│ 1 │ 11 │ +│ 1 │ 12 │ +│ 2 │ 21 │ +└────┴─────┘ +``` + +El `SELECT * FROM limit_by ORDER BY id, val LIMIT 2 OFFSET 1 BY id` consulta devuelve el mismo resultado. + +La siguiente consulta devuelve las 5 referencias principales para cada `domain, device_type` par con un máximo de 100 filas en total (`LIMIT n BY + LIMIT`). + +``` sql +SELECT + domainWithoutWWW(URL) AS domain, + domainWithoutWWW(REFERRER_URL) AS referrer, + device_type, + count() cnt +FROM hits +GROUP BY domain, referrer, device_type +ORDER BY cnt DESC +LIMIT 5 BY domain, device_type +LIMIT 100 +``` + +### Cláusula HAVING {#having-clause} + +Permite filtrar el resultado recibido después de GROUP BY, similar a la cláusula WHERE. +WHERE y HAVING difieren en que WHERE se realiza antes de la agregación (GROUP BY), mientras que HAVING se realiza después de ella. +Si no se realiza la agregación, no se puede usar HAVING. + +### ORDEN POR CLÁUSULA {#select-order-by} + +La cláusula ORDER BY contiene una lista de expresiones, a las que se puede asignar DESC o ASC (la dirección de clasificación). Si no se especifica la dirección, se supone ASC. ASC se ordena en orden ascendente y DESC en orden descendente. La dirección de ordenación se aplica a una sola expresión, no a toda la lista. Ejemplo: `ORDER BY Visits DESC, SearchPhrase` + +Para ordenar por valores de cadena, puede especificar la intercalación (comparación). Ejemplo: `ORDER BY SearchPhrase COLLATE 'tr'` - para ordenar por palabra clave en orden ascendente, utilizando el alfabeto turco, insensible a mayúsculas y minúsculas, suponiendo que las cadenas están codificadas en UTF-8. COLLATE se puede especificar o no para cada expresión en ORDER BY de forma independiente. Si se especifica ASC o DESC, se especifica COLLATE después de él. Cuando se usa COLLATE, la clasificación siempre distingue entre mayúsculas y minúsculas. + +Solo recomendamos usar COLLATE para la clasificación final de un pequeño número de filas, ya que la clasificación con COLLATE es menos eficiente que la clasificación normal por bytes. + +Las filas que tienen valores idénticos para la lista de expresiones de clasificación se generan en un orden arbitrario, que también puede ser no determinista (diferente cada vez). +Si se omite la cláusula ORDER BY, el orden de las filas tampoco está definido y también puede ser no determinista. + +`NaN` y `NULL` orden de clasificación: + +- Con el modificador `NULLS FIRST` — Primero `NULL`, entonces `NaN`, luego otros valores. +- Con el modificador `NULLS LAST` — Primero los valores, luego `NaN`, entonces `NULL`. +- Predeterminado: lo mismo que con el `NULLS LAST` modificador. + +Ejemplo: + +Para la mesa + +``` text +┌─x─┬────y─┐ +│ 1 │ ᴺᵁᴸᴸ │ +│ 2 │ 2 │ +│ 1 │ nan │ +│ 2 │ 2 │ +│ 3 │ 4 │ +│ 5 │ 6 │ +│ 6 │ nan │ +│ 7 │ ᴺᵁᴸᴸ │ +│ 6 │ 7 │ +│ 8 │ 9 │ +└───┴──────┘ +``` + +Ejecute la consulta `SELECT * FROM t_null_nan ORDER BY y NULLS FIRST` conseguir: + +``` text +┌─x─┬────y─┐ +│ 1 │ ᴺᵁᴸᴸ │ +│ 7 │ ᴺᵁᴸᴸ │ +│ 1 │ nan │ +│ 6 │ nan │ +│ 2 │ 2 │ +│ 2 │ 2 │ +│ 3 │ 4 │ +│ 5 │ 6 │ +│ 6 │ 7 │ +│ 8 │ 9 │ +└───┴──────┘ +``` + +Cuando se ordenan los números de coma flotante, los NaN están separados de los otros valores. Independientemente del orden de clasificación, los NaN vienen al final. En otras palabras, para la clasificación ascendente se colocan como si fueran más grandes que todos los demás números, mientras que para la clasificación descendente se colocan como si fueran más pequeños que el resto. + +Se usa menos RAM si se especifica un LIMIT lo suficientemente pequeño además de ORDER BY. De lo contrario, la cantidad de memoria gastada es proporcional al volumen de datos para clasificar. Para el procesamiento de consultas distribuidas, si se omite GROUP BY, la ordenación se realiza parcialmente en servidores remotos y los resultados se combinan en el servidor solicitante. Esto significa que para la ordenación distribuida, el volumen de datos a ordenar puede ser mayor que la cantidad de memoria en un único servidor. + +Si no hay suficiente RAM, es posible realizar la clasificación en la memoria externa (creando archivos temporales en un disco). Utilice el ajuste `max_bytes_before_external_sort` para este propósito. Si se establece en 0 (el valor predeterminado), la ordenación externa está deshabilitada. Si está habilitada, cuando el volumen de datos a ordenar alcanza el número especificado de bytes, los datos recopilados se ordenan y se vuelcan en un archivo temporal. Después de leer todos los datos, todos los archivos ordenados se fusionan y se generan los resultados. Los archivos se escriben en el directorio /var/lib/clickhouse/tmp/ en la configuración (de forma predeterminada, pero puede ‘tmp\_path’ parámetro para cambiar esta configuración). + +La ejecución de una consulta puede usar más memoria que ‘max\_bytes\_before\_external\_sort’. Por este motivo, esta configuración debe tener un valor significativamente menor que ‘max\_memory\_usage’. Como ejemplo, si su servidor tiene 128 GB de RAM y necesita ejecutar una sola consulta, establezca ‘max\_memory\_usage’ de hasta 100 GB, y ‘max\_bytes\_before\_external\_sort’ para 80 GB. + +La clasificación externa funciona con mucha menos eficacia que la clasificación en RAM. + +### SELECT Cláusula {#select-select} + +[Expresiones](syntax.md#syntax-expressions) especificado en el `SELECT` cláusula se calculan después de que todas las operaciones en las cláusulas descritas anteriormente hayan finalizado. Estas expresiones funcionan como si se aplicaran a filas separadas en el resultado. Si las expresiones en el `SELECT` cláusula contiene funciones agregadas, a continuación, ClickHouse procesa funciones agregadas y expresiones utilizadas como sus argumentos durante el [GRUPO POR](#select-group-by-clause) agregación. + +Si desea incluir todas las columnas en el resultado, use el asterisco (`*`) simbolo. Por ejemplo, `SELECT * FROM ...`. + +Para hacer coincidir algunas columnas en el resultado con un [Re2](https://en.wikipedia.org/wiki/RE2_(software)) expresión regular, puede utilizar el `COLUMNS` expresión. + +``` sql +COLUMNS('regexp') +``` + +Por ejemplo, considere la tabla: + +``` sql +CREATE TABLE default.col_names (aa Int8, ab Int8, bc Int8) ENGINE = TinyLog +``` + +La siguiente consulta selecciona datos de todas las columnas que contienen `a` símbolo en su nombre. + +``` sql +SELECT COLUMNS('a') FROM col_names +``` + +``` text +┌─aa─┬─ab─┐ +│ 1 │ 1 │ +└────┴────┘ +``` + +Las columnas seleccionadas no se devuelven en orden alfabético. + +Puede utilizar múltiples `COLUMNS` expresiones en una consulta y aplicarles funciones. + +Por ejemplo: + +``` sql +SELECT COLUMNS('a'), COLUMNS('c'), toTypeName(COLUMNS('c')) FROM col_names +``` + +``` text +┌─aa─┬─ab─┬─bc─┬─toTypeName(bc)─┐ +│ 1 │ 1 │ 1 │ Int8 │ +└────┴────┴────┴────────────────┘ +``` + +Cada columna devuelta por el `COLUMNS` expresión se pasa a la función como un argumento separado. También puede pasar otros argumentos a la función si los admite. Tenga cuidado al usar funciones. Si una función no admite el número de argumentos que le ha pasado, ClickHouse produce una excepción. + +Por ejemplo: + +``` sql +SELECT COLUMNS('a') + COLUMNS('c') FROM col_names +``` + +``` text +Received exception from server (version 19.14.1): +Code: 42. DB::Exception: Received from localhost:9000. DB::Exception: Number of arguments for function plus doesn't match: passed 3, should be 2. +``` + +En este ejemplo, `COLUMNS('a')` Todos los derechos reservados: `aa` y `ab`. `COLUMNS('c')` Nosotros `bc` columna. El `+` operador no puede aplicar a 3 argumentos, por lo que ClickHouse lanza una excepción con el mensaje relevante. + +Columnas que coinciden con el `COLUMNS` expresión puede tener diferentes tipos de datos. Si `COLUMNS` no coincide con ninguna columna y es la única expresión en `SELECT`, ClickHouse lanza una excepción. + +### Cláusula DISTINCT {#select-distinct} + +Si se especifica DISTINCT, sólo quedará una sola fila de todos los conjuntos de filas totalmente coincidentes en el resultado. +El resultado será el mismo que si GROUP BY se especificara en todos los campos especificados en SELECT sin funciones agregadas. Pero hay varias diferencias con GROUP BY: + +- DISTINCT se puede aplicar junto con GROUP BY. +- Cuando ORDER BY se omite y se define LIMIT, la consulta deja de ejecutarse inmediatamente después de leer el número necesario de filas diferentes. +- Los bloques de datos se generan a medida que se procesan, sin esperar a que finalice la ejecución de toda la consulta. + +DISTINCT no se admite si SELECT tiene al menos una columna de matriz. + +`DISTINCT` trabaja con [NULO](syntax.md) como si `NULL` Era un valor específico, y `NULL=NULL`. En otras palabras, en el `DISTINCT` resultados, diferentes combinaciones con `NULL` sólo ocurren una vez. + +ClickHouse admite el uso de `DISTINCT` y `ORDER BY` para diferentes columnas en una consulta. El `DISTINCT` cláusula se ejecuta antes de `ORDER BY` clausula. + +Tabla de ejemplo: + +``` text +┌─a─┬─b─┐ +│ 2 │ 1 │ +│ 1 │ 2 │ +│ 3 │ 3 │ +│ 2 │ 4 │ +└───┴───┘ +``` + +Al seleccionar datos con el `SELECT DISTINCT a FROM t1 ORDER BY b ASC` Consulta, obtenemos el siguiente resultado: + +``` text +┌─a─┐ +│ 2 │ +│ 1 │ +│ 3 │ +└───┘ +``` + +Si cambiamos la dirección de clasificación `SELECT DISTINCT a FROM t1 ORDER BY b DESC`, obtenemos el siguiente resultado: + +``` text +┌─a─┐ +│ 3 │ +│ 1 │ +│ 2 │ +└───┘ +``` + +Fila `2, 4` se cortó antes de clasificar. + +Tenga en cuenta esta especificidad de implementación al programar consultas. + +### Cláusula LIMIT {#limit-clause} + +`LIMIT m` permita seleccionar la primera `m` filas del resultado. + +`LIMIT n, m` permita seleccionar la primera `m` el resultado después de omitir la primera `n` películas. El `LIMIT m OFFSET n` sintaxis también es compatible. + +`n` y `m` deben ser enteros no negativos. + +Si no hay una `ORDER BY` cláusula que ordena explícitamente los resultados, el resultado puede ser arbitrario y no determinista. + +### UNION ALL Cláusula {#union-all-clause} + +Puede utilizar UNION ALL para combinar cualquier número de consultas. Ejemplo: + +``` sql +SELECT CounterID, 1 AS table, toInt64(count()) AS c + FROM test.hits + GROUP BY CounterID + +UNION ALL + +SELECT CounterID, 2 AS table, sum(Sign) AS c + FROM test.visits + GROUP BY CounterID + HAVING c > 0 +``` + +Solo se admite UNION ALL. La UNIÓN regular (UNION DISTINCT) no es compatible. Si necesita UNION DISTINCT, puede escribir SELECT DISTINCT desde una subconsulta que contenga UNION ALL. + +Las consultas que forman parte de UNION ALL se pueden ejecutar simultáneamente y sus resultados se pueden mezclar. + +La estructura de los resultados (el número y el tipo de columnas) debe coincidir con las consultas. Pero los nombres de columna pueden diferir. En este caso, los nombres de columna para el resultado final se tomarán de la primera consulta. La fundición de tipo se realiza para uniones. Por ejemplo, si dos consultas que se combinan tienen el mismo campo-`Nullable` y `Nullable` tipos de un tipo compatible, el resultado `UNION ALL` tiene una `Nullable` campo de tipo. + +Las consultas que forman parte de UNION ALL no se pueden encerrar entre paréntesis. ORDER BY y LIMIT se aplican a consultas separadas, no al resultado final. Si necesita aplicar una conversión al resultado final, puede colocar todas las consultas con UNION ALL en una subconsulta en la cláusula FROM. + +### INTO OUTFILE Cláusula {#into-outfile-clause} + +Añadir el `INTO OUTFILE filename` cláusula (donde filename es un literal de cadena) para redirigir la salida de la consulta al archivo especificado. +A diferencia de MySQL, el archivo se crea en el lado del cliente. La consulta fallará si ya existe un archivo con el mismo nombre de archivo. +Esta funcionalidad está disponible en el cliente de línea de comandos y clickhouse-local (una consulta enviada a través de la interfaz HTTP fallará). + +El formato de salida predeterminado es TabSeparated (el mismo que en el modo de lote de cliente de línea de comandos). + +### FORMAT Cláusula {#format-clause} + +Especificar ‘FORMAT format’ para obtener datos en cualquier formato especificado. +Puede usar esto por conveniencia o para crear volcados. +Para obtener más información, consulte la sección «Formats». +Si se omite la cláusula FORMAT, se utiliza el formato predeterminado, que depende tanto de la configuración como de la interfaz utilizada para acceder a la base de datos. Para la interfaz HTTP y el cliente de línea de comandos en modo por lotes, el formato predeterminado es TabSeparated. Para el cliente de línea de comandos en modo interactivo, el formato predeterminado es PrettyCompact (tiene tablas atractivas y compactas). + +Cuando se utiliza el cliente de línea de comandos, los datos se pasan al cliente en un formato interno eficiente. El cliente interpreta independientemente la cláusula FORMAT de la consulta y da formato a los datos en sí (aliviando así la red y el servidor de la carga). + +### ES Operadores {#select-in-operators} + +El `IN`, `NOT IN`, `GLOBAL IN`, y `GLOBAL NOT IN` están cubiertos por separado, ya que su funcionalidad es bastante rica. + +El lado izquierdo del operador es una sola columna o una tupla. + +Ejemplos: + +``` sql +SELECT UserID IN (123, 456) FROM ... +SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... +``` + +Si el lado izquierdo es una sola columna que está en el índice, y el lado derecho es un conjunto de constantes, el sistema usa el índice para procesar la consulta. + +No enumere demasiados valores explícitamente (es decir, millones). Si un conjunto de datos es grande, colóquelo en una tabla temporal (por ejemplo, consulte la sección «External data for query processing»), luego use una subconsulta. + +El lado derecho del operador puede ser un conjunto de expresiones constantes, un conjunto de tuplas con expresiones constantes (mostradas en los ejemplos anteriores) o el nombre de una tabla de base de datos o subconsulta SELECT entre paréntesis. + +Si el lado derecho del operador es el nombre de una tabla (por ejemplo, `UserID IN users`), esto es equivalente a la subconsulta `UserID IN (SELECT * FROM users)`. Úselo cuando trabaje con datos externos que se envían junto con la consulta. Por ejemplo, la consulta se puede enviar junto con un conjunto de ID de usuario ‘users’ tabla temporal, que debe ser filtrada. + +Si el lado derecho del operador es un nombre de tabla que tiene el motor Set (un conjunto de datos preparado que siempre está en RAM), el conjunto de datos no se volverá a crear para cada consulta. + +La subconsulta puede especificar más de una columna para filtrar tuplas. +Ejemplo: + +``` sql +SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ... +``` + +Las columnas a la izquierda y a la derecha del operador IN deben tener el mismo tipo. + +El operador IN y la subconsulta pueden aparecer en cualquier parte de la consulta, incluidas las funciones agregadas y las funciones lambda. +Ejemplo: + +``` sql +SELECT + EventDate, + avg(UserID IN + ( + SELECT UserID + FROM test.hits + WHERE EventDate = toDate('2014-03-17') + )) AS ratio +FROM test.hits +GROUP BY EventDate +ORDER BY EventDate ASC +``` + +``` text +┌──EventDate─┬────ratio─┐ +│ 2014-03-17 │ 1 │ +│ 2014-03-18 │ 0.807696 │ +│ 2014-03-19 │ 0.755406 │ +│ 2014-03-20 │ 0.723218 │ +│ 2014-03-21 │ 0.697021 │ +│ 2014-03-22 │ 0.647851 │ +│ 2014-03-23 │ 0.648416 │ +└────────────┴──────────┘ +``` + +Para cada día después del 17 de marzo, cuente el porcentaje de páginas vistas realizadas por los usuarios que visitaron el sitio el 17 de marzo. +Una subconsulta en la cláusula IN siempre se ejecuta una sola vez en un único servidor. No hay subconsultas dependientes. + +#### Procesamiento NULL {#null-processing-1} + +Durante el procesamiento de la solicitud, el operador IN asume que el resultado de una operación [NULO](syntax.md) siempre es igual a `0`, independientemente de si `NULL` está en el lado derecho o izquierdo del operador. `NULL` Los valores no se incluyen en ningún conjunto de datos, no se corresponden entre sí y no se pueden comparar. + +Aquí hay un ejemplo con el `t_null` tabla: + +``` text +┌─x─┬────y─┐ +│ 1 │ ᴺᵁᴸᴸ │ +│ 2 │ 3 │ +└───┴──────┘ +``` + +Ejecución de la consulta `SELECT x FROM t_null WHERE y IN (NULL,3)` da el siguiente resultado: + +``` text +┌─x─┐ +│ 2 │ +└───┘ +``` + +Se puede ver que la fila en la que `y = NULL` se expulsa de los resultados de la consulta. Esto se debe a que ClickHouse no puede decidir si `NULL` está incluido en el `(NULL,3)` conjunto, devuelve `0` como resultado de la operación, y `SELECT` excluye esta fila de la salida final. + +``` sql +SELECT y IN (NULL, 3) +FROM t_null +``` + +``` text +┌─in(y, tuple(NULL, 3))─┐ +│ 0 │ +│ 1 │ +└───────────────────────┘ +``` + +#### Subconsultas distribuidas {#select-distributed-subqueries} + +Hay dos opciones para IN-s con subconsultas (similar a JOINs): normal `IN` / `JOIN` y `GLOBAL IN` / `GLOBAL JOIN`. Se diferencian en cómo se ejecutan para el procesamiento de consultas distribuidas. + +!!! attention "Atención" + Recuerde que los algoritmos descritos a continuación pueden funcionar de manera diferente dependiendo de la [configuración](../operations/settings/settings.md) `distributed_product_mode` configuración. + +Cuando se utiliza el IN normal, la consulta se envía a servidores remotos, y cada uno de ellos ejecuta las subconsultas en el `IN` o `JOIN` clausula. + +Cuando se utiliza `GLOBAL IN` / `GLOBAL JOINs`, primero todas las subconsultas se ejecutan para `GLOBAL IN` / `GLOBAL JOINs`, y los resultados se recopilan en tablas temporales. A continuación, las tablas temporales se envían a cada servidor remoto, donde las consultas se ejecutan utilizando estos datos temporales. + +Para una consulta no distribuida, utilice el `IN` / `JOIN`. + +Tenga cuidado al usar subconsultas en el `IN` / `JOIN` para el procesamiento de consultas distribuidas. + +Veamos algunos ejemplos. Supongamos que cada servidor del clúster tiene un **local\_table**. Cada servidor también tiene un **distributed\_table** mesa con el **Distribuido** tipo, que mira todos los servidores del clúster. + +Para una consulta al **distributed\_table**, la consulta se enviará a todos los servidores remotos y se ejecutará en ellos usando el **local\_table**. + +Por ejemplo, la consulta + +``` sql +SELECT uniq(UserID) FROM distributed_table +``` + +se enviará a todos los servidores remotos como + +``` sql +SELECT uniq(UserID) FROM local_table +``` + +y ejecutar en cada uno de ellos en paralelo, hasta que llegue a la etapa donde se pueden combinar resultados intermedios. Luego, los resultados intermedios se devolverán al servidor solicitante y se fusionarán en él, y el resultado final se enviará al cliente. + +Ahora vamos a examinar una consulta con IN: + +``` sql +SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) +``` + +- Cálculo de la intersección de audiencias de dos sitios. + +Esta consulta se enviará a todos los servidores remotos como + +``` sql +SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) +``` + +En otras palabras, los datos establecidos en la cláusula IN se recopilarán en cada servidor de forma independiente, solo a través de los datos que se almacenan localmente en cada uno de los servidores. + +Esto funcionará correctamente y de manera óptima si está preparado para este caso y ha distribuido datos en los servidores de clúster de modo que los datos de un único ID de usuario residen completamente en un único servidor. En este caso, todos los datos necesarios estarán disponibles localmente en cada servidor. De lo contrario, el resultado será inexacto. Nos referimos a esta variación de la consulta como «local IN». + +Para corregir cómo funciona la consulta cuando los datos se distribuyen aleatoriamente entre los servidores de clúster, puede especificar **distributed\_table** dentro de una subconsulta. La consulta se vería así: + +``` sql +SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) +``` + +Esta consulta se enviará a todos los servidores remotos como + +``` sql +SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) +``` + +La subconsulta comenzará a ejecutarse en cada servidor remoto. Dado que la subconsulta utiliza una tabla distribuida, la subconsulta que se encuentra en cada servidor remoto se reenviará a cada servidor remoto como + +``` sql +SELECT UserID FROM local_table WHERE CounterID = 34 +``` + +Por ejemplo, si tiene un clúster de 100 servidores, la ejecución de toda la consulta requerirá 10.000 solicitudes elementales, lo que generalmente se considera inaceptable. + +En tales casos, siempre debe usar GLOBAL IN en lugar de IN. Veamos cómo funciona para la consulta + +``` sql +SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID GLOBAL IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) +``` + +El servidor del solicitante ejecutará la subconsulta + +``` sql +SELECT UserID FROM distributed_table WHERE CounterID = 34 +``` + +y el resultado se colocará en una tabla temporal en la RAM. A continuación, la solicitud se enviará a cada servidor remoto como + +``` sql +SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL IN _data1 +``` + +y la tabla temporal `_data1` se enviará a cada servidor remoto con la consulta (el nombre de la tabla temporal está definido por la implementación). + +Esto es más óptimo que usar el IN normal. Sin embargo, tenga en cuenta los siguientes puntos: + +1. Al crear una tabla temporal, los datos no se hacen únicos. Para reducir el volumen de datos transmitidos a través de la red, especifique DISTINCT en la subconsulta. (No necesita hacer esto para un IN normal.) +2. La tabla temporal se enviará a todos los servidores remotos. La transmisión no tiene en cuenta la topología de red. Por ejemplo, si 10 servidores remotos residen en un centro de datos que es muy remoto en relación con el servidor solicitante, los datos se enviarán 10 veces a través del canal al centro de datos remoto. Intente evitar grandes conjuntos de datos cuando use GLOBAL IN. +3. Al transmitir datos a servidores remotos, las restricciones en el ancho de banda de la red no son configurables. Puede sobrecargar la red. +4. Intente distribuir datos entre servidores para que no necesite usar GLOBAL IN de forma regular. +5. Si necesita utilizar GLOBAL IN con frecuencia, planifique la ubicación del clúster ClickHouse para que un único grupo de réplicas resida en no más de un centro de datos con una red rápida entre ellos, de modo que una consulta se pueda procesar completamente dentro de un único centro de datos. + +También tiene sentido especificar una tabla local en el `GLOBAL IN` cláusula, en caso de que esta tabla local solo esté disponible en el servidor solicitante y desee usar datos de ella en servidores remotos. + +### Valores extremos {#extreme-values} + +Además de los resultados, también puede obtener valores mínimos y máximos para las columnas de resultados. Para hacer esto, establezca el **extremo** a 1. Los mínimos y máximos se calculan para tipos numéricos, fechas y fechas con horas. Para otras columnas, se generan los valores predeterminados. + +Se calculan dos filas adicionales: los mínimos y los máximos, respectivamente. Estas dos filas adicionales se generan en `JSON*`, `TabSeparated*`, y `Pretty*` [Formato](../interfaces/formats.md), separado de las otras filas. No se emiten para otros formatos. + +En `JSON*` los valores extremos se emiten en un formato separado. ‘extremes’ campo. En `TabSeparated*` , la fila viene después del resultado principal, y después de ‘totals’ si está presente. Está precedido por una fila vacía (después de los otros datos). En `Pretty*` formatea, la fila se muestra como una tabla separada después del resultado principal, y después de `totals` si está presente. + +Los valores extremos se calculan para las filas anteriores `LIMIT`, pero después `LIMIT BY`. Sin embargo, cuando se usa `LIMIT offset, size`, las filas antes `offset` están incluidos en `extremes`. En las solicitudes de secuencia, el resultado también puede incluir un pequeño número de filas que pasaron por `LIMIT`. + +### Nota {#notes} + +El `GROUP BY` y `ORDER BY` las cláusulas no admiten argumentos posicionales. Esto contradice MySQL, pero se ajusta al SQL estándar. +Por ejemplo, `GROUP BY 1, 2` se interpretará como agrupación por constantes (es decir, agregación de todas las filas en una). + +Puedes usar sinónimos (`AS` Cualquier parte de una consulta. + +Puede poner un asterisco en cualquier parte de una consulta en lugar de una expresión. Cuando se analiza la consulta, el asterisco se expande a una lista de todas las columnas de la tabla `MATERIALIZED` y `ALIAS` columna). Solo hay unos pocos casos en los que se justifica el uso de un asterisco: + +- Al crear un volcado de tabla. +- Para tablas que contienen solo unas pocas columnas, como las tablas del sistema. +- Para obtener información sobre qué columnas están en una tabla. En este caso, establezca `LIMIT 1`. Pero es mejor usar el `DESC TABLE` consulta. +- Cuando hay una filtración fuerte en un pequeño número de columnas usando `PREWHERE`. +- En subconsultas (ya que las columnas que no son necesarias para la consulta externa se excluyen de las subconsultas). + +En todos los demás casos, no recomendamos usar el asterisco, ya que solo le da los inconvenientes de un DBMS columnar en lugar de las ventajas. En otras palabras, no se recomienda usar el asterisco. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/select/) diff --git a/docs/es/query_language/show.md b/docs/es/query_language/show.md new file mode 100644 index 00000000000..0341f654035 --- /dev/null +++ b/docs/es/query_language/show.md @@ -0,0 +1,98 @@ +# MOSTRAR consultas {#show-queries} + +## MOSTRAR CREAR TABLA {#show-create-table} + +``` sql +SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY] [db.]table [INTO OUTFILE filename] [FORMAT format] +``` + +Devuelve una sola `String`-tipo ‘statement’ columna, que contiene un único valor – el `CREATE` Consulta utilizada para crear el objeto especificado. + +## MOSTRAR BASAS DE DATOS {#show-databases} + +``` sql +SHOW DATABASES [INTO OUTFILE filename] [FORMAT format] +``` + +Imprime una lista de todas las bases de datos. +Esta consulta es idéntica a `SELECT name FROM system.databases [INTO OUTFILE filename] [FORMAT format]`. + +## MOSTRAR LISTA DE PROCESOS {#show-processlist} + +``` sql +SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] +``` + +Envía el contenido de la [sistema.proceso](../operations/system_tables.md#system_tables-processes) tabla, que contiene una lista de consultas que se están procesando en este momento, exceptuando `SHOW PROCESSLIST` consulta. + +El `SELECT * FROM system.processes` todas las consultas actuales. + +Consejo (ejecutar en la consola): + +``` bash +$ watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" +``` + +## MOSTRAR TABLAS {#show-tables} + +Muestra una lista de tablas. + +``` sql +SHOW [TEMPORARY] TABLES [{FROM | IN} ] [LIKE '' | WHERE expr] [LIMIT ] [INTO OUTFILE ] [FORMAT ] +``` + +Si el `FROM` no se especifica la cláusula, la consulta devuelve la lista de tablas de la base de datos actual. + +Puede obtener los mismos resultados que el `SHOW TABLES` Consulta de la siguiente manera: + +``` sql +SELECT name FROM system.tables WHERE database = [AND name LIKE ] [LIMIT ] [INTO OUTFILE ] [FORMAT ] +``` + +**Ejemplo** + +La siguiente consulta selecciona las dos primeras filas de la lista de tablas `system` base de datos, cuyos nombres contienen `co`. + +``` sql +SHOW TABLES FROM system LIKE '%co%' LIMIT 2 +``` + +``` text +┌─name───────────────────────────┐ +│ aggregate_function_combinators │ +│ collations │ +└────────────────────────────────┘ +``` + +## MOSTRAR DICCIONARIOS {#show-dictionaries} + +Muestra una lista de [diccionarios externos](dicts/external_dicts.md). + +``` sql +SHOW DICTIONARIES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] +``` + +Si el `FROM` no se especifica la cláusula, la consulta devuelve la lista de diccionarios de la base de datos actual. + +Puede obtener los mismos resultados que el `SHOW DICTIONARIES` Consulta de la siguiente manera: + +``` sql +SELECT name FROM system.dictionaries WHERE database = [AND name LIKE ] [LIMIT ] [INTO OUTFILE ] [FORMAT ] +``` + +**Ejemplo** + +La siguiente consulta selecciona las dos primeras filas de la lista de tablas `system` base de datos, cuyos nombres contienen `reg`. + +``` sql +SHOW DICTIONARIES FROM db LIKE '%reg%' LIMIT 2 +``` + +``` text +┌─name─────────┐ +│ regions │ +│ region_names │ +└──────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/show/) diff --git a/docs/es/query_language/syntax.md b/docs/es/query_language/syntax.md new file mode 100644 index 00000000000..b0a60d8d846 --- /dev/null +++ b/docs/es/query_language/syntax.md @@ -0,0 +1,180 @@ +# Sintaxis {#syntax} + +Hay dos tipos de analizadores en el sistema: el analizador SQL completo (un analizador de descenso recursivo) y el analizador de formato de datos (un analizador de flujo rápido). +En todos los casos, excepto el `INSERT` consulta, sólo se utiliza el analizador SQL completo. +El `INSERT` Consulta utiliza ambos analizadores: + +``` sql +INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def') +``` + +El `INSERT INTO t VALUES` fragmento es analizado por el analizador completo, y los datos `(1, 'Hello, world'), (2, 'abc'), (3, 'def')` es analizado por el analizador de flujo rápido. También puede activar el analizador completo de los datos mediante el [input\_format\_values\_interpret\_expressions](../operations/settings/settings.md#settings-input_format_values_interpret_expressions) configuración. Cuando `input_format_values_interpret_expressions = 1`, ClickHouse primero intenta analizar valores con el analizador de flujo rápido. Si falla, ClickHouse intenta usar el analizador completo para los datos, tratándolo como un SQL [expresión](#syntax-expressions). + +Los datos pueden tener cualquier formato. Cuando se recibe una consulta, el servidor no calcula más de [max\_query\_size](../operations/settings/settings.md#settings-max_query_size) bytes de la solicitud en RAM (por defecto, 1 MB), y el resto se analiza la secuencia. +Esto significa que el sistema no tiene problemas con `INSERT` Consultas, como lo hace MySQL. + +Cuando se utiliza el `Values` formato en un `INSERT` consulta, puede parecer que los datos se analizan igual que las expresiones en un `SELECT` Consulta, pero esto no es cierto. El `Values` formato es mucho más limitado. + +A continuación cubriremos el analizador completo. Para obtener más información sobre los analizadores de formato, consulte [Formato](../interfaces/formats.md) apartado. + +## Espacio {#spaces} + +Puede haber cualquier número de símbolos de espacio entre las construcciones sintácticas (incluidos el principio y el final de una consulta). Los símbolos de espacio incluyen el espacio, tabulación, avance de línea, CR y avance de formulario. + +## Comentario {#comments} + +Se admiten comentarios de estilo SQL y de estilo C. +Comentarios de estilo SQL: desde `--` al final de la línea. El espacio después `--` se puede omitir. +Comentarios en estilo C: de `/*` un `*/`. Estos comentarios pueden ser multilínea. Tampoco se requieren espacios aquí. + +## Palabras clave {#syntax-keywords} + +Las palabras clave no distinguen entre mayúsculas y minúsculas cuando corresponden a: + +- Estándar SQL. Por ejemplo, `SELECT`, `select` y `SeLeCt` son todos válidos. +- Implementación en algunos DBMS populares (MySQL o Postgres). Por ejemplo, `DateTime` es lo mismo que `datetime`. + +Si el nombre del tipo de datos distingue entre mayúsculas y minúsculas `system.data_type_families` tabla. + +A diferencia del SQL estándar, todas las demás palabras clave (incluidos los nombres de las funciones) son **minúsculas**. + +Las palabras clave no están reservadas (simplemente se analizan como palabras clave en el contexto correspondiente). Si usted usa [identificador](#syntax-identifiers) lo mismo que las palabras clave, encerrarlas entre comillas. Por ejemplo, la consulta `SELECT "FROM" FROM table_name` es válido si la tabla `table_name` tetas grandes con el nombre `"FROM"`. + +## Identificador {#syntax-identifiers} + +Los identificadores hijo: + +- Nombres de clúster, base de datos, tabla, partición y columna. +- Función. +- Tipos de datos. +- [Alias de expresión](#syntax-expression_aliases). + +Los identificadores pueden ser citados o no citados. Se recomienda utilizar identificadores no citados. + +Los identificadores no citados deben coincidir con la expresión regular `^[a-zA-Z_][0-9a-zA-Z_]*$` y no puede ser igual a [Palabras clave](#syntax-keywords). Ejemplos: `x, _1, X_y__Z123_.` + +Si desea utilizar identificadores iguales a las palabras clave o si desea utilizar otros símbolos en los identificadores, cítelo con comillas dobles o retrocesos, por ejemplo, `"id"`, `` `id` ``. + +## Literal {#literals} + +Hay: numérico, cadena, compuesto y `NULL` literal. + +### Numérico {#numeric} + +Un literal numérico, intenta ser analizado: + +- Primero como un número firmado de 64 bits, usando el [strtoull](https://en.cppreference.com/w/cpp/string/byte/strtoul) función. +- Si no tiene éxito, como un número de 64 bits sin signo, [Sistema abierto.](https://en.cppreference.com/w/cpp/string/byte/strtol) función. +- Si no tiene éxito, como un número de punto flotante [strtod](https://en.cppreference.com/w/cpp/string/byte/strtof) función. +- De lo contrario, se devuelve un error. + +El valor correspondiente tendrá el tipo más pequeño en el que se ajuste el valor. +Por ejemplo, 1 se analiza como `UInt8` pero 256 se analiza como `UInt16`. Para obtener más información, consulte [Tipos de datos](../data_types/index.md). + +Ejemplos: `1`, `18446744073709551615`, `0xDEADBEEF`, `01`, `0.1`, `1e100`, `-1e-100`, `inf`, `nan`. + +### Cadena {#syntax-string-literal} + +Solo se admiten literales de cadena entre comillas simples. Los caracteres incluidos se pueden escapar de barra invertida. Las siguientes secuencias de escape tienen un valor especial correspondiente: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\a`, `\v`, `\xHH`. En todos los demás casos, secuencias de escape en el formato `\c`, donde `c` cualquier carácter, se convierten a `c`. Esto significa que puedes usar las secuencias `\'`y`\\`. El valor tendrá el [Cadena](../data_types/string.md) tipo. + +El conjunto mínimo de caracteres que necesita para escapar en literales de cadena: `'` y `\`. La comilla simple se puede escapar con la comilla simple, literales `'It\'s'` y `'It''s'` hijo iguales. + +### Compuesto {#compound} + +Las construcciones son compatibles con las matrices: `[1, 2, 3]` y tuplas: `(1, 'Hello, world!', 2)`.. +En realidad, estos no son literales, sino expresiones con el operador de creación de matriz y el operador de creación de tuplas, respectivamente. +Una matriz debe constar de al menos un elemento y una tupla debe tener al menos dos elementos. +Las tuplas tienen un propósito especial para su uso en el `IN` cláusula de un `SELECT` consulta. Las tuplas se pueden obtener como resultado de una consulta, pero no se pueden guardar en una base de datos (con la excepción de [Memoria](../operations/table_engines/memory.md) tabla). + +### NULO {#null-literal} + +Indica que falta el valor. + +Para almacenar `NULL` es un campo de tabla, debe ser del [NULL](../data_types/nullable.md) tipo. + +Dependiendo del formato de datos (entrada o salida), `NULL` puede tener una representación diferente. Para obtener más información, consulte la documentación de [Formatos de datos](../interfaces/formats.md#formats). + +Hay muchos matices para el procesamiento `NULL`. Por ejemplo, si al menos uno de los argumentos de una operación de comparación es `NULL` el resultado de esta operación también se `NULL`. Lo mismo es cierto para la multiplicación, la suma y otras operaciones. Para obtener más información, lea la documentación de cada operación. + +En las consultas, puede verificar `NULL` utilizando el [ES NULO](operators.md#operator-is-null) y [NO ES NULO](operators.md) operadores y las funciones relacionadas `isNull` y `isNotNull`. + +## Función {#functions} + +Las funciones se escriben como un identificador con una lista de argumentos (posiblemente vacíos) entre paréntesis. A diferencia de SQL estándar, los corchetes son necesarios, incluso para una lista de argumentos vacía. Ejemplo: `now()`. +Hay funciones regulares y agregadas (ver la sección “Aggregate functions”). Algunas funciones agregadas pueden contener dos listas de argumentos entre paréntesis. Ejemplo: `quantile (0.9) (x)`. Estas funciones agregadas se llaman “parametric” funciones, y los argumentos en la primera lista se llaman “parameters”. La sintaxis de las funciones agregadas sin parámetros es la misma que para las funciones regulares. + +## Operador {#operators} + +Los operadores se convierten a sus funciones correspondientes durante el análisis de consultas, teniendo en cuenta su prioridad y asociatividad. +Por ejemplo, la expresión `1 + 2 * 3 + 4` se transforma a `plus(plus(1, multiply(2, 3)), 4)`. + +## Tipos de datos y motores de tabla de base de datos {#data-types-and-database-table-engines} + +Tipos de datos y motores de tablas en el `CREATE` las consultas se escriben de la misma manera que los identificadores o funciones. En otras palabras, pueden o no contener una lista de argumentos entre corchetes. Para obtener más información, consulte las secciones “Data types,” “Table engines,” y “CREATE”. + +## Alias de expresión {#syntax-expression-aliases} + +Un alias es un nombre definido por el usuario para una expresión en una consulta. + +``` sql +expr AS alias +``` + +- `AS` — The keyword for defining aliases. You can define the alias for a table name or a column name in a `SELECT` cláusula sin usar el `AS` palabra clave. + + For example, `SELECT table_name_alias.column_name FROM table_name table_name_alias`. + + In the [CAST](functions/type_conversion_functions.md#type_conversion_function-cast) function, the `AS` keyword has another meaning. See the description of the function. + +- `expr` — Any expression supported by ClickHouse. + + For example, `SELECT column_name * 2 AS double FROM some_table`. + +- `alias` — Name for `expr`. Los alias deben cumplir con el [identificador](#syntax-identifiers) sintaxis. + + For example, `SELECT "table t".column_name FROM table_name AS "table t"`. + +### Notas sobre el uso {#notes-on-usage} + +Los alias son globales para una consulta o subconsulta y puede definir un alias en cualquier parte de una consulta para cualquier expresión. Por ejemplo, `SELECT (1 AS n) + 2, n`. + +Los alias no son visibles en subconsultas y entre subconsultas. Por ejemplo, al ejecutar la consulta `SELECT (SELECT sum(b.a) + num FROM b) - a.a AS num FROM a` ClickHouse genera la excepción `Unknown identifier: num`. + +Si se define un alias para las columnas de resultados `SELECT` cláusula de una subconsulta, estas columnas son visibles en la consulta externa. Por ejemplo, `SELECT n + m FROM (SELECT 1 AS n, 2 AS m)`. + +Tenga cuidado con los alias que son iguales a los nombres de columna o tabla. Consideremos el siguiente ejemplo: + +``` sql +CREATE TABLE t +( + a Int, + b Int +) +ENGINE = TinyLog() +``` + +``` sql +SELECT + argMax(a, b), + sum(b) AS b +FROM t +``` + +``` text +Received exception from server (version 18.14.17): +Code: 184. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: Aggregate function sum(b) is found inside another aggregate function in query. +``` + +En este ejemplo, declaramos tabla `t` con columna `b`. Luego, al seleccionar los datos, definimos el `sum(b) AS b` apodo. Como los alias son globales, ClickHouse sustituyó el literal `b` en la expresión `argMax(a, b)` con la expresión `sum(b)`. Esta sustitución causó la excepción. + +## Asterisco {#asterisk} + +En un `SELECT` consulta, un asterisco puede reemplazar la expresión. Para obtener más información, consulte la sección “SELECT”. + +## Expresiones {#syntax-expressions} + +Una expresión es una función, identificador, literal, aplicación de un operador, expresión entre paréntesis, subconsulta o asterisco. También puede contener un alias. +Una lista de expresiones es una o más expresiones separadas por comas. +Las funciones y los operadores, a su vez, pueden tener expresiones como argumentos. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/syntax/) diff --git a/docs/es/query_language/system.md b/docs/es/query_language/system.md new file mode 100644 index 00000000000..db23f06954c --- /dev/null +++ b/docs/es/query_language/system.md @@ -0,0 +1,106 @@ +# Consultas del sistema {#query-language-system} + +- [Cargar DICCIONARIOS](#query_language-system-reload-dictionaries) +- [Cargar DICCIONARIO](#query_language-system-reload-dictionary) +- [CATEGORÍA](#query_language-system-drop-dns-cache) +- [CACHÉ DE LA MARCA DE LA GOTA](#query_language-system-drop-mark-cache) +- [REGISTROS DE FLUSH](#query_language-system-flush_logs) +- [CONFIGURACIÓN DE Carga](#query_language-system-reload-config) +- [APAGADO](#query_language-system-shutdown) +- [MATAR](#query_language-system-kill) +- [PARADA DE SENTIDOS DISTRIBUIDOS](#query_language-system-stop-distributed-sends) +- [FLUSH DISTRIBUIDO](#query_language-system-flush-distributed) +- [COMIENZAR SENTIDOS DISTRIBUIDOS](#query_language-system-start-distributed-sends) +- [PARADA DE FUSIONES](#query_language-system-stop-merges) +- [COMIENZAR FUSIONES](#query_language-system-start-merges) + +## Cargar DICCIONARIOS {#query-language-system-reload-dictionaries} + +Vuelve a cargar todos los diccionarios que se han cargado correctamente antes. +De forma predeterminada, los diccionarios se cargan perezosamente (ver [Diccionarios\_lazy\_load](../operations/server_settings/settings.md#server_settings-dictionaries_lazy_load)), por lo que en lugar de cargarse automáticamente al inicio, se inicializan en el primer acceso a través de la función dictGet o SELECT desde tablas con ENGINE = Dictionary . El `SYSTEM RELOAD DICTIONARIES` Consulta vuelve a cargar dichos diccionarios (LOADED). +Siempre vuelve `Ok.` independientemente del resultado de la actualización del diccionario. + +## RELOAD DICTIONARY dictionary\_name {#query-language-system-reload-dictionary} + +Recarga completamente un diccionario `dictionary_name`, independientemente del estado del diccionario (LOADED / NOT\_LOADED / FAILED). +Siempre vuelve `Ok.` independientemente del resultado de la actualización del diccionario. +El estado del diccionario se puede comprobar consultando el `system.dictionaries` tabla. + +``` sql +SELECT name, status FROM system.dictionaries; +``` + +## CATEGORÍA {#query-language-system-drop-dns-cache} + +Restablece la caché DNS interna de ClickHouse. A veces (para versiones anteriores de ClickHouse) es necesario usar este comando al cambiar la infraestructura (cambiar la dirección IP de otro servidor de ClickHouse o el servidor utilizado por los diccionarios). + +Para obtener una administración de caché más conveniente (automática), consulte disable\_internal\_dns\_cache, dns\_cache\_update\_period parameters. + +## CACHÉ DE LA MARCA DE LA GOTA {#query-language-system-drop-mark-cache} + +Restablece la caché de marcas. Utilizado en el desarrollo de ClickHouse y pruebas de rendimiento. + +## REGISTROS DE FLUSH {#query-language-system-flush-logs} + +Vuelca los búferes de los mensajes de registro a las tablas del sistema (por ejemplo, el sistema.query\_log). Le permite no esperar 7,5 segundos al depurar. + +## CONFIGURACIÓN DE Carga {#query-language-system-reload-config} + +Vuelve a cargar la configuración de ClickHouse. Se usa cuando la configuración se almacena en ZooKeeeper. + +## APAGADO {#query-language-system-shutdown} + +Normalmente se apaga ClickHouse (como `service clickhouse-server stop` / `kill {$pid_clickhouse-server}`) + +## MATAR {#query-language-system-kill} + +Anula el proceso de ClickHouse (como `kill -9 {$ pid_clickhouse-server}`) + +## Administración de tablas distribuidas {#query-language-system-distributed} + +ClickHouse puede administrar [distribuido](../operations/table_engines/distributed.md) tabla. Cuando un usuario inserta datos en estas tablas, ClickHouse primero crea una cola de los datos que se deben enviar a los nodos del clúster y, a continuación, los envía de forma asincrónica. Puede administrar el procesamiento de colas con el [PARADA DE SENTIDOS DISTRIBUIDOS](#query_language-system-stop-distributed-sends), [FLUSH DISTRIBUIDO](#query_language-system-flush-distributed), y [COMIENZAR SENTIDOS DISTRIBUIDOS](#query_language-system-start-distributed-sends) consulta. También puede insertar sincrónicamente datos distribuidos con el `insert_distributed_sync` configuración. + +### PARADA DE SENTIDOS DISTRIBUIDOS {#query-language-system-stop-distributed-sends} + +Deshabilita la distribución de datos en segundo plano al insertar datos en tablas distribuidas. + +``` sql +SYSTEM STOP DISTRIBUTED SENDS [db.] +``` + +### FLUSH DISTRIBUIDO {#query-language-system-flush-distributed} + +Obliga a ClickHouse a enviar datos a nodos de clúster de forma sincrónica. Si algún nodo no está disponible, ClickHouse produce una excepción y detiene la ejecución de la consulta. Puede volver a intentar la consulta hasta que tenga éxito, lo que sucederá cuando todos los nodos estén nuevamente en línea. + +``` sql +SYSTEM FLUSH DISTRIBUTED [db.] +``` + +### COMIENZAR SENTIDOS DISTRIBUIDOS {#query-language-system-start-distributed-sends} + +Habilita la distribución de datos en segundo plano al insertar datos en tablas distribuidas. + +``` sql +SYSTEM START DISTRIBUTED SENDS [db.] +``` + +### PARADA DE FUSIONES {#query-language-system-stop-merges} + +Proporciona la posibilidad de detener las fusiones en segundo plano para las tablas de la familia MergeTree: + +``` sql +SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] +``` + +!!! note "Nota" + `DETACH / ATTACH` la tabla comenzará las fusiones de fondo para la tabla, incluso en caso de que las fusiones se hayan detenido para todas las tablas MergeTree antes. + +### COMIENZAR FUSIONES {#query-language-system-start-merges} + +Proporciona la posibilidad de iniciar fusiones en segundo plano para tablas de la familia MergeTree: + +``` sql +SYSTEM START MERGES [[db.]merge_tree_family_table_name] +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/system/) diff --git a/docs/es/query_language/table_functions/file.md b/docs/es/query_language/table_functions/file.md new file mode 100644 index 00000000000..c3744b62e74 --- /dev/null +++ b/docs/es/query_language/table_functions/file.md @@ -0,0 +1,114 @@ +# file {#file} + +Crea una tabla a partir de un archivo. Esta función de tabla es similar a [URL](url.md) y [Hdfs](hdfs.md) Aquel. + +``` sql +file(path, format, structure) +``` + +**Parámetros de entrada** + +- `path` — La ruta relativa al archivo desde [user\_files\_path](../../operations/server_settings/settings.md#server_settings-user_files_path). Soporte de ruta a archivo siguiendo globs en modo de solo lectura: `*`, `?`, `{abc,def}` y `{N..M}` donde `N`, `M` — numero, \``'abc', 'def'` — cadena. +- `format` — El [Formato](../../interfaces/formats.md#formats) del archivo. +- `structure` — Estructura de la mesa. Formato `'column1_name column1_type, column2_name column2_type, ...'`. + +**Valor devuelto** + +Una tabla con la estructura especificada para leer o escribir datos en el archivo especificado. + +**Ejemplo** + +Configuración `user_files_path` y el contenido del archivo `test.csv`: + +``` bash +$ grep user_files_path /etc/clickhouse-server/config.xml + /var/lib/clickhouse/user_files/ + +$ cat /var/lib/clickhouse/user_files/test.csv + 1,2,3 + 3,2,1 + 78,43,45 +``` + +Tabla de`test.csv` y selección de las dos primeras filas de ella: + +``` sql +SELECT * +FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') +LIMIT 2 +``` + +``` text +┌─column1─┬─column2─┬─column3─┐ +│ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 1 │ +└─────────┴─────────┴─────────┘ +``` + +``` sql +-- getting the first 10 lines of a table that contains 3 columns of UInt32 type from a CSV file +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10 +``` + +**Globs en el camino** + +Múltiples componentes de ruta de acceso pueden tener globs. Para ser procesado, el archivo debe existir y coincidir con todo el patrón de ruta (no solo el sufijo o el prefijo). + +- `*` — Sustituye cualquier número de caracteres excepto `/` incluyendo cadena vacía. +- `?` — Sustituye a cualquier carácter individual. +- `{some_string,another_string,yet_another_one}` — Sustituye cualquiera de las cadenas `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Sustituye cualquier número en el intervalo de N a M, incluidas ambas fronteras. + +Construcciones con `{}` hijo similares a la [función de tabla remota](../../query_language/table_functions/remote.md)). + +**Ejemplo** + +1. Supongamos que tenemos varios archivos con las siguientes rutas relativas: + +- ‘some\_dir/some\_file\_1’ +- ‘some\_dir/some\_file\_2’ +- ‘some\_dir/some\_file\_3’ +- ‘another\_dir/some\_file\_1’ +- ‘another\_dir/some\_file\_2’ +- ‘another\_dir/some\_file\_3’ + +1. Consulta la cantidad de filas en estos archivos: + + + +``` sql +SELECT count(*) +FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') +``` + +1. Consulta la cantidad de filas en todos los archivos de estos dos directorios: + + + +``` sql +SELECT count(*) +FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') +``` + +!!! warning "Advertencia" + Si su lista de archivos contiene rangos de números con ceros a la izquierda, use la construcción con llaves para cada dígito por separado o use `?`. + +**Ejemplo** + +Consultar los datos desde archivos nombrados `file000`, `file001`, … , `file999`: + +``` sql +SELECT count(*) +FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') +``` + +## Columnas virtuales {#virtual-columns} + +- `_path` — Ruta de acceso al archivo. +- `_file` — Nombre del expediente. + +**Ver también** + +- [Columnas virtuales](https://clickhouse.tech/docs/es/operations/table_engines/#table_engines-virtual_columns) + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/file/) diff --git a/docs/es/query_language/table_functions/generate.md b/docs/es/query_language/table_functions/generate.md new file mode 100644 index 00000000000..cfaf38ccf9e --- /dev/null +++ b/docs/es/query_language/table_functions/generate.md @@ -0,0 +1,38 @@ +# generateRandom {#generaterandom} + +Genera datos aleatorios con un esquema dado. +Permite rellenar tablas de prueba con datos. +Admite todos los tipos de datos que se pueden almacenar en la tabla, excepto `LowCardinality` y `AggregateFunction`. + +``` sql +generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]); +``` + +**Parámetros** + +- `name` — Nombre de la columna correspondiente. +- `TypeName` — Tipo de columna correspondiente. +- `limit` — Número de filas a generar. +- `max_array_length` — Longitud máxima de matriz para todas las matrices generadas. Por defecto `10`. +- `max_string_length` — Longitud máxima de cadena para todas las cadenas generadas. Por defecto `10`. +- `random_seed` — Especifique manualmente la semilla aleatoria para producir resultados estables. Si NULL — semilla se genera aleatoriamente. + +**Valor devuelto** + +Un objeto de tabla con el esquema solicitado. + +## Ejemplo de uso {#usage-example} + +``` sql +SELECT * FROM generateRandom('a Array(Int8), d Decimal32(4), c Tuple(DateTime64(3), UUID)', 1, 10, 2); +``` + +``` text +┌─a────────┬────────────d─┬─c──────────────────────────────────────────────────────────────────┐ +│ [77] │ -124167.6723 │ ('2061-04-17 21:59:44.573','3f72f405-ec3e-13c8-44ca-66ef335f7835') │ +│ [32,110] │ -141397.7312 │ ('1979-02-09 03:43:48.526','982486d1-5a5d-a308-e525-7bd8b80ffa73') │ +│ [68] │ -67417.0770 │ ('2080-03-12 14:17:31.269','110425e5-413f-10a6-05ba-fa6b3e929f15') │ +└──────────┴──────────────┴────────────────────────────────────────────────────────────────────┘ +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/generate/) diff --git a/docs/es/query_language/table_functions/hdfs.md b/docs/es/query_language/table_functions/hdfs.md new file mode 100644 index 00000000000..9a72d684039 --- /dev/null +++ b/docs/es/query_language/table_functions/hdfs.md @@ -0,0 +1,97 @@ +# Hdfs {#hdfs} + +Crea una tabla a partir de archivos en HDFS. Esta función de tabla es similar a [URL](url.md) y [file](file.md) Aquel. + +``` sql +hdfs(URI, format, structure) +``` + +**Parámetros de entrada** + +- `URI` — El URI relativo al archivo en HDFS. Soporte de ruta a archivo siguiendo globs en modo de solo lectura: `*`, `?`, `{abc,def}` y `{N..M}` donde `N`, `M` — numero, \``'abc', 'def'` — cadena. +- `format` — El [Formato](../../interfaces/formats.md#formats) del archivo. +- `structure` — Estructura de la mesa. Formato `'column1_name column1_type, column2_name column2_type, ...'`. + +**Valor devuelto** + +Una tabla con la estructura especificada para leer o escribir datos en el archivo especificado. + +**Ejemplo** + +Tabla de `hdfs://hdfs1:9000/test` y selección de las dos primeras filas de ella: + +``` sql +SELECT * +FROM hdfs('hdfs://hdfs1:9000/test', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') +LIMIT 2 +``` + +``` text +┌─column1─┬─column2─┬─column3─┐ +│ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 1 │ +└─────────┴─────────┴─────────┘ +``` + +**Globs en el camino** + +Múltiples componentes de ruta de acceso pueden tener globs. Para ser procesado, el archivo debe existir y coincidir con todo el patrón de ruta (no solo el sufijo o el prefijo). + +- `*` — Sustituye cualquier número de caracteres excepto `/` incluyendo cadena vacía. +- `?` — Sustituye a cualquier carácter individual. +- `{some_string,another_string,yet_another_one}` — Sustituye cualquiera de las cadenas `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Sustituye cualquier número en el intervalo de N a M, incluidas ambas fronteras. + +Construcciones con `{}` hijo similares a la [función de tabla remota](../../query_language/table_functions/remote.md)). + +**Ejemplo** + +1. Supongamos que tenemos varios archivos con los siguientes URI en HDFS: + +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_3’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_3’ + +1. Consulta la cantidad de filas en estos archivos: + + + +``` sql +SELECT count(*) +FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') +``` + +1. Consulta la cantidad de filas en todos los archivos de estos dos directorios: + + + +``` sql +SELECT count(*) +FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32') +``` + +!!! warning "Advertencia" + Si su lista de archivos contiene rangos de números con ceros a la izquierda, use la construcción con llaves para cada dígito por separado o use `?`. + +**Ejemplo** + +Consultar los datos desde archivos nombrados `file000`, `file001`, … , `file999`: + +``` sql +SELECT count(*) +FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') +``` + +## Columnas virtuales {#virtual-columns} + +- `_path` — Ruta de acceso al archivo. +- `_file` — Nombre del expediente. + +**Ver también** + +- [Columnas virtuales](https://clickhouse.tech/docs/es/operations/table_engines/#table_engines-virtual_columns) + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/hdfs/) diff --git a/docs/es/query_language/table_functions/index.md b/docs/es/query_language/table_functions/index.md new file mode 100644 index 00000000000..7b297676d38 --- /dev/null +++ b/docs/es/query_language/table_functions/index.md @@ -0,0 +1,30 @@ +# Funciones de tabla {#table-functions} + +Las funciones de tabla son métodos para construir tablas. + +Puede usar funciones de tabla en: + +- [DE](../select.md#select-from) cláusula de la `SELECT` consulta. + + The method for creating a temporary table that is available only in the current query. The table is deleted when the query finishes. + +- [Crear TABLA COMO \](../create.md#create-table-query) consulta. + + It's one of the methods of creating a table. + +!!! warning "Advertencia" + No puede utilizar funciones de tabla si [Método de codificación de datos:](../../operations/settings/permissions_for_queries.md#settings_allow_ddl) la configuración está deshabilitada. + +| Función | Descripción | +|----------------------|-----------------------------------------------------------------------------------------------------------------------------------| +| [file](file.md) | Crea un [File](../../operations/table_engines/file.md)-mesa del motor. | +| [Fusionar](merge.md) | Crea un [Fusionar](../../operations/table_engines/merge.md)-mesa del motor. | +| [número](numbers.md) | Crea una tabla con una sola columna llena de números enteros. | +| [remoto](remote.md) | Le permite acceder a servidores remotos sin crear un [Distribuido](../../operations/table_engines/distributed.md)-mesa del motor. | +| [URL](url.md) | Crea un [URL](../../operations/table_engines/url.md)-mesa del motor. | +| [mysql](mysql.md) | Crea un [MySQL](../../operations/table_engines/mysql.md)-mesa del motor. | +| [jdbc](jdbc.md) | Crea un [JDBC](../../operations/table_engines/jdbc.md)-mesa del motor. | +| [Nosotros](odbc.md) | Crea un [ODBC](../../operations/table_engines/odbc.md)-mesa del motor. | +| [Hdfs](hdfs.md) | Crea un [HDFS](../../operations/table_engines/hdfs.md)-mesa del motor. | + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/) diff --git a/docs/es/query_language/table_functions/input.md b/docs/es/query_language/table_functions/input.md new file mode 100644 index 00000000000..ab89c198c42 --- /dev/null +++ b/docs/es/query_language/table_functions/input.md @@ -0,0 +1,40 @@ +# entrada {#input} + +`input(structure)` - función de tabla que permite convertir e insertar efectivamente los datos enviados al +servidor con estructura dada a la tabla con otra estructura. + +`structure` - estructura de los datos enviados al servidor en el siguiente formato `'column1_name column1_type, column2_name column2_type, ...'`. +Por ejemplo, `'id UInt32, name String'`. + +Esta función sólo se puede utilizar en `INSERT SELECT` consulta y sólo una vez, pero por lo demás se comporta como función de tabla ordinaria +(por ejemplo, se puede usar en subconsulta, etc.). + +Los datos se pueden enviar de cualquier manera como para ordinario `INSERT` Consulta y pasado en cualquier disponible [Formato](../../interfaces/formats.md#formats) +que debe especificarse al final de la consulta (a diferencia de lo ordinario `INSERT SELECT`). + +La característica principal de esta función es que cuando el servidor recibe datos del cliente, los convierte simultáneamente +según la lista de expresiones en el `SELECT` cláusula e inserta en la tabla de destino. Tabla temporal +con todos los datos transferidos no se crea. + +**Ejemplos** + +- Deje que el `test` tiene la siguiente estructura `(a String, b String)` + y datos en `data.csv` Tiene una estructura diferente `(col1 String, col2 Date, col3 Int32)`. Consulta de inserción + datos de la `data.csv` es el `test` con conversión simultánea se ve así: + + + +``` bash +$ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT lower(col1), col3 * col3 FROM input('col1 String, col2 Date, col3 Int32') FORMAT CSV"; +``` + +- Si `data.csv` contiene datos de la misma estructura `test_structure` como la mesa `test` entonces estas dos consultas son iguales: + + + +``` bash +$ cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" +$ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/input/) diff --git a/docs/es/query_language/table_functions/jdbc.md b/docs/es/query_language/table_functions/jdbc.md new file mode 100644 index 00000000000..ed13151a713 --- /dev/null +++ b/docs/es/query_language/table_functions/jdbc.md @@ -0,0 +1,22 @@ +# jdbc {#table-function-jdbc} + +`jdbc(jdbc_connection_uri, schema, table)` - devuelve la tabla que está conectado a través del controlador JDBC. + +Esta función de tabla requiere `clickhouse-jdbc-bridge` programa para estar en ejecución. +Admite tipos Nullable (basados en DDL de la tabla remota que se consulta). + +**Ejemplos** + +``` sql +SELECT * FROM jdbc('jdbc:mysql://localhost:3306/?user=root&password=root', 'schema', 'table') +``` + +``` sql +SELECT * FROM jdbc('mysql://localhost:3306/?user=root&password=root', 'schema', 'table') +``` + +``` sql +SELECT * FROM jdbc('datasource://mysql-local', 'schema', 'table') +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/jdbc/) diff --git a/docs/es/query_language/table_functions/merge.md b/docs/es/query_language/table_functions/merge.md new file mode 100644 index 00000000000..f1090deca81 --- /dev/null +++ b/docs/es/query_language/table_functions/merge.md @@ -0,0 +1,7 @@ +# Fusionar {#merge} + +`merge(db_name, 'tables_regexp')` – Crea una tabla de combinación temporal. Para obtener más información, consulte la sección “Table engines, Merge”. + +La estructura de la tabla se toma de la primera tabla encontrada que coincide con la expresión regular. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/merge/) diff --git a/docs/es/query_language/table_functions/mysql.md b/docs/es/query_language/table_functions/mysql.md new file mode 100644 index 00000000000..9fa4d272b63 --- /dev/null +++ b/docs/es/query_language/table_functions/mysql.md @@ -0,0 +1,79 @@ +# mysql {#mysql} + +Permitir `SELECT` consultas que se realizarán en los datos que se almacenan en un servidor MySQL remoto. + +``` sql +mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); +``` + +**Parámetros** + +- `host:port` — Dirección del servidor MySQL. + +- `database` — Nombre de base de datos remota. + +- `table` — Nombre de la tabla remota. + +- `user` — Usuario de MySQL. + +- `password` — Contraseña de usuario. + +- `replace_query` — Bandera que convierte `INSERT INTO` Consultas a `REPLACE INTO`. Si `replace_query=1`, la consulta se reemplaza. + +- `on_duplicate_clause` — El `ON DUPLICATE KEY on_duplicate_clause` expresión que se añade a la `INSERT` consulta. + + Example: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, where `on_duplicate_clause` is `UPDATE c2 = c2 + 1`. See the MySQL documentation to find which `on_duplicate_clause` you can use with the `ON DUPLICATE KEY` clause. + + To specify `on_duplicate_clause` you need to pass `0` to the `replace_query` parameter. If you simultaneously pass `replace_query = 1` and `on_duplicate_clause`, ClickHouse generates an exception. + +Simple `WHERE` cláusulas tales como `=, !=, >, >=, <, <=` se ejecutan actualmente en el servidor MySQL. + +El resto de las condiciones y el `LIMIT` La restricción de muestreo se ejecuta en ClickHouse solo después de que finalice la consulta a MySQL. + +**Valor devuelto** + +Un objeto de tabla con las mismas columnas que la tabla MySQL original. + +## Ejemplo de uso {#usage-example} + +Tabla en MySQL: + +``` text +mysql> CREATE TABLE `test`.`test` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `int_nullable` INT NULL DEFAULT NULL, + -> `float` FLOAT NOT NULL, + -> `float_nullable` FLOAT NULL DEFAULT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into test (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from test; ++--------+--------------+-------+----------------+ +| int_id | int_nullable | float | float_nullable | ++--------+--------------+-------+----------------+ +| 1 | NULL | 2 | NULL | ++--------+--------------+-------+----------------+ +1 row in set (0,00 sec) +``` + +Selección de datos de ClickHouse: + +``` sql +SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123') +``` + +``` text +┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ +│ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ +└────────┴──────────────┴───────┴────────────────┘ +``` + +## Ver también {#see-also} + +- [El ‘MySQL’ motor de mesa](../../operations/table_engines/mysql.md) +- [Uso de MySQL como fuente de diccionario externo](../dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/mysql/) diff --git a/docs/es/query_language/table_functions/numbers.md b/docs/es/query_language/table_functions/numbers.md new file mode 100644 index 00000000000..59d67cc6ff8 --- /dev/null +++ b/docs/es/query_language/table_functions/numbers.md @@ -0,0 +1,23 @@ +# número {#numbers} + +`numbers(N)` – Devuelve una tabla con el único ‘number’ columna (UInt64) que contiene enteros de 0 a N-1. +`numbers(N, M)` - Devuelve una tabla con el único ‘number’ columna (UInt64) que contiene enteros de N a (N + M - 1). + +Similar a la `system.numbers` tabla, puede ser utilizado para probar y generar valores sucesivos, `numbers(N, M)` más eficiente que `system.numbers`. + +Las siguientes consultas son equivalentes: + +``` sql +SELECT * FROM numbers(10); +SELECT * FROM numbers(0, 10); +SELECT * FROM system.numbers LIMIT 10; +``` + +Ejemplos: + +``` sql +-- Generate a sequence of dates from 2010-01-01 to 2010-12-31 +select toDate('2010-01-01') + number as d FROM numbers(365); +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/numbers/) diff --git a/docs/es/query_language/table_functions/odbc.md b/docs/es/query_language/table_functions/odbc.md new file mode 100644 index 00000000000..13a77aa90d5 --- /dev/null +++ b/docs/es/query_language/table_functions/odbc.md @@ -0,0 +1,101 @@ +# Nosotros {#table-functions-odbc} + +Devuelve la tabla que está conectada a través de [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). + +``` sql +odbc(connection_settings, external_database, external_table) +``` + +Parámetros: + +- `connection_settings` — Nombre de la sección con ajustes de conexión en el `odbc.ini` file. +- `external_database` — Nombre de una base de datos en un DBMS externo. +- `external_table` — Nombre de una tabla en el `external_database`. + +Para implementar con seguridad conexiones ODBC, ClickHouse usa un programa separado `clickhouse-odbc-bridge`. Si el controlador ODBC se carga directamente desde `clickhouse-server`, problemas de controlador pueden bloquear el servidor ClickHouse. ClickHouse se inicia automáticamente `clickhouse-odbc-bridge` cuando se requiere. El programa de puente ODBC se instala desde el mismo paquete que el `clickhouse-server`. + +Los campos con el `NULL` Los valores de la tabla externa se convierten en los valores predeterminados para el tipo de datos base. Por ejemplo, si un campo de tabla MySQL remoto tiene `INT NULL` tipo se convierte a 0 (el valor predeterminado para ClickHouse `Int32` tipo de datos). + +## Ejemplo de uso {#usage-example} + +**Obtener datos de la instalación local de MySQL a través de ODBC** + +Este ejemplo se comprueba para Ubuntu Linux 18.04 y el servidor MySQL 5.7. + +Asegúrese de que unixODBC y MySQL Connector están instalados. + +De forma predeterminada (si se instala desde paquetes), ClickHouse comienza como usuario `clickhouse`. Por lo tanto, debe crear y configurar este usuario en el servidor MySQL. + +``` bash +$ sudo mysql +``` + +``` sql +mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; +mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; +``` + +A continuación, configure la conexión en `/etc/odbc.ini`. + +``` bash +$ cat /etc/odbc.ini +[mysqlconn] +DRIVER = /usr/local/lib/libmyodbc5w.so +SERVER = 127.0.0.1 +PORT = 3306 +DATABASE = test +USERNAME = clickhouse +PASSWORD = clickhouse +``` + +Puede verificar la conexión usando el `isql` utilidad desde la instalación de unixODBC. + +``` bash +$ isql -v mysqlconn ++---------------------------------------+ +| Connected! | +| | +... +``` + +Tabla en MySQL: + +``` text +mysql> CREATE TABLE `test`.`test` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `int_nullable` INT NULL DEFAULT NULL, + -> `float` FLOAT NOT NULL, + -> `float_nullable` FLOAT NULL DEFAULT NULL, + -> PRIMARY KEY (`int_id`)); +Query OK, 0 rows affected (0,09 sec) + +mysql> insert into test (`int_id`, `float`) VALUES (1,2); +Query OK, 1 row affected (0,00 sec) + +mysql> select * from test; ++--------+--------------+-------+----------------+ +| int_id | int_nullable | float | float_nullable | ++--------+--------------+-------+----------------+ +| 1 | NULL | 2 | NULL | ++--------+--------------+-------+----------------+ +1 row in set (0,00 sec) +``` + +Recuperación de datos de la tabla MySQL en ClickHouse: + +``` sql +SELECT * FROM odbc('DSN=mysqlconn', 'test', 'test') +``` + +``` text +┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ +│ 1 │ 0 │ 2 │ 0 │ +└────────┴──────────────┴───────┴────────────────┘ +``` + +## Ver también {#see-also} + +- [Diccionarios externos ODBC](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) +- [Motor de tabla ODBC](../../operations/table_engines/odbc.md). + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/jdbc/) diff --git a/docs/es/query_language/table_functions/remote.md b/docs/es/query_language/table_functions/remote.md new file mode 100644 index 00000000000..922c038b1e2 --- /dev/null +++ b/docs/es/query_language/table_functions/remote.md @@ -0,0 +1,76 @@ +# Remoto, remoteSecure {#remote-remotesecure} + +Le permite acceder a servidores remotos sin crear un `Distributed` tabla. + +Firma: + +``` sql +remote('addresses_expr', db, table[, 'user'[, 'password']]) +remote('addresses_expr', db.table[, 'user'[, 'password']]) +``` + +`addresses_expr` – Una expresión que genera direcciones de servidores remotos. Esta puede ser solo una dirección de servidor. La dirección del servidor es `host:port` O simplemente `host`. El host se puede especificar como nombre de servidor o como dirección IPv4 o IPv6. Una dirección IPv6 se especifica entre corchetes. El puerto es el puerto TCP del servidor remoto. Si se omite el puerto, utiliza `tcp_port` del archivo de configuración del servidor (por defecto, 9000). + +!!! important "Importante" + El puerto es necesario para una dirección IPv6. + +Ejemplos: + +``` text +example01-01-1 +example01-01-1:9000 +localhost +127.0.0.1 +[::]:9000 +[2a02:6b8:0:1111::11]:9000 +``` + +Se pueden separar varias direcciones por comas. En este caso, ClickHouse usará procesamiento distribuido, por lo que enviará la consulta a todas las direcciones especificadas (como a fragmentos con datos diferentes). + +Ejemplo: + +``` text +example01-01-1,example01-02-1 +``` + +Parte de la expresión se puede especificar entre llaves. El ejemplo anterior se puede escribir de la siguiente manera: + +``` text +example01-0{1,2}-1 +``` + +Los corchetes rizados pueden contener un rango de números separados por dos puntos (enteros no negativos). En este caso, el rango se expande a un conjunto de valores que generan direcciones de fragmentos. Si el primer número comienza con cero, los valores se forman con la misma alineación cero. El ejemplo anterior se puede escribir de la siguiente manera: + +``` text +example01-{01..02}-1 +``` + +Si tiene varios pares de llaves, genera el producto directo de los conjuntos correspondientes. + +Las direcciones y partes de las direcciones entre llaves se pueden separar mediante el símbolo de tubería (\|). En este caso, los conjuntos de direcciones correspondientes se interpretan como réplicas y la consulta se enviará a la primera réplica en buen estado. Sin embargo, las réplicas se iteran en el orden establecido actualmente en el [load\_balancing](../../operations/settings/settings.md) configuración. + +Ejemplo: + +``` text +example01-{01..02}-{1|2} +``` + +En este ejemplo se especifican dos fragmentos que tienen dos réplicas cada uno. + +El número de direcciones generadas está limitado por una constante. En este momento esto es 1000 direcciones. + +Uso de la `remote` función de la tabla es menos óptima que la creación de un `Distributed` mesa, porque en este caso, la conexión del servidor se restablece para cada solicitud. Además, si se establecen nombres de host, los nombres se resuelven y los errores no se cuentan cuando se trabaja con varias réplicas. Cuando procese un gran número de consultas, cree siempre el `Distributed` mesa antes de tiempo, y no utilice el `remote` función de la tabla. + +El `remote` puede ser útil en los siguientes casos: + +- Acceder a un servidor específico para la comparación de datos, la depuración y las pruebas. +- Consultas entre varios clústeres de ClickHouse con fines de investigación. +- Solicitudes distribuidas poco frecuentes que se realizan manualmente. +- Solicitudes distribuidas donde el conjunto de servidores se redefine cada vez. + +Si el usuario no está especificado, `default` se utiliza. +Si no se especifica la contraseña, se utiliza una contraseña vacía. + +`remoteSecure` - igual que `remote` pero con conexión segura. Puerto predeterminado — [Tcp\_port\_secure](../../operations/server_settings/settings.md#server_settings-tcp_port_secure) de configuración o 9440. + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/remote/) diff --git a/docs/es/query_language/table_functions/url.md b/docs/es/query_language/table_functions/url.md new file mode 100644 index 00000000000..fb7cee98f9a --- /dev/null +++ b/docs/es/query_language/table_functions/url.md @@ -0,0 +1,19 @@ +# URL {#url} + +`url(URL, format, structure)` - devuelve una tabla creada a partir del `URL` con dado +`format` y `structure`. + +URL - Dirección de servidor HTTP o HTTPS, que puede aceptar `GET` y/o `POST` peticiones. + +Formato - [Formato](../../interfaces/formats.md#formats) de los datos. + +estructura - estructura de la tabla en `'UserID UInt64, Name String'` formato. Determina los nombres y tipos de columna. + +**Ejemplo** + +``` sql +-- getting the first 3 lines of a table that contains columns of String and UInt32 type from HTTP-server which answers in CSV format. +SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3 +``` + +[Artículo Original](https://clickhouse.tech/docs/es/query_language/table_functions/url/) diff --git a/docs/es/roadmap.md b/docs/es/roadmap.md new file mode 100644 index 00000000000..6bd63d64a8c --- /dev/null +++ b/docs/es/roadmap.md @@ -0,0 +1,12 @@ +# Hoja de ruta {#roadmap} + +## Q1 2020 {#q1-2020} + +- Control de acceso basado en roles + +## Q2 2020 {#q2-2020} + +- Integración con servicios de autenticación externos +- Grupos de recursos para una distribución más precisa de la capacidad del clúster entre los usuarios + +{## [Artículo Original](https://clickhouse.tech/docs/es/roadmap/) ##} diff --git a/docs/es/security_changelog.md b/docs/es/security_changelog.md new file mode 100644 index 00000000000..1658c348279 --- /dev/null +++ b/docs/es/security_changelog.md @@ -0,0 +1,69 @@ +## Corregido en la versión de ClickHouse 19.14.3.3, 2019-09-10 {#fixed-in-clickhouse-release-19-14-3-3-2019-09-10} + +### ¿Qué puedes encontrar en Neodigit {#cve-2019-15024} + +Un atacante que tenga acceso de escritura a ZooKeeper y que pueda ejecutar un servidor personalizado disponible desde la red donde se ejecuta ClickHouse, puede crear un servidor malicioso personalizado que actuará como una réplica de ClickHouse y lo registrará en ZooKeeper. Cuando otra réplica recuperará la parte de datos de la réplica maliciosa, puede forzar a clickhouse-server a escribir en una ruta arbitraria en el sistema de archivos. + +Créditos: Eldar Zaitov del equipo de seguridad de la información de Yandex + +### ¿Qué puedes encontrar en Neodigit {#cve-2019-16535} + +Аn La lectura OOB, la escritura OOB y el desbordamiento de enteros en los algoritmos de descompresión se pueden usar para lograr RCE o DoS a través del protocolo nativo. + +Créditos: Eldar Zaitov del equipo de seguridad de la información de Yandex + +### ¿Qué puedes encontrar en Neodigit {#cve-2019-16536} + +Un cliente autenticado malintencionado puede desencadenar el desbordamiento de pila que conduce a DoS. + +Créditos: Eldar Zaitov del equipo de seguridad de la información de Yandex + +## Corregido en la versión de ClickHouse 19.13.6.1, 2019-09-20 {#fixed-in-clickhouse-release-19-13-6-1-2019-09-20} + +### ¿Qué puedes encontrar en Neodigit {#cve-2019-18657} + +Función de la tabla `url` la vulnerabilidad permitió al atacante inyectar encabezados HTTP arbitrarios en la solicitud. + +Crédito: [Nikita Tikhomirov](https://github.com/NSTikhomirov) + +## Corregido en la versión de ClickHouse 18.12.13, 2018-09-10 {#fixed-in-clickhouse-release-18-12-13-2018-09-10} + +### ¿Qué puedes encontrar en Neodigit {#cve-2018-14672} + +Las funciones para cargar modelos CatBoost permitieron el recorrido de ruta y la lectura de archivos arbitrarios a través de mensajes de error. + +Créditos: Andrey Krasichkov del equipo de seguridad de la información de Yandex + +## Corregido en la versión de ClickHouse 18.10.3, 2018-08-13 {#fixed-in-clickhouse-release-18-10-3-2018-08-13} + +### ¿Qué puedes encontrar en Neodigit {#cve-2018-14671} + +unixODBC permitía cargar objetos compartidos arbitrarios desde el sistema de archivos, lo que provocó una vulnerabilidad de ejecución remota de código. + +Créditos: Andrey Krasichkov y Evgeny Sidorov del equipo de seguridad de la información de Yandex + +## Corregido en la versión de ClickHouse 1.1.54388, 2018-06-28 {#fixed-in-clickhouse-release-1-1-54388-2018-06-28} + +### ¿Qué puedes encontrar en Neodigit {#cve-2018-14668} + +“remote” función de tabla permitió símbolos arbitrarios en “user”, “password” y “default\_database” campos que llevaron a ataques de falsificación de solicitudes de protocolo cruzado. + +Créditos: Andrey Krasichkov del equipo de seguridad de la información de Yandex + +## Corregido en la versión de ClickHouse 1.1.54390, 2018-07-06 {#fixed-in-clickhouse-release-1-1-54390-2018-07-06} + +### ¿Qué puedes encontrar en Neodigit {#cve-2018-14669} + +ClickHouse cliente MySQL tenía “LOAD DATA LOCAL INFILE” funcionalidad habilitada que permitió a una base de datos MySQL maliciosa leer archivos arbitrarios desde el servidor ClickHouse conectado. + +Créditos: Andrey Krasichkov y Evgeny Sidorov del equipo de seguridad de la información de Yandex + +## Corregido en la versión de ClickHouse 1.1.54131, 2017-01-10 {#fixed-in-clickhouse-release-1-1-54131-2017-01-10} + +### ¿Qué puedes encontrar en Neodigit {#cve-2018-14670} + +Una configuración incorrecta en el paquete deb podría conducir al uso no autorizado de la base de datos. + +Créditos: Centro Nacional de Seguridad Cibernética del Reino Unido (NCSC) + +[Artículo Original](https://clickhouse.tech/docs/es/security_changelog/) diff --git a/docs/fa/commercial/cloud.md b/docs/fa/commercial/cloud.md new file mode 120000 index 00000000000..eb58e4a90be --- /dev/null +++ b/docs/fa/commercial/cloud.md @@ -0,0 +1 @@ +../../en/commercial/cloud.md \ No newline at end of file diff --git a/docs/fa/data_types/array.md b/docs/fa/data_types/array.md index c4c5d409860..d134005c255 100644 --- a/docs/fa/data_types/array.md +++ b/docs/fa/data_types/array.md @@ -1,8 +1,9 @@ -
+
-# Array(T) +# Array(T) {#arrayt} آرایه ای از عناصر با تایپ T. تایپ T می تواند هر Type باشد، از جمله یک آرایه. ما توصیه به استفاده از آرایه های multidimensional نمی کنیم، چون آنها به خوبی پشتیبانی نمی شوند (برای مثال، شما نمی تونید در جداولی که موتور آنها MergeTree است، آرایه های multidimensional ذخیره سازی کنید).
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/array/) diff --git a/docs/fa/data_types/boolean.md b/docs/fa/data_types/boolean.md index c4ac8496383..4b4338f6313 100644 --- a/docs/fa/data_types/boolean.md +++ b/docs/fa/data_types/boolean.md @@ -1,6 +1,6 @@ -
+
-# مقادیر Boolean +# مقادیر Boolean {#mqdyr-boolean} type مخصوص مقادیر boolean وجود ندارد. از Uint8 و محدود شده به 0 و 1 می توان استفاده کرد. diff --git a/docs/fa/data_types/date.md b/docs/fa/data_types/date.md index 21e84f2be7b..6daf1574b6d 100644 --- a/docs/fa/data_types/date.md +++ b/docs/fa/data_types/date.md @@ -1,10 +1,11 @@ -
+
-# Date +# Date {#date} Date، دو بایت به ازای هر تاریخ که به صورت عددی و از تاریخ 01-01-1970 می باشد ذخیره می کند (unsigned).این type به شما اجازه ی ذخیره سازی تاریخ های از ابتدای Unix Epoch تا بالاترین مقدار قابل پشتیبانی توسط این استاندارد را می دهد (در حال حاضر بالاترین مقدار این روش سال 2106 می باشد، اما سال آخری که به طور کامل پشتیبانی می شود سال 2105 است). کمترین مقدار این type در خروجی 0000-00-00 می باشد. Date بدون time zone ذخیره می شود.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/date/) diff --git a/docs/fa/data_types/datetime.md b/docs/fa/data_types/datetime.md index 79c2166274a..eeccd3dcc60 100644 --- a/docs/fa/data_types/datetime.md +++ b/docs/fa/data_types/datetime.md @@ -1,10 +1,10 @@ -
+
-# DateTime {#data_type-datetime} +# DateTime {#data-type-datetime} تاریخ با ساعت 4 بایت به صورت Unix timestamp ذخیره می کند (unsigned). به شما اجازه ی ذخیره سازی در محدوده ی تایپ Date را می دهد. حداقل مقدار در خروجی 0000-00-00 00:00:00 می باشد. زمان با دقت تا یک ثانیه ذخیره می شود. -## Time zones +## Time zones {#time-zones} این type از text به باینری تبدیل می شود، و در هنگام برگشت با توجه به time zone سرور، در زمانی که کلاینت یا سرور شروع به کار می کند تبدیل می شود. در فرمت text، اطلاعات DST از دست می رود. diff --git a/docs/fa/data_types/enum.md b/docs/fa/data_types/enum.md index dddd6ace97b..80e5bdba7a5 100644 --- a/docs/fa/data_types/enum.md +++ b/docs/fa/data_types/enum.md @@ -1,6 +1,6 @@ -
+
-# Enum +# Enum {#enum} Enum8 یا Enum16، به شما اجازه ی ذخیره سازی مجموعه ای محدود از رشته ها را می دهد که کارآمادتر از String می باشد. @@ -8,19 +8,17 @@ Enum8 یا Enum16، به شما اجازه ی ذخیره سازی مجموعه
-``` -Enum8('hello' = 1, 'world' = 2) -``` + Enum8('hello' = 1, 'world' = 2) -
+
-- مقدار داخل این ستون می تواند یکی از دو مقدار 'hello' یا 'world' باشد. +- مقدار داخل این ستون می تواند یکی از دو مقدار ‘hello’ یا ‘world’ باشد. هرکدام از مقادیر یک عدد در محدوده ی `-128 ... 127` برتی `Enum8` و در محدوده ی `-32768 ... 32767` برای `Enum16` می باشد. تمام رشته ها و اعداد باید متفاوت باشند. رشته ی خالی مجاز است. اگر این type مشخص شده باشد (در تعریف جدول)، اعداد می توانند به صورت دلخواه مرتب شوند. با این حال، ترتیب در اینجا مهم نیست. در RAM، این type مشابه `Int8` یا `Int16` ذخیره می شود. هنگام خواندن در فرم متنی، ClickHouse مقدار را به صورت String پارس می کند و رشته ی مربوطه را در مقادیر Enum جستجو می کند. اگر رشته را پیدا کند یک expection پرتاب می شود. در هنگام خواندن در فرمت text، رشته خواند می شود و مقدار عددی مربوطه مورد بررسی قرار می گیرد. اگر مقدار پیدا نشود expection پرتاب می شود. هنگام نوشتن، مقدار با رشته ی مربوط نوشته می شود. اگر داده ی ستون دارای garbage باشد، (اعدادی که از مجموعه ی معتبرنیستند)، یک expection پرتاب می شود. هنگام خواندن و نوشتن به صورت باینری، این type شبیه به Int8 و Int16 کار می کند. The implicit default value is the value with the lowest number. -در حین `ORDER BY`, `GROUP BY`, `IN`, `DISTINCT` و...، این type رفتاری مشابه با اعداد دارد. برای مثال ORDER BY به صورت عددی اینها رو مرتب می کند.اپراتور های مقایسه ای و مساوی عمل مشابهی در Enums دارند. +در حین `ORDER BY`, `GROUP BY`, `IN`, `DISTINCT` و…، این type رفتاری مشابه با اعداد دارد. برای مثال ORDER BY به صورت عددی اینها رو مرتب می کند.اپراتور های مقایسه ای و مساوی عمل مشابهی در Enums دارند. مقادیر Enum نمیتوانند با اعداد مقایسه شوند. مقادیر Enum را می توان با رشته ی ثابت مقایسه کرد. اگر رشته ی مقایسه، مقدار معتبری برای مقایس به Enum نداشته باشد، یک expetion رخ می دهد. اپراتور IN در Enum پشتیبانی می شود؛ به این صورت که ستون enum در سمت چپ و مجموعه از رشته ها در سمت راست قرار می گیرند. رشته ها مقادیر مربوط به Enum هستند. @@ -31,4 +29,5 @@ Enum8('hello' = 1, 'world' = 2) استفاده از ALTER برای تبدیل Enum8 به Enum16 یا برعکس، ممکن است، دقیقا شبیه به Int8 به Int16.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/enum/) diff --git a/docs/fa/data_types/fixedstring.md b/docs/fa/data_types/fixedstring.md index f006834d9bb..e89b4fb0ad5 100644 --- a/docs/fa/data_types/fixedstring.md +++ b/docs/fa/data_types/fixedstring.md @@ -1,10 +1,11 @@ -
+
-# FixedString(N) +# FixedString(N) {#fixedstringn} یک رشته با طول ثابت N بایت (fixed-length) (نه تعداد کاراکتر یا code point). N باید یک عدد طبیعی مثبت باشد. هنگامی که سرور رشته ای با اندازه ی کمتر از N میخواند (مثل زمان پارس کردن برای INSERT داده ها)، سمت راست رشته به اندازه فضای خالی باقی مانده به بایت، null درج می شود. زمانی که سرور رشته ای بزرگتر از N میخواند، پیغام خطا بر میگردد. زمانی که سرور یک رشته با طول ثابت را می نویسد (مثلا در زمانی که خروجی یک SELECT را برمیگرداند)، بایت های null از انتهای رشته حذف نمی شوند، و در خروجی می آیند. توجه داشته باشید که این رفتار متفاوت از رفتار MySQL برای Char می باشد (زمانی که رشته با space پر می شود و در خروجی space ها حذف می شود). توابع کمتری نسبت به String برای FixedString(N) وجود دارد، و برای استفاده کمتر مناسب است.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/fixedstring/) diff --git a/docs/fa/data_types/float.md b/docs/fa/data_types/float.md index 8c4466663a8..7c156f2d976 100644 --- a/docs/fa/data_types/float.md +++ b/docs/fa/data_types/float.md @@ -1,42 +1,41 @@ -
+
-# Float32, Float64 +# Float32, Float64 {#float32-float64} [اعداد Float](https://en.wikipedia.org/wiki/IEEE_754). Type های float در ClickHouse مشابه C می باشد: -- `Float32` - `float` -- `Float64` - `double` +- `Float32` - `float` +- `Float64` - `double` توصیه می کنیم که داده ها را هرزمان که امکان پذیره است به جای float به صورت int ذخیره کنید. برای مثال: تبدیل دقت اعداد به یک مقدار int، مثل سرعت page load در قالب میلی ثانیه. -## استفاده از اعداد Float +## استفاده از اعداد Float {#stfdh-z-dd-float} -- محاسبات با اعداد با Float ممکن است خطای round شدن را ایجاد کنند. +- محاسبات با اعداد با Float ممکن است خطای round شدن را ایجاد کنند.
``` sql SELECT 1 - 0.9 ``` -``` -┌───────minus(1, 0.9)─┐ -│ 0.09999999999999998 │ -└─────────────────────┘ -``` -
+ ┌───────minus(1, 0.9)─┐ + │ 0.09999999999999998 │ + └─────────────────────┘ -- نتایج محاسبات بسته به متد محاسباتی می باشد (نوع processor و معماری سیستم). -- محاسبات Float ممکن اسن نتایجی مثل infinity (`inf`) و "Not-a-number" (`Nan`) داشته باشد. این در هنگام پردازش نتایج محاسبات باید مورد توجه قرار گیرد. -- هنگام خواندن اعداد float از سطر ها، نتایج ممکن است نزدیک به اعداد machine-representable نباشد. +
-## NaN و Inf {#data_type-float-nan-inf} +- نتایج محاسبات بسته به متد محاسباتی می باشد (نوع processor و معماری سیستم). +- محاسبات Float ممکن اسن نتایجی مثل infinity (`inf`) و «Not-a-number» (`Nan`) داشته باشد. این در هنگام پردازش نتایج محاسبات باید مورد توجه قرار گیرد. +- هنگام خواندن اعداد float از سطر ها، نتایج ممکن است نزدیک به اعداد machine-representable نباشد. + +## NaN و Inf {#data-type-float-nan-inf} در مقابل استاندارد SQL، ClickHouse از موارد زیر مربوط به اعداد float پشتیبانی می کند: -- `Inf` – Infinity. +- `Inf` – Infinity.
@@ -44,15 +43,13 @@ SELECT 1 - 0.9 SELECT 0.5 / 0 ``` -``` -┌─divide(0.5, 0)─┐ -│ inf │ -└────────────────┘ -``` + ┌─divide(0.5, 0)─┐ + │ inf │ + └────────────────┘ -
+
-- `-Inf` – Negative infinity. +- `-Inf` – Negative infinity.
@@ -60,31 +57,26 @@ SELECT 0.5 / 0 SELECT -0.5 / 0 ``` -``` -┌─divide(-0.5, 0)─┐ -│ -inf │ -└─────────────────┘ -``` + ┌─divide(-0.5, 0)─┐ + │ -inf │ + └─────────────────┘ -
+
-- `NaN` – Not a number. +- `NaN` – Not a number.
-``` -SELECT 0 / 0 -``` + SELECT 0 / 0 -``` -┌─divide(0, 0)─┐ -│ nan │ -└──────────────┘ -``` + ┌─divide(0, 0)─┐ + │ nan │ + └──────────────┘ -
+
-قوانین مربوط به مرتب سازی ` Nan ` را در بخش [ORDER BY clause](../query_language/select.md) ببینید. +قوانین مربوط به مرتب سازی `Nan` را در بخش [ORDER BY clause](../query_language/select.md) ببینید.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/float/) diff --git a/docs/fa/data_types/index.md b/docs/fa/data_types/index.md index d8910be367c..b0d15860d87 100644 --- a/docs/fa/data_types/index.md +++ b/docs/fa/data_types/index.md @@ -1,6 +1,6 @@ -
+
-# Data types {#data_types} +# Data types {#data-types} ClickHouse قابلیت ذخیره سازی انواع type های مختلف برای ذخیره داده ها در جداول را دارا می باشد. diff --git a/docs/fa/data_types/int_uint.md b/docs/fa/data_types/int_uint.md index 383eb978a4b..caa0204a9e8 100644 --- a/docs/fa/data_types/int_uint.md +++ b/docs/fa/data_types/int_uint.md @@ -1,23 +1,23 @@ -
+
-# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 +# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 {#uint8-uint16-uint32-uint64-int8-int16-int32-int64} اعداد با طول مشخص (Fixed-length) با یا بدون sign +## محدوده ی Int {#mhdwdh-y-int} -## محدوده ی Int +- Int8 - \[-128 : 127\] +- Int16 - \[-32768 : 32767\] +- Int32 - \[-2147483648 : 2147483647\] +- Int64 - \[-9223372036854775808 : 9223372036854775807\] -- Int8 - [-128 : 127] -- Int16 - [-32768 : 32767] -- Int32 - [-2147483648 : 2147483647] -- Int64 - [-9223372036854775808 : 9223372036854775807] +## محدوده ی Uint {#mhdwdh-y-uint} -## محدوده ی Uint - -- UInt8 - [0 : 255] -- UInt16 - [0 : 65535] -- UInt32 - [0 : 4294967295] -- UInt64 - [0 : 18446744073709551615] +- UInt8 - \[0 : 255\] +- UInt16 - \[0 : 65535\] +- UInt32 - \[0 : 4294967295\] +- UInt64 - \[0 : 18446744073709551615\]
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/int_uint/) diff --git a/docs/fa/data_types/nested_data_structures/aggregatefunction.md b/docs/fa/data_types/nested_data_structures/aggregatefunction.md index e0345142080..c3de6f586fd 100644 --- a/docs/fa/data_types/nested_data_structures/aggregatefunction.md +++ b/docs/fa/data_types/nested_data_structures/aggregatefunction.md @@ -1,8 +1,9 @@ -
+
-# AggregateFunction(name, types_of_arguments...) +# AggregateFunction(name, types\_of\_arguments…) {#aggregatefunctionname-types-of-arguments} -حالت متوسط از توابع aggregate. برای دریافت آن، از توابع aggregate به همراه پسوند '-State' استفاده کنید. برای اطلاعات بیشتر قسمت "AggregatingMergeTree" را ببینید. +حالت متوسط از توابع aggregate. برای دریافت آن، از توابع aggregate به همراه پسوند ‘-State’ استفاده کنید. برای اطلاعات بیشتر قسمت «AggregatingMergeTree» را ببینید.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/nested_data_structures/aggregatefunction/) diff --git a/docs/fa/data_types/nested_data_structures/index.md b/docs/fa/data_types/nested_data_structures/index.md index ed3d9119815..419af3b9294 100644 --- a/docs/fa/data_types/nested_data_structures/index.md +++ b/docs/fa/data_types/nested_data_structures/index.md @@ -1,6 +1,6 @@ -
+
-# Nested data structures +# Nested data structures {#nested-data-structures}
diff --git a/docs/fa/data_types/nested_data_structures/nested.md b/docs/fa/data_types/nested_data_structures/nested.md index b97096b49c8..dc1b1ffefab 100644 --- a/docs/fa/data_types/nested_data_structures/nested.md +++ b/docs/fa/data_types/nested_data_structures/nested.md @@ -1,6 +1,6 @@ -
+
-# Nested(Name1 Type1, Name2 Type2, ...) +# Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} ساختار داده ی nested شبیه به یک جدول nested می باشد.پارامترهای ساختار داده nested، نام ستون ها و type های آنها مشابه دستور CREATE، مشخص می شود. هر سطر از جدول می تواند به هر تعداد سطر در ساختار داده nested مربوط شود. @@ -31,7 +31,7 @@ CREATE TABLE test.visits ) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) ``` -
+
این مثال `Goals` را به عنوان یک ساختار داده nested تعریف می کند، که می تواند شامل داده های مربوط به conversion (اهداف رسیده) باشد. هر سطر در جدول `visit` می تواند با صفر یا چند coversion ارتباط داشته باشد. @@ -52,26 +52,24 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -``` -┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ -│ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ -│ [1073752] │ ['2014-03-17 00:28:25'] │ -│ [1073752] │ ['2014-03-17 10:46:20'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │ -│ [] │ [] │ -│ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │ -│ [] │ [] │ -│ [] │ [] │ -│ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │ -└────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘ -``` + ┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ + │ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ + │ [1073752] │ ['2014-03-17 00:28:25'] │ + │ [1073752] │ ['2014-03-17 10:46:20'] │ + │ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │ + │ [] │ [] │ + │ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │ + │ [] │ [] │ + │ [] │ [] │ + │ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │ + │ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │ + └────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘ -
+
ساده ترین راه برای فکر کردن به یک ساختار داده nestet این است که، یک nestet مجموعه ای از آرایه های چند ستونی با طول ثابت است. -تنها جایی که یک دستور SELECT می تواند کل ساختار داده ی nested را به جای مشخص کردن ستون های آن قرار دهد، عبارت ARRAY JOIN است. برای اطلاعات بیشتر "ARRAY JOIN clouse" را ببینید. مثال: +تنها جایی که یک دستور SELECT می تواند کل ساختار داده ی nested را به جای مشخص کردن ستون های آن قرار دهد، عبارت ARRAY JOIN است. برای اطلاعات بیشتر «ARRAY JOIN clouse» را ببینید. مثال:
@@ -85,22 +83,20 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -``` -┌─Goal.ID─┬──────Goal.EventTime─┐ -│ 1073752 │ 2014-03-17 16:38:10 │ -│ 591325 │ 2014-03-17 16:38:48 │ -│ 591325 │ 2014-03-17 16:42:27 │ -│ 1073752 │ 2014-03-17 00:28:25 │ -│ 1073752 │ 2014-03-17 10:46:20 │ -│ 1073752 │ 2014-03-17 13:59:20 │ -│ 591325 │ 2014-03-17 22:17:55 │ -│ 591325 │ 2014-03-17 22:18:07 │ -│ 591325 │ 2014-03-17 22:18:51 │ -│ 1073752 │ 2014-03-17 11:37:06 │ -└─────────┴─────────────────────┘ -``` + ┌─Goal.ID─┬──────Goal.EventTime─┐ + │ 1073752 │ 2014-03-17 16:38:10 │ + │ 591325 │ 2014-03-17 16:38:48 │ + │ 591325 │ 2014-03-17 16:42:27 │ + │ 1073752 │ 2014-03-17 00:28:25 │ + │ 1073752 │ 2014-03-17 10:46:20 │ + │ 1073752 │ 2014-03-17 13:59:20 │ + │ 591325 │ 2014-03-17 22:17:55 │ + │ 591325 │ 2014-03-17 22:18:07 │ + │ 591325 │ 2014-03-17 22:18:51 │ + │ 1073752 │ 2014-03-17 11:37:06 │ + └─────────┴─────────────────────┘ -
+
شما نمیتوانید در قسمت SELECT تمام ساختار داده ی nested را قرار دهید. شما فقط می توانید ستون های فردی که هر کدام بخشی از این ساختار داده هستند را لیست کنید. @@ -111,4 +107,5 @@ LIMIT 10 دستور ALTER برای عناصر داخل nested بسیار محدود است.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/nested_data_structures/nested/) diff --git a/docs/fa/data_types/special_data_types/expression.md b/docs/fa/data_types/special_data_types/expression.md index 07de3db7201..3c33fad95a6 100644 --- a/docs/fa/data_types/special_data_types/expression.md +++ b/docs/fa/data_types/special_data_types/expression.md @@ -1,8 +1,9 @@ -
+
-# Expression +# Expression {#expression} برای نشان دادن توابع لامبدا در توابع high-order استفاده می شود.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/special_data_types/expression/) diff --git a/docs/fa/data_types/special_data_types/index.md b/docs/fa/data_types/special_data_types/index.md index 8b71fa9e124..a95f5b1b49c 100644 --- a/docs/fa/data_types/special_data_types/index.md +++ b/docs/fa/data_types/special_data_types/index.md @@ -1,8 +1,9 @@ -
+
-# Special data types +# Special data types {#special-data-types} مقادیر نوع داده special، نمیتوانند در در جدول ذخیره و یا در نتایج خروجی قرار بگیرند، اما در نتایج متوسط (intermediate) یک query در حال اجرا استفاده می شوند.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/special_data_types/) diff --git a/docs/fa/data_types/special_data_types/set.md b/docs/fa/data_types/special_data_types/set.md index 376c9615514..1d1a1a535ae 100644 --- a/docs/fa/data_types/special_data_types/set.md +++ b/docs/fa/data_types/special_data_types/set.md @@ -1,8 +1,9 @@ -
+
-# Set +# Set {#set} برای نصف سمت راست IN استفاده می شود.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/special_data_types/set/) diff --git a/docs/fa/data_types/string.md b/docs/fa/data_types/string.md index 87500a3c882..8142ab0d861 100644 --- a/docs/fa/data_types/string.md +++ b/docs/fa/data_types/string.md @@ -1,12 +1,13 @@ -
+
-# String +# String {#string} -String یک type برای قرار دادن رشته با طول دلخواه می باشد. این طول محدود نمی باشد. این مقدار می تواند شامل مجموعه ای دلخواه از بایت ها، از جمله null byte باشد. String type جایگزین type های VARCHAR, BLOB, CLOB و ... دیگر DBMS ها می باشد. +String یک type برای قرار دادن رشته با طول دلخواه می باشد. این طول محدود نمی باشد. این مقدار می تواند شامل مجموعه ای دلخواه از بایت ها، از جمله null byte باشد. String type جایگزین type های VARCHAR, BLOB, CLOB و … دیگر DBMS ها می باشد. -## Encodings +## Encodings {#encodings} -ClickHouse مفهومی به نام encoding ندارد. String ها می توانند شامل مجموعه ای بایت ها باشند که با همان شکل که نوشته می شوند به همان شکل هم در خروجی دیده شوند. اگر شما نیاز به ذخیره سازی متن دارید، توصیه می کنیم از UTF-8 استفاده کنید. حداقل اگر ترمینال شما از UTF-8 (پیشنهاد شده)، استفاده می کند، شما می توانید به راحتی مقادیر خود را نوشته و بخوانید.به طور مشابه توابع خاصی برای کار با رشته های متنوع وجود دارند که تخل این فرضیه عمل می کنند که رشته شامل مجوعه ای از بایت ها می باشند که نماینده ی متن های UTF-8 هستند. برای مثال تابع 'length' برای محاسبه طول رشته براساس بایت است، در حالی که تابع 'lengthUTF8' برای محاسبه طول رشته بر اساس UNICODE می باشد. +ClickHouse مفهومی به نام encoding ندارد. String ها می توانند شامل مجموعه ای بایت ها باشند که با همان شکل که نوشته می شوند به همان شکل هم در خروجی دیده شوند. اگر شما نیاز به ذخیره سازی متن دارید، توصیه می کنیم از UTF-8 استفاده کنید. حداقل اگر ترمینال شما از UTF-8 (پیشنهاد شده)، استفاده می کند، شما می توانید به راحتی مقادیر خود را نوشته و بخوانید.به طور مشابه توابع خاصی برای کار با رشته های متنوع وجود دارند که تخل این فرضیه عمل می کنند که رشته شامل مجوعه ای از بایت ها می باشند که نماینده ی متن های UTF-8 هستند. برای مثال تابع ‘length’ برای محاسبه طول رشته براساس بایت است، در حالی که تابع ‘lengthUTF8’ برای محاسبه طول رشته بر اساس UNICODE می باشد.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/string/) diff --git a/docs/fa/data_types/tuple.md b/docs/fa/data_types/tuple.md index 561180c6804..3935416b7d1 100644 --- a/docs/fa/data_types/tuple.md +++ b/docs/fa/data_types/tuple.md @@ -1,10 +1,11 @@ -
+
-# Tuple(T1, T2, ...) +# Tuple(T1, T2, …) {#tuplet1-t2} -Tuple ها نمیتوانند در جدول نوشته شوند (به غیر جداول Memory). آنها برای گروه بندی موقت ستون ها مورد استفاده قرار می گیرند. ستون های می توانند گروه بندی شوند اگر از عبارت In در query استفاده کنیم، و برای تعیین فرمت پارامتر های خاصی از توابع لامبدا. برای اطلاعات بیشتر "اپراتور IN" , "توابع Higher order" را ببینید. +Tuple ها نمیتوانند در جدول نوشته شوند (به غیر جداول Memory). آنها برای گروه بندی موقت ستون ها مورد استفاده قرار می گیرند. ستون های می توانند گروه بندی شوند اگر از عبارت In در query استفاده کنیم، و برای تعیین فرمت پارامتر های خاصی از توابع لامبدا. برای اطلاعات بیشتر «اپراتور IN» , «توابع Higher order» را ببینید. Tuple می توانند در خروجی نتیجه query در حال اجرا باشند. در این مورد، برای فرمت های text به غیر از JSON\*، مقادیر به صورت comma-separate داخل براکت قرار میگیرند. در فرمت های JSON\* مقادیر tuple به صورت آرایه در خروجی می آیند.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/data_types/tuple/) diff --git a/docs/fa/getting_started/example_datasets/amplab_benchmark.md b/docs/fa/getting_started/example_datasets/amplab_benchmark.md index 991622a9064..f5c8ff2e9d0 100644 --- a/docs/fa/getting_started/example_datasets/amplab_benchmark.md +++ b/docs/fa/getting_started/example_datasets/amplab_benchmark.md @@ -1,16 +1,16 @@ -
+
-# بنچمارک AMPLab Big Data +# بنچمارک AMPLab Big Data {#bnchmrkh-amplab-big-data} -ببینید +ببینید https://amplab.cs.berkeley.edu/benchmark/ -با یک اکانت مجانی در ثبت نام کنید. شما نیاز به ایمیل، شماره تلفن و credit card دارید. یک Access key جدید از دریافت کنید. +با یک اکانت مجانی در https://aws.amazon.com ثبت نام کنید. شما نیاز به ایمیل، شماره تلفن و credit card دارید. یک Access key جدید از https://console.aws.amazon.com/iam/home?nc2=h\_m\_sc\#security\_credential دریافت کنید. در کنسول این دستورات را وارد کنید:
-```bash +``` bash sudo apt-get install s3cmd mkdir tiny; cd tiny; s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . @@ -23,7 +23,7 @@ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . cd .. ``` -
+
این query های ClickHouse را اجرا کنید: @@ -91,13 +91,13 @@ CREATE TABLE uservisits_5nodes_on_single ) ENGINE = MergeTree(visitDate, visitDate, 8192); ``` -
+
به کنسول برگردید و دستورات زیر را مجددا اجرا کنید:
-```bash +``` bash for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done @@ -106,7 +106,7 @@ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | cl for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done ``` -
+
query های گرفتن data sample diff --git a/docs/fa/getting_started/example_datasets/criteo.md b/docs/fa/getting_started/example_datasets/criteo.md index 5925776a58b..de5046e1c76 100644 --- a/docs/fa/getting_started/example_datasets/criteo.md +++ b/docs/fa/getting_started/example_datasets/criteo.md @@ -1,8 +1,8 @@ -
+
-# ترابایت از لاگ های کلیک از سرویس Criteo +# ترابایت از لاگ های کلیک از سرویس Criteo {#trbyt-z-lg-hy-khlykh-z-srwys-criteo} -داده ها را از دانلود کنید. +داده ها را از http://labs.criteo.com/downloads/download-terabyte-click-logs/ دانلود کنید. جدول را برای import لاگ ها ایجاد کنید: @@ -12,17 +12,17 @@ CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log ``` -
+
داده ها را دانلود کنید:
-```bash +``` bash for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done ``` -
+
یک جدول برای داده های تبدیل شده ایجاد کنید: @@ -75,7 +75,7 @@ CREATE TABLE criteo ) ENGINE = MergeTree(date, intHash32(icat1), (date, intHash32(icat1)), 8192) ``` -
+
داده ها را از لاگ raw انتقال و به جدول دوم وارد کنید: @@ -87,5 +87,4 @@ INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int DROP TABLE criteo_log; ``` - [مقاله اصلی](https://clickhouse.tech/docs/fa/getting_started/example_datasets/criteo/) diff --git a/docs/fa/getting_started/example_datasets/metrica.md b/docs/fa/getting_started/example_datasets/metrica.md deleted file mode 120000 index 984023973eb..00000000000 --- a/docs/fa/getting_started/example_datasets/metrica.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/getting_started/example_datasets/metrica.md \ No newline at end of file diff --git a/docs/fa/getting_started/example_datasets/metrica.md b/docs/fa/getting_started/example_datasets/metrica.md new file mode 100644 index 00000000000..47b9751d513 --- /dev/null +++ b/docs/fa/getting_started/example_datasets/metrica.md @@ -0,0 +1,63 @@ +# ناشناس یاندکس.اطلاعات متریکا {#nshns-yndkhs-tl-t-mtrykh} + +مجموعه داده شامل دو جدول حاوی داده های ناشناس در مورد بازدید (`hits_v1`) و بازدیدکننده داشته است (`visits_v1`) یاندکس . متریکا شما می توانید اطلاعات بیشتر در مورد یاندکس به عنوان خوانده شده.متریکا در [تاریخچه کلیک](../../introduction/history.md) بخش. + +مجموعه داده ها شامل دو جدول است که هر کدام می توانند به عنوان یک فشرده دانلود شوند `tsv.xz` فایل و یا به عنوان پارتیشن تهیه شده است. علاوه بر این, یک نسخه طولانی از `hits` جدول حاوی 100 میلیون ردیف به عنوان تسو در دسترس است https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits\_100m\_obfuscated\_v1.tsv.xz و به عنوان پارتیشن تهیه شده در https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits\_100m\_obfuscated\_v1.tar.xz. + +## اخذ جداول از پارتیشن های تهیه شده {#khdh-jdwl-z-prtyshn-hy-thyh-shdh} + +دانلود و وارد کردن جدول بازدید: + +``` bash +curl -O https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar +tar xvf hits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory +# check permissions on unpacked data, fix if required +sudo service clickhouse-server restart +clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" +``` + +دانلود و وارد کردن بازدیدکننده داشته است: + +``` bash +curl -O https://clickhouse-datasets.s3.yandex.net/visits/partitions/visits_v1.tar +tar xvf visits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory +# check permissions on unpacked data, fix if required +sudo service clickhouse-server restart +clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" +``` + +## اخذ جداول از فایل تسو فشرده {#khdh-jdwl-z-fyl-tsw-fshrdh} + +دانلود و وارد کردن بازدید از فایل تسو فشرده: + +``` bash +curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv +# now create table +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" +# import data +cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000 +# optionally you can optimize table +clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL" +clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" +``` + +دانلود و واردات بازدیدکننده داشته است از فشرده فایل: + +``` bash +curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv +# now create table +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign)" +# import data +cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000 +# optionally you can optimize table +clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" +clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" +``` + +## به عنوان مثال نمایش داده شد {#bh-nwn-mthl-nmysh-ddh-shd} + +[اموزش کلیک](../../getting_started/tutorial.md) است در یاندکس بر اساس.مجموعه داده های متریکا و راه توصیه شده برای شروع این مجموعه داده ها فقط از طریق تدریس خصوصی است. + +نمونه های اضافی از نمایش داده شد به این جداول را می توان در میان یافت [تست های نفرت انگیز](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/tests/queries/1_stateful) از کلیک هاوس (به نام `test.hists` و `test.visits` وجود دارد). diff --git a/docs/fa/getting_started/example_datasets/nyc_taxi.md b/docs/fa/getting_started/example_datasets/nyc_taxi.md index 70101930c05..282b52ebcdb 100644 --- a/docs/fa/getting_started/example_datasets/nyc_taxi.md +++ b/docs/fa/getting_started/example_datasets/nyc_taxi.md @@ -1,12 +1,10 @@ -
+
-# داده های تاکسی New York +# داده های تاکسی New York {#ddh-hy-tkhsy-new-york} -## چطور داده های raw را import کنیم +## چطور داده های raw را import کنیم {#chtwr-ddh-hy-raw-r-import-khnym} - - -برای توضیحات بیشتر در ارتباط با دیتاست و موارد مربوط به دانلود به دو لینک و مراجعه کنید. +برای توضیحات بیشتر در ارتباط با دیتاست و موارد مربوط به دانلود به دو لینک https://github.com/toddwschneider/nyc-taxi-data و http://tech.marksblogg.com/billion-nyc-taxi-rides-redshift.html مراجعه کنید. دانلود فایل ها حدود 277 گیگابایت داده ی غیرفشرده در قالب فایل های CSV می باشد. دانلود با استفاده ازبیش از یک کانکشن 1 Gbit نزدیک 1 ساعت طول می کشد (دانلود موازی از s3.amazonaws.com حداقل نصف کانال 1 Gbit رو جبران می کند). بعضی از فایل ها ممکن است به طول کامل دانلود نشوند. اندازه فایل ها را بررسی کنید و اگر فایلی مشکوک بود، مجددا دانلود کنید. @@ -14,14 +12,14 @@
-```bash +``` bash sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-02.csv > data/yellow_tripdata_2010-02.csv_ sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-03.csv > data/yellow_tripdata_2010-03.csv_ mv data/yellow_tripdata_2010-02.csv_ data/yellow_tripdata_2010-02.csv mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv ``` -
+
سپس داده ها باید در PostgreSQL پیش پردازش شوند. این کار نقاط انتخابی چند ضلعی را ایجاد می کند (برای مطابقت با نقاط بر روی نقشه با مناطق شهر نیویورک) و تمام داده ها را با استفاده از JOIN در یک جدول flat و denormal ترکیب می کند. برای این کار شما نیاز به نصب PostgreSQL با پشتیبانی از PostGIS دارید. @@ -33,16 +31,14 @@ PostgreSQL تقریبا 20 تا 30 دقیقه برای پردازش هر ماه
-``` -time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" -## count - 1298979494 -(1 row) + time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" + ## count + 1298979494 + (1 row) -real 7m9.164s -``` + real 7m9.164s -
+
(در یکی از پست های مقالات Mark Litwintschik این کمی بیشتر از 1.1 میلیارد سطر گزارش شده است.) @@ -122,7 +118,7 @@ COPY ) TO '/opt/milovidov/nyc-taxi-data/trips.tsv'; ``` -
+
snapshot از داده ها با سرعت 50 مگابایت در ثانیه انجام می شود. در هنگام ایجاد snapshot، PostgreSQL داده ها را با سرعت 28 مگابایت در ثانیه از روی می خواند. این کار حدود 5 ساعت زمان میبرد. نتیجه کار فایل TSV با حجم 590612904969 بایت می باشد. @@ -187,25 +183,23 @@ dropoff_puma Nullable(String) ) ENGINE = Log; ``` -
+
برای تبدیل فیلد ها به data type های صحیح تر و در صورت امکان، حذف NULL ها لازم است.
-``` -time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv + time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv -real 75m56.214s -``` + real 75m56.214s -
+
داده ها با سرعت 112 تا 140 مگابیت در ثانیه خوانده می شوند. load کردن داده ها در جدول Log Type در یک Stream، 76 دقیقه زمان کشید. این داده ها در این جدول 142 گیگابایت فضا اشغال می کنند. -(import کردن داده ها به صورت مستقیم از Postgres با استفاده از ` COPY ... TO PROGRAM` هم امکان پذیر است.) +(import کردن داده ها به صورت مستقیم از Postgres با استفاده از `COPY ... TO PROGRAM` هم امکان پذیر است.) -متاسفانه، تمام فیلد های مرتبط با آب و هوا (precipitation...average_wind_speed) با Null پر شدند. به خاطر همین، ما از دیتاست نهایی اینها رو حذف کردیم. +متاسفانه، تمام فیلد های مرتبط با آب و هوا (precipitation…average\_wind\_speed) با Null پر شدند. به خاطر همین، ما از دیتاست نهایی اینها رو حذف کردیم. برای شروع، ما یک جدول در یک سرور ایجاد کردیم. بعدا ما یک جدول توزیع شده می سازیم. @@ -213,69 +207,67 @@ real 75m56.214s
-``` -CREATE TABLE trips_mergetree -ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) -AS SELECT + CREATE TABLE trips_mergetree + ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) + AS SELECT -trip_id, -CAST(vendor_id AS Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14)) AS vendor_id, -toDate(pickup_datetime) AS pickup_date, -ifNull(pickup_datetime, toDateTime(0)) AS pickup_datetime, -toDate(dropoff_datetime) AS dropoff_date, -ifNull(dropoff_datetime, toDateTime(0)) AS dropoff_datetime, -assumeNotNull(store_and_fwd_flag) IN ('Y', '1', '2') AS store_and_fwd_flag, -assumeNotNull(rate_code_id) AS rate_code_id, -assumeNotNull(pickup_longitude) AS pickup_longitude, -assumeNotNull(pickup_latitude) AS pickup_latitude, -assumeNotNull(dropoff_longitude) AS dropoff_longitude, -assumeNotNull(dropoff_latitude) AS dropoff_latitude, -assumeNotNull(passenger_count) AS passenger_count, -assumeNotNull(trip_distance) AS trip_distance, -assumeNotNull(fare_amount) AS fare_amount, -assumeNotNull(extra) AS extra, -assumeNotNull(mta_tax) AS mta_tax, -assumeNotNull(tip_amount) AS tip_amount, -assumeNotNull(tolls_amount) AS tolls_amount, -assumeNotNull(ehail_fee) AS ehail_fee, -assumeNotNull(improvement_surcharge) AS improvement_surcharge, -assumeNotNull(total_amount) AS total_amount, -CAST((assumeNotNull(payment_type) AS pt) IN ('CSH', 'CASH', 'Cash', 'CAS', 'Cas', '1') ? 'CSH' : (pt IN ('CRD', 'Credit', 'Cre', 'CRE', 'CREDIT', '2') ? 'CRE' : (pt IN ('NOC', 'No Charge', 'No', '3') ? 'NOC' : (pt IN ('DIS', 'Dispute', 'Dis', '4') ? 'DIS' : 'UNK'))) AS Enum8('CSH' = 1, 'CRE' = 2, 'UNK' = 0, 'NOC' = 3, 'DIS' = 4)) AS payment_type_, -assumeNotNull(trip_type) AS trip_type, -ifNull(toFixedString(unhex(pickup), 25), toFixedString('', 25)) AS pickup, -ifNull(toFixedString(unhex(dropoff), 25), toFixedString('', 25)) AS dropoff, -CAST(assumeNotNull(cab_type) AS Enum8('yellow' = 1, 'green' = 2, 'uber' = 3)) AS cab_type, + trip_id, + CAST(vendor_id AS Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14)) AS vendor_id, + toDate(pickup_datetime) AS pickup_date, + ifNull(pickup_datetime, toDateTime(0)) AS pickup_datetime, + toDate(dropoff_datetime) AS dropoff_date, + ifNull(dropoff_datetime, toDateTime(0)) AS dropoff_datetime, + assumeNotNull(store_and_fwd_flag) IN ('Y', '1', '2') AS store_and_fwd_flag, + assumeNotNull(rate_code_id) AS rate_code_id, + assumeNotNull(pickup_longitude) AS pickup_longitude, + assumeNotNull(pickup_latitude) AS pickup_latitude, + assumeNotNull(dropoff_longitude) AS dropoff_longitude, + assumeNotNull(dropoff_latitude) AS dropoff_latitude, + assumeNotNull(passenger_count) AS passenger_count, + assumeNotNull(trip_distance) AS trip_distance, + assumeNotNull(fare_amount) AS fare_amount, + assumeNotNull(extra) AS extra, + assumeNotNull(mta_tax) AS mta_tax, + assumeNotNull(tip_amount) AS tip_amount, + assumeNotNull(tolls_amount) AS tolls_amount, + assumeNotNull(ehail_fee) AS ehail_fee, + assumeNotNull(improvement_surcharge) AS improvement_surcharge, + assumeNotNull(total_amount) AS total_amount, + CAST((assumeNotNull(payment_type) AS pt) IN ('CSH', 'CASH', 'Cash', 'CAS', 'Cas', '1') ? 'CSH' : (pt IN ('CRD', 'Credit', 'Cre', 'CRE', 'CREDIT', '2') ? 'CRE' : (pt IN ('NOC', 'No Charge', 'No', '3') ? 'NOC' : (pt IN ('DIS', 'Dispute', 'Dis', '4') ? 'DIS' : 'UNK'))) AS Enum8('CSH' = 1, 'CRE' = 2, 'UNK' = 0, 'NOC' = 3, 'DIS' = 4)) AS payment_type_, + assumeNotNull(trip_type) AS trip_type, + ifNull(toFixedString(unhex(pickup), 25), toFixedString('', 25)) AS pickup, + ifNull(toFixedString(unhex(dropoff), 25), toFixedString('', 25)) AS dropoff, + CAST(assumeNotNull(cab_type) AS Enum8('yellow' = 1, 'green' = 2, 'uber' = 3)) AS cab_type, -assumeNotNull(pickup_nyct2010_gid) AS pickup_nyct2010_gid, -toFloat32(ifNull(pickup_ctlabel, '0')) AS pickup_ctlabel, -assumeNotNull(pickup_borocode) AS pickup_borocode, -CAST(assumeNotNull(pickup_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS pickup_boroname, -toFixedString(ifNull(pickup_ct2010, '000000'), 6) AS pickup_ct2010, -toFixedString(ifNull(pickup_boroct2010, '0000000'), 7) AS pickup_boroct2010, -CAST(assumeNotNull(ifNull(pickup_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS pickup_cdeligibil, -toFixedString(ifNull(pickup_ntacode, '0000'), 4) AS pickup_ntacode, + assumeNotNull(pickup_nyct2010_gid) AS pickup_nyct2010_gid, + toFloat32(ifNull(pickup_ctlabel, '0')) AS pickup_ctlabel, + assumeNotNull(pickup_borocode) AS pickup_borocode, + CAST(assumeNotNull(pickup_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS pickup_boroname, + toFixedString(ifNull(pickup_ct2010, '000000'), 6) AS pickup_ct2010, + toFixedString(ifNull(pickup_boroct2010, '0000000'), 7) AS pickup_boroct2010, + CAST(assumeNotNull(ifNull(pickup_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS pickup_cdeligibil, + toFixedString(ifNull(pickup_ntacode, '0000'), 4) AS pickup_ntacode, -CAST(assumeNotNull(pickup_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS pickup_ntaname, + CAST(assumeNotNull(pickup_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS pickup_ntaname, -toUInt16(ifNull(pickup_puma, '0')) AS pickup_puma, + toUInt16(ifNull(pickup_puma, '0')) AS pickup_puma, -assumeNotNull(dropoff_nyct2010_gid) AS dropoff_nyct2010_gid, -toFloat32(ifNull(dropoff_ctlabel, '0')) AS dropoff_ctlabel, -assumeNotNull(dropoff_borocode) AS dropoff_borocode, -CAST(assumeNotNull(dropoff_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS dropoff_boroname, -toFixedString(ifNull(dropoff_ct2010, '000000'), 6) AS dropoff_ct2010, -toFixedString(ifNull(dropoff_boroct2010, '0000000'), 7) AS dropoff_boroct2010, -CAST(assumeNotNull(ifNull(dropoff_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS dropoff_cdeligibil, -toFixedString(ifNull(dropoff_ntacode, '0000'), 4) AS dropoff_ntacode, + assumeNotNull(dropoff_nyct2010_gid) AS dropoff_nyct2010_gid, + toFloat32(ifNull(dropoff_ctlabel, '0')) AS dropoff_ctlabel, + assumeNotNull(dropoff_borocode) AS dropoff_borocode, + CAST(assumeNotNull(dropoff_boroname) AS Enum8('Manhattan' = 1, 'Queens' = 4, 'Brooklyn' = 3, '' = 0, 'Bronx' = 2, 'Staten Island' = 5)) AS dropoff_boroname, + toFixedString(ifNull(dropoff_ct2010, '000000'), 6) AS dropoff_ct2010, + toFixedString(ifNull(dropoff_boroct2010, '0000000'), 7) AS dropoff_boroct2010, + CAST(assumeNotNull(ifNull(dropoff_cdeligibil, ' ')) AS Enum8(' ' = 0, 'E' = 1, 'I' = 2)) AS dropoff_cdeligibil, + toFixedString(ifNull(dropoff_ntacode, '0000'), 4) AS dropoff_ntacode, -CAST(assumeNotNull(dropoff_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS dropoff_ntaname, + CAST(assumeNotNull(dropoff_ntaname) AS Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195)) AS dropoff_ntaname, -toUInt16(ifNull(dropoff_puma, '0')) AS dropoff_puma + toUInt16(ifNull(dropoff_puma, '0')) AS dropoff_puma -FROM trips -``` + FROM trips -
+
این کار با سرعت 428 هزار رکورد در ثانیه و 3030 ثانیه طول خواهد کشید. برای load سریعتر، شما می توانید یک جدول با موتور `Log` به جای `MergeTree` بسازید. در این مورد، دانلود سریعتر از 200 ثانیه کار می کند. @@ -283,23 +275,21 @@ FROM trips
-``` -:) SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active + :) SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active -SELECT formatReadableSize(sum(bytes)) -FROM system.parts -WHERE (table = 'trips_mergetree') AND active + SELECT formatReadableSize(sum(bytes)) + FROM system.parts + WHERE (table = 'trips_mergetree') AND active -┌─formatReadableSize(sum(bytes))─┐ -│ 126.18 GiB │ -└────────────────────────────────┘ -``` + ┌─formatReadableSize(sum(bytes))─┐ + │ 126.18 GiB │ + └────────────────────────────────┘ -
+
در میان چیزهای دیگر، شما می تونید از دستور OPTIMIZE بر روی MergeTree استفاده کنید. اما از آنجایی که بدون این دستور همه چیز خوب است، اجرای این دستور ضروری نیست.. -## نتایج بر روی یک سرور +## نتایج بر روی یک سرور {#ntyj-br-rwy-ykh-srwr}
@@ -338,7 +328,7 @@ ORDER BY year, count(*) DESC 3.593 seconds. -
+
کانفیگ سرور به این صورت بود: @@ -354,11 +344,9 @@ Two Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz, 16 physical kernels total,
-``` -CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) -``` + CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) -
+
بر روی سرور source دستور زیر را وارد کنید: @@ -368,7 +356,7 @@ CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) ``` -
+
query زیر دادها را توزیع مجدد می کند: @@ -378,7 +366,7 @@ query زیر دادها را توزیع مجدد می کند: INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree ``` -
+
این query 2454 ثانیه زمان میبرد. @@ -400,13 +388,14 @@ Q4: 0.072 ثانیه. در این مورد، زمان پردازش query براساس latency شبکه مشخص می شود. ما این query ها را با استفاده از یک مشتری واقع در دیتاسنتر Yandex در فنلاند در یک کلاستر روسیه دریافت کردیم، که latency آن حدود 20 میلی ثانیه به نتایج اضافه کرد. -## نتایج +## نتایج {#ntyj} | نودها | Q1 | Q2 | Q3 | Q4 | -| ----- | ----- | ----- | ----- | ----- | -| 1 | 0.490 | 1.224 | 2.104 | 3.593 | -| 3 | 0.212 | 0.438 | 0.733 | 1.241 | -| 140 | 0.028 | 0.043 | 0.051 | 0.072 | +|-------|-------|-------|-------|-------| +| 1 | 0.490 | 1.224 | 2.104 | 3.593 | +| 3 | 0.212 | 0.438 | 0.733 | 1.241 | +| 140 | 0.028 | 0.043 | 0.051 | 0.072 |
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/getting_started/example_datasets/nyc_taxi/) diff --git a/docs/fa/getting_started/example_datasets/ontime.md b/docs/fa/getting_started/example_datasets/ontime.md index 89145bfb1af..9f8387316ce 100644 --- a/docs/fa/getting_started/example_datasets/ontime.md +++ b/docs/fa/getting_started/example_datasets/ontime.md @@ -1,13 +1,12 @@ +
-
- -# OnTime +# OnTime {#ontime} دانلود داده ها:
-```bash +``` bash for s in `seq 1987 2018` do for m in `seq 1 12` @@ -17,9 +16,9 @@ done done ``` -
+
-(از ) +(از https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh ) ساخت جدول: @@ -136,23 +135,24 @@ CREATE TABLE `ontime` ( `Div5LongestGTime` String, `Div5WheelsOff` String, `Div5TailNum` String -) ENGINE = MergeTree -PARTITION BY Year -ORDER BY (Carrier, FlightDate) +) ENGINE = MergeTree +PARTITION BY Year +ORDER BY (Carrier, FlightDate) SETTINGS index_granularity = 8192; ``` -
+
Load داده ها:
-```bash +``` bash for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ``` -
+
+ query ها: Q0. @@ -163,7 +163,7 @@ Q0. select avg(c1) from (select Year, Month, count(*) as c1 from ontime group by Year, Month); ``` -
+
Q1. تعداد پروازهای به تفکیک روز از تاریخ 2000 تا 2008 @@ -173,7 +173,7 @@ Q1. تعداد پروازهای به تفکیک روز از تاریخ 2000 تا SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC; ``` -
+
Q2. تعداد پروازهای بیش از 10 دقیقه تاخیر خورده، گروه بندی براساس روزهای هفته از سال 2000 تا 2008 @@ -183,7 +183,7 @@ Q2. تعداد پروازهای بیش از 10 دقیقه تاخیر خورده SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY DayOfWeek ORDER BY c DESC ``` -
+
Q3. تعداد تاخیرها براساس airport از سال 2000 تا 2008 @@ -193,7 +193,7 @@ Q3. تعداد تاخیرها براساس airport از سال 2000 تا 2008 SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year >= 2000 AND Year <= 2008 GROUP BY Origin ORDER BY c DESC LIMIT 10 ``` -
+
Q4. تعداد تاخیرها براساس carrier در سال 78 @@ -203,7 +203,7 @@ Q4. تعداد تاخیرها براساس carrier در سال 78 SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year = 2007 GROUP BY Carrier ORDER BY count(*) DESC ``` -
+
Q5. درصد تاخیر ها براساس carrier در سال 2007 @@ -233,7 +233,7 @@ JOIN ORDER BY c3 DESC; ``` -
+
نسخه ی بهتر query @@ -243,7 +243,7 @@ ORDER BY c3 DESC; SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year = 2007 GROUP BY Carrier ORDER BY c3 DESC ``` -
+
Q6. مانند query قبلی اما برای طیف وسیعی از سال های 2000 تا 2008 @@ -273,7 +273,7 @@ JOIN ORDER BY c3 DESC; ``` -
+
نسخه ی بهتر query @@ -283,7 +283,7 @@ ORDER BY c3 DESC; SELECT Carrier, avg(DepDelay > 10) * 100 AS c3 FROM ontime WHERE Year >= 2000 AND Year <= 2008 GROUP BY Carrier ORDER BY c3 DESC ``` -
+
Q7. درصد تاخیر بیش از 10 دقیقه پروازها به تفکیک سال @@ -311,7 +311,7 @@ JOIN ORDER BY Year ``` -
+
نسخه ی بهتر query @@ -321,7 +321,7 @@ ORDER BY Year SELECT Year, avg(DepDelay > 10)*100 FROM ontime GROUP BY Year ORDER BY Year ``` -
+
Q8. مقصدهای پرطرفدار براساس تعداد اتصال های مستقیم شهرها برای سال 2000 تا 2010 @@ -331,7 +331,7 @@ Q8. مقصدهای پرطرفدار براساس تعداد اتصال های م SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime WHERE Year >= 2000 and Year <= 2010 GROUP BY DestCityName ORDER BY u DESC LIMIT 10; ``` -
+
Q9. @@ -341,7 +341,7 @@ Q9. select Year, count(*) as c1 from ontime group by Year; ``` -
+
Q10. @@ -363,7 +363,7 @@ ORDER by rate DESC LIMIT 1000; ``` -
+
query های بیشتر: @@ -381,16 +381,16 @@ SELECT OriginCityName, DestCityName, count() AS c FROM ontime GROUP BY OriginCit SELECT OriginCityName, count() AS c FROM ontime GROUP BY OriginCityName ORDER BY c DESC LIMIT 10; ``` -
+
این تست های performance توسط Vadim Tkachenko انجام شده است. برای اطلاعات بیشتر به لینک های زیر مراجعه کنید: -- -- -- -- -- -- +- https://www.percona.com/blog/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/ +- https://www.percona.com/blog/2009/10/26/air-traffic-queries-in-luciddb/ +- https://www.percona.com/blog/2009/11/02/air-traffic-queries-in-infinidb-early-alpha/ +- https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/ +- https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/ +- http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html
diff --git a/docs/fa/getting_started/example_datasets/star_schema.md b/docs/fa/getting_started/example_datasets/star_schema.md index f2cd4978608..9b699e59f50 100644 --- a/docs/fa/getting_started/example_datasets/star_schema.md +++ b/docs/fa/getting_started/example_datasets/star_schema.md @@ -1,33 +1,33 @@ -
+
-# بنچمارک Star Schema +# بنچمارک Star Schema {#bnchmrkh-star-schema} -از لینک روبرو dbgen رو کامپایل کنید. +از لینک روبرو dbgen رو کامپایل کنید. https://github.com/vadimtk/ssb-dbgen
-```bash +``` bash git clone git@github.com:vadimtk/ssb-dbgen.git cd ssb-dbgen make ``` -
+
در هنگام پردازش چند warnings نمایش داده می شود که مشکلی نیست و طبیعی است. -`dbgen` و ` dists.dss` را در یک جا با 800 گیگابایت فضای حالی دیسک قرار دهید. +`dbgen` و `dists.dss` را در یک جا با 800 گیگابایت فضای حالی دیسک قرار دهید. تولید داده ها:
-```bash +``` bash ./dbgen -s 1000 -T c ./dbgen -s 1000 -T l ``` -
+
ساخت جداول در ClickHouse @@ -84,7 +84,7 @@ CREATE TABLE customerd AS customer ENGINE = Distributed(perftest_3shards_1replic CREATE TABLE partd AS part ENGINE = Distributed(perftest_3shards_1replicas, default, part, rand()); ``` -
+
برای تست بر روی یک سرور، فقط از جداول MergeTree استفاده کنید. برای تست توزیع شده، شما نیاز به کانفیگ `perftest_3shards_1replicas` در فایل کانفیگ را دارید. در ادامه جداول MergeTree را در هر سرور ایجاد کنید و موارد بالا را توزیع کنید. @@ -92,7 +92,7 @@ CREATE TABLE partd AS part ENGINE = Distributed(perftest_3shards_1replicas, defa
-```bash +``` bash cat customer.tbl | sed 's/$/2000-01-01/' | clickhouse-client --query "INSERT INTO customer FORMAT CSV" cat lineorder.tbl | clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" ``` diff --git a/docs/fa/getting_started/example_datasets/wikistat.md b/docs/fa/getting_started/example_datasets/wikistat.md index 4232815d794..6178cbae243 100644 --- a/docs/fa/getting_started/example_datasets/wikistat.md +++ b/docs/fa/getting_started/example_datasets/wikistat.md @@ -1,8 +1,8 @@ -
+
-# WikiStat +# WikiStat {#wikistat} -ببینید: +ببینید: http://dumps.wikimedia.org/other/pagecounts-raw/ ساخت جدول: @@ -21,14 +21,13 @@ CREATE TABLE wikistat ) ENGINE = MergeTree(date, (path, time), 8192); ``` - -
+
load دیتا
-```bash +``` bash for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done diff --git a/docs/fa/getting_started/index.md b/docs/fa/getting_started/index.md index f5492a490fc..6556609c10d 100644 --- a/docs/fa/getting_started/index.md +++ b/docs/fa/getting_started/index.md @@ -1,11 +1,13 @@ -
+
+ # ﻥﺪﺷ ﻉﻭﺮﺷ ﻖﯾﺮﻃ ﺯﺍ ﺪﯾﺎﺑ ﻪﻤﻫ ﺯﺍ ﻝﻭﺍ ، ﺪﯿﻨﮐ ﺱﺎﺴﺣﺍ ﺍﺭ ﻥﺁ ﺩﺮﮑﻠﻤﻋ ﺪﯿﻫﺍﻮﺧ ﯽﻣ ﻭ ﺪﯿﺘﺴﻫ ﺩﺭﺍﻭ ﻩﺯﺎﺗ[ﺐﺼﻧ ﻞﺣﺍﺮﻣ](install.md). ﺪﯿﻨﮐ ﺏﺎﺨﺘﻧﺍ ﺍﺭ ﺮﯾﺯ ﯼﺎﻫ ﻪﻨﯾﺰﮔ ﺯﺍ ﯽﮑﯾ ﺪﯿﻧﺍﻮﺗ ﯽﻣ ﻥﺁ ﺯﺍ ﺲﭘ: -* [ﺪﯿﻨﮐ ﯽﻃ ﺍﺭ ﻞﺼﻔﻣ ﺵﺯﻮﻣﺁ](tutorial.md) -* [ﺪﯿﻨﮐ ﺶﯾﺎﻣﺯﺁ ﻪﻧﻮﻤﻧ ﯼﺎﻫ ﻩﺩﺍﺩ ﺎﺑ](example_datasets/ontime.md) +- [ﺪﯿﻨﮐ ﯽﻃ ﺍﺭ ﻞﺼﻔﻣ ﺵﺯﻮﻣﺁ](tutorial.md) +- [ﺪﯿﻨﮐ ﺶﯾﺎﻣﺯﺁ ﻪﻧﻮﻤﻧ ﯼﺎﻫ ﻩﺩﺍﺩ ﺎﺑ](example_datasets/ontime.md) [ﯽﻠﺻﺍ ﻪﻟﺎﻘﻣ](https://clickhouse.tech/docs/fa/getting_started/) +
diff --git a/docs/fa/getting_started/install.md b/docs/fa/getting_started/install.md index d4a9ff1810f..11a59d26395 100644 --- a/docs/fa/getting_started/install.md +++ b/docs/fa/getting_started/install.md @@ -1,8 +1,8 @@ -
+
# ﯼﺯﺍﺪﻧﺍ ﻩﺍﺭ ﻭ ﺐﺼﻧ -## نیازمندی های سیستم +## نیازمندی های سیستم {#nyzmndy-hy-systm} ClickHouse ﺲﮐﻮﻨﯿﻟ ﻉﻮﻧ ﺮﻫ ﯼﻭﺭ ﺮﺑ ﺪﻧﺍﻮﺗ ﯽﻣ ، FreeBSD ﺎﯾ Mac OS X ﯼﺭﺎﻤﻌﻣ ﺎﺑ CPU x @@ -10,74 +10,71 @@ ClickHouse ﺲﮐﻮﻨﯿﻟ ﻉﻮﻧ ﺮﻫ ﯼﻭﺭ ﺮﺑ ﺪﻧﺍﻮﺗ
-```bash +``` bash grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" ``` -
+
ﺪﯾﺎﺑ ، ﺪﻧﺭﺍﺪﻧ PowerPC64LE ﺎﯾ AArch64 ﯼﺭﺎﻤﻌﻣ ﺎﯾ ﺪﻨﻨﮐ ﯽﻤﻧ ﯽﻧﺎﺒﯿﺘﺸﭘ SSE 4.2 ﺯﺍ ﻪﮐ[ClickHouse ﺪﯿﻨﮐ ﺩﺎﺠﯾﺍ ﻊﺑﺎﻨﻣ ﺯﺍ ﺍﺭ](#from-sources) ﺐﺳﺎﻨﻣ ﺕﺎﻤﯿﻈﻨﺗ ﺎﺑ -##ﺩﻮﺟﻮﻣ ﺐﺼﻧ ﯼﺎﻫ ﻪﻨﯾﺰﮔ +## ﺩﻮﺟﻮﻣ ﺐﺼﻧ ﯼﺎﻫ ﻪﻨﯾﺰﮔ -### نصب از طریق پکیج های Debian/Ubuntu {#from-deb-packages} +\#\#\# نصب از طریق پکیج های Debian/Ubuntu {\#from-deb-packages} در فایل `/etc/apt/sources.list` (یا در یک فایل جدا `/etc/apt/sources.list.d/clickhouse.list`)، Repo زیر را اضافه کنید:
-``` -deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ -``` + deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ -
+
-اگر شما میخوایید جدیدترین نسخه ی تست را استفاده کنید، 'stable' رو به 'testing' تغییر بدید. +اگر شما میخوایید جدیدترین نسخه ی تست را استفاده کنید، ‘stable’ رو به ‘testing’ تغییر بدید. سپس دستورات زیر را اجرا کنید:
-```bash +``` bash sudo apt-get install dirmngr # optional sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional sudo apt-get update sudo apt-get install clickhouse-client clickhouse-server ``` -
+
-شما همچنین می توانید از طریق لینک زیر پکیج ClickHouse را به صورت دستی دانلود و نصب کنید: . +شما همچنین می توانید از طریق لینک زیر پکیج ClickHouse را به صورت دستی دانلود و نصب کنید: https://repo.yandex.ru/clickhouse/deb/stable/main/. -ClickHouse دارای تنظیمات محدودیت دسترسی می باشد. این تنظیمات در فایل 'users.xml' (کنار 'config.xml') می باشد. به صورت پیش فرض دسترسی برای کاربر 'default' از همه جا بدون نیاز به پسورد وجود دارد. 'user/default/networks' را مشاهده کنید. برای اطلاعات بیشتر قسمت "تنظیمات فایل ها" را مشاهده کنید. +ClickHouse دارای تنظیمات محدودیت دسترسی می باشد. این تنظیمات در فایل ‘users.xml’ (کنار ‘config.xml’) می باشد. به صورت پیش فرض دسترسی برای کاربر ‘default’ از همه جا بدون نیاز به پسورد وجود دارد. ‘user/default/networks’ را مشاهده کنید. برای اطلاعات بیشتر قسمت «تنظیمات فایل ها» را مشاهده کنید. ### RPM ﯼﺎﻫ ﻪﺘﺴﺑ ﺯﺍ {#from-rpm-packages} .ﺪﻨﮐ ﯽﻣ ﻪﯿﺻﻮﺗ ﺲﮐﻮﻨﯿﻟ ﺮﺑ ﯽﻨﺘﺒﻣ rpm ﺮﺑ ﯽﻨﺘﺒﻣ ﯼﺎﻫ ﻊﯾﺯﻮﺗ ﺮﯾﺎﺳ ﻭ CentOS ، RedHat ﯼﺍ - :ﺪﯿﻨﮐ ﻪﻓﺎﺿﺍ ﺍﺭ ﯽﻤﺳﺭ ﻥﺰﺨﻣ ﺪﯾﺎﺑ ﺍﺪﺘﺑﺍ + :ﺪﯿﻨﮐ ﻪﻓﺎﺿﺍ ﺍﺭ ﯽﻤﺳﺭ ﻥﺰﺨﻣ ﺪﯾﺎﺑ ﺍﺪﺘﺑﺍ -```bash +``` bash sudo yum install yum-utils sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64 ``` -.(ﺩﻮﺷ ﯽﻣ ﻪﯿﺻﻮﺗ ﺎﻤﺷ ﺶﯾﺎﻣﺯﺁ ﯼﺎﻫ ﻂﯿﺤﻣ ﯼﺍﺮﺑ ﻦﯾﺍ) ﺪﯿﻨﮐ ﻦﯾﺰﮕﯾﺎﺟ "ﺖﺴﺗ" ﺎﺑ ﺍﺭ "ﺭﺍﺪﯾﺎﭘ" +.(ﺩﻮﺷ ﯽﻣ ﻪﯿﺻﻮﺗ ﺎﻤﺷ ﺶﯾﺎﻣﺯﺁ ﯼﺎﻫ ﻂﯿﺤﻣ ﯼﺍﺮﺑ ﻦﯾﺍ) ﺪﯿﻨﮐ ﻦﯾﺰﮕﯾﺎﺟ «ﺖﺴﺗ» ﺎﺑ ﺍﺭ «ﺭﺍﺪﯾﺎﭘ» - :ﺪﯿﻨﮐ ﺐﺼﻧ ﺍﺭ ﺎﻫ ﻪﺘﺴﺑ ﻊﻗﺍﻭ ﺭﺩ ﺎﺗ ﺪﯿﻨﮐ ﺍﺮﺟﺍ ﺍﺭ ﺕﺍﺭﻮﺘﺳﺩ ﻦﯾﺍ ﺲﭙﺳ + :ﺪﯿﻨﮐ ﺐﺼﻧ ﺍﺭ ﺎﻫ ﻪﺘﺴﺑ ﻊﻗﺍﻭ ﺭﺩ ﺎﺗ ﺪﯿﻨﮐ ﺍﺮﺟﺍ ﺍﺭ ﺕﺍﺭﻮﺘﺳﺩ ﻦﯾﺍ ﺲﭙﺳ -```bash +``` bash sudo yum install clickhouse-server clickhouse-client ``` -. :ﺪﯿﻨﮐ ﺐﺼﻧ ﻭ ﯼﺮﯿﮔﺭﺎﺑ ﺎﺠﻨ +.https://repo.yandex.ru/clickhouse/rpm/stable/x86\_64 :ﺪﯿﻨﮐ ﺐﺼﻧ ﻭ ﯼﺮﯿﮔﺭﺎﺑ ﺎﺠﻨ - Docker Image ﺯﺍ ### - -.ﺪﻨﻨﮐ ﯽﻣ ﻩﺩﺎﻔﺘﺳﺍ ﻞﺧﺍﺩ ﺭﺩ "deb" ﯽﻤﺳﺭ ﯼﺎﻫ ﻪﺘﺴﺑ ﺯﺍ ﺮﯾﻭﺎﺼﺗ ﻦﯾﺍ .ﺪﯿﻨﮐ ﻝﺎﺒﻧﺩ ﺍﺭ (/ht + Docker Image ﺯﺍ ### +.ﺪﻨﻨﮐ ﯽﻣ ﻩﺩﺎﻔﺘﺳﺍ ﻞﺧﺍﺩ ﺭﺩ «deb» ﯽﻤﺳﺭ ﯼﺎﻫ ﻪﺘﺴﺑ ﺯﺍ ﺮﯾﻭﺎﺼﺗ ﻦﯾﺍ .ﺪﯿﻨﮐ ﻝﺎﺒﻧﺩ ﺍﺭ (/ht ### نصب از طریق Source {#from-sources} @@ -87,47 +84,43 @@ sudo yum install clickhouse-server clickhouse-client
-``` -Client: dbms/programs/clickhouse-client -Server: dbms/programs/clickhouse-server -``` + Client: dbms/programs/clickhouse-client + Server: dbms/programs/clickhouse-server -
+
برای سرور، یک کاتالوگ با دیتا بسازید، مانند
-``` -/opt/clickhouse/data/default/ -/opt/clickhouse/metadata/default/ -``` + /opt/clickhouse/data/default/ + /opt/clickhouse/metadata/default/ -
+
-(قابل تنظیم در تنظیمات سرور). 'chown' را برای کاربر دلخواه اجرا کنید. +(قابل تنظیم در تنظیمات سرور). ‘chown’ را برای کاربر دلخواه اجرا کنید. به مسیر لاگ ها در تنظیمات سرور توجه کنید (src/dbms/programs/config.xml). ### روش های دیگر نصب {#from-docker-image} -Docker image: +Docker image: https://hub.docker.com/r/yandex/clickhouse-server/ -پکیج RPM برای CentOS یا RHEL: +پکیج RPM برای CentOS یا RHEL: https://github.com/Altinity/clickhouse-rpm-install Gentoo: `emerge clickhouse` -## راه اندازی +## راه اندازی {#rh-ndzy} برای استارت سرور (به صورت daemon)، دستور زیر را اجرا کنید:
-```bash +``` bash sudo service clickhouse-server start ``` -
+
لاگ های دایرکتوری `/var/log/clickhouse-server/` directory. را مشاهده کنید. @@ -137,41 +130,41 @@ sudo service clickhouse-server start
-```bash +``` bash clickhouse-server --config-file=/etc/clickhouse-server/config.xml ``` -
+
-در این مورد که مناسب زمان توسعه می باشد، لاگ ها در کنسول پرینت می شوند. اگر فایل تنظیمات در دایرکتوری جاری باشد، نیازی به مشخص کردن '--config-file' نمی باشد. به صورت پیش فرض از './config.xml' استفاده می شود. +در این مورد که مناسب زمان توسعه می باشد، لاگ ها در کنسول پرینت می شوند. اگر فایل تنظیمات در دایرکتوری جاری باشد، نیازی به مشخص کردن ‘–config-file’ نمی باشد. به صورت پیش فرض از ‘./config.xml’ استفاده می شود. شما می توانید از کلاینت command-line برای اتصال به سرور استفاده کنید:
-```bash +``` bash clickhouse-client ``` -
+
-پارامترهای پیش فرض، نشان از اتصال به localhost:9000 از طرف کاربر 'default' بدون پسورد را می دهد. از کلاینت میتوان برای اتصال به یک سرور remote استفاده کرد. مثال: +پارامترهای پیش فرض، نشان از اتصال به localhost:9000 از طرف کاربر ‘default’ بدون پسورد را می دهد. از کلاینت میتوان برای اتصال به یک سرور remote استفاده کرد. مثال:
-```bash +``` bash clickhouse-client --host=example.com ``` -
+
-برای اطلاعات بیشتر، بخش "کلاینت Command-line" را مشاهده کنید. +برای اطلاعات بیشتر، بخش «کلاینت Command-line» را مشاهده کنید. چک کردن سیستم:
-```bash +``` bash milovidov@hostname:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client ClickHouse client version 0.0.18749. Connecting to localhost:9000. @@ -190,11 +183,12 @@ SELECT 1 :) ``` -
+
**تبریک میگم، سیستم کار می کنه!** برای ادامه آزمایشات، شما میتوانید دیتاست های تستی را دریافت و امتحان کنید.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/getting_started/install/) diff --git a/docs/fa/getting_started/tutorial.md b/docs/fa/getting_started/tutorial.md deleted file mode 120000 index 8bc40816ab2..00000000000 --- a/docs/fa/getting_started/tutorial.md +++ /dev/null @@ -1 +0,0 @@ -../../en/getting_started/tutorial.md \ No newline at end of file diff --git a/docs/fa/getting_started/tutorial.md b/docs/fa/getting_started/tutorial.md new file mode 100644 index 00000000000..2c0bcf086df --- /dev/null +++ b/docs/fa/getting_started/tutorial.md @@ -0,0 +1,662 @@ +# اموزش کلیک {#mwzsh-khlykh} + +## چه انتظار از این مقاله? {#chh-ntzr-z-yn-mqlh} + +با رفتن را از طریق این مقاله شما یاد بگیرند که چگونه به راه اندازی پایه خوشه خانه رعیتی, این کوچک, اما مقاوم در برابر خطا و مقیاس پذیر. ما از یکی از مجموعه داده های نمونه برای پر کردن داده ها و اجرای برخی از نمایش های نسخه ی نمایشی استفاده خواهیم کرد. + +## راه اندازی تک گره {#rh-ndzy-tkh-grh} + +برای به تعویق انداختن پیچیدگی از محیط توزیع, ما با استقرار کلیک بر روی یک سرور و یا ماشین مجازی شروع. خانه کلیک است که معمولا از نصب [دب](index.md#install-from-deb-packages) یا [دور در دقیقه](index.md#from-rpm-packages) بسته, اما وجود دارد [جایگزین ها](index.md#from-docker-image) برای سیستم عامل هایی که هیچ پشتیبانی نمی کنند. + +مثلا, شما را انتخاب کرده اند `deb` بسته ها و اعدام: + +``` bash +sudo apt-get install dirmngr +sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 + +echo "deb http://repo.clickhouse.tech/deb/stable/ main/" | sudo tee /etc/apt/sources.list.d/clickhouse.list +sudo apt-get update + +sudo apt-get install -y clickhouse-server clickhouse-client +``` + +در بسته هایی که نصب شده اند چه چیزی داریم: + +- `clickhouse-client` بسته شامل [کلیک مشتری](../interfaces/cli.md) کاربرد, تعاملی مشتری کنسول تاتر. +- `clickhouse-common` بسته شامل یک فایل اجرایی کلیک. +- `clickhouse-server` بسته شامل فایل های پیکربندی برای اجرای تاتر به عنوان یک سرور. + +فایل های پیکربندی سرور در واقع `/etc/clickhouse-server/`. قبل از رفتن بیشتر لطفا توجه کنید `` عنصر در `config.xml`. مسیر تعیین محل ذخیره سازی داده ها, بنابراین باید در حجم با ظرفیت دیسک بزرگ واقع, مقدار پیش فرض است `/var/lib/clickhouse/`. اگر شما می خواهید برای تنظیم پیکربندی دستی به طور مستقیم ویرایش نیست `config.xml` فایل, با توجه به اینکه ممکن است در به روز رسانی بسته های بعدی بازنویسی. راه توصیه می شود به نادیده گرفتن عناصر پیکربندی است که برای ایجاد [فایل ها در پیکربندی.فهرست راهنما](../operations/configuration_files.md) که به عنوان خدمت می کنند «patches» برای پیکربندی.. + +همانطور که شما ممکن است متوجه, `clickhouse-server` به طور خودکار پس از نصب بسته راه اندازی نشده است. این به طور خودکار پس از به روز رسانی هم دوباره راه اندازی نخواهد شد. راه شما شروع به سرور بستگی به سیستم اینیت خود را, معمولا, این: + +``` bash +sudo service clickhouse-server start +``` + +یا + +``` bash +sudo /etc/init.d/clickhouse-server start +``` + +محل پیش فرض برای سیاهههای مربوط به سرور است `/var/log/clickhouse-server/`. سرور خواهد بود برای رسیدگی به اتصالات مشتری یک بار `Ready for connections` پیام وارد شد. + +هنگامی که `clickhouse-server` است و در حال اجرا, ما می توانیم با استفاده از `clickhouse-client` برای اتصال به سرور و اجرای برخی از نمایش داده شد تست مانند `SELECT "Hello, world!";`. + +
+ +راهنمایی سریع برای کلیک-مشتری حالت تعاملی: + +``` bash +clickhouse-client +clickhouse-client --host=... --port=... --user=... --password=... +``` + +فعالسازی پرسشهای چند خطی: + +``` bash +clickhouse-client -m +clickhouse-client --multiline +``` + +نمایش داده شد اجرا در دسته حالت: + +``` bash +clickhouse-client --query='SELECT 1' +echo 'SELECT 1' | clickhouse-client +clickhouse-client <<< 'SELECT 1' +``` + +درج داده از یک پرونده در قالب مشخص شده: + +``` bash +clickhouse-client --query='INSERT INTO table VALUES' < data.txt +clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv +``` + +
+ +## واردات مجموعه داده نمونه {#wrdt-mjmw-h-ddh-nmwnh} + +در حال حاضر زمان برای پر کردن سرور کلیک ما با برخی از داده های نمونه است. در این مقاله ما داده های ناشناس یاندکس را استفاده خواهیم کرد.متریکا, اولین سرویس اجرا می شود که کلیک در راه تولید قبل از منبع باز شد (بیشتر در که در [بخش تاریخچه](../introduction/history.md)). وجود دارد [راه های متعدد برای وارد کردن یاندکس.مجموعه داده های متریکا](example_datasets/metrica.md) و به خاطر اموزش, ما با یکی از واقع بینانه ترین رفتن. + +### دانلود و استخراج داده های جدول {#dnlwd-w-stkhrj-ddh-hy-jdwl} + +``` bash +curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv +curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv +``` + +فایل های استخراج شده حدود 10 گیگابایت است. + +### ایجاد جداول {#yjd-jdwl} + +جداول منطقی به گروه بندی می شوند «databases». یک `default` پایگاه داده, اما ما یکی از جدید به نام ایجاد `tutorial`: + +``` bash +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial" +``` + +نحو برای ایجاد جداول راه پیچیده تر در مقایسه با پایگاه داده است (نگاه کنید به [مرجع](../query_language/create.md). به طور کلی `CREATE TABLE` بیانیه باید سه چیز کلیدی را مشخص کند: + +1. نام جدول برای ایجاد. +2. طرحواره جدول, به عنوان مثال لیستی از ستون ها و خود [انواع داده ها](../data_types/index.md). +3. [موتور جدول](../operations/table_engines/index.md) و این تنظیمات است, که تعیین تمام اطلاعات در مورد نحوه نمایش داده شد به این جدول خواهد شد از لحاظ جسمی اجرا. + +یاندکسمتریکا یک سرویس تجزیه و تحلیل وب است و مجموعه داده نمونه قابلیت های کامل خود را پوشش نمی دهد بنابراین تنها دو جدول برای ایجاد وجود دارد: + +- `hits` یک جدول با هر عمل انجام شده توسط همه کاربران در تمام وب سایت های تحت پوشش این سرویس است. +- `visits` یک جدول است که شامل جلسات از پیش ساخته شده به جای اقدامات فردی است. + +بیایید ببینید و اجرای واقعی ایجاد نمایش داده شد جدول برای این جداول: + +``` sql +CREATE TABLE tutorial.hits_v1 +( + `WatchID` UInt64, + `JavaEnable` UInt8, + `Title` String, + `GoodEvent` Int16, + `EventTime` DateTime, + `EventDate` Date, + `CounterID` UInt32, + `ClientIP` UInt32, + `ClientIP6` FixedString(16), + `RegionID` UInt32, + `UserID` UInt64, + `CounterClass` Int8, + `OS` UInt8, + `UserAgent` UInt8, + `URL` String, + `Referer` String, + `URLDomain` String, + `RefererDomain` String, + `Refresh` UInt8, + `IsRobot` UInt8, + `RefererCategories` Array(UInt16), + `URLCategories` Array(UInt16), + `URLRegions` Array(UInt32), + `RefererRegions` Array(UInt32), + `ResolutionWidth` UInt16, + `ResolutionHeight` UInt16, + `ResolutionDepth` UInt8, + `FlashMajor` UInt8, + `FlashMinor` UInt8, + `FlashMinor2` String, + `NetMajor` UInt8, + `NetMinor` UInt8, + `UserAgentMajor` UInt16, + `UserAgentMinor` FixedString(2), + `CookieEnable` UInt8, + `JavascriptEnable` UInt8, + `IsMobile` UInt8, + `MobilePhone` UInt8, + `MobilePhoneModel` String, + `Params` String, + `IPNetworkID` UInt32, + `TraficSourceID` Int8, + `SearchEngineID` UInt16, + `SearchPhrase` String, + `AdvEngineID` UInt8, + `IsArtifical` UInt8, + `WindowClientWidth` UInt16, + `WindowClientHeight` UInt16, + `ClientTimeZone` Int16, + `ClientEventTime` DateTime, + `SilverlightVersion1` UInt8, + `SilverlightVersion2` UInt8, + `SilverlightVersion3` UInt32, + `SilverlightVersion4` UInt16, + `PageCharset` String, + `CodeVersion` UInt32, + `IsLink` UInt8, + `IsDownload` UInt8, + `IsNotBounce` UInt8, + `FUniqID` UInt64, + `HID` UInt32, + `IsOldCounter` UInt8, + `IsEvent` UInt8, + `IsParameter` UInt8, + `DontCountHits` UInt8, + `WithHash` UInt8, + `HitColor` FixedString(1), + `UTCEventTime` DateTime, + `Age` UInt8, + `Sex` UInt8, + `Income` UInt8, + `Interests` UInt16, + `Robotness` UInt8, + `GeneralInterests` Array(UInt16), + `RemoteIP` UInt32, + `RemoteIP6` FixedString(16), + `WindowName` Int32, + `OpenerName` Int32, + `HistoryLength` Int16, + `BrowserLanguage` FixedString(2), + `BrowserCountry` FixedString(2), + `SocialNetwork` String, + `SocialAction` String, + `HTTPError` UInt16, + `SendTiming` Int32, + `DNSTiming` Int32, + `ConnectTiming` Int32, + `ResponseStartTiming` Int32, + `ResponseEndTiming` Int32, + `FetchTiming` Int32, + `RedirectTiming` Int32, + `DOMInteractiveTiming` Int32, + `DOMContentLoadedTiming` Int32, + `DOMCompleteTiming` Int32, + `LoadEventStartTiming` Int32, + `LoadEventEndTiming` Int32, + `NSToDOMContentLoadedTiming` Int32, + `FirstPaintTiming` Int32, + `RedirectCount` Int8, + `SocialSourceNetworkID` UInt8, + `SocialSourcePage` String, + `ParamPrice` Int64, + `ParamOrderID` String, + `ParamCurrency` FixedString(3), + `ParamCurrencyID` UInt16, + `GoalsReached` Array(UInt32), + `OpenstatServiceName` String, + `OpenstatCampaignID` String, + `OpenstatAdID` String, + `OpenstatSourceID` String, + `UTMSource` String, + `UTMMedium` String, + `UTMCampaign` String, + `UTMContent` String, + `UTMTerm` String, + `FromTag` String, + `HasGCLID` UInt8, + `RefererHash` UInt64, + `URLHash` UInt64, + `CLID` UInt32, + `YCLID` UInt64, + `ShareService` String, + `ShareURL` String, + `ShareTitle` String, + `ParsedParams` Nested( + Key1 String, + Key2 String, + Key3 String, + Key4 String, + Key5 String, + ValueDouble Float64), + `IslandID` FixedString(16), + `RequestNum` UInt32, + `RequestTry` UInt8 +) +ENGINE = MergeTree() +PARTITION BY toYYYYMM(EventDate) +ORDER BY (CounterID, EventDate, intHash32(UserID)) +SAMPLE BY intHash32(UserID) +SETTINGS index_granularity = 8192 +``` + +``` sql +CREATE TABLE tutorial.visits_v1 +( + `CounterID` UInt32, + `StartDate` Date, + `Sign` Int8, + `IsNew` UInt8, + `VisitID` UInt64, + `UserID` UInt64, + `StartTime` DateTime, + `Duration` UInt32, + `UTCStartTime` DateTime, + `PageViews` Int32, + `Hits` Int32, + `IsBounce` UInt8, + `Referer` String, + `StartURL` String, + `RefererDomain` String, + `StartURLDomain` String, + `EndURL` String, + `LinkURL` String, + `IsDownload` UInt8, + `TraficSourceID` Int8, + `SearchEngineID` UInt16, + `SearchPhrase` String, + `AdvEngineID` UInt8, + `PlaceID` Int32, + `RefererCategories` Array(UInt16), + `URLCategories` Array(UInt16), + `URLRegions` Array(UInt32), + `RefererRegions` Array(UInt32), + `IsYandex` UInt8, + `GoalReachesDepth` Int32, + `GoalReachesURL` Int32, + `GoalReachesAny` Int32, + `SocialSourceNetworkID` UInt8, + `SocialSourcePage` String, + `MobilePhoneModel` String, + `ClientEventTime` DateTime, + `RegionID` UInt32, + `ClientIP` UInt32, + `ClientIP6` FixedString(16), + `RemoteIP` UInt32, + `RemoteIP6` FixedString(16), + `IPNetworkID` UInt32, + `SilverlightVersion3` UInt32, + `CodeVersion` UInt32, + `ResolutionWidth` UInt16, + `ResolutionHeight` UInt16, + `UserAgentMajor` UInt16, + `UserAgentMinor` UInt16, + `WindowClientWidth` UInt16, + `WindowClientHeight` UInt16, + `SilverlightVersion2` UInt8, + `SilverlightVersion4` UInt16, + `FlashVersion3` UInt16, + `FlashVersion4` UInt16, + `ClientTimeZone` Int16, + `OS` UInt8, + `UserAgent` UInt8, + `ResolutionDepth` UInt8, + `FlashMajor` UInt8, + `FlashMinor` UInt8, + `NetMajor` UInt8, + `NetMinor` UInt8, + `MobilePhone` UInt8, + `SilverlightVersion1` UInt8, + `Age` UInt8, + `Sex` UInt8, + `Income` UInt8, + `JavaEnable` UInt8, + `CookieEnable` UInt8, + `JavascriptEnable` UInt8, + `IsMobile` UInt8, + `BrowserLanguage` UInt16, + `BrowserCountry` UInt16, + `Interests` UInt16, + `Robotness` UInt8, + `GeneralInterests` Array(UInt16), + `Params` Array(String), + `Goals` Nested( + ID UInt32, + Serial UInt32, + EventTime DateTime, + Price Int64, + OrderID String, + CurrencyID UInt32), + `WatchIDs` Array(UInt64), + `ParamSumPrice` Int64, + `ParamCurrency` FixedString(3), + `ParamCurrencyID` UInt16, + `ClickLogID` UInt64, + `ClickEventID` Int32, + `ClickGoodEvent` Int32, + `ClickEventTime` DateTime, + `ClickPriorityID` Int32, + `ClickPhraseID` Int32, + `ClickPageID` Int32, + `ClickPlaceID` Int32, + `ClickTypeID` Int32, + `ClickResourceID` Int32, + `ClickCost` UInt32, + `ClickClientIP` UInt32, + `ClickDomainID` UInt32, + `ClickURL` String, + `ClickAttempt` UInt8, + `ClickOrderID` UInt32, + `ClickBannerID` UInt32, + `ClickMarketCategoryID` UInt32, + `ClickMarketPP` UInt32, + `ClickMarketCategoryName` String, + `ClickMarketPPName` String, + `ClickAWAPSCampaignName` String, + `ClickPageName` String, + `ClickTargetType` UInt16, + `ClickTargetPhraseID` UInt64, + `ClickContextType` UInt8, + `ClickSelectType` Int8, + `ClickOptions` String, + `ClickGroupBannerID` Int32, + `OpenstatServiceName` String, + `OpenstatCampaignID` String, + `OpenstatAdID` String, + `OpenstatSourceID` String, + `UTMSource` String, + `UTMMedium` String, + `UTMCampaign` String, + `UTMContent` String, + `UTMTerm` String, + `FromTag` String, + `HasGCLID` UInt8, + `FirstVisit` DateTime, + `PredLastVisit` Date, + `LastVisit` Date, + `TotalVisits` UInt32, + `TraficSource` Nested( + ID Int8, + SearchEngineID UInt16, + AdvEngineID UInt8, + PlaceID UInt16, + SocialSourceNetworkID UInt8, + Domain String, + SearchPhrase String, + SocialSourcePage String), + `Attendance` FixedString(16), + `CLID` UInt32, + `YCLID` UInt64, + `NormalizedRefererHash` UInt64, + `SearchPhraseHash` UInt64, + `RefererDomainHash` UInt64, + `NormalizedStartURLHash` UInt64, + `StartURLDomainHash` UInt64, + `NormalizedEndURLHash` UInt64, + `TopLevelDomain` UInt64, + `URLScheme` UInt64, + `OpenstatServiceNameHash` UInt64, + `OpenstatCampaignIDHash` UInt64, + `OpenstatAdIDHash` UInt64, + `OpenstatSourceIDHash` UInt64, + `UTMSourceHash` UInt64, + `UTMMediumHash` UInt64, + `UTMCampaignHash` UInt64, + `UTMContentHash` UInt64, + `UTMTermHash` UInt64, + `FromHash` UInt64, + `WebVisorEnabled` UInt8, + `WebVisorActivity` UInt32, + `ParsedParams` Nested( + Key1 String, + Key2 String, + Key3 String, + Key4 String, + Key5 String, + ValueDouble Float64), + `Market` Nested( + Type UInt8, + GoalID UInt32, + OrderID String, + OrderPrice Int64, + PP UInt32, + DirectPlaceID UInt32, + DirectOrderID UInt32, + DirectBannerID UInt32, + GoodID String, + GoodName String, + GoodQuantity Int32, + GoodPrice Int64), + `IslandID` FixedString(16) +) +ENGINE = CollapsingMergeTree(Sign) +PARTITION BY toYYYYMM(StartDate) +ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) +SAMPLE BY intHash32(UserID) +SETTINGS index_granularity = 8192 +``` + +شما می توانید این پرسش ها را با استفاده از حالت تعاملی اجرا کنید `clickhouse-client` (فقط در یک ترمینال راه اندازی بدون مشخص کردن یک پرس و جو در پیش) و یا سعی کنید برخی از [رابط جایگزین](../interfaces/index.md) اگر شما می خواهید. + +همانطور که می بینیم, `hits_v1` با استفاده از [موتور ادغام عمومی](../operations/table_engines/mergetree.md) در حالی که `visits_v1` با استفاده از [سقوط](../operations/table_engines/collapsingmergetree.md) گزینه. + +### وارد کردن داده {#wrd-khrdn-ddh} + +وارد کردن داده ها به تاتر از طریق انجام می شود [وارد](../query_language/insert_into.md) پرس و جو مانند در بسیاری از پایگاه داده های دیگر گذاشتن. با این حال داده ها معمولا در یکی از [فرمت های پشتیبانی شده](../interfaces/formats.md) به جای `VALUES` بند (که همچنین پشتیبانی). + +فایل هایی که قبلا دانلود کردیم در قالب تب جدا شده اند بنابراین در اینجا نحوه وارد کردن از طریق مشتری کنسول است: + +``` bash +clickhouse-client --query "INSERT INTO tutorial.hits_v1 FORMAT TSV" --max_insert_block_size=100000 < hits_v1.tsv +clickhouse-client --query "INSERT INTO tutorial.visits_v1 FORMAT TSV" --max_insert_block_size=100000 < visits_v1.tsv +``` + +تاتر است که بسیاری از [تنظیمات برای تنظیم](../operations/settings/index.md) و یک راه برای مشخص کردن انها در کنسول مشتری از طریق استدلال است همانطور که ما می توانید ببینید با `--max_insert_block_size`. ساده ترین راه برای کشف کردن چه تنظیمات در دسترس هستند, چه معنی می دهند و چه پیش فرض است به پرس و جو `system.settings` جدول: + +``` sql +SELECT name, value, changed, description +FROM system.settings +WHERE name LIKE '%max_insert_b%' +FORMAT TSV + +max_insert_block_size 1048576 0 "The maximum block size for insertion, if we control the creation of blocks for insertion." +``` + +در صورت تمایل شما می توانید [بهینه سازی](../query_language/misc/#misc_operations-optimize) جداول پس از واردات. جداول که با ادغام پیکربندی-موتور خانواده همیشه ادغام قطعات داده ها در پس زمینه برای بهینه سازی ذخیره سازی داده ها (یا حداقل چک کنید اگر حس می کند). این نمایش داده شد فقط موتور جدول را مجبور به انجام بهینه سازی ذخیره سازی در حال حاضر به جای برخی از زمان بعد: + +``` bash +clickhouse-client --query "OPTIMIZE TABLE tutorial.hits_v1 FINAL" +clickhouse-client --query "OPTIMIZE TABLE tutorial.visits_v1 FINAL" +``` + +این عملیات فشرده من / و پردازنده است بنابراین اگر جدول به طور مداوم داده های جدید را دریافت کند بهتر است تنها بماند و اجازه دهید ادغام در پس زمینه اجرا شود. + +در حال حاضر ما می توانید بررسی کنید که جداول با موفقیت وارد شده است: + +``` bash +clickhouse-client --query "SELECT COUNT(*) FROM tutorial.hits_v1" +clickhouse-client --query "SELECT COUNT(*) FROM tutorial.visits_v1" +``` + +## به عنوان مثال نمایش داده شد {#bh-nwn-mthl-nmysh-ddh-shd} + +``` sql +SELECT + StartURL AS URL, + AVG(Duration) AS AvgDuration +FROM tutorial.visits_v1 +WHERE StartDate BETWEEN '2014-03-23' AND '2014-03-30' +GROUP BY URL +ORDER BY AvgDuration DESC +LIMIT 10 +``` + +``` sql +SELECT + sum(Sign) AS visits, + sumIf(Sign, has(Goals.ID, 1105530)) AS goal_visits, + (100. * goal_visits) / visits AS goal_percent +FROM tutorial.visits_v1 +WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru') +``` + +## استقرار خوشه {#stqrr-khwshh} + +خوشه کلیک یک خوشه همگن است. مراحل برای راه اندازی: + +1. نصب سرور کلیک بر روی تمام ماشین های خوشه +2. تنظیم پیکربندی خوشه در فایل های پیکربندی +3. ایجاد جداول محلی در هر نمونه +4. ایجاد یک [جدول توزیع شده](../operations/table_engines/distributed.md) + +[جدول توزیع شده](../operations/table_engines/distributed.md) در واقع یک نوع از «view» به جداول محلی خوشه فاحشه خانه. پرس و جو را انتخاب کنید از یک جدول توزیع خواهد شد با استفاده از منابع خرده ریز تمام خوشه اجرا. شما ممکن است تنظیمات برای خوشه های متعدد مشخص و ایجاد جداول توزیع های متعدد فراهم کردن دیدگاه ها به خوشه های مختلف. + +به عنوان مثال پیکربندی برای یک خوشه با سه خرده ریز, یک ماکت هر: + +``` xml + + + + + example-perftest01j.yandex.ru + 9000 + + + + + example-perftest02j.yandex.ru + 9000 + + + + + example-perftest03j.yandex.ru + 9000 + + + + +``` + +برای تظاهرات بیشتر بیایید ایجاد یک جدول محلی جدید با همان `CREATE TABLE` پرس و جو که ما برای استفاده `hits_v1`, اما نام جدول های مختلف: + +``` sql +CREATE TABLE tutorial.hits_local (...) ENGINE = MergeTree() ... +``` + +ایجاد یک جدول توزیع شده برای نمایش در جداول محلی خوشه: + +``` sql +CREATE TABLE tutorial.hits_all AS tutorial.hits_local +ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand()); +``` + +یک روش معمول این است که جداول توزیع شده مشابه را در تمام ماشین های خوشه ایجاد کنید. این اجازه می دهد در حال اجرا نمایش داده شد توزیع در هر دستگاه از خوشه. همچنین یک گزینه جایگزین برای ایجاد جدول توزیع موقت برای پرس و جو انتخاب داده شده با استفاده از وجود دارد [دور](../query_language/table_functions/remote.md) تابع جدول. + +بیا فرار کنیم [درج را انتخاب کنید](../query_language/insert_into.md) به جدول توزیع شده برای گسترش جدول به چندین سرور. + +``` sql +INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; +``` + +!!! اخطار «Notice» این روش مناسب برای جلوگیری از جداول بزرگ نیست. یک ابزار جداگانه وجود دارد [تاتر-کپی](../operations/utils/clickhouse-copier.md) که می تواند جداول دلخواه بزرگ دوباره سفال. "سفال." + + +همانطور که شما می توانید انتظار نمایش داده شد محاسباتی سنگین نفر بار سریع تر اجرا در 3 سرور به جای یک راه اندازی. + +در این مورد, ما یک خوشه با استفاده کرده اند 3 خرده ریز هر شامل یک ماکت تک. + +برای انعطاف پذیری در یک محیط تولید توصیه می کنیم که هر سفال باید حاوی 2-3 کپی بین مراکز داده های متعدد توزیع شده است. توجه داشته باشید که کلیک خانه پشتیبانی از تعداد نامحدودی از کپی. + +به عنوان مثال پیکربندی برای یک خوشه از یک سفال حاوی سه کپی: + +``` xml + + ... + + + + example-perftest01j.yandex.ru + 9000 + + + example-perftest02j.yandex.ru + 9000 + + + example-perftest03j.yandex.ru + 9000 + + + + +``` + +برای فعال کردن تکثیر بومی باغ وحش الزامی است. تاتر مراقبت از سازگاری داده ها در تمام کپی را اجرا و بازگرداندن روش پس از شکست بطور خودکار توصیه می شود برای استقرار خوشه باغ وحش به سرور جداگانه. + +باغ وحش یک نیاز سخت نیست: در برخی موارد ساده می توانید داده ها را با نوشتن به تمام کپی ها از کد درخواست خود کپی کنید. این رویکرد است **نه** توصیه می شود, در این مورد, تاتر قادر نخواهد بود به تضمین سازگاری داده ها در تمام کپی. این وظیفه درخواست شما باقی می ماند. + +مکان های باغ وحش باید در فایل پیکربندی مشخص شود: + +``` xml + + + zoo01.yandex.ru + 2181 + + + zoo02.yandex.ru + 2181 + + + zoo03.yandex.ru + 2181 + + +``` + +همچنین, ما نیاز به تنظیم ماکروها برای شناسایی هر سفال و ماکت, خواهد شد در ایجاد جدول استفاده می شود: + +``` xml + + 01 + 01 + +``` + +اگر هیچ کپی در حال حاضر در ایجاد جدول تکرار وجود دارد, اولین ماکت جدید نمونه خواهد شد. اگر در حال حاضر زندگی می کنند کپی جدید کلون کردن داده ها از موجود. شما ابتدا یک گزینه برای ایجاد تمام جداول تکرار شده دارید و داده ها را وارد می کنید. یکی دیگر از گزینه این است که برای ایجاد برخی از کپی و اضافه کردن دیگران بعد یا در هنگام درج داده ها. + +``` sql +CREATE TABLE tutorial.hits_replica (...) +ENGINE = ReplcatedMergeTree( + '/clickhouse_perftest/tables/{shard}/hits', + '{replica}' +) +... +``` + +در اینجا ما با استفاده از [تکرار غذای اصلی](../operations/table_engines/replication.md) موتور جدول. در پارامترهای مشخص می کنیم مسیر باغ وحش حاوی سفال و کپی شناسه. + +``` sql +INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local; +``` + +تکرار عمل در حالت چند استاد. داده ها را می توان به هر ماکت بارگذاری کرد و به طور خودکار با سایر موارد همگام سازی می شود. تکرار ناهمزمان است بنابراین در یک لحظه معین, همه کپی ممکن است حاوی داده به تازگی قرار داده شده. اجازه می دهد تا درج داده ها حداقل یک ماکت باید باشد. دیگران همگام سازی داده ها و قوام تعمیر هنگامی که دوباره فعال تبدیل خواهد شد. لطفا توجه داشته باشید که چنین رویکردی اجازه می دهد تا برای امکان کم از دست دادن داده ها فقط اضافه. + +[مقاله اصلی](https://clickhouse.tech/docs/en/getting_started/tutorial/) diff --git a/docs/fa/images/logo.svg b/docs/fa/images/logo.svg index 865b96d98c7..b5ab923ff65 100644 --- a/docs/fa/images/logo.svg +++ b/docs/fa/images/logo.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/fa/index.md b/docs/fa/index.md index 51eca317237..a8f1c201e37 100644 --- a/docs/fa/index.md +++ b/docs/fa/index.md @@ -1,19 +1,17 @@ -
+
-# ClickHouse چیست؟ +# ClickHouse چیست؟ {#clickhouse-chyst} ClickHouse یک مدیریت دیتابیس (DBMS) ستون گرا برای پردازش تحلیلی آنلاین (OLAP) می باشد. - در یک مدیریت دیتابیس ردیف گرا، داده ها به فرم زیر ذخیره سازی می شوند: - | Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | -| --- | ------------------- | ---------- | ------------------ | --------- | ------------------- | -| #0 | 5385521489354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 5385521490329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 5385521489953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | ... | ... | ... | ... | ... | +|-----|---------------------|------------|--------------------|-----------|---------------------| +| \#0 | 5385521489354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| \#1 | 5385521490329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| \#2 | 5385521489953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| \#N | … | … | … | … | … | به این صورت، تمام مقادیر مربوط به یک سطر (رکورد) به صورت فیزیکی و در کنار یکدگیر ذخیره سازی می شوند. @@ -22,15 +20,13 @@ ClickHouse یک مدیریت دیتابیس (DBMS) ستون گرا برای پر در یک دیتابیس ستون گرا، داده ها به شکل زیر ذخیره سازی می شوند: - -| Row: | #0 | #1 | #2 | #N | -| ----------- | ------------------- | ------------------- | ------------------- | ------------------- | -| WatchID: | 5385521489354350662 | 5385521490329509958 | 5385521489953706054 | ... | -| JavaEnable: | 1 | 0 | 1 | ... | -| Title: | Investor Relations | Contact us | Mission | ... | -| GoodEvent: | 1 | 1 | 1 | ... | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | - +| Row: | \#0 | \#1 | \#2 | \#N | +|-------------|---------------------|---------------------|---------------------|-----| +| WatchID: | 5385521489354350662 | 5385521490329509958 | 5385521489953706054 | … | +| JavaEnable: | 1 | 0 | 1 | … | +| Title: | Investor Relations | Contact us | Mission | … | +| GoodEvent: | 1 | 1 | 1 | … | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | این مثال ها تنها نشان می دهند که داده ها منظم شده اند. مقادیر ستون های مختلف به صورت جدا، و داده های مربوط به یک ستون در کنار یکدیگر ذخیره می شوند. @@ -38,33 +34,32 @@ ClickHouse یک مدیریت دیتابیس (DBMS) ستون گرا برای پر مثال های از دیتابیس های ستون گرا: Vertica, Paraccel (Actian Matrix, Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise, Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, kdb+. {: .grey } -ترتیب های مختلف برای ذخیره سازی داده ها، مناسب سناریو های مختلف هستند. سناریو دسترسی به داده اشاره دارد به، چه query هایی ساخته شده اند، چند وقت به چند وقت، در چه مقداری، چقدر داده در هنگام اجرای هر query خوانده می شود، چند رکورد، چند ستون و چند بایت؛ رابطه ی بین خوانده و نوشتن داده؛ سایز دیتاسی فعال مورد استفاده و نحوه ی استفاده آن به صورت محلی؛ آیا از تراکنش استفاده می شود؛ چگونه داده ها جدا می شوند؛ نیازمندی ها برای replication داده ها و یکپارچگی منطقی داده ها؛ نیازمندی ها برای latency و throughput برای هر نوع از query، و... +ترتیب های مختلف برای ذخیره سازی داده ها، مناسب سناریو های مختلف هستند. سناریو دسترسی به داده اشاره دارد به، چه query هایی ساخته شده اند، چند وقت به چند وقت، در چه مقداری، چقدر داده در هنگام اجرای هر query خوانده می شود، چند رکورد، چند ستون و چند بایت؛ رابطه ی بین خوانده و نوشتن داده؛ سایز دیتاسی فعال مورد استفاده و نحوه ی استفاده آن به صورت محلی؛ آیا از تراکنش استفاده می شود؛ چگونه داده ها جدا می شوند؛ نیازمندی ها برای replication داده ها و یکپارچگی منطقی داده ها؛ نیازمندی ها برای latency و throughput برای هر نوع از query، و… مهمتر از بالا بودن لود سیستم، سفارشی کردن سیستم مطابق با نیازمندی های سناریو می باشد، و این سفارشی سازی در ادامه دقیق تر می شود. هیج سیستمی وجود ندارد که مناسب انجام سناریو های متفاوت(بسیار متفاوت) باشد. اگر یک سیستم برای اجرای سناریو های مختلف آداپته شده باشد، در زمان بالا بودن لود، سیستم تمام سناریوها را به صورت ضعیف handle می کند. -## ویژگی های کلیدی یک سناریو OLAP +## ویژگی های کلیدی یک سناریو OLAP {#wyjgy-hy-khlydy-ykh-snryw-olap} -- اکثریت درخواست های برای خواندن می باشد. -- داده ها به صورت batch های بزرگ (< 1000 رکورد) وارد می شوند، نه به صورت تکی؛ یا اینکه اصلا بروز نمی شوند. -- داده ها به دیتابیس اضافه می شوند و تغییر پیدا نمی کنند. -- برای خواندن، تعداد زیادی از رکورد ها از دیتابیس استخراج می شوند، اما فقط چند ستون از رکورد ها. -- جداول "wide" هستند، به این معنی تعداد زیادی ستون دارند. -- query ها نسبتا کم هستند (معمولا صدها query در ثانیه به ازای هر سرور یا کمتر) -- برای query های ساده، زمان تاخیر 50 میلی ثانیه مجاز باشد. -- مقادیر ستون ها کوچک باشد: اعداد و رشته های کوتاه (برای مثال 60 بایت به ازای هر url) -- نیازمند throughput بالا در هنگام اجرای یک query (بالای یک میلیارد رکورد در هر ثانیه به ازای هر سرور) -- تراکنش واجب نیست. -- نیازمندی کم برای consistency بودن داده ها. -- فقط یک جدول بزرگ به ازای هر query وجود دارد. تمام جداول کوچک هستند، به جز یکی. -- نتیجه query به طول قابل توجهی کوچکتر از source داده ها می باشد. به عبارتی دیگر در یک query، داده ها فیلتر یا تجمیع می شوند، پس نتایج در RAM یک سرور فیت می شوند. +- اکثریت درخواست های برای خواندن می باشد. +- داده ها به صورت batch های بزرگ (\< 1000 رکورد) وارد می شوند، نه به صورت تکی؛ یا اینکه اصلا بروز نمی شوند. +- داده ها به دیتابیس اضافه می شوند و تغییر پیدا نمی کنند. +- برای خواندن، تعداد زیادی از رکورد ها از دیتابیس استخراج می شوند، اما فقط چند ستون از رکورد ها. +- جداول «wide» هستند، به این معنی تعداد زیادی ستون دارند. +- query ها نسبتا کم هستند (معمولا صدها query در ثانیه به ازای هر سرور یا کمتر) +- برای query های ساده، زمان تاخیر 50 میلی ثانیه مجاز باشد. +- مقادیر ستون ها کوچک باشد: اعداد و رشته های کوتاه (برای مثال 60 بایت به ازای هر url) +- نیازمند throughput بالا در هنگام اجرای یک query (بالای یک میلیارد رکورد در هر ثانیه به ازای هر سرور) +- تراکنش واجب نیست. +- نیازمندی کم برای consistency بودن داده ها. +- فقط یک جدول بزرگ به ازای هر query وجود دارد. تمام جداول کوچک هستند، به جز یکی. +- نتیجه query به طول قابل توجهی کوچکتر از source داده ها می باشد. به عبارتی دیگر در یک query، داده ها فیلتر یا تجمیع می شوند، پس نتایج در RAM یک سرور فیت می شوند. خوب خیلی ساده می توان دید که سناریو های OLAP خیلی متفاوت تر از دیگر سناریو های محبوب هستند (مثل OLTP یا Key-Value). پس اگر میخواهید performance مناسب داشته باشید، استفاده از دیتابیس های OLTP یا Key-Value برای اجرای query های OLAP معنی ندارد. برای مثال، اگر شما از دیتابیس MongoDB یا Redis برای آنالیز استفاده کنید، قطعا performance بسیار ضعیف تری نسبت به دیتابیس های OLAP خواهید داشت. -## دلایل برتری دیتابیس های ستون گرا برای سناریو های OLAP - +## دلایل برتری دیتابیس های ستون گرا برای سناریو های OLAP {#dlyl-brtry-dytbys-hy-stwn-gr-bry-snryw-hy-olap} دیتابیس های ستون گرا مناسب سناریو های OLAP هستند - (حداقل 100 برابر در بیشتر query ها سرعت پردازش آنها بهتر است). دلایل این برتری در پایین شرح داده شده است، اما آسانترش این هست که به صورت visually این تفاوت را ببینیم: +(حداقل 100 برابر در بیشتر query ها سرعت پردازش آنها بهتر است). دلایل این برتری در پایین شرح داده شده است، اما آسانترش این هست که به صورت visually این تفاوت را ببینیم: **ردیف گرا** @@ -76,69 +71,71 @@ ClickHouse یک مدیریت دیتابیس (DBMS) ستون گرا برای پر تفاوت را دیدید؟ بیشتر بخوانید تا یاد بگیرید چرا این اتفاق رخ میدهد. -### Input/output +### Input/output {#inputoutput} -1. برای query های تحلیلی، تنها چند ستون از تمام ستون های جدول نیاز به خواندن دارد. در یک دیتابیس ستون گرا، شما فقط داده ی مورد نیاز را می خوانید. برای مثال، اگر شما نیاز به 5 ستون از 100 ستون را دارید، شما می توانید انتظار 20 برابر کاهش I/O را داشته باشید. -2. از آنجایی که داده در بسته ها خوانده می شوند، فشرده سازی ساده می باشد. همچنین داده های ستون ها برای فشرده سازی ساده می باشند. این باعث کاهش نرخ I/O در ادامه می شود. -3. با توجه به کاهش I/O، داده های بیشتری در system cache قرار می گیرند. +1. برای query های تحلیلی، تنها چند ستون از تمام ستون های جدول نیاز به خواندن دارد. در یک دیتابیس ستون گرا، شما فقط داده ی مورد نیاز را می خوانید. برای مثال، اگر شما نیاز به 5 ستون از 100 ستون را دارید، شما می توانید انتظار 20 برابر کاهش I/O را داشته باشید. +2. از آنجایی که داده در بسته ها خوانده می شوند، فشرده سازی ساده می باشد. همچنین داده های ستون ها برای فشرده سازی ساده می باشند. این باعث کاهش نرخ I/O در ادامه می شود. +3. با توجه به کاهش I/O، داده های بیشتری در system cache قرار می گیرند. - برای مثال، query "تعداد رکوردها به ازای هر بستر نیازمندی" نیازمند خواندن ستون "آیدی بستر آگهی"، که 1 بایت بدون فشرده طول می کشد، خواهد بود. اگر بیشتر ترافیک مربوط به بستر های نیازمندی نبود، شما می توانید انتظار حداقل 10 برابر فشرده سازی این ستون را داشته باشید. زمانی که از الگوریتم فشرده سازی quick استفاده می کنید، عملیات decompression داده ها با سرعت حداقل چندین گیگابایت در ثانیه انجام می شود. به عبارت دیگر، این query توانایی پردازش تقریبا چندین میلیارد رکورد در ثانیه به ازای یک سرور را دارد. این سرعت در عمل واقعی و دست یافتنی است. +برای مثال، query «تعداد رکوردها به ازای هر بستر نیازمندی» نیازمند خواندن ستون «آیدی بستر آگهی»، که 1 بایت بدون فشرده طول می کشد، خواهد بود. اگر بیشتر ترافیک مربوط به بستر های نیازمندی نبود، شما می توانید انتظار حداقل 10 برابر فشرده سازی این ستون را داشته باشید. زمانی که از الگوریتم فشرده سازی quick استفاده می کنید، عملیات decompression داده ها با سرعت حداقل چندین گیگابایت در ثانیه انجام می شود. به عبارت دیگر، این query توانایی پردازش تقریبا چندین میلیارد رکورد در ثانیه به ازای یک سرور را دارد. این سرعت در عمل واقعی و دست یافتنی است. -
مثال -``` -$ clickhouse-client -ClickHouse client version 0.0.52053. -Connecting to localhost:9000. -Connected to ClickHouse server version 0.0.52053. +
-:) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 +مثال -SELECT - CounterID, - count() -FROM hits -GROUP BY CounterID -ORDER BY count() DESC -LIMIT 20 + $ clickhouse-client + ClickHouse client version 0.0.52053. + Connecting to localhost:9000. + Connected to ClickHouse server version 0.0.52053. -┌─CounterID─┬──count()─┐ -│ 114208 │ 56057344 │ -│ 115080 │ 51619590 │ -│ 3228 │ 44658301 │ -│ 38230 │ 42045932 │ -│ 145263 │ 42042158 │ -│ 91244 │ 38297270 │ -│ 154139 │ 26647572 │ -│ 150748 │ 24112755 │ -│ 242232 │ 21302571 │ -│ 338158 │ 13507087 │ -│ 62180 │ 12229491 │ -│ 82264 │ 12187441 │ -│ 232261 │ 12148031 │ -│ 146272 │ 11438516 │ -│ 168777 │ 11403636 │ -│ 4120072 │ 11227824 │ -│ 10938808 │ 10519739 │ -│ 74088 │ 9047015 │ -│ 115079 │ 8837972 │ -│ 337234 │ 8205961 │ -└───────────┴──────────┘ + :) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 -20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) + SELECT + CounterID, + count() + FROM hits + GROUP BY CounterID + ORDER BY count() DESC + LIMIT 20 + + ┌─CounterID─┬──count()─┐ + │ 114208 │ 56057344 │ + │ 115080 │ 51619590 │ + │ 3228 │ 44658301 │ + │ 38230 │ 42045932 │ + │ 145263 │ 42042158 │ + │ 91244 │ 38297270 │ + │ 154139 │ 26647572 │ + │ 150748 │ 24112755 │ + │ 242232 │ 21302571 │ + │ 338158 │ 13507087 │ + │ 62180 │ 12229491 │ + │ 82264 │ 12187441 │ + │ 232261 │ 12148031 │ + │ 146272 │ 11438516 │ + │ 168777 │ 11403636 │ + │ 4120072 │ 11227824 │ + │ 10938808 │ 10519739 │ + │ 74088 │ 9047015 │ + │ 115079 │ 8837972 │ + │ 337234 │ 8205961 │ + └───────────┴──────────┘ + + 20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) + + :) -:) -```
-### CPU +### CPU {#cpu} از آنجایی که اجرای یک query نیازمند پردازش تعداد زیادی سطر می باشد، این کمک می کند تا تمام عملیات ها به جای ارسال به سطرهای جداگانه، برای کل بردار ارسال شود، یا برای ترکیب query engine به طوری که هیچ هزینه ی ارسالی وجود ندارد. اگر این کار رو نکنید، با هر half-decent disk subsystem، تفسیرگر query ناگزیر است که CPU را متوقف کند. این منطقی است که که در صورت امکان هر دو کار ذخیره سازی داده در ستون ها و پردازش ستون ها با هم انجام شود. دو راه برای انجام این کار وجود دارد: -1. یک موتور بردار. تمام عملیات ها به جای مقادیر جداگانه، برای بردارها نوشته شوند. این به این معنیست که شما خیلی از مواقع نیازی به صدا کردن عملیات ها ندارید، و هزینه انتقال ناچیز است. کد عملیاتی شامل یک چرخه داخلی بهینه شده است. +1. یک موتور بردار. تمام عملیات ها به جای مقادیر جداگانه، برای بردارها نوشته شوند. این به این معنیست که شما خیلی از مواقع نیازی به صدا کردن عملیات ها ندارید، و هزینه انتقال ناچیز است. کد عملیاتی شامل یک چرخه داخلی بهینه شده است. -2. Code generation. کد تولید شده برای query دارای تمام تماس های غیرمستقیم در آن است. +2. Code generation. کد تولید شده برای query دارای تمام تماس های غیرمستقیم در آن است. این در یک دیتابیس نرمال انجام نمی شود، چرا که برای اجرای query های ساده این کارها منطقی نیست. هرچند، استثناهاتی هم وجود دارد. برای مثال، MemSQL از code generation برای کاهش latency در هنگام پردازش query های SQL استفاده می کند. (برای مقایسه، مدیریت دیتابیس های آنالیزی نیازمند بهینه سازی توان عملیاتی (throughput) هستند نه latency.) diff --git a/docs/fa/interfaces/cli.md b/docs/fa/interfaces/cli.md index 61588744728..37fc1e3060c 100644 --- a/docs/fa/interfaces/cli.md +++ b/docs/fa/interfaces/cli.md @@ -1,12 +1,12 @@ -
+
-# کلاینت Command-line +# کلاینت Command-line {#khlynt-command-line} -برای کار از طریق محیط ترمینال میتوانید از دستور ` clickhouse-client` استفاده کنید +برای کار از طریق محیط ترمینال میتوانید از دستور `clickhouse-client` استفاده کنید
-```bash +``` bash $ clickhouse-client ClickHouse client version 0.0.26176. Connecting to localhost:9000. @@ -15,11 +15,11 @@ Connected to ClickHouse server version 0.0.26176. :) ``` -
+
-کلاینت از آپشن های command-line و فایل های کانفیگ پشتیبانی می کند. برای اطلاعات بیشتر بخش "[پیکربندی](#interfaces_cli_configuration)" را مشاهده کنید. +کلاینت از آپشن های command-line و فایل های کانفیگ پشتیبانی می کند. برای اطلاعات بیشتر بخش «[پیکربندی](#interfaces_cli_configuration)» را مشاهده کنید. -## استفاده +## استفاده {#stfdh} کلاینت می تواند به دو صورت interactive و non-intercative (batch) مورد استفاده قرار گیرد. برای استفاده از حالت batch، پارامتر `query` را مشخص کنید، و یا داده ها ره به `stdin` ارسال کنید (کلاینت تایید می کند که `stdin` ترمینال نیست) و یا از هر 2 استفاده کنید. مشابه HTTP interface، هنگامی که از از پارامتر `query` و ارسال داده ها به `stdin` استفاده می کنید، درخواست، ترکیبی از پارامتر `query`، line feed، و داده ها در `stdin` است. این کار برای query های بزرگ INSERT مناسب است. @@ -27,7 +27,7 @@ Connected to ClickHouse server version 0.0.26176.
-```bash +``` bash echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; @@ -37,15 +37,16 @@ _EOF cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; ``` -
+ +
در حالت Batch، فرمت داده ها به صورت پیش فرض به صورت TabSeparated می باشد. شما میتوانید فرمت داده ها رو در هنگام اجرای query و با استفاده از شرط FORMAT مشخص کنید. -به طور پیش فرض شما فقط می توانید یک query را در خالت batch اجرا کنید.برای ساخت چندین query از یک "اسکریپت"، از پارامتر --multiquery استفاده کنید. این روش برای تمام query ها به جز INSERT کار می کند. نتایج query ها به صورت متوالی و بدون seperator اضافه تولید می شوند. به طور مشابه برای پردازش تعداد زیادی از query ها شما می توانید از 'clickhouse-client' برای هر query استفاده کنید. دقت کنید که ممکن است حدود 10 میلی ثانیه تا زمان راه اندازی برنامه 'clickhouse-client' زمان گرفته شود. +به طور پیش فرض شما فقط می توانید یک query را در خالت batch اجرا کنید.برای ساخت چندین query از یک «اسکریپت»، از پارامتر –multiquery استفاده کنید. این روش برای تمام query ها به جز INSERT کار می کند. نتایج query ها به صورت متوالی و بدون seperator اضافه تولید می شوند. به طور مشابه برای پردازش تعداد زیادی از query ها شما می توانید از ‘clickhouse-client’ برای هر query استفاده کنید. دقت کنید که ممکن است حدود 10 میلی ثانیه تا زمان راه اندازی برنامه ‘clickhouse-client’ زمان گرفته شود. در حالت intercative، شما یک command line برای درج query های خود دریافت می کنید. -اگر 'multiline' مشخص نشده باشد (به صورت پیش فرض): برای اجرای یک query، دکمه Enter را بزنید. سیمی کالن در انتهای query اجباری نیست. برای درج یک query چند خطی (multiline)، دکمه ی بک اسلش `\` را قبل از line feed فشار دهید. بعد از فشردن Enter، از شما برای درج خط بعدی query درخواست خواهد شد. +اگر ‘multiline’ مشخص نشده باشد (به صورت پیش فرض): برای اجرای یک query، دکمه Enter را بزنید. سیمی کالن در انتهای query اجباری نیست. برای درج یک query چند خطی (multiline)، دکمه ی بک اسلش `\` را قبل از line feed فشار دهید. بعد از فشردن Enter، از شما برای درج خط بعدی query درخواست خواهد شد. اگر چند خطی (multiline) مشخص شده باشد: برای اجرای query، در انتها سیمی کالن را وارد کنید و سپس Enter بزنید. اگر سیمی کالن از انتهای خط حذف می شد، از شما برای درج خط جدید query درخواست می شد. @@ -53,70 +54,69 @@ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMA شما میتوانید از `\G` به جای سیمی کالن یا بعد از سیمی کالن استفاده کنید. این علامت، فرمت Vertical را نشان می دهد. در این فرمت، هر مقدار در یک خط جدا چاپ می شود که برای جداول عریض مناسب است. این ویژگی غیرمعمول برای سازگاری با MySQL CLI اضافه شد. -command line برا پایه 'replxx' می باشد. به عبارت دیگر، این محیط از shortcut های آشنا استفاده می کند و history دستورات را نگه می دار. history ها در فایل ~/.clickhouse-client-history نوشته می شوند. +command line برا پایه ‘replxx’ می باشد. به عبارت دیگر، این محیط از shortcut های آشنا استفاده می کند و history دستورات را نگه می دار. history ها در فایل ~/.clickhouse-client-history نوشته می شوند. به صورت پیش فرض فرمت خروجی PrettyCompact می باشد. شما میتوانید از طریق دستور FORMAT در یک query، یا با مشخص کردن `\G` در انتهای query، استفاده از آرگومان های `--format` یا `--vertical` یا از کانفیگ فایل کلاینت، فرمت خروجی را مشخص کنید. -برای خروج از کلاینت، Ctrl-D (یا Ctrl+C) را فشار دهید؛ و یا یکی از دستورات زیر را به جای اجرای query اجرا کنید: "exit", "quit", "logout", "exit;", "quit;", "logout;", "q", "Q", ":q" +برای خروج از کلاینت، Ctrl-D (یا Ctrl+C) را فشار دهید؛ و یا یکی از دستورات زیر را به جای اجرای query اجرا کنید: «exit», «quit», «logout», «exit;», «quit;», «logout;», «q», «Q», «:q» در هنگام اجرای یک query، کلاینت موارد زیر را نمایش می دهد: -1. Progress، که بیش از 10 بار در ثانیه بروز نخواهد شد ( به صورت پیش فرض). برای query های سریع، progress ممکن است زمانی برای نمایش پیدا نکند. -2. فرمت کردن query بعد از عملیات پارس کردن، به منظور دیباگ کردن query. -3. نمایش خروجی با توجه به نوع فرمت. -4. تعداد لاین های خروجی، زمان پاس شدن query، و میانگیم سرعت پردازش query. +1. Progress، که بیش از 10 بار در ثانیه بروز نخواهد شد ( به صورت پیش فرض). برای query های سریع، progress ممکن است زمانی برای نمایش پیدا نکند. +2. فرمت کردن query بعد از عملیات پارس کردن، به منظور دیباگ کردن query. +3. نمایش خروجی با توجه به نوع فرمت. +4. تعداد لاین های خروجی، زمان پاس شدن query، و میانگیم سرعت پردازش query. شما میتوانید query های طولانی را با فشردن Ctrl-C کنسل کنید. هر چند، بعد از این کار همچنان نیاز به انتظار چند ثانیه ای برای قطع کردن درخواست توسط سرور می باشید. امکان کنسل کردن یک query در مراحل خاص وجود ندارد. اگر شما صبر نکنید و برای بار دوم Ctrl+C را وارد کنید از client خارج می شوید. -کلاینت commant-line اجازه ی پاس دادن داده های external (جداول موقت external) را برای query ها می دهد. برای اطلاعات بیشتر به بخش "داده های External برای پردازش query" مراجعه کنید. +کلاینت commant-line اجازه ی پاس دادن داده های external (جداول موقت external) را برای query ها می دهد. برای اطلاعات بیشتر به بخش «داده های External برای پردازش query» مراجعه کنید. -## پیکربندی {#interfaces_cli_configuration} +## پیکربندی {#interfaces-cli-configuration} شما میتوانید، پارامتر ها را به `clickhouse-client` (تمام پارامترها دارای مقدار پیش فرض هستند) از دو روش زیر پاس بدید: -- از طریق Command Line +- از طریق Command Line - گزینه های Command-line مقادیر پیش فرض در ستینگ و کانفیگ فایل را نادیده میگیرد. + گزینه های Command-line مقادیر پیش فرض در ستینگ و کانفیگ فایل را نادیده میگیرد. -- کانفیگ فایل ها. +- کانفیگ فایل ها. - ستینگ های داخل کانفیگ فایل، مقادیر پیش فرض را نادیده می گیرد. + ستینگ های داخل کانفیگ فایل، مقادیر پیش فرض را نادیده می گیرد. -### گزینه های Command line +### گزینه های Command line {#gzynh-hy-command-line} -- `--host, -h` -– نام سرور، به صورت پیش فرض 'localhost' است. شما میتوانید یکی از موارد نام و یا IPv4 و یا IPv6 را در این گزینه مشخص کنید. -- `--port` – پورت اتصال به ClickHouse. مقدار پیش فرض: 9000. دقت کنید که پرت اینترفیس HTTP و اینتفریس native متفاوت است. -- `--user, -u` – نام کاربری جهت اتصال. پیش فرض: default. -- `--password` – پسورد جهت اتصال. پیش فرض: خالی -- `--query, -q` – مشخص کردن query برای پردازش در هنگام استفاده از حالت non-interactive. -- `--database, -d` – انتخاب دیتابیس در بدو ورود به کلاینت. مقدار پیش فرض: دیتابیس مشخص شده در تنظیمات سرور (پیش فرض 'default') -- `--multiline, -m` – اگر مشخص شود، یعنی اجازه ی نوشتن query های چند خطی را بده. (بعد از Enter، query را ارسال نکن). -- `--multiquery, -n` – اگر مشخص شود، اجازه ی اجرای چندین query که از طریق جمع و حلقه ها جدا شده اند را می دهد. فقط در حالت non-interactive کار می کند. -- `--format, -f` مشخص کردن نوع فرمت خروجی -- `--vertical, -E` اگر مشخص شود، از فرمت Vertical برای نمایش خروجی استفاده می شود. این گزینه مشابه '--format=Vertical' می باشد. در این فرمت، هر مقدار در یک خط جدید چاپ می شود، که در هنگام نمایش جداول عریض مفید است. -- `--time, -t` اگر مشخص شود، در حالت non-interactive زمان اجرای query در 'stderr' جاپ می شود. -- `--stacktrace` – اگر مشخص شود stack trase مربوط به اجرای query در هنگام رخ دادن یک exception چاپ می شود. -- `--config-file` – نام فایل پیکربندی. +- `--host, -h` -– نام سرور، به صورت پیش فرض ‘localhost’ است. شما میتوانید یکی از موارد نام و یا IPv4 و یا IPv6 را در این گزینه مشخص کنید. +- `--port` – پورت اتصال به ClickHouse. مقدار پیش فرض: 9000. دقت کنید که پرت اینترفیس HTTP و اینتفریس native متفاوت است. +- `--user, -u` – نام کاربری جهت اتصال. پیش فرض: default. +- `--password` – پسورد جهت اتصال. پیش فرض: خالی +- `--query, -q` – مشخص کردن query برای پردازش در هنگام استفاده از حالت non-interactive. +- `--database, -d` – انتخاب دیتابیس در بدو ورود به کلاینت. مقدار پیش فرض: دیتابیس مشخص شده در تنظیمات سرور (پیش فرض ‘default’) +- `--multiline, -m` – اگر مشخص شود، یعنی اجازه ی نوشتن query های چند خطی را بده. (بعد از Enter، query را ارسال نکن). +- `--multiquery, -n` – اگر مشخص شود، اجازه ی اجرای چندین query که از طریق جمع و حلقه ها جدا شده اند را می دهد. فقط در حالت non-interactive کار می کند. +- `--format, -f` مشخص کردن نوع فرمت خروجی +- `--vertical, -E` اگر مشخص شود، از فرمت Vertical برای نمایش خروجی استفاده می شود. این گزینه مشابه ‘–format=Vertical’ می باشد. در این فرمت، هر مقدار در یک خط جدید چاپ می شود، که در هنگام نمایش جداول عریض مفید است. +- `--time, -t` اگر مشخص شود، در حالت non-interactive زمان اجرای query در ‘stderr’ جاپ می شود. +- `--stacktrace` – اگر مشخص شود stack trase مربوط به اجرای query در هنگام رخ دادن یک exception چاپ می شود. +- `--config-file` – نام فایل پیکربندی. -### فایل های پیکربندی +### فایل های پیکربندی {#fyl-hy-pykhrbndy} `clickhouse-client` به ترتیب اولویت زیر از اولین فایل موجود برای ست کردن تنظیمات استفاده می کند: -- مشخص شده در پارامتر `--config-file` -- `./clickhouse-client.xml` -- `\~/.clickhouse-client/config.xml` -- `/etc/clickhouse-client/config.xml` +- مشخص شده در پارامتر `--config-file` +- `./clickhouse-client.xml` +- `\~/.clickhouse-client/config.xml` +- `/etc/clickhouse-client/config.xml` مثالی از یک کانفیگ فایل
-```xml +``` xml username password ``` - [مقاله اصلی](https://clickhouse.tech/docs/fa/interfaces/cli/) diff --git a/docs/fa/interfaces/cpp.md b/docs/fa/interfaces/cpp.md index 490f852792a..398d6e22687 100644 --- a/docs/fa/interfaces/cpp.md +++ b/docs/fa/interfaces/cpp.md @@ -1,4 +1,4 @@ -# C++ Client Library +# C++ Client Library {#c-client-library} See README at [clickhouse-cpp](https://github.com/ClickHouse/clickhouse-cpp) repository. diff --git a/docs/fa/interfaces/formats.md b/docs/fa/interfaces/formats.md index 5cea6162263..dc241e8d438 100644 --- a/docs/fa/interfaces/formats.md +++ b/docs/fa/interfaces/formats.md @@ -1,4 +1,4 @@ -
+
# فرمت های Input و Output {#formats} @@ -6,49 +6,48 @@ جدول زیر لیست فرمت های پشتیبانی شده برای هر نوع از query ها را نمایش می دهد. -Format | INSERT | SELECT --------|--------|-------- -[TabSeparated](formats.md#tabseparated) | ✔ | ✔ | -[TabSeparatedRaw](formats.md#tabseparatedraw) | ✗ | ✔ | -[TabSeparatedWithNames](formats.md#tabseparatedwithnames) | ✔ | ✔ | -[TabSeparatedWithNamesAndTypes](formats.md#tabseparatedwithnamesandtypes) | ✔ | ✔ | -[Template](#format-template) | ✔ | ✔ | -[TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | -[CSV](formats.md#csv) | ✔ | ✔ | -[CSVWithNames](formats.md#csvwithnames) | ✔ | ✔ | -[Values](formats.md#data-format-values) | ✔ | ✔ | -[Vertical](formats.md#vertical) | ✗ | ✔ | -[VerticalRaw](formats.md#verticalraw) | ✗ | ✔ | -[JSON](formats.md#json) | ✗ | ✔ | -[JSONCompact](formats.md#jsoncompact) | ✗ | ✔ | -[JSONEachRow](formats.md#jsoneachrow) | ✔ | ✔ | -[TSKV](formats.md#tskv) | ✔ | ✔ | -[Pretty](formats.md#pretty) | ✗ | ✔ | -[PrettyCompact](formats.md#prettycompact) | ✗ | ✔ | -[PrettyCompactMonoBlock](formats.md#prettycompactmonoblock) | ✗ | ✔ | -[PrettyNoEscapes](formats.md#prettynoescapes) | ✗ | ✔ | -[PrettySpace](formats.md#prettyspace) | ✗ | ✔ | -[Protobuf](#protobuf) | ✔ | ✔ | -[Avro](#data-format-avro) | ✔ | ✔ | -[AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | -[Parquet](#data-format-parquet) | ✔ | ✔ | -[ORC](#data-format-orc) | ✔ | ✗ | -[RowBinary](formats.md#rowbinary) | ✔ | ✔ | -[Native](formats.md#native) | ✔ | ✔ | -[Null](formats.md#null) | ✗ | ✔ | -[XML](formats.md#xml) | ✗ | ✔ | -[CapnProto](formats.md#capnproto) | ✔ | ✔ | +| Format | INSERT | SELECT | +|---------------------------------------------------------------------------|--------|--------| +| [TabSeparated](formats.md#tabseparated) | ✔ | ✔ | +| [TabSeparatedRaw](formats.md#tabseparatedraw) | ✗ | ✔ | +| [TabSeparatedWithNames](formats.md#tabseparatedwithnames) | ✔ | ✔ | +| [TabSeparatedWithNamesAndTypes](formats.md#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | +| [CSV](formats.md#csv) | ✔ | ✔ | +| [CSVWithNames](formats.md#csvwithnames) | ✔ | ✔ | +| [Values](formats.md#data-format-values) | ✔ | ✔ | +| [Vertical](formats.md#vertical) | ✗ | ✔ | +| [VerticalRaw](formats.md#verticalraw) | ✗ | ✔ | +| [JSON](formats.md#json) | ✗ | ✔ | +| [JSONCompact](formats.md#jsoncompact) | ✗ | ✔ | +| [JSONEachRow](formats.md#jsoneachrow) | ✔ | ✔ | +| [TSKV](formats.md#tskv) | ✔ | ✔ | +| [Pretty](formats.md#pretty) | ✗ | ✔ | +| [PrettyCompact](formats.md#prettycompact) | ✗ | ✔ | +| [PrettyCompactMonoBlock](formats.md#prettycompactmonoblock) | ✗ | ✔ | +| [PrettyNoEscapes](formats.md#prettynoescapes) | ✗ | ✔ | +| [PrettySpace](formats.md#prettyspace) | ✗ | ✔ | +| [Protobuf](#protobuf) | ✔ | ✔ | +| [Avro](#data-format-avro) | ✔ | ✔ | +| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | +| [Parquet](#data-format-parquet) | ✔ | ✔ | +| [ORC](#data-format-orc) | ✔ | ✗ | +| [RowBinary](formats.md#rowbinary) | ✔ | ✔ | +| [Native](formats.md#native) | ✔ | ✔ | +| [Null](formats.md#null) | ✗ | ✔ | +| [XML](formats.md#xml) | ✗ | ✔ | +| [CapnProto](formats.md#capnproto) | ✔ | ✔ | - -## TabSeparated +## TabSeparated {#tabseparated} در فرمت TabSeparated، داده ها به صورت سطر نوشته می شوند. هر سطر شامل مقادیر جدا شده با tab می باشد. هر مقدار با یک tab دنبال می شود، به جز آخرین مقدار یک سطر، که با line feed دنبال می شود. line feed unix در همه جا مورد تسافده قرار می گیرد. آخرین سطر از خروجی هم باید شامل line feed در انتها باشد. مقادیر در فرمت متنی بدون enclose با کوتیشون، و یا escape با کاراکترهای ویژه، نوشته می شوند. -اعداد Integer با فرم decimal نوشته می شوند. اعداد می توانند شامل کاراکتر اضافه "+" در ابتدای خود باشند. (در هنگام پارس کردن نادیده گرفته می شوند، و در هنگام فرمت کردن، ثبت نمی شوند). اعداد غیر منفی نمیتوانند شامل علامت منفی باشند. در هنگام خواندن، اجازه داده می شود که رشته خالی را به عنوان صفر، پارس کرد، یا (برای تایپ های sign) یک رشته که شامل فقط یک علامت منفی است به عنوان صفر پارس کرد. اعدادی که در data type مربوطه فیت نشوند ممکن است به عددی متفاوت تبدیل شوند و پیغام خطایی هم نمایش ندهند. +اعداد Integer با فرم decimal نوشته می شوند. اعداد می توانند شامل کاراکتر اضافه «+» در ابتدای خود باشند. (در هنگام پارس کردن نادیده گرفته می شوند، و در هنگام فرمت کردن، ثبت نمی شوند). اعداد غیر منفی نمیتوانند شامل علامت منفی باشند. در هنگام خواندن، اجازه داده می شود که رشته خالی را به عنوان صفر، پارس کرد، یا (برای تایپ های sign) یک رشته که شامل فقط یک علامت منفی است به عنوان صفر پارس کرد. اعدادی که در data type مربوطه فیت نشوند ممکن است به عددی متفاوت تبدیل شوند و پیغام خطایی هم نمایش ندهند. -اعداد Floating-point به فرم decimal نوشته می شوند. از دات به عنوان جدا کننده decimal استفاده می شود. نوشته های نمایشی مثل 'inf'، '+inf'، '-inf' و 'nan' پشتیبانی می شوند. ورودی اعداد floating-point می تواند با یه نقطه اعشار شروع یا پایان یابد. در هنگام فرمت، دقت اعداد floating-point ممکن است گم شوند. در هنگام پارس کردن، دقیقا نیازی به خواندن نزدیکترین عدد machine-representable نیست. +اعداد Floating-point به فرم decimal نوشته می شوند. از دات به عنوان جدا کننده decimal استفاده می شود. نوشته های نمایشی مثل ‘inf’، ‘+inf’، ‘-inf’ و ‘nan’ پشتیبانی می شوند. ورودی اعداد floating-point می تواند با یه نقطه اعشار شروع یا پایان یابد. در هنگام فرمت، دقت اعداد floating-point ممکن است گم شوند. در هنگام پارس کردن، دقیقا نیازی به خواندن نزدیکترین عدد machine-representable نیست. -Dates با فرمت YYY-MM-DD نوشته می شوند و به همین حالت پارس می شوند، اما با هر کاراکتری به عنوان جدا کننده. Dates به همراه زمان با فرمت YYYY-MM-DD hh:mm:ss نوشته می شوند و با همین فرمت پارس می شوند، اما با هر کاراکتری به عنوان جداکننده. این در منطقه زمان سیستم در زمانی که کلاینت یا سرور شروع می شود (بسته به اینکه کدام یک از داده ها را تشکیل می دهد) رخ می دهد. برای تاریخ همراه با زمان DST مشخص نمی شود. پس اگر یک دامپ دارای زمان DST باشد، دامپ، داده ها را به طور غیرمستقیم مطابقت نمی دهد و پارسینگ، یکی از دو ساعت را انتخاب خواهد کرد. در طول عملیات خواندن، تاریخ ها و تاریخ و ساعت های نادرست می توانند به صورت null و یا natural overflow پارس شوند، بدون اینکه پیغام خطایی نمایش دهند. +Dates با فرمت YYY-MM-DD نوشته می شوند و به همین حالت پارس می شوند، اما با هر کاراکتری به عنوان جدا کننده. Dates به همراه زمان با فرمت YYYY-MM-DD hh:mm:ss نوشته می شوند و با همین فرمت پارس می شوند، اما با هر کاراکتری به عنوان جداکننده. این در منطقه زمان سیستم در زمانی که کلاینت یا سرور شروع می شود (بسته به اینکه کدام یک از داده ها را تشکیل می دهد) رخ می دهد. برای تاریخ همراه با زمان DST مشخص نمی شود. پس اگر یک دامپ دارای زمان DST باشد، دامپ، داده ها را به طور غیرمستقیم مطابقت نمی دهد و پارسینگ، یکی از دو ساعت را انتخاب خواهد کرد. در طول عملیات خواندن، تاریخ ها و تاریخ و ساعت های نادرست می توانند به صورت null و یا natural overflow پارس شوند، بدون اینکه پیغام خطایی نمایش دهند. به عنوان یک استثنا، پارس کردن تاریخ به همراه ساعت، اگر مقدار دقیقا شامل 10 عدد decimal باشد، به عنوان فرمت unix timestamp پشتیبانی خواهد کرد. خروجی وابسته به time-zone نمی باشد. فرمت های YYYY-MM-DD hh: mm: ss و NNNNNNNNNN به صورت خودکار تمایز می یابند. @@ -56,14 +55,12 @@ Dates با فرمت YYY-MM-DD نوشته می شوند و به همین حالت
-``` -Hello\nworld + Hello\nworld -Hello\ -world -``` + Hello\ + world -
+
نوع دوم به دلیل پشتیبانی MySQL در هنگام نوشتن دامپ به صورت tab-separate، پشتیبانی می شود. @@ -75,7 +72,7 @@ world فرمت TabSeparate برای پردازش داده ها با استفاده از برنامه های شخصی سازی شده و اسکریپت ها مناسب است. TabSeparate به صورت پیش فرض در HTTP interface و در حالت batch کلاینت command-line مورد استفاده قرار می گیرد. همچنین این فرمت اجازه ی انتقال داده ها بین DBMS های مختلف را می دهد. برای مثال، شما می توانید از MySQL با این روش دامپ بگیرید و آن را در ClickHouse یا vice versa آپلود کنید. -فرمت TabSeparated از خروحی total values (هنگام استفاده از WITH TOTALS) و extreme values (در هنگامی که 'extreme' برابر با 1 است) پشتیبانی می کند. در این موارد، total value و extreme بعد از داده های اصلی در خروجی می آیند. نتایج اصلی، total values و extreme همگی با یک empty line از هم جدا می شوند. مثال: +فرمت TabSeparated از خروحی total values (هنگام استفاده از WITH TOTALS) و extreme values (در هنگامی که ‘extreme’ برابر با 1 است) پشتیبانی می کند. در این موارد، total value و extreme بعد از داده های اصلی در خروجی می آیند. نتایج اصلی، total values و extreme همگی با یک empty line از هم جدا می شوند. مثال:
@@ -83,98 +80,94 @@ world SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated`` ``` -``` -2014-03-17 1406958 -2014-03-18 1383658 -2014-03-19 1405797 -2014-03-20 1353623 -2014-03-21 1245779 -2014-03-22 1031592 -2014-03-23 1046491 + 2014-03-17 1406958 + 2014-03-18 1383658 + 2014-03-19 1405797 + 2014-03-20 1353623 + 2014-03-21 1245779 + 2014-03-22 1031592 + 2014-03-23 1046491 -0000-00-00 8873898 + 0000-00-00 8873898 -2014-03-17 1031592 -2014-03-23 1406958 -``` + 2014-03-17 1031592 + 2014-03-23 1406958 -
+
این فرمت نیز تحت نام `TSV` موجود است. - - -## TabSeparatedRaw +## TabSeparatedRaw {#tabseparatedraw} تفاوت آن با `TabSeperated` در این است که در این فرمت سطرها بدون escape نوشته می شوند. این فرمت فقط مناسب خروجی نتایج query ها می باشد، نه برای پارس کردن (دریافت داده ها و درج آن در جدول). -همچنین این فرمت تحت عنوان ` TSVRaw`وجود دارد. +همچنین این فرمت تحت عنوان `TSVRaw`وجود دارد. -## TabSeparatedWithNames +## TabSeparatedWithNames {#tabseparatedwithnames} تفاوت آن با فرمت `TabSeparated` در این است که، در این فرمت نام ستون ها در سطر اول قرار می گیرد. در طول پارس کردن، سطر اول به طور کامل نادیده گرفته می شود. شما نمی توانید نام ستون ها را برای تعیین موقعیت آنها یا بررسی صحت آنها استفاده کنید. (پشتیبانی از پارس کردن سطر header ممکن است در آینده اضافه شود.) -همچنین این فرمت تحت عنوان ` TSVWithNames`وجود دارد. +همچنین این فرمت تحت عنوان `TSVWithNames`وجود دارد. -## TabSeparatedWithNamesAndTypes +## TabSeparatedWithNamesAndTypes {#tabseparatedwithnamesandtypes} تفاوت آن با `TabSeparated` در این است که در این فرمت نام ستون ها در سطر اول نوشته می شود، و type ستون ها در سطر دوم نوشته می شود. در طی پارسینگ، سطر اول و دوم به طور کامل نادیده گرفته می شوند. -همچنین این فرمت تحت عنوان ` TSVWithNamesAndTypes`وجود دارد. - +همچنین این فرمت تحت عنوان `TSVWithNamesAndTypes`وجود دارد. ## Template {#format-template} This format allows to specify a custom format string with placeholders for values with specified escaping rule. -It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) +It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) Format string `format_schema_rows` specifies rows format with the following syntax: - `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, +`delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, - where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), - `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), - `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: - - - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) - - `Escaped` (similarly to `TSV`) - - `Quoted` (similarly to `Values`) - - `Raw` (without escaping, similarly to `TSVRaw`) - - `None` (no escaping rule, see further) - - If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. - - So, for the following format string: - - `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` - - the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: + where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), + `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), + `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: - `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` - - The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) + - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) + - `Escaped` (similarly to `TSV`) + - `Quoted` (similarly to `Values`) + - `Raw` (without escaping, similarly to `TSVRaw`) + - `None` (no escaping rule, see further) + + If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. + + So, for the following format string: + + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` + + the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: + + `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + +The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) Format string `format_schema` has the same syntax as `format_schema_rows` and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: - - `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. - - `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) - - `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) - - `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) - - `rows` is the total number of output rows - - `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. - - `time` is the request execution time in seconds - - `rows_read` is the number of rows have been read - - `bytes_read` is the number of bytes (uncompressed) have been read - - The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. - If the `format_schema` setting is an empty string, `${data}` is used as default value. - For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). - - `Select` example: -```sql +- `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. +- `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) +- `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) +- `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) +- `rows` is the total number of output rows +- `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows\_before\_limit\_at\_least is the exact number of rows there would have been without a LIMIT. +- `time` is the request execution time in seconds +- `rows_read` is the number of rows have been read +- `bytes_read` is the number of bytes (uncompressed) have been read + +The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. +If the `format_schema` setting is an empty string, `${data}` is used as default value. +For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). + +`Select` example: + +``` sql SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 -FORMAT Template +FORMAT Template SETTINGS format_schema = ' Search phrases @@ -191,7 +184,8 @@ SETTINGS format_schema = ' format_schema_rows = ' ${SearchPhrase:XML} ${с:XML} ', format_schema_rows_between_delimiter = '\n ' ``` -```html + +``` html Search phrases @@ -212,52 +206,52 @@ format_schema_rows_between_delimiter = '\n ' ``` `Insert` example: -``` -Some header -Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 -Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 -Total rows: 2 -``` -```sql -INSERT INTO UserActivity FORMAT Template SETTINGS -format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', + + Some header + Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 + Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 + Total rows: 2 + +``` sql +INSERT INTO UserActivity FORMAT Template SETTINGS +format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' ``` -`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows: ` in suffix will be ignored. + +`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows:` in suffix will be ignored. All delimiters in the input data must be strictly equal to delimiters in specified format strings. - + ## TemplateIgnoreSpaces {#templateignorespaces} This format is suitable only for input. -Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. -It's possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): -```sql +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. +It’s possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): + +``` sql INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', format_schema_rows_between_delimiter = ',' ``` -## TSKV +## TSKV {#tskv} مشابه فرمت TabSeparated، اما خروجی به صورت name=value می باشد. نام ها مشابه روش TabSeparated، escape می شوند، و همچنین = symbol هم escape می شود.
-``` -SearchPhrase= count()=8267016 -SearchPhrase=bathroom interior design count()=2166 -SearchPhrase=yandex count()=1655 -SearchPhrase=spring 2014 fashion count()=1549 -SearchPhrase=freeform photos count()=1480 -SearchPhrase=angelina jolia count()=1245 -SearchPhrase=omsk count()=1112 -SearchPhrase=photos of dog breeds count()=1091 -SearchPhrase=curtain design count()=1064 -SearchPhrase=baku count()=1000 -``` + SearchPhrase= count()=8267016 + SearchPhrase=bathroom interior design count()=2166 + SearchPhrase=yandex count()=1655 + SearchPhrase=spring 2014 fashion count()=1549 + SearchPhrase=freeform photos count()=1480 + SearchPhrase=angelina jolia count()=1245 + SearchPhrase=omsk count()=1112 + SearchPhrase=photos of dog breeds count()=1091 + SearchPhrase=curtain design count()=1064 + SearchPhrase=baku count()=1000 -
+
وقتی تعداد زیادی از ستون ها وجود دارد، این فرمت بی فایده است، و در حالت کلی دلیلی بر استفاده از این فرمت در این مواقع وجود ندارد. این فرمت در بعضی از دپارتمان های Yandex استفاده می شد. @@ -269,27 +263,25 @@ SearchPhrase=baku count()=1000 Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)). -زمانی که از این روش برای فرمت استفاده می شود، سطر ها با دابل کوتیشن enclosed می شوند. دابل کوتیشن داخل یک رشته خروجی آن به صورت دو دابل کوتیشن در یک سطر است. قانون دیگری برای escape کردن کاراکترها وجود ندارد. تاریخ و تاریخ-ساعت در دابل کوتیشن ها enclosed می شوند. اعداد بدون دابل کوتیشن در خروجی می آیند. مقادیر با جدا کننده * مشخص می شوند. سطر ها با استفاده از line feed (LF) جدا می شوند. آرایه ها در csv به این صورت serialize می شوند: ابتدا آرایه به یک رشته با فرمت TabSeparate سریالایز می شوند، و سپس رشته ی حاصل در دابل کوتیشن برای csv ارسال می شود. Tuple ها در فرمت CSV در ستون های جدا سریالایز می شوند (به این ترتیب، nest ها در tuble از دست میروند) +زمانی که از این روش برای فرمت استفاده می شود، سطر ها با دابل کوتیشن enclosed می شوند. دابل کوتیشن داخل یک رشته خروجی آن به صورت دو دابل کوتیشن در یک سطر است. قانون دیگری برای escape کردن کاراکترها وجود ندارد. تاریخ و تاریخ-ساعت در دابل کوتیشن ها enclosed می شوند. اعداد بدون دابل کوتیشن در خروجی می آیند. مقادیر با جدا کننده \* مشخص می شوند. سطر ها با استفاده از line feed (LF) جدا می شوند. آرایه ها در csv به این صورت serialize می شوند: ابتدا آرایه به یک رشته با فرمت TabSeparate سریالایز می شوند، و سپس رشته ی حاصل در دابل کوتیشن برای csv ارسال می شود. Tuple ها در فرمت CSV در ستون های جدا سریالایز می شوند (به این ترتیب، nest ها در tuble از دست میروند)
-``` -clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv -``` + clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv -
+
-*به صورت پیش فرض — `,`. برای اطلاعات بیشتر [format_csv_delimiter](/operations/settings/settings/#settings-format_csv_delimiter) را ببینید. +\*به صورت پیش فرض — `,`. برای اطلاعات بیشتر [format\_csv\_delimiter](/operations/settings/settings/#settings-format_csv_delimiter) را ببینید. در هنگام پارس کردن، تمامی مقادیر می توانند با کوتیشن یا بدون کوتیشن پارس شوند. تک کوتیشن و دابل کوتیشن پشتیبانی می شود. سطر ها می توانند بدون کوتیشن تنظیم شوند. در این مورد سطر ها، جدا کننده ها با (CR یا LF) پارس می شوند. در موارد نقض RFC، در هنگام پارس کردن سطر ها بدون کوتیشن، فضاها و tab های پیشین نادید گرفته می شوند. برای line feed، یونیکس از (LF)، ویدنوز از (CR LF) و Mac OS کلاسیک (CR LF) پشتیبانی می کند. فرمت CSV خروجی total و extreme را همانند `TabSeparated` پشتیبانی می کنند. -## CSVWithNames +## CSVWithNames {#csvwithnames} همچنین header سطر را چاپ می کند، شبیه به `TabSeparatedWithNames`. -## JSON +## JSON {#json} خروجی داده ها با فرمت JSON. در کنال داده های جداول، خروجی JSON اسم ستون ها و type آنها به همراه اطلاعات بیشتر تولید می کند: تعداد سطر های خروجی، و همچنین تعداد رکورد های کل بدون در نظر گرفتن دستور LIMIT. مثال: @@ -299,7 +291,7 @@ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMA SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON ``` -```json +``` json { "meta": [ @@ -363,13 +355,13 @@ SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTA } ``` -
+
-JSON با جاوااسکریپت سازگار است. برای اطمینان از این، بعضی از کاراکتر ها ecape های اضافه دارند: اسلش `/` به صورت `\/` escape می شود؛ line break جایگزین یعنی `U+2028` و `U+2029` که باعث break در بعضی از مروگرها می شود، به شکل `\uXXXX` escape می شوند. کاراکتر های کنترلی ASCII هم escape می شوند: backspace، form feed، line feed، carriage return، و horizontal tab به ترتیب با `\b`، `\f`، `\n`، `\r`، `\t` جایگزین می شوند. همچنین بایت های باقی مانده در محدوده 00 تا 1F با استفاده از `\uXXXX` جایگزین می شوند. کاراکتر های بی اعتبار UTF-8 با � جایگزین می شوند، پس خروجی JSON شامل موارد معتبر UTF-8 می باشد. برای سازگاری با جاوااسکریپت، اعداد Int64 و Uint64 به صورت پیش فرض، با استفاده از دابل کوتیشن enclose می شوند. برای حذف کوتیشن، شما باید پارامتر output_format_json_quote_64bit_integers v رو برابر با 0 قرار دهید. +JSON با جاوااسکریپت سازگار است. برای اطمینان از این، بعضی از کاراکتر ها ecape های اضافه دارند: اسلش `/` به صورت `\/` escape می شود؛ line break جایگزین یعنی `U+2028` و `U+2029` که باعث break در بعضی از مروگرها می شود، به شکل `\uXXXX` escape می شوند. کاراکتر های کنترلی ASCII هم escape می شوند: backspace، form feed، line feed، carriage return، و horizontal tab به ترتیب با `\b`، `\f`، `\n`، `\r`، `\t` جایگزین می شوند. همچنین بایت های باقی مانده در محدوده 00 تا 1F با استفاده از `\uXXXX` جایگزین می شوند. کاراکتر های بی اعتبار UTF-8 با � جایگزین می شوند، پس خروجی JSON شامل موارد معتبر UTF-8 می باشد. برای سازگاری با جاوااسکریپت، اعداد Int64 و Uint64 به صورت پیش فرض، با استفاده از دابل کوتیشن enclose می شوند. برای حذف کوتیشن، شما باید پارامتر output\_format\_json\_quote\_64bit\_integers v رو برابر با 0 قرار دهید. `rows` – تعداد سطر های خروجی -`rows_before_limit_at_least` حداقل تعداد سطر ها در هنگام عدم استفاده از LIMIT. فقط در هنگامی که query دارای LIMIT است خروجی دارد. اگر query شامل GROUP BY باشد، مقدار rows_before_limit_at_least دقیقا با زمانی که از LIMIT استفاده نمی شود یکی است. +`rows_before_limit_at_least` حداقل تعداد سطر ها در هنگام عدم استفاده از LIMIT. فقط در هنگامی که query دارای LIMIT است خروجی دارد. اگر query شامل GROUP BY باشد، مقدار rows\_before\_limit\_at\_least دقیقا با زمانی که از LIMIT استفاده نمی شود یکی است. `totals` – مقدار TOTAL (زمانی که از WITH TOTALS استفاده می شود). @@ -377,7 +369,7 @@ JSON با جاوااسکریپت سازگار است. برای اطمینان ا این فرمت فقط مناسب خروجی query های می باشد، به این معنی که برای عملیات پارس کردن (دریافت داده برای insert در جدول) نیست. همچنین فرمت JSONEachRow را ببینید. -## JSONCompact +## JSONCompact {#jsoncompact} فقط در جاهایی که داده ها به جای object در array هستند خروجی آنها متفاوت است. @@ -385,7 +377,7 @@ JSON با جاوااسکریپت سازگار است. برای اطمینان ا
-```json +``` json { "meta": [ @@ -422,17 +414,17 @@ JSON با جاوااسکریپت سازگار است. برای اطمینان ا } ``` -
+
این فرمت فقط مناسب خروجی query های می باشد، به این معنی که برای عملیات پارس کردن (دریافت داده برای insert در جدول) نیست. همچنین فرمت JSONEachRow را ببینید. -## JSONEachRow +## JSONEachRow {#jsoneachrow} هر سطر برای خود JSON Object جدا دارد. (با استفاده از newline، JSON تعریف می شوند.)
-```json +``` json {"SearchPhrase":"","count()":"8267016"} {"SearchPhrase":"bathroom interior design","count()":"2166"} {"SearchPhrase":"yandex","count()":"1655"} @@ -445,7 +437,7 @@ JSON با جاوااسکریپت سازگار است. برای اطمینان ا {"SearchPhrase":"baku","count()":"1000"} ``` -
+
بر خلاف فرمت JSON، هیچ جایگزینی برای کاراکتر های بی اعتبار UTF-8 وجود ندارد. هر مجموعه ای از بایت های می تواند داخل سطر در خروجی باشند. پس داده ها بدون از دست دادن هیچ اطلاعاتی فرمت می شوند. مقادیر شبیه به JSON، escape می شوند. @@ -453,23 +445,23 @@ JSON با جاوااسکریپت سازگار است. برای اطمینان ا ### Usage of Nested Structures {#jsoneachrow-nested} -If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. +If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input\_format\_import\_nested\_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. For example, consider the following table: -```sql +``` sql CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory ``` As you can find in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column, `n.s` and `n.i` for our table. So you can insert the data the following way: -```sql +``` sql INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} ``` -To insert data as hierarchical JSON object set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). +To insert data as hierarchical JSON object set [input\_format\_import\_nested\_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). -```json +``` json { "n": { "s": ["abc", "def"], @@ -480,46 +472,51 @@ To insert data as hierarchical JSON object set [input_format_import_nested_json= Without this setting ClickHouse throws the exception. -```sql +``` sql SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json' ``` -```text + +``` text ┌─name────────────────────────────┬─value─┐ │ input_format_import_nested_json │ 0 │ └─────────────────────────────────┴───────┘ ``` -```sql + +``` sql INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} ``` -```text + +``` text Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1) ``` -```sql + +``` sql SET input_format_import_nested_json=1 INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} SELECT * FROM json_each_row_nested ``` -```text + +``` text ┌─n.s───────────┬─n.i────┐ │ ['abc','def'] │ [1,23] │ └───────────────┴────────┘ ``` -## Native +## Native {#native} -کارآمدترین فرمت. داده ها توسط بلاک ها و در فرمت باینری نوشته و خوانده می شوند. برای هر بلاک، تعداد سطرها، تعداد ستون ها، نام ستون ها و type آنها، و بخش هایی از ستون ها در این بلاک یکی پس از دیگری ثبت می شوند. به عبارت دیگر، این فرمت "columnar" است - این فرمت ستون ها را به سطر تبدیل نمی کند. این فرمت در حالت native interface و بین سرور و محیط ترمینال و همچنین کلاینت C++ استفاده می شود. +کارآمدترین فرمت. داده ها توسط بلاک ها و در فرمت باینری نوشته و خوانده می شوند. برای هر بلاک، تعداد سطرها، تعداد ستون ها، نام ستون ها و type آنها، و بخش هایی از ستون ها در این بلاک یکی پس از دیگری ثبت می شوند. به عبارت دیگر، این فرمت «columnar» است - این فرمت ستون ها را به سطر تبدیل نمی کند. این فرمت در حالت native interface و بین سرور و محیط ترمینال و همچنین کلاینت C++ استفاده می شود. شما می توانید از این فرمت برای تهیه دامپ سریع که فقط توسط مدیریت دیتابیس ClickHouse قابل خواندن است استفاده کنید. برای استفاده از این فرمت برای خودتان منطقی نیست. -## Null +## Null {#null} هیچی در خروجی نمایش داده نمی شود. با این حال، query پردازش می شود، و زمانی که از کلایت command-line استفاده می کنید، داده ها برای کلاینت ارسال می شوند. از این برای تست، شامل تست بهره وری استفاده می شود. به طور مشخص، این فرمت فقط برای خروجی مناسب است نه برای پارس کردن. -## Pretty +## Pretty {#pretty} خروجی داده ها به صورت جداول Unicode-art، همچنین استفاده از ANSI-escape برای تنظیم رنگ های ترمینال. یک جدول کامل کشیده می شود، و هر سطر دو خط از ترمینال را اشغال می کند. هر بلاکِ نتیجه، به عنوان یک جدول جدا چاپ می شود.پس بلاک ها می توانند بدون بافر کردن نتایج چاپ شوند (بافرینگ برای pre-calculate تمام مقادیر قابل مشاهده ضروری است). برای جلوگیری از دامپ زیاد داده ها در ترمینال، 10 هزار سطر اول چاپ می شوند. اگر تعداد سطر های بزرگتر مساوی 10 هزار باشد، پیغام " 10 هزار اول نمایش داده شد" چاپ می شود. این فرمت فقط مناسب خروجی نتایج query ها می باشد، نه برای پارس کردن (دریافت داده ها و درج آن در جدول). -فرمت Pretty از total values (هنگام استفاده از WITH TOTALS) و extreme (هنگام که 'extremes' برابر با 1 است) برای خروجی پشتیبانی می کند. در این موارد، total values و extreme values بعد از نمایش داده های اصلی در جداول جدا، چاپ می شوند. مثال (برای فرمت PrettyCompact نمایش داده شده است): +فرمت Pretty از total values (هنگام استفاده از WITH TOTALS) و extreme (هنگام که ‘extremes’ برابر با 1 است) برای خروجی پشتیبانی می کند. در این موارد، total values و extreme values بعد از نمایش داده های اصلی در جداول جدا، چاپ می شوند. مثال (برای فرمت PrettyCompact نمایش داده شده است):
@@ -527,70 +524,66 @@ SELECT * FROM json_each_row_nested SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact ``` -``` -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1406958 │ -│ 2014-03-18 │ 1383658 │ -│ 2014-03-19 │ 1405797 │ -│ 2014-03-20 │ 1353623 │ -│ 2014-03-21 │ 1245779 │ -│ 2014-03-22 │ 1031592 │ -│ 2014-03-23 │ 1046491 │ -└────────────┴─────────┘ + ┌──EventDate─┬───────c─┐ + │ 2014-03-17 │ 1406958 │ + │ 2014-03-18 │ 1383658 │ + │ 2014-03-19 │ 1405797 │ + │ 2014-03-20 │ 1353623 │ + │ 2014-03-21 │ 1245779 │ + │ 2014-03-22 │ 1031592 │ + │ 2014-03-23 │ 1046491 │ + └────────────┴─────────┘ -Totals: -┌──EventDate─┬───────c─┐ -│ 0000-00-00 │ 8873898 │ -└────────────┴─────────┘ + Totals: + ┌──EventDate─┬───────c─┐ + │ 0000-00-00 │ 8873898 │ + └────────────┴─────────┘ -Extremes: -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1031592 │ -│ 2014-03-23 │ 1406958 │ -└────────────┴─────────┘ -``` + Extremes: + ┌──EventDate─┬───────c─┐ + │ 2014-03-17 │ 1031592 │ + │ 2014-03-23 │ 1406958 │ + └────────────┴─────────┘ -
+
-## PrettyCompact +## PrettyCompact {#prettycompact} تفاوت آن با `Pretty` در این است که grid های کشیده شده بین سطر ها و خروجی فشرده تر است. این فرمت به صورت پیش فرض در محیط کلاینت در حالت interactive مورد استفاده قرار می گیرد. -## PrettyCompactMonoBlock +## PrettyCompactMonoBlock {#prettycompactmonoblock} تفاوت آن با `PrettyCompact` در این است که 10 هزار سطر خروجی بافر می شوند، و سپس در یک جدول چاپ می شوند. نه به صورت بلاک -## PrettyNoEscapes +## PrettyNoEscapes {#prettynoescapes} -تفاوت آن با Pretty در این است که از ANSI-escape استفاده نمی کند. این برای نمایش این فرمت در مروگر ضروری است، و همچنین برای استفاده از دستور 'watch' ضروری است. +تفاوت آن با Pretty در این است که از ANSI-escape استفاده نمی کند. این برای نمایش این فرمت در مروگر ضروری است، و همچنین برای استفاده از دستور ‘watch’ ضروری است. مثال:
-```bash +``` bash watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" ``` -
+
شما می توانید برای نمایش در مرورگر از interface HTTP استفاده کنید. - - -### PrettyCompactNoEscapes +### PrettyCompactNoEscapes {#prettycompactnoescapes} همانند تنظیم قبلی می باشد. -### PrettySpaceNoEscapes +### PrettySpaceNoEscapes {#prettyspacenoescapes} همانند تنظیم قبلی می باشد.. -## PrettySpace +## PrettySpace {#prettyspace} تفاوت آن با `PrettyCompact` در این است که از whitespace (کاراکتر های space) به جای grid استفاده می کند. -## RowBinary +## RowBinary {#rowbinary} فرمت ها و پارس کردن داده ها، براساس سطر در فرمت باینری است.سطرها و مقادیر به صورت پیوسته و بدون جدا کننده لیست می شوند.این فرمت کم کارآمد تر از فرمت native است، از آنجایی که ردیف گرا است. @@ -606,11 +599,11 @@ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FOR این فرمت برای دستور `INSERT INTO t VALUES ...` مورد استفاده قرار می گیرد، اما همچنین شما می تونید برای فرمت نتایج query استفاده کنید. -## Vertical +## Vertical {#vertical} مقدار هر ستون به همراه نام ستون در سطر جداگانه چاپ می شود. اگر هر سطر شامل تعداد زیادی ستون است، این فرمت جهت چاپ چند سطر مناسب است. این فرمت فقط مناسب خروجی نتایج query ها می باشد، نه برای پارس کردن (دریافت داده ها و درج آن در جدول). -## VerticalRaw +## VerticalRaw {#verticalraw} تفاوت آن با `Vertical` در این است که سطر ها escape نمی شوند. این فرمت فقط مناسب خروجی نتایج query ها می باشد، نه برای پارس کردن (دریافت داده ها و درج آن در جدول). @@ -618,40 +611,37 @@ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FOR
-``` -:) SHOW CREATE TABLE geonames FORMAT VerticalRaw; -Row 1: -────── -statement: CREATE TABLE default.geonames ( geonameid UInt32, date Date DEFAULT CAST('2017-12-08' AS Date)) ENGINE = MergeTree(date, geonameid, 8192) + :) SHOW CREATE TABLE geonames FORMAT VerticalRaw; + Row 1: + ────── + statement: CREATE TABLE default.geonames ( geonameid UInt32, date Date DEFAULT CAST('2017-12-08' AS Date)) ENGINE = MergeTree(date, geonameid, 8192) -:) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT VerticalRaw; -Row 1: -────── -test: string with 'quotes' and with some special - characters -``` + :) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT VerticalRaw; + Row 1: + ────── + test: string with 'quotes' and with some special + characters -
+
در مقایسه با فرمت Vertical:
-``` -:) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical; -Row 1: -────── -test: string with \'quotes\' and \t with some special \n characters -``` -## XML + :) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical; + Row 1: + ────── + test: string with \'quotes\' and \t with some special \n characters -
+## XML {#xml} + +
فرمت XML فقط برای خروجی مناسب است، نه برای پارس کردن. مثال:
-```xml +``` xml @@ -713,9 +703,9 @@ test: string with \'quotes\' and \t with some special \n characters ``` -
+
-اگر نام فیلد، فرمت قابل قبولی نداشته باشد، اسم 'field' به عنوان نام عنصر استفاده می شود. به طور کلی، ساختار XML مشابه ساختار JSON می باشد. فقط در JSON، موارد بی اعتبار UTF-8 تبدیل به کاراکتر � می شوند که منجر به خروجی معتبر UTF-8 می شود. +اگر نام فیلد، فرمت قابل قبولی نداشته باشد، اسم ‘field’ به عنوان نام عنصر استفاده می شود. به طور کلی، ساختار XML مشابه ساختار JSON می باشد. فقط در JSON، موارد بی اعتبار UTF-8 تبدیل به کاراکتر � می شوند که منجر به خروجی معتبر UTF-8 می شود. در مقادیر رشته ای، کاراکتر های `>` و `&` به صورت `<` و `&` escape می شوند. @@ -723,9 +713,9 @@ test: string with \'quotes\' and \t with some special \n characters ## CapnProto {#capnproto} -Cap'n Proto یک فرمت پیام باینری شبیه به Protocol Buffer و Thrift می باشد، اما شبیه به JSON یا MessagePack نیست. +Cap’n Proto یک فرمت پیام باینری شبیه به Protocol Buffer و Thrift می باشد، اما شبیه به JSON یا MessagePack نیست. -پیغام های Cap'n Proto به صورت self-describing نیستند، به این معنی که آنها نیاز دارند که به صورت external، schema آنها شرح داده شود. schema به صورت on the fly اضافه می شود و برای هر query، cache می شود. +پیغام های Cap’n Proto به صورت self-describing نیستند، به این معنی که آنها نیاز دارند که به صورت external، schema آنها شرح داده شود. schema به صورت on the fly اضافه می شود و برای هر query، cache می شود.
@@ -734,22 +724,20 @@ SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase FORMAT CapnProto SETTINGS schema = 'schema:Message' ``` -
+
جایی که `schema.capnp` شبیه این است:
-``` -struct Message { - SearchPhrase @0 :Text; - c @1 :Uint64; -} -``` + struct Message { + SearchPhrase @0 :Text; + c @1 :Uint64; + } -
+
-فایل های Schema در فایلی قرار دارند که این فایل در دایرکتوری مشخص شده کانفیگ [ format_schema_path](../operations/server_settings/settings.md) قرار گرفته اند. +فایل های Schema در فایلی قرار دارند که این فایل در دایرکتوری مشخص شده کانفیگ [format\_schema\_path](../operations/server_settings/settings.md) قرار گرفته اند. عملیات Deserialization موثر است و معمولا لود سیستم را افزایش نمی دهد. @@ -764,17 +752,17 @@ ClickHouse supports both `proto2` and `proto3` syntaxes. Repeated/optional/requi Usage examples: -```sql +``` sql SELECT * FROM test.table FORMAT Protobuf SETTINGS format_schema = 'schemafile:MessageType' ``` -```bash +``` bash cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'" ``` where the file `schemafile.proto` looks like this: -```capnp +``` capnp syntax = "proto3"; message MessageType { @@ -785,13 +773,13 @@ message MessageType { }; ``` -To find the correspondence between table columns and fields of Protocol Buffers' message type ClickHouse compares their names. +To find the correspondence between table columns and fields of Protocol Buffers’ message type ClickHouse compares their names. This comparison is case-insensitive and the characters `_` (underscore) and `.` (dot) are considered as equal. -If types of a column and a field of Protocol Buffers' message are different the necessary conversion is applied. +If types of a column and a field of Protocol Buffers’ message are different the necessary conversion is applied. Nested messages are supported. For example, for the field `z` in the following message type -```capnp +``` capnp message MessageType { message XType { message YType { @@ -808,7 +796,7 @@ Nested messages are suitable to input or output a [nested data structures](../da Default values defined in a protobuf schema like this -```capnp +``` capnp syntax = "proto2"; message MessageType { @@ -824,43 +812,41 @@ See also [how to read/write length-delimited protobuf messages in popular langua ## Avro {#data-format-avro} -[Apache Avro](http://avro.apache.org/) is a row-oriented data serialization framework developed within Apache's Hadoop project. +[Apache Avro](http://avro.apache.org/) is a row-oriented data serialization framework developed within Apache’s Hadoop project. ClickHouse Avro format supports reading and writing [Avro data files](http://avro.apache.org/docs/current/spec.html#Object+Container+Files). -### Data Types Matching +### Data Types Matching {#data-types-matching} The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` and `SELECT` queries. -| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | -| -------------------- | -------------------- | ------------------ | -| `boolean`, `int`, `long`, `float`, `double` | [Int(8\|16\|32\)](../data_types/int_uint.md), [UInt(8\|16\|32)](../data_types/int_uint.md) | `int` | -| `boolean`, `int`, `long`, `float`, `double` | [Int64](../data_types/int_uint.md), [UInt64](../data_types/int_uint.md) | `long` | -| `boolean`, `int`, `long`, `float`, `double` | [Float32](../data_types/float.md) | `float` | -| `boolean`, `int`, `long`, `float`, `double` | [Float64](../data_types/float.md) | `double` | -| `bytes`, `string`, `fixed`, `enum` | [String](../data_types/string.md) | `bytes` | -| `bytes`, `string`, `fixed` | [FixedString(N)](../data_types/fixedstring.md) | `fixed(N)` | -| `enum` | [Enum(8\|16)](../data_types/enum.md) | `enum` | -| `array(T)` | [Array(T)](../data_types/array.md) | `array(T)` | -| `union(null, T)`, `union(T, null)` | [Nullable(T)](../data_types/date.md) | `union(null, T)`| -| `null` | [Nullable(Nothing)](../data_types/special_data_types/nothing.md) | `null` | -| `int (date)` * | [Date](../data_types/date.md) | `int (date)` * | -| `long (timestamp-millis)` * | [DateTime64(3)](../data_types/datetime.md) | `long (timestamp-millis)` * | -| `long (timestamp-micros)` * | [DateTime64(6)](../data_types/datetime.md) | `long (timestamp-micros)` * | +| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | +|---------------------------------------------|-------------------------------------------------------------------------------------------|------------------------------| +| `boolean`, `int`, `long`, `float`, `double` | [Int(8\|16\|32)](../data_types/int_uint.md), [UInt(8\|16\|32)](../data_types/int_uint.md) | `int` | +| `boolean`, `int`, `long`, `float`, `double` | [Int64](../data_types/int_uint.md), [UInt64](../data_types/int_uint.md) | `long` | +| `boolean`, `int`, `long`, `float`, `double` | [Float32](../data_types/float.md) | `float` | +| `boolean`, `int`, `long`, `float`, `double` | [Float64](../data_types/float.md) | `double` | +| `bytes`, `string`, `fixed`, `enum` | [String](../data_types/string.md) | `bytes` | +| `bytes`, `string`, `fixed` | [FixedString(N)](../data_types/fixedstring.md) | `fixed(N)` | +| `enum` | [Enum(8\|16)](../data_types/enum.md) | `enum` | +| `array(T)` | [Array(T)](../data_types/array.md) | `array(T)` | +| `union(null, T)`, `union(T, null)` | [Nullable(T)](../data_types/date.md) | `union(null, T)` | +| `null` | [Nullable(Nothing)](../data_types/special_data_types/nothing.md) | `null` | +| `int (date)` \* | [Date](../data_types/date.md) | `int (date)` \* | +| `long (timestamp-millis)` \* | [DateTime64(3)](../data_types/datetime.md) | `long (timestamp-millis)` \* | +| `long (timestamp-micros)` \* | [DateTime64(6)](../data_types/datetime.md) | `long (timestamp-micros)` \* | \* [Avro logical types](http://avro.apache.org/docs/current/spec.html#Logical+Types) - - Unsupported Avro data types: `record` (non-root), `map` Unsupported Avro logical data types: `uuid`, `time-millis`, `time-micros`, `duration` -### Inserting Data +### Inserting Data {#inserting-data} To insert data from an Avro file into ClickHouse table: -```bash +``` bash $ cat file.avro | clickhouse-client --query="INSERT INTO {some_table} FORMAT Avro" ``` @@ -871,20 +857,20 @@ Unused fields are skipped. Data types of a ClickHouse table columns can differ from the corresponding fields of the Avro data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to corresponding column type. -### Selecting Data +### Selecting Data {#selecting-data} To select data from ClickHouse table into an Avro file: -```bash +``` bash $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Avro" > file.avro ``` Column names must: -- start with `[A-Za-z_]` -- subsequently contain only `[A-Za-z0-9_]` +- start with `[A-Za-z_]` +- subsequently contain only `[A-Za-z0-9_]` -Output Avro file compression and sync interval can be configured with [output_format_avro_codec](../operations/settings/settings.md#settings-output_format_avro_codec) and [output_format_avro_sync_interval](../operations/settings/settings.md#settings-output_format_avro_sync_interval) respectively. +Output Avro file compression and sync interval can be configured with [output\_format\_avro\_codec](../operations/settings/settings.md#settings-output_format_avro_codec) and [output\_format\_avro\_sync\_interval](../operations/settings/settings.md#settings-output_format_avro_sync_interval) respectively. ## AvroConfluent {#data-format-avro-confluent} @@ -894,17 +880,17 @@ Each Avro message embeds a schema id that can be resolved to the actual schema w Schemas are cached once resolved. -Schema Registry URL is configured with [format_avro_schema_registry_url](../operations/settings/settings.md#settings-format_avro_schema_registry_url) +Schema Registry URL is configured with [format\_avro\_schema\_registry\_url](../operations/settings/settings.md#settings-format_avro_schema_registry_url) -### Data Types Matching +### Data Types Matching {#data-types-matching-1} Same as [Avro](#data-format-avro) -### Usage +### Usage {#usage} To quickly verify schema resolution you can use [kafkacat](https://github.com/edenhill/kafkacat) with [clickhouse-local](../operations/utils/clickhouse-local.md): -```bash +``` bash $ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse-local --input-format AvroConfluent --format_avro_schema_registry_url 'http://schema-registry' -S "field1 Int64, field2 String" -q 'select * from table' 1 a 2 b @@ -912,7 +898,8 @@ $ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse- ``` To use `AvroConfluent` with [Kafka](../operations/table_engines/kafka.md): -```sql + +``` sql CREATE TABLE topic1_stream ( field1 String, @@ -931,34 +918,33 @@ SELECT * FROM topic1_stream; ``` !!! note "Warning" - Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it's value after a restart. - + Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. ## Parquet {#data-format-parquet} [Apache Parquet](http://parquet.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. ClickHouse supports read and write operations for this format. -### Data Types Matching +### Data Types Matching {#data-types-matching-2} The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` and `SELECT` queries. -| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) | -| -------------------- | ------------------ | ---- | -| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | `UINT8` | -| `INT8` | [Int8](../data_types/int_uint.md) | `INT8` | -| `UINT16` | [UInt16](../data_types/int_uint.md) | `UINT16` | -| `INT16` | [Int16](../data_types/int_uint.md) | `INT16` | -| `UINT32` | [UInt32](../data_types/int_uint.md) | `UINT32` | -| `INT32` | [Int32](../data_types/int_uint.md) | `INT32` | -| `UINT64` | [UInt64](../data_types/int_uint.md) | `UINT64` | -| `INT64` | [Int64](../data_types/int_uint.md) | `INT64` | -| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | `FLOAT` | -| `DOUBLE` | [Float64](../data_types/float.md) | `DOUBLE` | -| `DATE32` | [Date](../data_types/date.md) | `UINT16` | -| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | `UINT32` | -| `STRING`, `BINARY` | [String](../data_types/string.md) | `STRING` | -| — | [FixedString](../data_types/fixedstring.md) | `STRING` | -| `DECIMAL` | [Decimal](../data_types/decimal.md) | `DECIMAL` | +| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) | +|------------------------------|---------------------------------------------|------------------------------| +| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | `UINT8` | +| `INT8` | [Int8](../data_types/int_uint.md) | `INT8` | +| `UINT16` | [UInt16](../data_types/int_uint.md) | `UINT16` | +| `INT16` | [Int16](../data_types/int_uint.md) | `INT16` | +| `UINT32` | [UInt32](../data_types/int_uint.md) | `UINT32` | +| `INT32` | [Int32](../data_types/int_uint.md) | `INT32` | +| `UINT64` | [UInt64](../data_types/int_uint.md) | `UINT64` | +| `INT64` | [Int64](../data_types/int_uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../data_types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../data_types/date.md) | `UINT16` | +| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | `UINT32` | +| `STRING`, `BINARY` | [String](../data_types/string.md) | `STRING` | +| — | [FixedString](../data_types/fixedstring.md) | `STRING` | +| `DECIMAL` | [Decimal](../data_types/decimal.md) | `DECIMAL` | ClickHouse supports configurable precision of `Decimal` type. The `INSERT` query treats the Parquet `DECIMAL` type as the ClickHouse `Decimal128` type. @@ -966,17 +952,17 @@ Unsupported Parquet data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, Data types of a ClickHouse table columns can differ from the corresponding fields of the Parquet data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [cast](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to that data type which is set for the ClickHouse table column. -### Inserting and Selecting Data +### Inserting and Selecting Data {#inserting-and-selecting-data} You can insert Parquet data from a file into ClickHouse table by the following command: -```bash +``` bash $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" ``` You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command: -```bash +``` bash $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} ``` @@ -986,42 +972,41 @@ To exchange data with Hadoop, you can use [HDFS table engine](../operations/tabl [Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. You can only insert data in this format to ClickHouse. -### Data Types Matching +### Data Types Matching {#data-types-matching-3} The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` queries. -| ORC data type (`INSERT`) | ClickHouse data type | -| -------------------- | ------------------ | -| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | -| `INT8` | [Int8](../data_types/int_uint.md) | -| `UINT16` | [UInt16](../data_types/int_uint.md) | -| `INT16` | [Int16](../data_types/int_uint.md) | -| `UINT32` | [UInt32](../data_types/int_uint.md) | -| `INT32` | [Int32](../data_types/int_uint.md) | -| `UINT64` | [UInt64](../data_types/int_uint.md) | -| `INT64` | [Int64](../data_types/int_uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | -| `DOUBLE` | [Float64](../data_types/float.md) | -| `DATE32` | [Date](../data_types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | -| `STRING`, `BINARY` | [String](../data_types/string.md) | -| `DECIMAL` | [Decimal](../data_types/decimal.md) | +| ORC data type (`INSERT`) | ClickHouse data type | +|--------------------------|---------------------------------------| +| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | +| `INT8` | [Int8](../data_types/int_uint.md) | +| `UINT16` | [UInt16](../data_types/int_uint.md) | +| `INT16` | [Int16](../data_types/int_uint.md) | +| `UINT32` | [UInt32](../data_types/int_uint.md) | +| `INT32` | [Int32](../data_types/int_uint.md) | +| `UINT64` | [UInt64](../data_types/int_uint.md) | +| `INT64` | [Int64](../data_types/int_uint.md) | +| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | +| `DOUBLE` | [Float64](../data_types/float.md) | +| `DATE32` | [Date](../data_types/date.md) | +| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | +| `STRING`, `BINARY` | [String](../data_types/string.md) | +| `DECIMAL` | [Decimal](../data_types/decimal.md) | ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -The data types of ClickHouse table columns don't have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. +The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. -### Inserting Data +### Inserting Data {#inserting-data-1} You can insert ORC data from a file into ClickHouse table by the following command: -```bash +``` bash $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` To exchange data with Hadoop, you can use [HDFS table engine](../operations/table_engines/hdfs.md). - [مقاله اصلی](https://clickhouse.tech/docs/fa/interfaces/formats/) diff --git a/docs/fa/interfaces/http.md b/docs/fa/interfaces/http.md index 844558ad43a..d51b60f00ba 100644 --- a/docs/fa/interfaces/http.md +++ b/docs/fa/interfaces/http.md @@ -1,31 +1,31 @@ -
+
-# HTTP interface +# HTTP interface {#http-interface} HTTP interface به شما امکان استفاده از ClickHouse در هر پلتفرم با هر زمان برنامه نویسی را می دهد. ما از این Interface برای زبان های Java و Perl به مانند shell استفاده می کنیم. در دیگر دپارتمان ها، HTTP interface در Perl، Python، و Go استفاده می شود. HTTP Interface محدود تر از native interface می باشد، اما سازگاری بهتری دارد. -به صورت پیش فرض، clickhouse-server به پرت 8123 در HTTP گوش می دهد. (میتونه در کانفیگ فایل تغییر پیدا کنه). اگر شما یک درخواست GET / بدون پارامتر بسازید، رشته ی "Ok." رو دریافت می کنید (به همراه line feed در انتها). شما می توانید از این درخواست برای اسکریپت های health-check استفاده کنید. +به صورت پیش فرض، clickhouse-server به پرت 8123 در HTTP گوش می دهد. (میتونه در کانفیگ فایل تغییر پیدا کنه). اگر شما یک درخواست GET / بدون پارامتر بسازید، رشته ی «Ok.» رو دریافت می کنید (به همراه line feed در انتها). شما می توانید از این درخواست برای اسکریپت های health-check استفاده کنید.
-```bash +``` bash $ curl 'http://localhost:8123/' Ok. ``` -
+
-درخواست های خود را پارامتر 'query'، یا با متد POST، یا ابتدای query را در پارامتر 'query' ارسال کنید، و بقیه را در POST (بعدا توضیح خواهیم داد که چرا این کار ضروری است). سایت URL محدود به 16 کیلوبایت است، پس هنگام ارسال query های بزرگ، اینو به خاطر داشته باشید +درخواست های خود را پارامتر ‘query’، یا با متد POST، یا ابتدای query را در پارامتر ‘query’ ارسال کنید، و بقیه را در POST (بعدا توضیح خواهیم داد که چرا این کار ضروری است). سایت URL محدود به 16 کیلوبایت است، پس هنگام ارسال query های بزرگ، اینو به خاطر داشته باشید اگر درخواست موفق آمیز باشد، استاتوس کد 200 را دریافت می کنید و نتایج در بدنه response می باشد. اگر اروری رخ دهد، استاتوس کد 500 را دریافت می کنید و توضیحات ارور در بدنه ی reponse قرار می گیرد. -در هنگام استفاده از متد GET، 'readonly' ست می شود. به عبارت دیگر، برای query هایی که قصد تغییر دیتا را دارند، شما فقط از طریق متد POST می توانید این تغییرات را انجام دهید. شما میتونید query رو در بدنه ی POST یه یا به عنوان پارامتر های URL ارسال کنید. +در هنگام استفاده از متد GET، ‘readonly’ ست می شود. به عبارت دیگر، برای query هایی که قصد تغییر دیتا را دارند، شما فقط از طریق متد POST می توانید این تغییرات را انجام دهید. شما میتونید query رو در بدنه ی POST یه یا به عنوان پارامتر های URL ارسال کنید. مثال:
-```bash +``` bash $ curl 'http://localhost:8123/?query=SELECT%201' 1 @@ -40,13 +40,13 @@ Date: Fri, 16 Nov 2012 19:21:50 GMT 1 ``` -
+
-همانطور که می بینید، curl is somewhat inconvenient in that spaces must be URL escaped. هر چند wget همه چیز را خودش escape می کنه، ما توصیه به استفاده از اون رو نمی کنیم، چون wget به خوبی با HTTP 1.1 در هنگام استفاده از هدر های keep-alive و Transfer-Encoding: chunked کار نمی کند. +همانطور که می بینید، curl is somewhat inconvenient in that spaces must be URL escaped. هر چند wget همه چیز را خودش escape می کنه، ما توصیه به استفاده از اون رو نمی کنیم، چون wget به خوبی با HTTP 1.1 در هنگام استفاده از هدر های keep-alive و Transfer-Encoding: chunked کار نمی کند.
-```bash +``` bash $ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @- 1 @@ -57,26 +57,26 @@ $ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @- 1 ``` -
+
اگر بخشی از query در پارامتر ارسال شود، و بخش دیگر در POST، یک line feed بین دو بخش وارد می شود. مثال (این کار نمی کند):
-```bash +``` bash $ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @- Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL ECT 1 , expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception ``` -
+
-به صورت پیش فرض، داده ها با فرمت TabSeparated بر میگردند. (برای اطلاعات بیشتر بخش "فرمت" را مشاهده کنید). شما میتوانید از دستور FORMAT در query خود برای ست کردن فرمتی دیگر استفاده کنید. +به صورت پیش فرض، داده ها با فرمت TabSeparated بر میگردند. (برای اطلاعات بیشتر بخش «فرمت» را مشاهده کنید). شما میتوانید از دستور FORMAT در query خود برای ست کردن فرمتی دیگر استفاده کنید.
-```bash +``` bash $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @- ┏━━━┓ ┃ 1 ┃ @@ -85,7 +85,7 @@ $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @ └───┘ ``` -
+
برای query های INSERT متد POST ضروری است. در این مورد، شما می توانید ابتدای query خود را در URL parameter بنویسید، و از POST برای پاس داده داده ها برای درج استفاده کنید. داده ی برای درج می تواند، برای مثال یک دامپ tab-separated شده از MySQL باشد. به این ترتیب، query INSERT جایگزین LOAD DATA LOCAL INFILE از MySQL می شود. @@ -93,57 +93,57 @@ $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @
-```bash +``` bash echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- ``` -
+
استفاده از query INSERT برای درج داده:
-```bash +``` bash echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- ``` -
+
داده ها میتوانند جدا از پارامتر query ارسال شوند:
-```bash +``` bash echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- ``` -
+
-شما می توانید هر نوع فرمت دیتایی مشخص کنید. فرمت 'Values' دقیقا مشابه زمانی است که شما INSERT INTO t VALUES را می نویسید: +شما می توانید هر نوع فرمت دیتایی مشخص کنید. فرمت ‘Values’ دقیقا مشابه زمانی است که شما INSERT INTO t VALUES را می نویسید:
-```bash +``` bash echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- ``` -
+
برای درج داده ها از یک دامپ tab-separate، فرمت مشخص زیر را وارد کنید:
-```bash +``` bash echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- ``` -
+
به دلیل پردازش موازی، نتایج query با ترتیب رندوم چاپ می شود:
-```bash +``` bash $ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' 7 8 @@ -159,33 +159,33 @@ $ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' 6 ``` -
+
حدف جدول:
-```bash +``` bash echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- ``` -
+
برای درخواست هایی موفقی که داده ای از جدول بر نمیگردد، بدنه response خالی است. شما می توانید از فرمت فشرده سازی داخلی ClickHouse در هنگان انتقال داده ها استفاده کنید. این فشرده سازی داده، یک فرمت غیراستاندارد است، و شما باید از برنامه مخصوص فشرده سازی ClickHouse برای استفاده از آن استفاده کنید. (این برنامه در هنگام نصب پکیج clickhouse-client نصب شده است) -اگر شما در URL پارامتر 'compress=1' را قرار دهید، سرور داده های ارسالی به شما را فشرده سازی می کند. اگر شما پارامتر 'decompress=1' را در URL ست کنید، سرور داده های ارسالی توسط متد POST را decompress می کند. +اگر شما در URL پارامتر ‘compress=1’ را قرار دهید، سرور داده های ارسالی به شما را فشرده سازی می کند. اگر شما پارامتر ‘decompress=1’ را در URL ست کنید، سرور داده های ارسالی توسط متد POST را decompress می کند. همچنین استفاده از فشرده سازی استاندارد gzip در HTTP ممکن است. برای ارسال درخواست POST و فشرده سازی آن به صورت gzip، هدر `Content-Encoding: gzip` را به request خود اضافه کنید. برای اینکه ClickHouse، response فشرده شده به صورت gzip برای شما ارسال کند، ابتدا باید `enable_http_compression` را در تنظیمات ClickHouse فعال کنید و در ادامه هدر `Accept-Encoding: gzip` را به درخواست خود اضافه کنید. شما می توانید از این کار برای کاهش ترافیک شبکه هنگام ارسال مقدار زیادی از داده ها یا برای ایجاد dump هایی که بلافاصله فشرده می شوند، استفاده کنید. -شما می توانید از پارامتر 'database' در URL برای مشخص کردن دیتابیس پیش فرض استفاده کنید. +شما می توانید از پارامتر ‘database’ در URL برای مشخص کردن دیتابیس پیش فرض استفاده کنید.
-```bash +``` bash $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @- 0 1 @@ -199,39 +199,39 @@ $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?data 9 ``` -
+
-به صورت پیش فرض، دیتابیس ثبت شده در تنظیمات سرور به عنوان دیتابیس پیش فرض مورد استفاده قرار می گیرد، این دیتابیس 'default' نامیده می شود. از سوی دیگر، شما می توانید همیشه نام دیتابیس را با دات و قبل از اسم جدول مشخص کنید. +به صورت پیش فرض، دیتابیس ثبت شده در تنظیمات سرور به عنوان دیتابیس پیش فرض مورد استفاده قرار می گیرد، این دیتابیس ‘default’ نامیده می شود. از سوی دیگر، شما می توانید همیشه نام دیتابیس را با دات و قبل از اسم جدول مشخص کنید. نام کاربری و پسورد می توانند به یکی از دو روش زیر ست شوند: -1. استفاده از HTTP Basic Authentication. مثال: +1. استفاده از HTTP Basic Authentication. مثال:
-```bash +``` bash echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- ``` -
+
-2. با دو پارامتر 'user' و 'password' در URL. مثال: +1. با دو پارامتر ‘user’ و ‘password’ در URL. مثال:
-```bash +``` bash echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- ``` -
+
-اگر نام کاربری مشخص نشود، نام کاربری 'default' استفاده می شود. اگر پسورد مشخص نشود، پسورد خالی استفاده می شود. شما همچنین می توانید از پارامتر های URL برای مشخص کردن هر تنظیمی برای اجرای یک query استفاده کنید. مثال: http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1 +اگر نام کاربری مشخص نشود، نام کاربری ‘default’ استفاده می شود. اگر پسورد مشخص نشود، پسورد خالی استفاده می شود. شما همچنین می توانید از پارامتر های URL برای مشخص کردن هر تنظیمی برای اجرای یک query استفاده کنید. مثال: http://localhost:8123/?profile=web&max\_rows\_to\_read=1000000000&query=SELECT+1 -برای اطلاعات بیشتر بخش "تنظیمات" را مشاهده کنید. +برای اطلاعات بیشتر بخش «تنظیمات» را مشاهده کنید.
-```bash +``` bash $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @- 0 1 @@ -245,21 +245,21 @@ $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:812 9 ``` -
+
-برای اطلاعات بیشتر در مورد دیگر پارامترها، بخش "SET" را ببینید. +برای اطلاعات بیشتر در مورد دیگر پارامترها، بخش «SET» را ببینید. به طور مشابه، شما می توانید از ClickHouse Session در HTTP استفاده کنید. برای این کار، شما نیاز به ست کردن پارامتر`session_id` برای درخواست را دارید. شما میتوانید از هر رشته ای برای ست کردن Session ID استفاده کنید. به صورت پیش فرض، یک session بعد از 60 ثانیه از اجرای آخرین درخواست نابود می شود. برای تغییر زمان timeout، باید `default_session_timeout` را در تنظیمات سرور تغییر دهید و یا پارامتر `session_timeout` در هنگام درخواست ست کنید. برای بررسی وضعیت status از پارامتر `session_check=1` استفاده کنید. با یک session تنها یک query در لحظه می توان اجرا کرد. -گزینه ای برای دریافت اطلاعات progress اجرای query وجود دارد. این گزینه هدر X-ClickHouse-Progress می باشد. برای این کار تنظیم send_progress_in_http_headers در کانفیگ فایل فعال کنید. +گزینه ای برای دریافت اطلاعات progress اجرای query وجود دارد. این گزینه هدر X-ClickHouse-Progress می باشد. برای این کار تنظیم send\_progress\_in\_http\_headers در کانفیگ فایل فعال کنید. -درخواست در حال اجرا، در صورت لاست شدن کانکشن HTTP به صورت اتوماتیک متوقف نمی شود. پارس کردن و فرمت کردن داده ها در سمت سرور صورت می گیرد، و استفاده از شبکه ممکن است ناکارآمد باشد. پارامتر 'query_id' می تونه به عنوان query id پاس داده شود (هر رشته ای قابل قبول است). برای اطلاعات بیشتر بخش "تنظیمات، replace_running_query" را مشاهده کنید. +درخواست در حال اجرا، در صورت لاست شدن کانکشن HTTP به صورت اتوماتیک متوقف نمی شود. پارس کردن و فرمت کردن داده ها در سمت سرور صورت می گیرد، و استفاده از شبکه ممکن است ناکارآمد باشد. پارامتر ‘query\_id’ می تونه به عنوان query id پاس داده شود (هر رشته ای قابل قبول است). برای اطلاعات بیشتر بخش «تنظیمات، replace\_running\_query» را مشاهده کنید. -پارامتر 'quoto_key' می تواند به عنوان quoto key پاس داده شود (هر رشته ای قابل قبول است). برای اطلاعات بیشتر بخش "Quotas" را مشاهده کنید. +پارامتر ‘quoto\_key’ می تواند به عنوان quoto key پاس داده شود (هر رشته ای قابل قبول است). برای اطلاعات بیشتر بخش «Quotas» را مشاهده کنید. -HTTP interface اجازه ی پاس دادن داده های external (جداول موقت external) به query را می دهد. برای اطلاعات بیشتر، بخش "داده های External برای پردازش query" را مشاهده کنید. +HTTP interface اجازه ی پاس دادن داده های external (جداول موقت external) به query را می دهد. برای اطلاعات بیشتر، بخش «داده های External برای پردازش query» را مشاهده کنید. -## بافرینگ Response +## بافرینگ Response {#bfryng-response} شما می توانید در سمت سرور قابلی بافرینگ response را فعال کنید. پارامترهای `buffer_size` و `wait_end_of_query` در URL برای این هدف ارائه شده اند. @@ -271,11 +271,11 @@ HTTP interface اجازه ی پاس دادن داده های external (جداو
-```bash +``` bash curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' ``` -
+
از بافرینگ به منظور اجتناب از شرایطی که یک خطای پردازش query رخ داده بعد از response کد و هدر های ارسال شده به کلاینت استفاده کنید. در این شرایط، پیغام خطا در انتهای بنده response نوشته می شود، و در سمت کلاینت، پیغام خطا فقط از طریق مرحله پارس کردن قابل شناسایی است. diff --git a/docs/fa/interfaces/index.md b/docs/fa/interfaces/index.md index 6d99795f264..30adb2e5806 100644 --- a/docs/fa/interfaces/index.md +++ b/docs/fa/interfaces/index.md @@ -1,22 +1,23 @@ -
+
# رابط ها {#interfaces} ClickHouse دو اینترفیس شبکه را فراهم می کند (هر دو می توانند به صورت اختیاری در TLS برای امنیت اضافی پیچیده شوند): -* [HTTP](http.md), که مستند شده و به راحتی به طور مستقیم استفاده می شود. -* [بومی TCP](tcp.md), که دارای سربار کمتر است. +- [HTTP](http.md), که مستند شده و به راحتی به طور مستقیم استفاده می شود. +- [بومی TCP](tcp.md), که دارای سربار کمتر است. اگرچه در بیشتر موارد توصیه می شود از ابزار یا کتابخانه مناسب استفاده کنید تا به طور مستقیم با آن ها ارتباط برقرار نکنید. به طور رسمی توسط یانداکس پشتیبانی می شوند عبارتند از: -* [خط فرمان خط](cli.md) -* [راننده JDBC](jdbc.md) -* [راننده ODBC](odbc.md) -* [C ++ کتابخانه مشتری](cpp.md) +\* [خط فرمان خط](cli.md) +\* [راننده JDBC](jdbc.md) +\* [راننده ODBC](odbc.md) +\* [C ++ کتابخانه مشتری](cpp.md) همچنین برای کار با ClickHouse طیف گسترده ای از کتابخانه های شخص ثالث وجود دارد: -* [کتابخانه های مشتری](third-party/client_libraries.md) -* [ادغام](third-party/integrations.md) -* [رابط های بصری](third-party/gui.md) +\* [کتابخانه های مشتری](third-party/client_libraries.md) +\* [ادغام](third-party/integrations.md) +\* [رابط های بصری](third-party/gui.md)
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/interfaces/) diff --git a/docs/fa/interfaces/jdbc.md b/docs/fa/interfaces/jdbc.md index 5c47948335b..417931a16b7 100644 --- a/docs/fa/interfaces/jdbc.md +++ b/docs/fa/interfaces/jdbc.md @@ -1,13 +1,13 @@ -
+
-# درایور JDBC +# درایور JDBC {#drywr-jdbc} درایور رسمی JDBC برای ClickHouse وجود دارد. برای اطلاعات بیشتر [اینجا](https://github.com/ClickHouse/clickhouse-jdbc) را ببینید. - درایور JDBC توسط سازمان های دیگر اجرا می شوند. -- [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) +- [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC)
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/interfaces/jdbc/) diff --git a/docs/fa/interfaces/odbc.md b/docs/fa/interfaces/odbc.md index 35b1cc508ab..03d8d8dbd9c 100644 --- a/docs/fa/interfaces/odbc.md +++ b/docs/fa/interfaces/odbc.md @@ -1,7 +1,9 @@ -
-# ODBC درایور +
+ +# ODBC درایور {#odbc-drywr} درایور رسمی ODBC برای ClickHouse وجود دارد. برای اطلاعات بیشتر [اینجا](https://github.com/ClickHouse/clickhouse-odbc) را ببینید.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/interfaces/odbc/) diff --git a/docs/fa/interfaces/tcp.md b/docs/fa/interfaces/tcp.md index bf226fa7bf0..f7e455de4e1 100644 --- a/docs/fa/interfaces/tcp.md +++ b/docs/fa/interfaces/tcp.md @@ -1,8 +1,9 @@ -
+
-# رابط بومی (TCP) +# رابط بومی (TCP) {#rbt-bwmy-tcp} -پروتکل بومی در [خط فرمان خط] (cli.md)، برای برقراری ارتباط بین سرور در طی پردازش پرس و جو توزیع شده، و همچنین در سایر برنامه های C ++ استفاده می شود. متاسفانه، پروتکل ClickHouse بومی هنوز مشخصات رسمی ندارد، اما می توان آن را از کد منبع ClickHouse (شروع [از اینجا](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client)) و / یا با رهگیری و تجزیه و تحلیل ترافیک TCP. +پروتکل بومی در \[خط فرمان خط\] (cli.md)، برای برقراری ارتباط بین سرور در طی پردازش پرس و جو توزیع شده، و همچنین در سایر برنامه های C ++ استفاده می شود. متاسفانه، پروتکل ClickHouse بومی هنوز مشخصات رسمی ندارد، اما می توان آن را از کد منبع ClickHouse (شروع [از اینجا](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client)) و / یا با رهگیری و تجزیه و تحلیل ترافیک TCP.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/interfaces/tcp/) diff --git a/docs/fa/interfaces/third-party/client_libraries.md b/docs/fa/interfaces/third-party/client_libraries.md index 0057230781a..a2c17d053fb 100644 --- a/docs/fa/interfaces/third-party/client_libraries.md +++ b/docs/fa/interfaces/third-party/client_libraries.md @@ -1,52 +1,55 @@ -
+
-# کتابخانه های مشتری شخص ثالث +# کتابخانه های مشتری شخص ثالث {#khtbkhnh-hy-mshtry-shkhs-thlth} !!! warning "سلب مسئولیت" Yandex نه حفظ کتابخانه ها در زیر ذکر شده و نشده انجام هر آزمایش های گسترده ای برای اطمینان از کیفیت آنها. -- Python - - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) - - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) - - [clickhouse-client](https://github.com/yurial/clickhouse-client) -- PHP - - [phpClickHouse](https://github.com/smi2/phpClickHouse) - - [clickhouse-php-client](https://github.com/8bitov/clickhouse-php-client) - - [clickhouse-client](https://github.com/bozerkins/clickhouse-client) - - [PhpClickHouseClient](https://github.com/SevaCode/PhpClickHouseClient) -- Go - - [clickhouse](https://github.com/kshvakov/clickhouse/) - - [go-clickhouse](https://github.com/roistat/go-clickhouse) - - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) - - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) -- NodeJs - - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) - - [node-clickhouse](https://github.com/apla/node-clickhouse) -- Perl - - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) - - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) - - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) -- Ruby - - [ClickHouse (Ruby)](https://github.com/shlima/click_house) -- R - - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) - - [RClickhouse](https://github.com/IMSMWU/RClickhouse) -- Java - - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) -- Scala - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- Kotlin - - [AORM](https://github.com/TanVD/AORM) -- C# - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) - - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) -- Elixir - - [clickhousex](https://github.com/appodeal/clickhousex/) -- Nim - - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) +- Python + - [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm) + - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) + - [clickhouse-client](https://github.com/yurial/clickhouse-client) +- PHP + - [smi2/phpclickhouse](https://packagist.org/packages/smi2/phpClickHouse) + - [8bitov/clickhouse-php-client](https://packagist.org/packages/8bitov/clickhouse-php-client) + - [bozerkins/clickhouse-client](https://packagist.org/packages/bozerkins/clickhouse-client) + - [simpod/clickhouse-client](https://packagist.org/packages/simpod/clickhouse-client) + - [seva-code/php-click-house-client](https://packagist.org/packages/seva-code/php-click-house-client) + - [SeasClick C++ client](https://github.com/SeasX/SeasClick) +- Go + - [clickhouse](https://github.com/kshvakov/clickhouse/) + - [go-clickhouse](https://github.com/roistat/go-clickhouse) + - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) + - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) +- NodeJs + - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) + - [node-clickhouse](https://github.com/apla/node-clickhouse) +- Perl + - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) + - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) + - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) +- Ruby + - [ClickHouse (Ruby)](https://github.com/shlima/click_house) +- R + - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) + - [RClickhouse](https://github.com/IMSMWU/RClickhouse) +- Java + - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) +- Scala + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- Kotlin + - [AORM](https://github.com/TanVD/AORM) +- C\# + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) + - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) +- Elixir + - [clickhousex](https://github.com/appodeal/clickhousex/) +- Nim + - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) ما این کتابخانه ها را تست نکردیم. آنها به صورت تصادفی انتخاب شده اند.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/interfaces/third-party/client_libraries/) diff --git a/docs/fa/interfaces/third-party/gui.md b/docs/fa/interfaces/third-party/gui.md index cd6862d1091..38e2422a7f9 100644 --- a/docs/fa/interfaces/third-party/gui.md +++ b/docs/fa/interfaces/third-party/gui.md @@ -1,65 +1,64 @@ -
+
-# interface های visual توسعه دهندگان third-party +# interface های visual توسعه دهندگان third-party {#interface-hy-visual-tws-h-dhndgn-third-party} -## متن باز +## متن باز {#mtn-bz} -### Tabix +### Tabix {#tabix} interface تحت وب برای ClickHouse در پروژه [Tabix](https://github.com/tabixio/tabix). ویژگی ها: -- کار با ClickHouse به صورت مستقیم و از طریق مرورگر، بدون نیاز به نرم افزار اضافی. -- ادیتور query به همراه syntax highlighting. -- ویژگی Auto-completion برای دستورات. -- ابزارهایی برای آنالیز گرافیکی اجرای query. -- گزینه های Color scheme. +- کار با ClickHouse به صورت مستقیم و از طریق مرورگر، بدون نیاز به نرم افزار اضافی. +- ادیتور query به همراه syntax highlighting. +- ویژگی Auto-completion برای دستورات. +- ابزارهایی برای آنالیز گرافیکی اجرای query. +- گزینه های Color scheme. [مستندات Tabix](https://tabix.io/doc/). - -### HouseOps +### HouseOps {#houseops} [HouseOps](https://github.com/HouseOps/HouseOps) نرم افزار Desktop برای سیستم عامل های Linux و OSX و Windows می باشد.. ویژگی ها: -- ابزار Query builder به همراه syntax highlighting. نمایش نتایج به صورت جدول و JSON Object. -- خروجی نتایج به صورت csv و JSON Object. -- Pنمایش Processes List ها به همراه توضیحات، ویژگی حالت record و kill کردن process ها. -- نمودار دیتابیس به همراه تمام جداول و ستون ها به همراه اطلاعات اضافی. -- نماش آسان سایز ستون ها. -- تنظیمات سرور. -- مدیریت دیتابیس (بزودی); -- مدیریت کاربران (بزودی); -- آنالیز داده ها به صورت Real-Time (بزودی); -- مانیتورینگ کلاستر/زیرساخت (بزودی); -- مدیریت کلاستر (بزودی); -- مانیتورینگ کافکا و جداول replicate (بزودی); -- و بسیاری از ویژگی های دیگر برای شما. +- ابزار Query builder به همراه syntax highlighting. نمایش نتایج به صورت جدول و JSON Object. +- خروجی نتایج به صورت csv و JSON Object. +- Pنمایش Processes List ها به همراه توضیحات، ویژگی حالت record و kill کردن process ها. +- نمودار دیتابیس به همراه تمام جداول و ستون ها به همراه اطلاعات اضافی. +- نماش آسان سایز ستون ها. +- تنظیمات سرور. +- مدیریت دیتابیس (بزودی); +- مدیریت کاربران (بزودی); +- آنالیز داده ها به صورت Real-Time (بزودی); +- مانیتورینگ کلاستر/زیرساخت (بزودی); +- مدیریت کلاستر (بزودی); +- مانیتورینگ کافکا و جداول replicate (بزودی); +- و بسیاری از ویژگی های دیگر برای شما. -### LightHouse +### LightHouse {#lighthouse} [LightHouse](https://github.com/VKCOM/lighthouse) رابط کاربری سبک وزن برای ClickHouse است. امکانات: -- لیست جدول با فیلتر کردن و ابرداده. -- پیش نمایش جدول با فیلتر کردن و مرتب سازی. -- اعداد نمایش داده شده فقط خواندنی +- لیست جدول با فیلتر کردن و ابرداده. +- پیش نمایش جدول با فیلتر کردن و مرتب سازی. +- اعداد نمایش داده شده فقط خواندنی -### DBeaver +### DBeaver {#dbeaver} [DBeaver](https://dbeaver.io/) - مشتری دسکتاپ دسکتاپ دسکتاپ با پشتیبانی ClickHouse. امکانات: -- توسعه پرس و جو با برجسته نحو -- پیش نمایش جدول -- تکمیل خودکار +- توسعه پرس و جو با برجسته نحو +- پیش نمایش جدول +- تکمیل خودکار -### clickhouse-cli +### clickhouse-cli {#clickhouse-cli} [clickhouse-cli](https://github.com/hatarist/clickhouse-cli) یک مشتری خط فرمان برای ClickHouse است که در پایتون 3 نوشته شده است. @@ -69,24 +68,25 @@ interface تحت وب برای ClickHouse در پروژه [Tabix](https://github - پشتیبانی از Pager برای خروجی داده. - دستورات پست سفارشی مانند PostgreSQL. -### clickhouse-flamegraph +### clickhouse-flamegraph {#clickhouse-flamegraph} [clickhouse-flamegraph](https://github.com/Slach/clickhouse-flamegraph) یک ابزار تخصصی برای تجسم است `system.trace_log`مانند[flamegraph](http://www.brendangregg.com/flamegraphs.html). -## تجاری +## تجاری {#tjry} -### DataGrip +### DataGrip {#datagrip} [DataGrip](https://www.jetbrains.com/datagrip/) IDE پایگاه داده از JetBrains با پشتیبانی اختصاصی برای ClickHouse است. این ابزار همچنین به سایر ابزارهای مبتنی بر IntelliJ تعبیه شده است: PyCharm، IntelliJ IDEA، GoLand، PhpStorm و دیگران. امکانات: -- تکمیل کد بسیار سریع -- نحو برجسته ClickHouse. -- پشتیبانی از ویژگی های خاص برای ClickHouse، برای مثال ستون های توپی، موتورهای جدول. -- ویرایشگر داده. -- Refactorings. -- جستجو و ناوبری +- تکمیل کد بسیار سریع +- نحو برجسته ClickHouse. +- پشتیبانی از ویژگی های خاص برای ClickHouse، برای مثال ستون های توپی، موتورهای جدول. +- ویرایشگر داده. +- Refactorings. +- جستجو و ناوبری
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/interfaces/third-party/gui/) diff --git a/docs/fa/interfaces/third-party/integrations.md b/docs/fa/interfaces/third-party/integrations.md index fc222e6c421..7374a696de0 100644 --- a/docs/fa/interfaces/third-party/integrations.md +++ b/docs/fa/interfaces/third-party/integrations.md @@ -1,88 +1,89 @@ -
+
-# کتابخانه ادغام ثالث +# کتابخانه ادغام ثالث {#khtbkhnh-dgm-thlth} !!! warning "سلب مسئولیت" Yandex نه حفظ کتابخانه ها در زیر ذکر شده و نشده انجام هر آزمایش های گسترده ای برای اطمینان از کیفیت آنها. -## محصولات زیربنایی +## محصولات زیربنایی {#mhswlt-zyrbnyy} -- سیستم های مدیریت پایگاه داده رابطه ای - - [MySQL](https://www.mysql.com) - - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) - - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) - - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) - - [PostgreSQL](https://www.postgresql.org) - - [clickhousedb_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) - - [infi.clickhouse_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (استفاده می کند [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pg2ch](https://github.com/mkabilov/pg2ch) - - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) - - [ClickHouseMightrator](https://github.com/zlzforever/ClickHouseMigrator) -- صف پیام - - [Kafka](https://kafka.apache.org) - - [clickhouse_sinker](https://github.com/housepower/clickhouse_sinker) (استفاده می کند [Go client](https://github.com/kshvakov/clickhouse/)) -- فروشگاه شی - - [S3](https://en.wikipedia.org/wiki/Amazon_S3) - - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) -- ارکستراسیون کانتینر - - [Kubernetes](https://kubernetes.io) - - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) -- مدیریت تنظیمات - - [puppet](https://puppet.com) - - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) - - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) -- نظارت بر - - [Graphite](https://graphiteapp.org) - - [graphouse](https://github.com/yandex/graphouse) - - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + - - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) - - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - optimizes staled partitions in [\*GraphiteMergeTree](../../operations/table_engines/graphitemergetree.md#graphitemergetree) if rules from [rollup configuration](../../operations/table_engines/graphitemergetree.md#rollup-configuration) could be applied - - [Grafana](https://grafana.com/) - - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) - - [Prometheus](https://prometheus.io/) - - [clickhouse_exporter](https://github.com/f1yegor/clickhouse_exporter) - - [PromHouse](https://github.com/Percona-Lab/PromHouse) - - [Nagios](https://www.nagios.org/) - - [check_clickhouse](https://github.com/exogroup/check_clickhouse/) - - [Zabbix](https://www.zabbix.com) - - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) - - [Sematext](https://sematext.com/) - - [clickhouse ادغام](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) -- ثبت نام - - [rsyslog](https://www.rsyslog.com/) - - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) - - [fluentd](https://www.fluentd.org) - - [loghouse](https://github.com/flant/loghouse) (برای [Kubernetes](https://kubernetes.io)) - - [logagent](https://www.sematext.com/logagent) - - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) -- جغرافیایی - - [MaxMind](https://dev.maxmind.com/geoip/) - - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) +- سیستم های مدیریت پایگاه داده رابطه ای + - [MySQL](https://www.mysql.com) + - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) + - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) + - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) + - [PostgreSQL](https://www.postgresql.org) + - [clickhousedb\_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) + - [infi.clickhouse\_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (استفاده می کند [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pg2ch](https://github.com/mkabilov/pg2ch) + - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) + - [ClickHouseMightrator](https://github.com/zlzforever/ClickHouseMigrator) +- صف پیام + - [Kafka](https://kafka.apache.org) + - [clickhouse\_sinker](https://github.com/housepower/clickhouse_sinker) (استفاده می کند [Go client](https://github.com/kshvakov/clickhouse/)) +- فروشگاه شی + - [S3](https://en.wikipedia.org/wiki/Amazon_S3) + - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) +- ارکستراسیون کانتینر + - [Kubernetes](https://kubernetes.io) + - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) +- مدیریت تنظیمات + - [puppet](https://puppet.com) + - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) + - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) +- نظارت بر + - [Graphite](https://graphiteapp.org) + - [graphouse](https://github.com/yandex/graphouse) + - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + + - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) + - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - optimizes staled partitions in [\*GraphiteMergeTree](../../operations/table_engines/graphitemergetree.md#graphitemergetree) if rules from [rollup configuration](../../operations/table_engines/graphitemergetree.md#rollup-configuration) could be applied + - [Grafana](https://grafana.com/) + - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) + - [Prometheus](https://prometheus.io/) + - [clickhouse\_exporter](https://github.com/f1yegor/clickhouse_exporter) + - [PromHouse](https://github.com/Percona-Lab/PromHouse) + - [Nagios](https://www.nagios.org/) + - [check\_clickhouse](https://github.com/exogroup/check_clickhouse/) + - [Zabbix](https://www.zabbix.com) + - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) + - [Sematext](https://sematext.com/) + - [clickhouse ادغام](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) +- ثبت نام + - [rsyslog](https://www.rsyslog.com/) + - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) + - [fluentd](https://www.fluentd.org) + - [loghouse](https://github.com/flant/loghouse) (برای [Kubernetes](https://kubernetes.io)) + - [logagent](https://www.sematext.com/logagent) + - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) +- جغرافیایی + - [MaxMind](https://dev.maxmind.com/geoip/) + - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) -## اکوسیستم زبان برنامه نویسی +## اکوسیستم زبان برنامه نویسی {#khwsystm-zbn-brnmh-nwysy} -- Python - - [SQLAlchemy](https://www.sqlalchemy.org) - - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (استفاده می کند [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pandas](https://pandas.pydata.org) - - [pandahouse](https://github.com/kszucs/pandahouse) -- R - - [dplyr](https://db.rstudio.com/dplyr/) - - [RClickhouse](https://github.com/IMSMWU/RClickhouse) (استفاده می کند [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) -- Java - - [Hadoop](http://hadoop.apache.org) - - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (استفاده می کند [JDBC](../../query_language/table_functions/jdbc.md)) -- Scala - - [Akka](https://akka.io) - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- C# - - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) - - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) -- Elixir - - [Ecto](https://github.com/elixir-ecto/ecto) - - [clickhouse_ecto](https://github.com/appodeal/clickhouse_ecto) +- Python + - [SQLAlchemy](https://www.sqlalchemy.org) + - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (استفاده می کند [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pandas](https://pandas.pydata.org) + - [pandahouse](https://github.com/kszucs/pandahouse) +- R + - [dplyr](https://db.rstudio.com/dplyr/) + - [RClickhouse](https://github.com/IMSMWU/RClickhouse) (استفاده می کند [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) +- Java + - [Hadoop](http://hadoop.apache.org) + - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (استفاده می کند [JDBC](../../query_language/table_functions/jdbc.md)) +- Scala + - [Akka](https://akka.io) + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- C\# + - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) + - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) +- Elixir + - [Ecto](https://github.com/elixir-ecto/ecto) + - [clickhouse\_ecto](https://github.com/appodeal/clickhouse_ecto)
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/interfaces/third-party/integrations/) diff --git a/docs/fa/interfaces/third-party/proxy.md b/docs/fa/interfaces/third-party/proxy.md index 783af4749e0..5ecc5caf751 100644 --- a/docs/fa/interfaces/third-party/proxy.md +++ b/docs/fa/interfaces/third-party/proxy.md @@ -1,37 +1,38 @@ -
-# سرورهای پروکسی از توسعه دهندگان شخص ثالث +
+ +# سرورهای پروکسی از توسعه دهندگان شخص ثالث {#srwrhy-prwkhsy-z-tws-h-dhndgn-shkhs-thlth} [chproxy](https://github.com/Vertamedia/chproxy)، یک پراکسی HTTP و تعادل بار برای پایگاه داده ClickHouse است. امکانات -* مسیریابی و پاسخ دهی کاربر به کاربر. -* محدودیت انعطاف پذیر -* تمدید SSL cerificate به صورت خودکار. +- مسیریابی و پاسخ دهی کاربر به کاربر. +- محدودیت انعطاف پذیر +- تمدید SSL cerificate به صورت خودکار. اجرا شده در برو -## KittenHouse +## KittenHouse {#kittenhouse} [KittenHouse](https://github.com/VKCOM/kittenhouse) طراحی شده است که یک پروکسی محلی بین ClickHouse و سرور برنامه باشد در صورتی که غیر ممکن است یا ناخوشایند بافر کردن اطلاعات INSERT در قسمت درخواست شما. امکانات: -* بافر حافظه در حافظه و درایو. -* مسیریابی در جدول -* تعادل بار و بررسی سلامت. +- بافر حافظه در حافظه و درایو. +- مسیریابی در جدول +- تعادل بار و بررسی سلامت. اجرا شده در برو -## ClickHouse-Bulk +## ClickHouse-Bulk {#clickhouse-bulk} [ClickHouse-Bulk](https://github.com/nikepan/clickhouse-bulk) یک ClickHouse جمع کننده ساده است. امکانات: -* درخواست گروهی و ارسال توسط آستانه یا فاصله. -* چند سرور از راه دور -* احراز هویت پایه +- درخواست گروهی و ارسال توسط آستانه یا فاصله. +- چند سرور از راه دور +- احراز هویت پایه اجرا شده در برو diff --git a/docs/fa/introduction/distinctive_features.md b/docs/fa/introduction/distinctive_features.md index 189cc504f94..8879190233f 100644 --- a/docs/fa/introduction/distinctive_features.md +++ b/docs/fa/introduction/distinctive_features.md @@ -1,8 +1,8 @@ -
+
-# ویژگی های برجسته ClickHouse +# ویژگی های برجسته ClickHouse {#wyjgy-hy-brjsth-clickhouse} -## مدیریت دیتابیس ستون گرای واقعی +## مدیریت دیتابیس ستون گرای واقعی {#mdyryt-dytbys-stwn-gry-wq-y} در یک مدیریت دیتابیس ستون گرای واقعی، هیچ مقداری فضای اضافی برای ذخیره سازی ندارد. برای مثال، این به این معنیست که برای مقادیر، constant-length باید پشتیبانی شوند تا از ذخیره سازی طول مقدار به عنوان یه عدد integer کنار مقدار جلوگیری شود. در این مورد، یک میلیارد مقدار Uint8 باید در واقع در حالت غیرفشرده 1 گیگابایت فضا اشغال کند، در غیراین صورت به شدت بر عملکرد CPU تاثیر میگذارد. این خیلی مهم هست که داده ها به صورت compact ذخیره سازی شوند حتی زمانی که uncompressed هستند، از آنجا که سرعت سرعت decompress (CPU Usage) عمدتا به حجم داده های uncompress بستگی دارد. @@ -10,58 +10,58 @@ همچنین توجه داشته باشید که ClickHouse یک مدیریت دیتابیس است نه فقط یک دیتابیس. ClickHouse اجازه میدهد که در زمان اجرا اقدام به ساخت جدول، دیتابیس کنید، داده load کنید و query های خود را بدون restart و یا reconfigure مجدد سرور، اجرا کنید -## فشرده سازی داده ها +## فشرده سازی داده ها {#fshrdh-szy-ddh-h} بعضی دیتابیس های ستون گرا (InfiniDB CE یا MonetDB) از فشرده سازی داده ها استفاده نمی کنند. با این حال، فشرده سازی داده ها برای رسیدن به عملکرد عالی ضروری است. -## Disk storage of data +## Disk storage of data {#disk-storage-of-data} خیلی از مدیریت دیتابیس های ستون گرا (مثل SAP HANA و Google PowerDrill) فقط داخل RAM کار می کنند. این رویکرد منجر به تخصیص بودجه سخت افزاری بالا برای تحقق آنالیز های real-time می شود. ClickHouse برای کار بر روی هارد دیسک های معمولی طراحی شده است، که هزینه ی برای هر گیگابایت داده را تضمین می کند، اما اگر SSD و RAM موجود باشد به طور کامل مورد استفاده قرار می گیرد. -## پردازش موازی روی چندین هسته +## پردازش موازی روی چندین هسته {#prdzsh-mwzy-rwy-chndyn-hsth} query های بزرگ به طور طبیعی با استفاده از تمام منابع موجود در روی سرور فعلی، موازی سازی می شوند. - -## پردازش توزیع شده بر روی چندین سرور +## پردازش توزیع شده بر روی چندین سرور {#prdzsh-twzy-shdh-br-rwy-chndyn-srwr} تقریبا هیچ کدام از DBMS هایی که بالاتر ذکر شد، از اجرای query ها به صورت توزیع شده پشتیبانی نمی کنند. در ClickHouse، داده ها می توانن در shard های مختلف مستقر شوند. هر shard میتوامند گروهی از replica ها برای بالا بردن تحمل پذیری در برابر خطا (fault tolerance) را داشته باشد. یک query به صورت موازی بر روی تمامی shard های اجرا می شود. این برای کاربر شفاف است. -## پشتیبانی SQL +## پشتیبانی SQL {#pshtybny-sql} اگر شما با SQL آشنا باشید، ما واقعا نمیتونیم در مورد پشتیبانی از SQL صحبت کنیم. تمام توابع اسم های مختلفی دارند. با این حال، این یک زبان بر پایه SQL هست که نمیتواند در بسیاری از موارد با SQL متفاوت باشد. JOIN ها پشتیبانی می شود. subquery ها در FROM، IN و JOIN پشتیبانی می شود. Dependent subquery ها پشتیبانی نمی شود. ClickHouse زبان بر پایه SQL است که در بسیاری از موارد از استاندارد SQL پیروی می کند. GROUP BY، ORDER BY، scalar subquery ها در FROM، IN و JOIN پشتیبانی می شود. subquery های مرتبط و window function ها پشتیبانی نمی شود. -## موتور بردارد +## موتور بردارد {#mwtwr-brdrd} داده ها نه فقط براساس ستون ها ذخیره می شوند، بلکه با استفاده از برداردها (بخشی از ستون ها) پردازش می شوند. این قابلیت باعث رسیدن به بهره وری بالای CPU می شود. -## بروزرسانی داده ها به صورت Real-Time +## بروزرسانی داده ها به صورت Real-Time {#brwzrsny-ddh-h-bh-swrt-real-time} ClickHouse از جداول دارای Primary Key پشتیبانی می کند. به منظور اجرای query های range بر روی Primary Key، داده ها به صورت افزایشی (مرتب شده) با استفاده از merge tree ذخیره سازی می شوند. با توجه به این، داده ها می توانند به طور پیوسته به جدول اضافه شوند. هیچ نوع lock در هنگام مصرف داده ها وجود ندارد. -## Index +## Index {#index} داشتن داده ها به صورت فیزیکی و مرتب شده براساس Primary Key این قابلیت را می دهد که استخراج کردن داده برای مقدار خاص و یا مقادیر range با کمترین latencey، یعنی کمتر از چند هزار میلی ثانیه ممکن شود. -## مناسب برای query های آنلاین +## مناسب برای query های آنلاین {#mnsb-bry-query-hy-anlyn} latency پایین به این معنی است که query ها بتونن بدون delay و بدون تلاش برای رسیدن به پیش پاسخ(از قبل محاسبه شده) دقیقا در همان لحظه که کاربر در حال load صفحه است پردازش شوند. به عبارتی دیگر، آنلاین -## پشتیبانی از محاسبات تقریبی +## پشتیبانی از محاسبات تقریبی {#pshtybny-z-mhsbt-tqryby} ClickHouse روش های مختلفی برای کسب دقیق performance ارائه می دهد: -1. توابع Aggregate برای محاسبات تقریبی تعداد مقادیر متمایز (distinct)، median و quantity ها -2. اجرای یک query بر پایه بخشی از داده ها (داده ی sample) و دریافت خروجی تقریبی. در این مورد داده ی نسبتا کمتری از دیسک بازیابی می شود. -3. اجرای یک Aggregation برای تعداد محدودی از کلید های تصافی، به جای تمام کلید ها. در شرایط خاص برای توزیع کلید در داده ها، این روش کمک می کند به نتایج منطقی برسیم با استفاده از منابع کمتر. +1. توابع Aggregate برای محاسبات تقریبی تعداد مقادیر متمایز (distinct)، median و quantity ها +2. اجرای یک query بر پایه بخشی از داده ها (داده ی sample) و دریافت خروجی تقریبی. در این مورد داده ی نسبتا کمتری از دیسک بازیابی می شود. +3. اجرای یک Aggregation برای تعداد محدودی از کلید های تصافی، به جای تمام کلید ها. در شرایط خاص برای توزیع کلید در داده ها، این روش کمک می کند به نتایج منطقی برسیم با استفاده از منابع کمتر. -## Replication داده ها و integrity +## Replication داده ها و integrity {#replication-ddh-h-w-integrity} ClickHouse از روش asynchronous multimaster replication استفاده می کند. بعد از نوشتن داده در یکی از replica های موجود، داده به صورت توزیع شده به بقیه replica ها منتقل می شود. این سیستم داده های مشابه را در replica های مختلف نگه داری می کند. در اکثر موارد که سیستم fail می شوند، داده ها به صورت اتوماتیک restore می شوند و یا در موارد پیچیده به صورت نیمه اتوماتیک restore می شوند. برای اطلاعات بیشتر، به بخش [replication داده ها](../operations/table_engines/replication.md) مراجعه کنید.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/introduction/distinctive_features/) diff --git a/docs/fa/introduction/features_considered_disadvantages.md b/docs/fa/introduction/features_considered_disadvantages.md index 13a9828e6b9..5a8ea156da5 100644 --- a/docs/fa/introduction/features_considered_disadvantages.md +++ b/docs/fa/introduction/features_considered_disadvantages.md @@ -1,10 +1,10 @@ -
+
-# ویژگی های از ClickHouse که می تواند معایبی باشد. +# ویژگی های از ClickHouse که می تواند معایبی باشد. {#wyjgy-hy-z-clickhouse-khh-my-twnd-m-yby-bshd} -1. بدون پشتیبانی کامل از تراکنش -2. عدم توانایی برای تغییر و یا حذف داده های در حال حاضر وارد شده با سرعت بالا و تاخیر کم. برای پاک کردن و یا اصلاح داده ها، به عنوان مثال برای پیروی از [GDPR](https://gdpr-info.eu)، دسته ای پاک و به روزرسانی وجود دارد.حال توسعه می باشد. -3. Sparse index باعث می شود ClickHouse چندان مناسب اجرای پرسمان های point query برای دریافت یک ردیف از داده ها با استفاده از کلید آنها نباشد. +1. بدون پشتیبانی کامل از تراکنش +2. عدم توانایی برای تغییر و یا حذف داده های در حال حاضر وارد شده با سرعت بالا و تاخیر کم. برای پاک کردن و یا اصلاح داده ها، به عنوان مثال برای پیروی از [GDPR](https://gdpr-info.eu)، دسته ای پاک و به روزرسانی وجود دارد.حال توسعه می باشد. +3. Sparse index باعث می شود ClickHouse چندان مناسب اجرای پرسمان های point query برای دریافت یک ردیف از داده ها با استفاده از کلید آنها نباشد.
diff --git a/docs/fa/introduction/history.md b/docs/fa/introduction/history.md index db523df31c8..7882c89eae5 100644 --- a/docs/fa/introduction/history.md +++ b/docs/fa/introduction/history.md @@ -1,6 +1,6 @@ -
+
-# ClickHouse ﻪﭽﺨﯾﺭﺎﺗ +# ClickHouse ﻪﭽﺨﯾﺭﺎﺗ {#clickhouse} ClickHouse در ابتدا برای قدرت به Yandex.Metrica دومین بستر آنالیز وب در دنیا توسعه داده شد، و همچنان جز اصلی آن است. ClickHouse اجازه می دهند که با بیش از 13 تریلیون رکورد در دیتابیس و بیش از 20 میلیارد event در روز، گزارش های مستقیم (On the fly) از داده های non-aggregate تهیه کنیم. این مقاله پیشنیه ی تاریخی در ارتباط با اهداف اصلی ClickHouse قبل از آنکه به یک محصول open source تبدیل شود، می دهد. @@ -8,34 +8,34 @@ Yandex.Metrica تولید گزارش های برپایه بازدید و session از آوریل 2014، Yandex.Metrica تقریبا 12 میلیارد event شامل page view و click در روز دریافت کرد. تمام این event ها باید به ترتیب برای ساخت گزارش های سفارشی ذخیره سازی می شدند. یک query ممکن است نیاز به اسکن کردن میلیون ها سطر با زمان کمتر از چند صد میلی ثانیه، یا چند صد میلیون سطر در عرض چند ثانیه داشته باشد. -## استفاده در Yandex.Metrica و دیگر سرویس های Yandex +## استفاده در Yandex.Metrica و دیگر سرویس های Yandex {#stfdh-dr-yandex-metrica-w-dygr-srwys-hy-yandex} ClickHouse با چندین اهداف در Yandex.Metrica استفاده می شود. وظیفه اصلی آن ساخت گزارش های آنلاین از داده های non-aggregate می باشد. ClickHouse در یک کلاستر با سایز 374 سرور، که بیش از 20.3 تریلیون سطر در دیتابیس را دارد مورد استفاده قرار می گیرد. اندازه فشرده داده ها، بدون شمارش داده های تکراری و replication، حدود 2 پتابایت می باشد. اندازه ی غیرفشرده داده ها (در فرمت TSV) حدودا 17 پتابایت می باشد. ClickHouse همچنین در موارد زیراستفاده می شود: -- ذخیره سازی داده ها برای Session replay از Yandex.Metrica. -- پردازش داده های Intermediate. -- ساخت گزارش های سراسری از آنالیز ها. -- اجرای query ها برای debug کردن موتور Yandex.Metrica. -- آنالیز لاگ های به دست آمده از API ها و user interface. +- ذخیره سازی داده ها برای Session replay از Yandex.Metrica. +- پردازش داده های Intermediate. +- ساخت گزارش های سراسری از آنالیز ها. +- اجرای query ها برای debug کردن موتور Yandex.Metrica. +- آنالیز لاگ های به دست آمده از API ها و user interface. ClickHouse حداقل در دوازده جای دیگر سرویس Yandex نصب شده است: در search verticals، Market، Direct، Business Analytics، Mobile Development، AdFox، سرویس های شخصی و.. -## داده های Aggregate , Non-Aggregate +## داده های Aggregate , Non-Aggregate {#ddh-hy-aggregate-non-aggregate} یک دیدگاه محبوب وجود دارد که شما باید، داده های خود را به منظور کاهش اندازه داده ها Aggregate کنید. اما به دلایل زیر، aggregate کردن داده ها راه حل بسیار محدودی است: -- شما باید لیست گزارش های از قبل تعریف شده توسط کاربر که نیاز به تهیه گزارش آنها را دارید، داشته باشید. -- کاربر نمیتواند گزارش های سفارشی تهیه کند. -- در هنگام aggregate کردن تعداد بسیار زیاد key، اندازه ی داده ها کم نمی شود و aggregate بی فایده است. -- برای تعداد زیادی از گزارش ها، aggregate های متنوع و تغییرپذیر زیادی وجود دارد. (انفجار ترکیبی). -- هنگام aggregate کردن key ها با cardinality بالا (مثل URL ها)، اندازه داده ها به اندازه کافی کاهش پیدا نمی کند (کمتر از دو برابر). -- به این دلیل اندازه ی داده ها با aggregate کردن ممکن است به جای شکستن، رشد هم بکند. -- کاربر تمام گزارش هایی که ما تولید کردیم را نگاه نمی کند. بخش بزرگی از محاسبات بی فایده است. -- یکپارچگی منطقی داده ها ممکن است برای aggregate های مختلف نقض شود. +- شما باید لیست گزارش های از قبل تعریف شده توسط کاربر که نیاز به تهیه گزارش آنها را دارید، داشته باشید. +- کاربر نمیتواند گزارش های سفارشی تهیه کند. +- در هنگام aggregate کردن تعداد بسیار زیاد key، اندازه ی داده ها کم نمی شود و aggregate بی فایده است. +- برای تعداد زیادی از گزارش ها، aggregate های متنوع و تغییرپذیر زیادی وجود دارد. (انفجار ترکیبی). +- هنگام aggregate کردن key ها با cardinality بالا (مثل URL ها)، اندازه داده ها به اندازه کافی کاهش پیدا نمی کند (کمتر از دو برابر). +- به این دلیل اندازه ی داده ها با aggregate کردن ممکن است به جای شکستن، رشد هم بکند. +- کاربر تمام گزارش هایی که ما تولید کردیم را نگاه نمی کند. بخش بزرگی از محاسبات بی فایده است. +- یکپارچگی منطقی داده ها ممکن است برای aggregate های مختلف نقض شود. اگر ما هیچ چیزی را aggregate نکنیم و با داده های non-aggregate کار کنیم، در واقع این ممکن است باعث کاهش اندازه ی محاسبات شود. @@ -46,4 +46,5 @@ Yandex.Metrica دارای یک سیستم تخصصی برای aggregate کردن برای حذف محدودیت های OLAPServer و حل مشکلات کار با داده های Non-Aggregate برای تمام گزارش ها، ما مدیریت دیتابیس ClicHouse را توسعه دادیم..
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/introduction/ya_metrika_task/) diff --git a/docs/fa/introduction/performance.md b/docs/fa/introduction/performance.md index 552b49966fc..5eb876f5ee1 100644 --- a/docs/fa/introduction/performance.md +++ b/docs/fa/introduction/performance.md @@ -1,28 +1,29 @@ -
+
-# Performance +# Performance {#performance} با توجه به نتایج تست های Yandex، ClickHouse بهترین عملکرد را برای سناریوهای عملیاتی قابل مقایسه با دیگر سیستم های در کلاس خود را از خود نشان داد. این تست ها شامل بالاترین توان عملیاتی برای query های طولانی، و کمترین latency برای query های کوتاه بود. نتایج این تست های در [صفحه ی جدا](https://clickhouse.tech/benchmark.html) موجود است. benchmark های زیادی وجود دارند که این نتایج را تایید می کنند. میتوانید این نتایج را جستجو کنید و یا [این لینک های benchmark](https://clickhouse.tech/#independent-benchmarks). مستقل را ببینید. -## توان عملیاتی برای یک query بزرگ +## توان عملیاتی برای یک query بزرگ {#twn-mlyty-bry-ykh-query-bzrg} توان عملیاتی می تواند به صورت تعداد سطر در ثانیه و یا تعداد مگابایت در ثانیه اندازه گیری شود. اگر داده ها در page cache قرار داشته باشند، یک query برای اجرا شدن بر روی سخت افزارهای مدرن چندان پیچیده نخواهد بود و با سرعت تقریبا 2 تا 10 گیگابایت در ثانیه برای داده های غیرفشرده و در یک سرور پردازش خواهد شد (برای ساده ترین موارد، سرعت ممکن است به 30 گیگابایت در ثانیه برسد). اگر داده ها در page cache قرار نداشته باشند، سرعت محدود به دیسک و همچنین چگونگی فشرده سازی داده ها بر روی دیسک می باشد. برای مثال اگر یک دیسک اجازه ی خواندن داده ها با سرعت 400 مگابایت در ثانیه را بدهد، و داده ها با نرخ 3 فشرده سازی شده باشند، سرعت در حدود 1.2 گیگابایت در ثانیه خواهد بود. برای گرفتن تعداد رکورد در ثانیه، سرعت بایت در ثانیه را تقسیم بر کل سایز ستون ها مورد استفاده در query می کنیم. برای مثال اگر 10 بایت از ستوه ها استخراج می شود، سرعت در حدود 100 تا 200 میلیون سطر در ثانیه می باشد. سرعت پردازش در سرویس توزیع شده تقریبا به صورت خطی افزایش پیدا می کند، اما فقط وقتی که نتایج سطرهای به دست آمده از aggeration یا مرتب سازی زیاد بزرگ نباشند. -## Latency در زمان پردازش query های کوتاه +## Latency در زمان پردازش query های کوتاه {#latency-dr-zmn-prdzsh-query-hy-khwth} اگر یک query از Primary Key استفاده کند و تعداد زیادی از سطر ها را برای پردازش select نکند (صدها هزار)، و از تعداد زیادی ستون استفاده نکند،اگر داده ها در page cache قرار داشته باشند، ما میتوانیم انتظار latency کمتر از 50 میلی ثانیه را داشته باشیم. در غیر این صورت محاسبه زمان براساس تعداد seek ها انجام خواهد گرفت. اگر شما از هارد های دیسکی استفاده می کنید، برای سیستمی که overload ندارد، محاسبه تقریبی latency با استفاده از این فرمول ممکن است: زمان seek (10 ms) \* تعداد ستون های مورد نیاز در query \* تعداد قطعات داده -## توان عملیاتی در هنگام پردازش تعداد زیادی از query های کوتاه +## توان عملیاتی در هنگام پردازش تعداد زیادی از query های کوتاه {#twn-mlyty-dr-hngm-prdzsh-t-dd-zydy-z-query-hy-khwth} تحت شرایط مشابه، ClickHouse توانایی رسیدگی به چند صد query در ثانیه به ازای یک سرور را دارد ( بالای چند هزار در ثانیه در بهترین مورد). از آنجایی که این سناریو در مدیریت دیتابیس های آنالیزی معمول نیست، بهتر است نهایتا انتظار چند صد query در ثانیه را داشته باشید. -## Performance در هنگام درج داده ها +## Performance در هنگام درج داده ها {#performance-dr-hngm-drj-ddh-h} پیشنهاد می کنیم درج داده ها را به صورت دسته ای و حداقل 100 سطر در هر دسته انجام دهید و یا بیش از یک درخواست insert در ثانیه را نداشته باشید. در هنگام درج داده در جدول MergeTree از یک dump جدا شده با tab، سرعت درج داده از 50 تا 200 مگابایت در ثانیه می باشد. اگر سطر های درج شده حدود 1 کیلوبایت باشند، سرعت حدود 50 هزار تا 200 هزار سطر در ثانیه می باشد. اگر سطر ها کوچک باشند بازدهی بالایی در تعداد سطر در ثانیه خواهیم داشت. در Banner System Data -`>` 500 هزار سطر در ثانیه، در Graphite data -`>` 1 میلیون سطر در ثانیه). برای بهبود کارایی، شما می توانید چندین insert را به صورت موازی اجرا کنید، که در این حالت کارایی سیستم به صورت خطی افزایش می یابد.
+ [مقاله اصلی](https://clickhouse.tech/docs/fa/introduction/performance/) diff --git a/docs/fa/operations/performance/sampling_query_profiler_example_result.txt b/docs/fa/operations/performance/sampling_query_profiler_example_result.txt new file mode 120000 index 00000000000..58c5abe7122 --- /dev/null +++ b/docs/fa/operations/performance/sampling_query_profiler_example_result.txt @@ -0,0 +1 @@ +../../../en/operations/performance/sampling_query_profiler_example_result.txt \ No newline at end of file diff --git a/docs/ja/commercial/cloud.md b/docs/ja/commercial/cloud.md new file mode 120000 index 00000000000..eb58e4a90be --- /dev/null +++ b/docs/ja/commercial/cloud.md @@ -0,0 +1 @@ +../../en/commercial/cloud.md \ No newline at end of file diff --git a/docs/ja/images/logo.svg b/docs/ja/images/logo.svg index 865b96d98c7..b5ab923ff65 100644 --- a/docs/ja/images/logo.svg +++ b/docs/ja/images/logo.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/ja/index.md b/docs/ja/index.md index 7d4ddc11eb2..0feff817023 100644 --- a/docs/ja/index.md +++ b/docs/ja/index.md @@ -1,15 +1,15 @@ -# ClickHouseとは? +# ClickHouseとは? {#clickhousetoha} ClickHouseは、クエリのオンライン分析処理(OLAP)用の列指向のデータベース管理システム(DBMS)です。 「通常の」行指向のDBMSでは、データは次の順序で保存されます。 -| Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | -| ------ | ------------------- | ---------- | ------------------ | --------- | ------------------- | -| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | ... | ... | ... | ... | ... | +| Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | +|-----|-------------|------------|--------------------|-----------|---------------------| +| \#0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| \#1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| \#2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| \#N | … | … | … | … | … | つまり、行に関連するすべての値は物理的に隣り合わせに格納されます。 @@ -18,13 +18,13 @@ ClickHouseは、クエリのオンライン分析処理(OLAP)用の列指向 列指向のDBMSでは、データは次のように保存されます: -| Row: | #0 | #1 | #2 | #N | -| ----------- | ------------------- | ------------------- | ------------------- | ------------------- | -| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | -| JavaEnable: | 1 | 0 | 1 | ... | -| Title: | Investor Relations | Contact us | Mission | ... | -| GoodEvent: | 1 | 1 | 1 | ... | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | +| Row: | \#0 | \#1 | \#2 | \#N | +|-------------|---------------------|---------------------|---------------------|-----| +| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | +| JavaEnable: | 1 | 0 | 1 | … | +| Title: | Investor Relations | Contact us | Mission | … | +| GoodEvent: | 1 | 1 | 1 | … | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | これらの例は、データが配置される順序のみを示しています。 異なる列の値は別々に保存され、同じ列のデータは一緒に保存されます。 @@ -37,25 +37,25 @@ ClickHouseは、クエリのオンライン分析処理(OLAP)用の列指向 システムの負荷が高いほど、使用シナリオの要件に一致するようにセットアップされたシステムをカスタマイズすることがより重要になり、このカスタマイズはより細かくなります。大きく異なるシナリオに等しく適したシステムはありません。システムがさまざまなシナリオに適応可能である場合、高負荷下では、システムはすべてのシナリオを同等に不十分に処理するか、1つまたはいくつかの可能なシナリオでうまく機能します。 -## OLAPシナリオの主要なプロパティ +## OLAPシナリオの主要なプロパティ {#olapsinarionozhu-yao-napuropatei} -- リクエストの大部分は読み取りアクセス用である。 -- データは、単一行ではなく、かなり大きなバッチ(> 1000行)で更新されます。または、まったく更新されない。 -- データはDBに追加されるが、変更されない。 -- 読み取りの場合、非常に多くの行がDBから抽出されるが、一部の列のみ。 -- テーブルは「幅が広く」、多数の列が含まれる。 -- クエリは比較的まれ(通常、サーバーあたり毎秒数百あるいはそれ以下の数のクエリ)。 -- 単純なクエリでは、約50ミリ秒の遅延が容認される。 -- 列の値はかなり小さく、数値や短い文字列(たとえば、URLごとに60バイト)。 -- 単一のクエリを処理する場合、高いスループットが必要(サーバーあたり毎秒最大数十億行)。 -- トランザクションは必要ない。 -- データの一貫性の要件が低い。 -- クエリごとに1つの大きなテーブルがある。 1つを除くすべてのテーブルは小さい。 -- クエリ結果は、ソースデータよりも大幅に小さくなる。つまり、データはフィルター処理または集計されるため、結果は単一サーバーのRAMに収まる。 +- リクエストの大部分は読み取りアクセス用である。 +- データは、単一行ではなく、かなり大きなバッチ(\> 1000行)で更新されます。または、まったく更新されない。 +- データはDBに追加されるが、変更されない。 +- 読み取りの場合、非常に多くの行がDBから抽出されるが、一部の列のみ。 +- テーブルは「幅が広く」、多数の列が含まれる。 +- クエリは比較的まれ(通常、サーバーあたり毎秒数百あるいはそれ以下の数のクエリ)。 +- 単純なクエリでは、約50ミリ秒の遅延が容認される。 +- 列の値はかなり小さく、数値や短い文字列(たとえば、URLごとに60バイト)。 +- 単一のクエリを処理する場合、高いスループットが必要(サーバーあたり毎秒最大数十億行)。 +- トランザクションは必要ない。 +- データの一貫性の要件が低い。 +- クエリごとに1つの大きなテーブルがある。 1つを除くすべてのテーブルは小さい。 +- クエリ結果は、ソースデータよりも大幅に小さくなる。つまり、データはフィルター処理または集計されるため、結果は単一サーバーのRAMに収まる。 OLAPシナリオは、他の一般的なシナリオ(OLTPやKey-Valueアクセスなど)とは非常に異なることが容易にわかります。 したがって、まともなパフォーマンスを得るには、OLTPまたはKey-Value DBを使用して分析クエリを処理しようとするのは無意味です。 たとえば、分析にMongoDBまたはRedisを使用しようとすると、OLAPデータベースに比べてパフォーマンスが非常に低下します。 -## OLAPシナリオで列指向データベースがよりよく機能する理由 +## OLAPシナリオで列指向データベースがよりよく機能する理由 {#olapsinariodelie-zhi-xiang-detabesugayoriyokuji-neng-suruli-you} 列指向データベースは、OLAPシナリオにより適しています。ほとんどのクエリの処理が少なくとも100倍高速です。 理由を以下に詳しく説明しますが、その根拠は視覚的に簡単に説明できます: @@ -69,25 +69,30 @@ OLAPシナリオは、他の一般的なシナリオ(OLTPやKey-Valueアクセ 違いがわかりましたか? -### Input/output +### Input/output {#inputoutput} -1. 分析クエリでは、少数のテーブル列のみを読み取る必要があります。列指向のデータベースでは、必要なデータのみを読み取ることができます。たとえば、100のうち5つの列が必要な場合、I/Oが20倍削減されることが期待できます。 -2. データはパケットで読み取られるため、圧縮が容易です。列のデータも圧縮が簡単です。これにより、I/Oボリュームがさらに削減されます。 -3. I/Oの削減により、より多くのデータがシステムキャッシュに収まります。 +1. 分析クエリでは、少数のテーブル列のみを読み取る必要があります。列指向のデータベースでは、必要なデータのみを読み取ることができます。たとえば、100のうち5つの列が必要な場合、I/Oが20倍削減されることが期待できます。 +2. データはパケットで読み取られるため、圧縮が容易です。列のデータも圧縮が簡単です。これにより、I/Oボリュームがさらに削減されます。 +3. I/Oの削減により、より多くのデータがシステムキャッシュに収まります。 たとえば、「各広告プラットフォームのレコード数をカウントする」クエリでは、1つの「広告プラットフォームID」列を読み取る必要がありますが、これは非圧縮では1バイトの領域を要します。トラフィックのほとんどが広告プラットフォームからのものではない場合、この列は少なくとも10倍の圧縮が期待できます。高速な圧縮アルゴリズムを使用すれば、1秒あたり少なくとも非圧縮データに換算して数ギガバイトの速度でデータを展開できます。つまり、このクエリは、単一のサーバーで1秒あたり約数十億行の速度で処理できます。この速度はまさに実際に達成されます。 -
Example -```bash +
+ +Example + +``` bash $ clickhouse-client ClickHouse client version 0.0.52053. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.52053. ``` -```sql + +``` sql SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 ``` -```text + +``` text ┌─CounterID─┬──count()─┐ │ 114208 │ 56057344 │ │ 115080 │ 51619590 │ @@ -114,16 +119,16 @@ SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIM
-### CPU +### CPU {#cpu} クエリを実行するには大量の行を処理する必要があるため、個別の行ではなくベクター全体のすべての操作をディスパッチするか、ディスパッチコストがほとんどないようにクエリエンジンを実装すると効率的です。 適切なディスクサブシステムでこれを行わないと、クエリインタープリターが必然的にCPUを失速させます。 データを列に格納し、可能な場合は列ごとに処理することは理にかなっています。 これを行うには2つの方法があります: -1. ベクトルエンジン。 すべての操作は、個別の値ではなく、ベクトルに対して記述されます。 これは、オペレーションを頻繁に呼び出す必要がなく、ディスパッチコストが無視できることを意味します。 操作コードには、最適化された内部サイクルが含まれています。 +1. ベクトルエンジン。 すべての操作は、個別の値ではなく、ベクトルに対して記述されます。 これは、オペレーションを頻繁に呼び出す必要がなく、ディスパッチコストが無視できることを意味します。 操作コードには、最適化された内部サイクルが含まれています。 -2. コード生成。 クエリ用に生成されたコードには、すべての間接的な呼び出しが含まれています。 +2. コード生成。 クエリ用に生成されたコードには、すべての間接的な呼び出しが含まれています。 これは、単純なクエリを実行する場合には意味がないため、「通常の」データベースでは実行されません。 ただし、例外があります。 たとえば、MemSQLはコード生成を使用して、SQLクエリを処理する際の遅延を減らします。 (比較のために、分析DBMSではレイテンシではなくスループットの最適化が必要です。) diff --git a/docs/ja/introduction/distinctive_features.md b/docs/ja/introduction/distinctive_features.md index da023db9825..eb9c7391fe5 100644 --- a/docs/ja/introduction/distinctive_features.md +++ b/docs/ja/introduction/distinctive_features.md @@ -1,6 +1,6 @@ -# ClickHouseの特徴 +# ClickHouseの特徴 {#clickhousenote-zheng} -## 真の列指向DBMS +## 真の列指向DBMS {#zhen-nolie-zhi-xiang-dbms} 真の列指向DBMSでは、値に関する余分なデータは保存されません。 これは、固定長の値がサポートされ、「データ長」が格納されないことを意味します。 例として、10億個のUInt8型の値は、圧縮されていない状態で約1 GBを消費しますが、そうしないとCPU負荷に大きく影響します。 これは、データの展開速度(CPU使用率)が、主に非圧縮データのサイズに依存するため、データを(冗長なデータが含まれないように)コンパクトに格納することが非常に重要であるためです。 @@ -8,51 +8,51 @@ また、ClickHouseは単一のデータベースではなく、データベース管理システムであることに注意してください。 ClickHouseを使用すると、サーバーを再構成および再起動することなく、実行時にテーブルとデータベースを作成し、データを読み込み、クエリを実行できます。 -## データ圧縮 +## データ圧縮 {#detaya-suo} 一部の列指向DBMS(InfiniDB CEおよびMonetDB)は、データ圧縮を使用しません。ただし、優れたパフォーマンスを実現するには、データ圧縮が重要な役割を果たします。 -## データのディスクストレージ +## データのディスクストレージ {#detanodeisukusutorezi} -データを主キーで物理的に並べ替えておくと、特定の値または値範囲のデータを数十ミリ秒未満の低遅延で抽出できます。一部の列指向DBMS(SAP HANAやGoogle PowerDrillなど)は、RAMでのみ機能します。このアプローチでは、リアルタイム分析を行う上で、実際に必要なハードウェアよりも規模の大きな予算の割り当てが推奨されます。 これに対しClickHouseは、通常のハードドライブで動作するように設計されています。 つまり、データストレージのGBあたりのコストは低くなり、さらに利用可能なSSDや追加のRAMがあれば、それらもフルに使用されます。 +データを主キーで物理的に並べ替えておくと、特定の値または値範囲のデータを数十ミリ秒未満の低遅延で抽出できます。一部の列指向DBMS(SAP HANAやGoogle PowerDrillなど)は、RAMでのみ機能します。このアプローチでは、リアルタイム分析を行う上で、実際に必要なハードウェアよりも規模の大きな予算の割り当てが推奨されます。 これに対しClickHouseは、通常のハードドライブで動作するように設計されています。 つまり、データストレージのGBあたりのコストは低くなり、さらに利用可能なSSDや追加のRAMがあれば、それらもフルに使用されます。 -## マルチコアでの並列処理 +## マルチコアでの並列処理 {#marutikoadenobing-lie-chu-li} 大規模なクエリは、現在のサーバーで使用可能なすべての必要なリソースを使用して、自然な方法で並列化されます。 -## 複数のサーバーでの分散処理 +## 複数のサーバーでの分散処理 {#fu-shu-nosabadenofen-san-chu-li} 前述の列指向DBMSのほとんどは、分散クエリ処理をサポートしていません。 ClickHouseでは、データを異なるシャードに置くことができます。各シャードは、フォールトトレランスに使用されるレプリカのグループにすることができます。クエリは、すべてのシャードで並行して処理されます。これはユーザーにとって透過的です。 -## SQLのサポート +## SQLのサポート {#sqlnosapoto} ClickHouseは、多くの場合、標準SQLと同一のSQLに基づく宣言的なクエリ言語をサポートしています。サポートされるクエリには、GROUP BY、ORDER BYや、FROM、IN、およびJOIN句中のサブクエリ、およびスカラーサブクエリが含まれます。従属サブクエリとウィンドウ関数はサポートされていません。 -## ベクトルエンジン +## ベクトルエンジン {#bekutoruenzin} データは列ごとに格納されるだけでなく、ベクトル(列の一部)によって処理されます。これにより、高いCPU効率を実現できます。 -## リアルタイムのデータ更新 +## リアルタイムのデータ更新 {#riarutaimunodetageng-xin} ClickHouseは、主キーを持つテーブルをサポートします。 主キーの範囲でクエリをすばやく実行するために、データはマージツリーを使用して随時並べ替えられます。これにより、データを継続的にテーブルに追加できます。新しいデータが取り込まれたときにロックは取得されません。 -## インデックス +## インデックス {#indetukusu} データを主キーで物理的に並べ替えることにより、特定の値または値範囲のデータを数十ミリ秒未満の低遅延で抽出できます。 -## オンラインクエリとの親和性 +## オンラインクエリとの親和性 {#onrainkueritonoqin-he-xing} 低レイテンシであるとは、UIページの読み込み中に、事前計算を伴わないでクエリを遅延なく処理できることを意味します。これはオンラインであるとも言えます。 -## 近似計算のサポート +## 近似計算のサポート {#jin-si-ji-suan-nosapoto} ClickHouseには、精度を犠牲にしてパフォーマンスを得るための様々な方法が用意されています。 -1. ユニーク値、中央値、および分位数を近似計算のための集計関数。 -2. データの一部(サンプル)に基づいてクエリを実行し、近似結果を取得します。この場合、ディスクから取得されるデータは近似に比例して少なくなります。 -3. すべてのキーではなく、限られた数のランダムキーに対して集計を実行します。データ内のキー分布が特定の条件下であれば、使用するリソースが少なくなり、かなり正確な結果が得られます。 +1. ユニーク値、中央値、および分位数を近似計算のための集計関数。 +2. データの一部(サンプル)に基づいてクエリを実行し、近似結果を取得します。この場合、ディスクから取得されるデータは近似に比例して少なくなります。 +3. すべてのキーではなく、限られた数のランダムキーに対して集計を実行します。データ内のキー分布が特定の条件下であれば、使用するリソースが少なくなり、かなり正確な結果が得られます。 -## データのレプリケーションとデータ整合性のサポート +## データのレプリケーションとデータ整合性のサポート {#detanorepurikesiyontodetazheng-he-xing-nosapoto} 非同期マルチマスターレプリケーションを使用します。使用可能なレプリカに書き込まれた後、データはバックグラウンドで残りのすべてのレプリカに配布されます。システムは、異なるレプリカで同一のデータを維持します。ほとんどの障害後の回復は、自動的に、複雑な場合には半自動的に実行されます。 diff --git a/docs/ja/introduction/features_considered_disadvantages.md b/docs/ja/introduction/features_considered_disadvantages.md index 596fb9ba86f..0c2eebed9b7 100644 --- a/docs/ja/introduction/features_considered_disadvantages.md +++ b/docs/ja/introduction/features_considered_disadvantages.md @@ -1,8 +1,7 @@ -# 欠点と考えられるClickHouseの機能 +# 欠点と考えられるClickHouseの機能 {#qian-dian-tokao-erareruclickhousenoji-neng} -1. 本格的なトランザクションはありません。 -2. 既に挿入されたデータの変更または削除を、高頻度かつ低遅延に行う機能はありません。 [GDPR](https://gdpr-info.eu)に準拠するなど、データをクリーンアップまたは変更するために、バッチ削除およびバッチ更新が利用可能です。 -3. インデックスが疎であるため、ClickHouseは、キーで単一行を取得するようなクエリにはあまり適していません。 +1. 本格的なトランザクションはありません。 +2. 既に挿入されたデータの変更または削除を、高頻度かつ低遅延に行う機能はありません。 [GDPR](https://gdpr-info.eu)に準拠するなど、データをクリーンアップまたは変更するために、バッチ削除およびバッチ更新が利用可能です。 +3. インデックスが疎であるため、ClickHouseは、キーで単一行を取得するようなクエリにはあまり適していません。 [Original article](https://clickhouse.yandex/docs/en/introduction/features_considered_disadvantages/) - diff --git a/docs/ja/introduction/history.md b/docs/ja/introduction/history.md index fbae277542d..2a5631bda1e 100644 --- a/docs/ja/introduction/history.md +++ b/docs/ja/introduction/history.md @@ -1,39 +1,39 @@ -# ClickHouseの歴史 +# ClickHouseの歴史 {#clickhousenoli-shi} -ClickHouseは元々、 [世界で2番目に大きなWeb分析プラットフォーム ](http://w3techs.com/technologies/overview/traffic_analysis/all) である [Yandex.Metrica](https://metrica.yandex.com/)を強化するために開発されたもので、このシステムのコアコンポーネントであり続けています。データベースには13兆を超えるレコードがあり、毎日200億を超えるイベントが発生しますが、ClickHouseでは集計されていないデータから直接カスタムレポートを生成できます。この記事では、ClickHouseの開発の初期段階におけるClickHouseの目標について簡単に説明します。 +ClickHouseは元々、 [世界で2番目に大きなWeb分析プラットフォーム](http://w3techs.com/technologies/overview/traffic_analysis/all) である [Yandex.Metrica](https://metrica.yandex.com/)を強化するために開発されたもので、このシステムのコアコンポーネントであり続けています。データベースには13兆を超えるレコードがあり、毎日200億を超えるイベントが発生しますが、ClickHouseでは集計されていないデータから直接カスタムレポートを生成できます。この記事では、ClickHouseの開発の初期段階におけるClickHouseの目標について簡単に説明します。 Yandex.Metricaは、ユーザーが定義した任意のセグメントを使用して、ヒットとセッションに基づいてカスタマイズされたレポートをその都度作成します。これには、多くの場合、一意のユーザー数などの複雑な集計を作成する必要があり、レポートを作成するための新しいデータがリアルタイムで受信されます。 2014年4月の時点で、Yandex.Metricaは毎日約120億のイベント(ページビューとクリック)を追跡していました。カスタムレポートを作成するには、これらすべてのイベントを保存する必要があります。単一のクエリで、数百ミリ秒以内に数百万行をスキャンしたり、わずか数秒で数億行をスキャンする必要があります。 -## Yandex.Metricaおよびその他のYandexサービスでの用途 +## Yandex.Metricaおよびその他のYandexサービスでの用途 {#yandex-metricaoyobisonota-noyandexsabisudenoyong-tu} ClickHouseは、Yandex.Metricaで複数の目的に使用されます。その主なタスクは、非集計データを使用してオンラインでレポートを作成することです。 374台のサーバーからなるクラスターを使用し、20.3兆行をデータベースに保存します。圧縮されたデータの量は、重複データとレプリケーションを除いて約2PBです。非圧縮データ(TSV形式)の量は約17PBにもなります。 ClickHouseは以下の目的にも使用されます。 -- Yandex.Metricaのデータをセッションリプレイのために保存する。 -- 中間データを処理する。 -- Analyticsを使用したグローバルレポートの作成。 -- Yandex.Metricaエンジンをデバッグするためのクエリの実行。 -- APIおよびユーザーインターフェイスからのログの分析。 +- Yandex.Metricaのデータをセッションリプレイのために保存する。 +- 中間データを処理する。 +- Analyticsを使用したグローバルレポートの作成。 +- Yandex.Metricaエンジンをデバッグするためのクエリの実行。 +- APIおよびユーザーインターフェイスからのログの分析。 ClickHouseは少なくとも、そのほか12のYandexのサービス(検索分野、Market、Direct、ビジネス分析、モバイル開発、AdFox、パーソナルサービスなど)で利用されています。 -## 集約されたデータと非集約データ +## 集約されたデータと非集約データ {#ji-yue-saretadetatofei-ji-yue-deta} 統計を効果的に計算するには、データの量を減らすため、データを集計する必要があるという一般的な意見があります。 ただし次の理由により、データ集約は非常に限られた解決策です。 -- ユーザーが必要とするレポートの事前定義リストが必要です。 -- ユーザーはカスタムレポートを作成できません。 -- 大量のキーを集約する場合、データ量は削減されず、集約は役に立ちません。 -- 多数のレポートの場合、集計のバリエーションが多すぎます(組み合わせ爆発) -- カーディナリティの高いキー(URLなど)を集約する場合、データの量はそれほど減少しません(たかだか半分程度)。 -- このため、集約されたデータの量は減少するどころか増加する場合があります。 -- 生成した全てのレポートをユーザが見るわけではありません。計算の大部分は無駄になっています。 -- データの論理的な整合性は、さまざまな集計に対して違反する可能性があります。 +- ユーザーが必要とするレポートの事前定義リストが必要です。 +- ユーザーはカスタムレポートを作成できません。 +- 大量のキーを集約する場合、データ量は削減されず、集約は役に立ちません。 +- 多数のレポートの場合、集計のバリエーションが多すぎます(組み合わせ爆発) +- カーディナリティの高いキー(URLなど)を集約する場合、データの量はそれほど減少しません(たかだか半分程度)。 +- このため、集約されたデータの量は減少するどころか増加する場合があります。 +- 生成した全てのレポートをユーザが見るわけではありません。計算の大部分は無駄になっています。 +- データの論理的な整合性は、さまざまな集計に対して違反する可能性があります。 何も集約せず、集約されていないデータを操作する場合、実際には計算量が減る可能性があります。 diff --git a/docs/ja/introduction/performance.md b/docs/ja/introduction/performance.md index 013a47497da..6e4763649ef 100644 --- a/docs/ja/introduction/performance.md +++ b/docs/ja/introduction/performance.md @@ -1,25 +1,25 @@ -# パフォーマンス +# パフォーマンス {#pahuomansu} Yandexの内部テスト結果によると、ClickHouseは、テスト可能なクラスのシステム間で同等の動作シナリオで最高のパフォーマンス(長時間のクエリで最も高いスループットと、短時間のクエリで最小のレイテンシの両方)を示します。 [別のページで](https://clickhouse.yandex/benchmark.html)テスト結果を表示できます 。 -これは、多数の独立したベンチマークでも確認されています。インターネット検索で見つけることは難しくありませんし、 [ 私達がまとめた関連リンク集 ](https://clickhouse.yandex/#independent-benchmarks) から見つけることもできます。 +これは、多数の独立したベンチマークでも確認されています。インターネット検索で見つけることは難しくありませんし、 [私達がまとめた関連リンク集](https://clickhouse.yandex/#independent-benchmarks) から見つけることもできます。 -## 単一の巨大なクエリのスループット +## 単一の巨大なクエリのスループット {#dan-yi-noju-da-nakuerinosurupututo} スループットは、1秒あたりの行数または1秒あたりのメガバイト数で測定できます。データがページキャッシュに配置される場合、モダンなハードウェアで実行される、それほど複雑でないクエリは、単一サーバ上の非圧縮データに対し 約2〜10GB/秒 の速度で処理されます (最も単純な場合、速度は30GB/秒)。データがページキャッシュに配置されない場合、速度はディスクサブシステムとデータ圧縮率に依存します。たとえば、ディスクサブシステムが400 MB /秒でのデータの読み取りを許可し、データ圧縮率が3の場合、速度は約1.2 GB /秒になります。 1秒あたりの行数で速度を計算するには、1秒あたりのバイト数での速度をクエリで使用される列の合計サイズで除算します。たとえば、10バイトの列が抽出される場合、速度は1秒あたり約1億から2億行になります。 分散処理の場合、処理速度はほぼ線形に向上しますが、これは集約または並べ替えの結果として生じる行の数があまり大きくない場合のみです。 -## 短いクエリを処理するときのレイテンシ +## 短いクエリを処理するときのレイテンシ {#duan-ikueriwochu-li-surutokinoreitensi} -クエリが主キーを使用し、処理する行数がそれほど多くなく(数十万)、列数も多くない場合、データがページキャッシュにあれば50ミリ秒未満のレイテンシ(最良の場合は1桁のミリ秒)が期待できます。それ以外の場合、レイテンシはシーク数から計算されます。ディスクドライブを使用する場合、過負荷になっていないシステムの場合、レイテンシは次の式で計算されます: シーク時間(10ミリ秒) * クエリされる列の数 * データ部分の数 +クエリが主キーを使用し、処理する行数がそれほど多くなく(数十万)、列数も多くない場合、データがページキャッシュにあれば50ミリ秒未満のレイテンシ(最良の場合は1桁のミリ秒)が期待できます。それ以外の場合、レイテンシはシーク数から計算されます。ディスクドライブを使用する場合、過負荷になっていないシステムの場合、レイテンシは次の式で計算されます: シーク時間(10ミリ秒) \* クエリされる列の数 \* データ部分の数 -## 大量の短いクエリを処理するときのスループット +## 大量の短いクエリを処理するときのスループット {#da-liang-noduan-ikueriwochu-li-surutokinosurupututo} 同じ条件下で、ClickHouseは1台のサーバーで毎秒数百のクエリを処理できます(最良の場合は数千まで)。このシナリオは分析DBMSでは一般的ではないため、1秒あたり最大100クエリを想定することをお勧めします。 -## データ挿入時のパフォーマンス +## データ挿入時のパフォーマンス {#detacha-ru-shi-nopahuomansu} 少なくとも1000行のパケットにデータを挿入することをお勧めします。または、1秒あたり1回のリクエストを超えないでください。タブ区切りのダンプデータをMergeTreeテーブルに挿入する場合、挿入速度は50〜200MB/sになります。挿入された行のサイズが約1Kbの場合、速度は毎秒50,000〜200,000行になります。行が小さい場合、パフォーマンスは1秒あたりの行数で高くなります(Banner System データ- `>` 500,000行/秒、Graphite データ- `>` 1,000,000行/秒)。パフォーマンスを向上させるために、複数のINSERTクエリを並行して作成することで、パフォーマンスを線形に向上できます。 -[Original article](https://clickhouse.yandex/docs/ja/introduction/performance/) \ No newline at end of file +[Original article](https://clickhouse.yandex/docs/ja/introduction/performance/) diff --git a/docs/ja/operations/performance/sampling_query_profiler_example_result.txt b/docs/ja/operations/performance/sampling_query_profiler_example_result.txt new file mode 120000 index 00000000000..58c5abe7122 --- /dev/null +++ b/docs/ja/operations/performance/sampling_query_profiler_example_result.txt @@ -0,0 +1 @@ +../../../en/operations/performance/sampling_query_profiler_example_result.txt \ No newline at end of file diff --git a/docs/ru/commercial/cloud.md b/docs/ru/commercial/cloud.md new file mode 120000 index 00000000000..eb58e4a90be --- /dev/null +++ b/docs/ru/commercial/cloud.md @@ -0,0 +1 @@ +../../en/commercial/cloud.md \ No newline at end of file diff --git a/docs/ru/data_types/array.md b/docs/ru/data_types/array.md index c9b6df35ede..0fa13d54cae 100644 --- a/docs/ru/data_types/array.md +++ b/docs/ru/data_types/array.md @@ -1,53 +1,58 @@ -# Array(T) {#data_type-array} +# Array(T) {#data-type-array} Массив из элементов типа `T`. `T` может любым, в том числе, массивом. Таким образом поддержаны многомерные массивы. -## Создание массива +## Создание массива {#sozdanie-massiva} Массив можно создать с помощью функции: -```sql +``` sql array(T) ``` Также можно использовать квадратные скобки -```sql +``` sql [] ``` Пример создания массива: -```sql + +``` sql SELECT array(1, 2) AS x, toTypeName(x) ``` -```text + +``` text ┌─x─────┬─toTypeName(array(1, 2))─┐ │ [1,2] │ Array(UInt8) │ └───────┴─────────────────────────┘ ``` -```sql + +``` sql SELECT [1, 2] AS x, toTypeName(x) ``` -```text + +``` text ┌─x─────┬─toTypeName([1, 2])─┐ │ [1,2] │ Array(UInt8) │ └───────┴────────────────────┘ ``` -## Особенности работы с типами данных +## Особенности работы с типами данных {#osobennosti-raboty-s-tipami-dannykh} -При создании массива "на лету" ClickHouse автоматически определяет тип аргументов как наиболее узкий тип данных, в котором можно хранить все перечисленные аргументы. Если среди аргументов есть [NULL](../query_language/syntax.md#null-literal) или аргумент типа [Nullable](nullable.md#data_type-nullable), то тип элементов массива — [Nullable](nullable.md). +При создании массива «на лету» ClickHouse автоматически определяет тип аргументов как наиболее узкий тип данных, в котором можно хранить все перечисленные аргументы. Если среди аргументов есть [NULL](../query_language/syntax.md#null-literal) или аргумент типа [Nullable](nullable.md#data_type-nullable), то тип элементов массива — [Nullable](nullable.md). Если ClickHouse не смог подобрать тип данных, то он сгенерирует исключение. Это произойдёт, например, при попытке создать массив одновременно со строками и числами `SELECT array(1, 'a')`. Примеры автоматического определения типа данных: -```sql +``` sql SELECT array(1, 2, NULL) AS x, toTypeName(x) ``` -```text + +``` text ┌─x──────────┬─toTypeName(array(1, 2, NULL))─┐ │ [1,2,NULL] │ Array(Nullable(UInt8)) │ └────────────┴───────────────────────────────┘ @@ -55,10 +60,11 @@ SELECT array(1, 2, NULL) AS x, toTypeName(x) Если попытаться создать массив из несовместимых типов данных, то ClickHouse выбросит исключение: -```sql +``` sql SELECT array(1, 'a') ``` -```text + +``` text Received exception from server (version 1.1.54388): Code: 386. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: There is no supertype for types UInt8, String because some of them are String/FixedString and some of them are not. ``` diff --git a/docs/ru/data_types/boolean.md b/docs/ru/data_types/boolean.md index f4a908840ae..f868ebf6d14 100644 --- a/docs/ru/data_types/boolean.md +++ b/docs/ru/data_types/boolean.md @@ -1,4 +1,4 @@ -# Булевы значения +# Булевы значения {#bulevy-znacheniia} Отдельного типа для булевых значений нет. Для них используется тип UInt8, в котором используются только значения 0 и 1. diff --git a/docs/ru/data_types/date.md b/docs/ru/data_types/date.md index 3e39f72d5ed..8db0de8a57d 100644 --- a/docs/ru/data_types/date.md +++ b/docs/ru/data_types/date.md @@ -1,4 +1,4 @@ -# Date {#data_type-date} +# Date {#data-type-date} Дата. Хранится в двух байтах в виде (беззнакового) числа дней, прошедших от 1970-01-01. Позволяет хранить значения от чуть больше, чем начала unix-эпохи до верхнего порога, определяющегося константой на этапе компиляции (сейчас - до 2106 года, последний полностью поддерживаемый год - 2105). Минимальное значение выводится как 0000-00-00. diff --git a/docs/ru/data_types/datetime.md b/docs/ru/data_types/datetime.md index ef9498c9871..5e17b10e4e1 100644 --- a/docs/ru/data_types/datetime.md +++ b/docs/ru/data_types/datetime.md @@ -1,20 +1,20 @@ -# DateTime {#data_type-datetime} +# DateTime {#data-type-datetime} Позволяет хранить момент времени, который может быть представлен как календарная дата и время. Синтаксис: -```sql +``` sql DateTime([timezone]) ``` -Диапазон значений: [1970-01-01 00:00:00, 2105-12-31 23:59:59]. +Диапазон значений: \[1970-01-01 00:00:00, 2105-12-31 23:59:59\]. Точность: 1 секунда. -## Использование +## Использование {#ispolzovanie} -Момент времени сохраняется как [Unix timestamp](https://ru.wikipedia.org/wiki/Unix-%D0%B2%D1%80%D0%B5%D0%BC%D1%8F), независимо от часового пояса и переходов на летнее/зимнее время. Дополнительно, тип `DateTime` позволяет хранить часовой пояс, единый для всей колонки, который влияет на то, как будут отображаться значения типа `DateTime` в текстовом виде и как будут парситься значения заданные в виде строк ('2020-01-01 05:00:01'). Часовой пояс не хранится в строках таблицы (выборки), а хранится в метаданных колонки. +Момент времени сохраняется как [Unix timestamp](https://ru.wikipedia.org/wiki/Unix-%D0%B2%D1%80%D0%B5%D0%BC%D1%8F), независимо от часового пояса и переходов на летнее/зимнее время. Дополнительно, тип `DateTime` позволяет хранить часовой пояс, единый для всей колонки, который влияет на то, как будут отображаться значения типа `DateTime` в текстовом виде и как будут парситься значения заданные в виде строк (‘2020-01-01 05:00:01’). Часовой пояс не хранится в строках таблицы (выборки), а хранится в метаданных колонки. Список поддерживаемых временных зон можно найти в [IANA Time Zone Database](https://www.iana.org/time-zones). Пакет `tzdata`, содержащий [базу данных часовых поясов IANA](https://www.iana.org/time-zones), должен быть установлен в системе. Используйте команду `timedatectl list-timezones` для получения списка часовых поясов, известных локальной системе. @@ -24,51 +24,58 @@ DateTime([timezone]) ClickHouse отображает значения типа `DateTime` в формате `YYYY-MM-DD hh:mm:ss`. Отображение можно поменять с помощью функции [formatDateTime](../query_language/functions/date_time_functions.md#formatdatetime). -При вставке данных в ClickHouse, можно использовать различные форматы даты и времени в зависимости от значения настройки [date_time_input_format](../operations/settings/settings.md#settings-date_time_input_format). +При вставке данных в ClickHouse, можно использовать различные форматы даты и времени в зависимости от значения настройки [date\_time\_input\_format](../operations/settings/settings.md#settings-date_time_input_format). -## Примеры +## Примеры {#primery} **1.** Создание таблицы с столбцом типа `DateTime` и вставка данных в неё: -```sql +``` sql CREATE TABLE dt ( - `timestamp` DateTime('Europe/Moscow'), + `timestamp` DateTime('Europe/Moscow'), `event_id` UInt8 ) ENGINE = TinyLog; ``` -```sql + +``` sql INSERT INTO dt Values (1546300800, 1), ('2019-01-01 00:00:00', 2); ``` -```sql + +``` sql SELECT * FROM dt; ``` -```text + +``` text ┌───────────timestamp─┬─event_id─┐ │ 2019-01-01 03:00:00 │ 1 │ │ 2019-01-01 00:00:00 │ 2 │ └─────────────────────┴──────────┘ ``` -* При вставке даты-времени как целого числа, оно трактуется как Unix Timestamp (UTC). Unix timestamp `1546300800` в часовом поясе `Europe/London (UTC+0)` представляет время `'2019-01-01 00:00:00'`. Однако, столбец `timestamp` имеет тип `DateTime('Europe/Moscow (UTC+3)')`, так что при выводе в виде строки время отобразится как `2019-01-01 03:00:00`. -* При вставке даты-времени в виде строки, время трактуется соответственно часовому поясу установленному для колонки. `'2019-01-01 00:00:00'` трактуется как время по Москве (и в базу сохраняется `1546290000`) +- При вставке даты-времени как целого числа, оно трактуется как Unix Timestamp (UTC). Unix timestamp `1546300800` в часовом поясе `Europe/London (UTC+0)` представляет время `'2019-01-01 00:00:00'`. Однако, столбец `timestamp` имеет тип `DateTime('Europe/Moscow (UTC+3)')`, так что при выводе в виде строки время отобразится как `2019-01-01 03:00:00`. +- При вставке даты-времени в виде строки, время трактуется соответственно часовому поясу установленному для колонки. `'2019-01-01 00:00:00'` трактуется как время по Москве (и в базу сохраняется `1546290000`) **2.** Фильтрация по значениям даты-времени -```sql +``` sql SELECT * FROM dt WHERE timestamp = toDateTime('2019-01-01 00:00:00', 'Europe/Moscow') ``` -```text + +``` text ┌───────────timestamp─┬─event_id─┐ │ 2019-01-01 00:00:00 │ 2 │ └─────────────────────┴──────────┘ ``` + Фильтровать по колонке типа `DateTime` можно, указывая строковое значение в фильтре `WHERE`. Конвертация будет выполнена автоматически: -```sql + +``` sql SELECT * FROM dt WHERE timestamp = '2019-01-01 00:00:00' ``` -```text + +``` text ┌───────────timestamp─┬─event_id─┐ │ 2019-01-01 03:00:00 │ 1 │ └─────────────────────┴──────────┘ @@ -76,10 +83,11 @@ SELECT * FROM dt WHERE timestamp = '2019-01-01 00:00:00' **3.** Получение часового пояса для колонки типа `DateTime`: -```sql +``` sql SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x ``` -```text + +``` text ┌──────────────column─┬─x─────────────────────────┐ │ 2019-10-16 04:12:04 │ DateTime('Europe/Moscow') │ └─────────────────────┴───────────────────────────┘ @@ -87,27 +95,28 @@ SELECT toDateTime(now(), 'Europe/Moscow') AS column, toTypeName(column) AS x **4.** Конвертация часовых поясов -```sql -SELECT -toDateTime(timestamp, 'Europe/London') as lon_time, +``` sql +SELECT +toDateTime(timestamp, 'Europe/London') as lon_time, toDateTime(timestamp, 'Europe/Moscow') as mos_time FROM dt ``` -```text + +``` text ┌───────────lon_time──┬────────────mos_time─┐ │ 2019-01-01 00:00:00 │ 2019-01-01 03:00:00 │ │ 2018-12-31 21:00:00 │ 2019-01-01 00:00:00 │ └─────────────────────┴─────────────────────┘ ``` -## See Also +## See Also {#see-also} -- [Функции преобразования типов](../query_language/functions/type_conversion_functions.md) -- [Функции для работы с датой и временем](../query_language/functions/date_time_functions.md) -- [Функции для работы с массивами](../query_language/functions/array_functions.md) -- [Настройка `date_time_input_format`](../operations/settings/settings.md#settings-date_time_input_format) -- [Конфигурационный параметр сервера `timezone`](../operations/server_settings/settings.md#server_settings-timezone) -- [Операторы для работы с датой и временем](../query_language/operators.md#operators-datetime) -- [Тип данных `Date`](date.md) +- [Функции преобразования типов](../query_language/functions/type_conversion_functions.md) +- [Функции для работы с датой и временем](../query_language/functions/date_time_functions.md) +- [Функции для работы с массивами](../query_language/functions/array_functions.md) +- [Настройка `date_time_input_format`](../operations/settings/settings.md#settings-date_time_input_format) +- [Конфигурационный параметр сервера `timezone`](../operations/server_settings/settings.md#server_settings-timezone) +- [Операторы для работы с датой и временем](../query_language/operators.md#operators-datetime) +- [Тип данных `Date`](date.md) [Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/datetime/) diff --git a/docs/ru/data_types/datetime64.md b/docs/ru/data_types/datetime64.md index 96476984f87..c5bf2c9e31c 100644 --- a/docs/ru/data_types/datetime64.md +++ b/docs/ru/data_types/datetime64.md @@ -1,62 +1,69 @@ -# DateTime64 {#data_type-datetime64} +# DateTime64 {#data-type-datetime64} -Позволяет хранить момент времени, который может быть представлен как календарная дата и время, с заданной суб-секундной точностью. +Позволяет хранить момент времени, который может быть представлен как календарная дата и время, с заданной суб-секундной точностью. Размер тика/точность: 10-precision секунд, где precision - целочисленный параметр типа. Синтаксис: -```sql + +``` sql DateTime64(precision, [timezone]) ``` -Данные хранятся в виде количества 'тиков', прошедших с момента начала эпохи (1970-01-01 00:00:00 UTC), в Int64. Размер тика определяется параметром precision. Дополнительно, тип `DateTime64` позволяет хранить часовой пояс, единый для всей колонки, который влияет на то, как будут отображаться значения типа `DateTime64` в текстовом виде и как будут парситься значения заданные в виде строк ('2020-01-01 05:00:01.000'). Часовой пояс не хранится в строках таблицы (выборки), а хранится в метаданных колонки. Подробнее см. [DateTime](datetime.md). +Данные хранятся в виде количества ‘тиков’, прошедших с момента начала эпохи (1970-01-01 00:00:00 UTC), в Int64. Размер тика определяется параметром precision. Дополнительно, тип `DateTime64` позволяет хранить часовой пояс, единый для всей колонки, который влияет на то, как будут отображаться значения типа `DateTime64` в текстовом виде и как будут парситься значения заданные в виде строк (‘2020-01-01 05:00:01.000’). Часовой пояс не хранится в строках таблицы (выборки), а хранится в метаданных колонки. Подробнее см. [DateTime](datetime.md). -## Пример +## Пример {#primer} **1.** Создание таблицы с столбцом типа `DateTime64` и вставка данных в неё: -```sql +``` sql CREATE TABLE dt ( - `timestamp` DateTime64(3, 'Europe/Moscow'), + `timestamp` DateTime64(3, 'Europe/Moscow'), `event_id` UInt8 ) ENGINE = TinyLog ``` -```sql + +``` sql INSERT INTO dt Values (1546300800000, 1), ('2019-01-01 00:00:00', 2) ``` -```sql + +``` sql SELECT * FROM dt ``` -```text + +``` text ┌───────────────timestamp─┬─event_id─┐ │ 2019-01-01 03:00:00.000 │ 1 │ │ 2019-01-01 00:00:00.000 │ 2 │ └─────────────────────────┴──────────┘ ``` -* При вставке даты-времени как числа (аналогично 'Unix timestamp'), время трактуется как UTC. Unix timestamp `1546300800` в часовом поясе `Europe/London (UTC+0)` представляет время `'2019-01-01 00:00:00'`. Однако, столбец `timestamp` имеет тип `DateTime('Europe/Moscow (UTC+3)')`, так что при выводе в виде строки время отобразится как `2019-01-01 03:00:00`. -* При вставке даты-времени в виде строки, время трактуется соответственно часовому поясу установленному для колонки. `'2019-01-01 00:00:00'` трактуется как время по Москве (и в базу сохраняется `'2018-12-31 21:00:00'` в виде Unix Timestamp) +- При вставке даты-времени как числа (аналогично ‘Unix timestamp’), время трактуется как UTC. Unix timestamp `1546300800` в часовом поясе `Europe/London (UTC+0)` представляет время `'2019-01-01 00:00:00'`. Однако, столбец `timestamp` имеет тип `DateTime('Europe/Moscow (UTC+3)')`, так что при выводе в виде строки время отобразится как `2019-01-01 03:00:00`. +- При вставке даты-времени в виде строки, время трактуется соответственно часовому поясу установленному для колонки. `'2019-01-01 00:00:00'` трактуется как время по Москве (и в базу сохраняется `'2018-12-31 21:00:00'` в виде Unix Timestamp) **2.** Фильтрация по значениям даты-времени -```sql +``` sql SELECT * FROM dt WHERE timestamp = toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow') ``` -```text + +``` text ┌───────────────timestamp─┬─event_id─┐ │ 2019-01-01 00:00:00.000 │ 2 │ └─────────────────────────┴──────────┘ ``` + В отличие от типа `DateTime`, `DateTime64` не конвертируется из строк автоматически **3.** Получение часового пояса для значения типа `DateTime64`: -```sql +``` sql SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS x ``` -```text + +``` text ┌──────────────────column─┬─x──────────────────────────────┐ │ 2019-10-16 04:12:04.000 │ DateTime64(3, 'Europe/Moscow') │ └─────────────────────────┴────────────────────────────────┘ @@ -64,27 +71,27 @@ SELECT toDateTime64(now(), 3, 'Europe/Moscow') AS column, toTypeName(column) AS **4.** Конвертация часовых поясов -```sql -SELECT -toDateTime64(timestamp, 3, 'Europe/London') as lon_time, +``` sql +SELECT +toDateTime64(timestamp, 3, 'Europe/London') as lon_time, toDateTime64(timestamp, 3, 'Europe/Moscow') as mos_time FROM dt ``` -```text + +``` text ┌───────────────lon_time──┬────────────────mos_time─┐ │ 2019-01-01 00:00:00.000 │ 2019-01-01 03:00:00.000 │ │ 2018-12-31 21:00:00.000 │ 2019-01-01 00:00:00.000 │ └─────────────────────────┴─────────────────────────┘ ``` -## See Also - -- [Функции преобразования типов](../query_language/functions/type_conversion_functions.md) -- [Функции для работы с датой и временем](../query_language/functions/date_time_functions.md) -- [Функции для работы с массивами](../query_language/functions/array_functions.md) -- [Настройка `date_time_input_format`](../operations/settings/settings.md#settings-date_time_input_format) -- [Конфигурационный параметр сервера `timezone`](../operations/server_settings/settings.md#server_settings-timezone) -- [Операторы для работы с датой и временем](../query_language/operators.md#operators-datetime) -- [Тип данных `Date`](date.md) -- [Тип данных `DateTime`](datetime.md) +## See Also {#see-also} +- [Функции преобразования типов](../query_language/functions/type_conversion_functions.md) +- [Функции для работы с датой и временем](../query_language/functions/date_time_functions.md) +- [Функции для работы с массивами](../query_language/functions/array_functions.md) +- [Настройка `date_time_input_format`](../operations/settings/settings.md#settings-date_time_input_format) +- [Конфигурационный параметр сервера `timezone`](../operations/server_settings/settings.md#server_settings-timezone) +- [Операторы для работы с датой и временем](../query_language/operators.md#operators-datetime) +- [Тип данных `Date`](date.md) +- [Тип данных `DateTime`](datetime.md) diff --git a/docs/ru/data_types/decimal.md b/docs/ru/data_types/decimal.md index e8e8a15fe49..8975406bf4b 100644 --- a/docs/ru/data_types/decimal.md +++ b/docs/ru/data_types/decimal.md @@ -1,44 +1,43 @@ - -# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S) +# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S) {#decimalp-s-decimal32s-decimal64s-decimal128s} Знаковые дробные числа с сохранением точности операций сложения, умножения и вычитания. Для деления осуществляется отбрасывание (не округление) знаков, не попадающих в младший десятичный разряд. -## Параметры +## Параметры {#parametry} -- P - precision. Значение из диапазона [ 1 : 38 ]. Определяет, сколько десятичных знаков (с учетом дробной части) может содержать число. -- S - scale. Значение из диапазона [ 0 : P ]. Определяет, сколько десятичных знаков содержится в дробной части числа. +- P - precision. Значение из диапазона \[ 1 : 38 \]. Определяет, сколько десятичных знаков (с учетом дробной части) может содержать число. +- S - scale. Значение из диапазона \[ 0 : P \]. Определяет, сколько десятичных знаков содержится в дробной части числа. В зависимости от параметра P Decimal(P, S) является синонимом: -- P из [ 1 : 9 ] - для Decimal32(S) -- P из [ 10 : 18 ] - для Decimal64(S) -- P из [ 19 : 38 ] - для Decimal128(S) +- P из \[ 1 : 9 \] - для Decimal32(S) +- P из \[ 10 : 18 \] - для Decimal64(S) +- P из \[ 19 : 38 \] - для Decimal128(S) -## Диапазоны Decimal +## Диапазоны Decimal {#diapazony-decimal} -- Decimal32(S) - ( -1 * 10^(9 - S), 1 * 10^(9 - S) ) -- Decimal64(S) - ( -1 * 10^(18 - S), 1 * 10^(18 - S) ) -- Decimal128(S) - ( -1 * 10^(38 - S), 1 * 10^(38 - S) ) +- Decimal32(S) - ( -1 \* 10^(9 - S), 1 \* 10^(9 - S) ) +- Decimal64(S) - ( -1 \* 10^(18 - S), 1 \* 10^(18 - S) ) +- Decimal128(S) - ( -1 \* 10^(38 - S), 1 \* 10^(38 - S) ) Например, Decimal32(4) содержит числа от -99999.9999 до 99999.9999 c шагом 0.0001. -## Внутреннее представление +## Внутреннее представление {#vnutrennee-predstavlenie} Внутри данные представляются как знаковые целые числа, соответсвующей разрядности. Реальные диапазоны, хранящиеся в ячейках памяти несколько больше заявленных. Заявленные диапазоны Decimal проверяются только при вводе числа из строкового представления. Поскольку современные CPU не поддерживают 128-битные числа, операции над Decimal128 эмулируются программно. Decimal128 работает в разы медленней чем Decimal32/Decimal64. -## Операции и типы результата +## Операции и типы результата {#operatsii-i-tipy-rezultata} Результат операции между двумя Decimal расширяется до большего типа (независимо от порядка аргументов). -- Decimal64(S1) Decimal32(S2) -> Decimal64(S) -- Decimal128(S1) Decimal32(S2) -> Decimal128(S) -- Decimal128(S1) Decimal64(S2) -> Decimal128(S) +- Decimal64(S1) Decimal32(S2) -\> Decimal64(S) +- Decimal128(S1) Decimal32(S2) -\> Decimal128(S) +- Decimal128(S1) Decimal64(S2) -\> Decimal128(S) Для размера дробной части (scale) результата действуют следующие правила: -- сложение, вычитание: S = max(S1, S2). -- умножение: S = S1 + S2. -- деление: S = S1. +- сложение, вычитание: S = max(S1, S2). +- умножение: S = S1 + S2. +- деление: S = S1. При операциях между Decimal и целыми числами результатом является Decimal, аналогичный аргументу. @@ -47,38 +46,44 @@ Часть функций над Decimal возвращают Float64 (например, var, stddev). Для некоторых из них промежуточные операции проходят в Decimal. Для таких функций результат над одинаковыми данными во Float64 и Decimal может отличаться, несмотря на одинаковый тип результата. -## Проверка переполнений +## Проверка переполнений {#proverka-perepolnenii} При выполнении операций над типом Decimal могут происходить целочисленные переполнения. Лишняя дробная часть отбрасывается (не округляется). Лишняя целочисленная часть приводит к исключению. -```sql + +``` sql SELECT toDecimal32(2, 4) AS x, x / 3 ``` -```text + +``` text ┌──────x─┬─divide(toDecimal32(2, 4), 3)─┐ │ 2.0000 │ 0.6666 │ └────────┴──────────────────────────────┘ ``` -```sql +``` sql SELECT toDecimal32(4.2, 8) AS x, x * x ``` -```text + +``` text DB::Exception: Scale is out of bounds. ``` -```sql +``` sql SELECT toDecimal32(4.2, 8) AS x, 6 * x ``` -```text + +``` text DB::Exception: Decimal math overflow. ``` -Проверка переполнения приводит к замедлению операций. При уверенности, что типа результата хватит для его записи проверку переполнения можно отключить настройкой decimal_check_overflow. В этом случае при переполнении вернется неверное значение: -```sql +Проверка переполнения приводит к замедлению операций. При уверенности, что типа результата хватит для его записи проверку переполнения можно отключить настройкой decimal\_check\_overflow. В этом случае при переполнении вернется неверное значение: + +``` sql SET decimal_check_overflow = 0; SELECT toDecimal32(4.2, 8) AS x, 6 * x ``` -```text + +``` text ┌──────────x─┬─multiply(6, toDecimal32(4.2, 8))─┐ │ 4.20000000 │ -17.74967296 │ └────────────┴──────────────────────────────────┘ @@ -86,10 +91,11 @@ SELECT toDecimal32(4.2, 8) AS x, 6 * x Переполнения происходят не только на арифметических операциях, но и на операциях сравнения. Отключать проверку стоит только при полной уверенности в корректности результата: -```sql +``` sql SELECT toDecimal32(1, 8) < 100 ``` -```text + +``` text DB::Exception: Can't compare. ``` diff --git a/docs/ru/data_types/domains/ipv4.md b/docs/ru/data_types/domains/ipv4.md index cce7c17d21b..2903404774b 100644 --- a/docs/ru/data_types/domains/ipv4.md +++ b/docs/ru/data_types/domains/ipv4.md @@ -1,16 +1,16 @@ -## IPv4 +## IPv4 {#ipv4} `IPv4` — это домен, базирующийся на типе данных `UInt32` предназначенный для хранения адресов IPv4. Он обеспечивает компактное хранение данных с удобным для человека форматом ввода-вывода, и явно отображаемым типом данных в структуре таблицы. -### Применение +### Применение {#primenenie} -```sql +``` sql CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -```text +``` text ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ │ url │ String │ │ │ │ │ │ from │ IPv4 │ │ │ │ │ @@ -19,19 +19,19 @@ DESCRIBE TABLE hits; Или вы можете использовать домен IPv4 в качестве ключа: -```sql +``` sql CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY from; ``` `IPv4` поддерживает вставку в виде строк с текстовым представлением IPv4 адреса: -```sql +``` sql INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '116.253.40.133')('https://clickhouse.tech', '183.247.232.58')('https://clickhouse.yandex/docs/en/', '116.106.34.242'); SELECT * FROM hits; ``` -```text +``` text ┌─url────────────────────────────────┬───────────from─┐ │ https://clickhouse.tech/docs/en/ │ 116.106.34.242 │ │ https://wikipedia.org │ 116.253.40.133 │ @@ -41,11 +41,11 @@ SELECT * FROM hits; Значения хранятся в компактной бинарной форме: -```sql +``` sql SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(from)─┬─hex(from)─┐ │ IPv4 │ B7F7E83A │ └──────────────────┴───────────┘ @@ -54,11 +54,11 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; Значения с доменным типом данных не преобразуются неявно в другие типы данных, кроме `UInt32`. Если необходимо преобразовать значение типа `IPv4` в строку, то это необходимо делать явно с помощью функции `IPv4NumToString()`: -```sql +``` sql SELECT toTypeName(s), IPv4NumToString(from) AS s FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ │ String │ 183.247.232.58 │ └───────────────────────────────────┴────────────────┘ @@ -66,11 +66,11 @@ SELECT toTypeName(s), IPv4NumToString(from) AS s FROM hits LIMIT 1; Или приводить к типу данных `UInt32`: -```sql +``` sql SELECT toTypeName(i), CAST(from AS UInt32) AS i FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ │ UInt32 │ 3086477370 │ └──────────────────────────────────┴────────────┘ diff --git a/docs/ru/data_types/domains/ipv6.md b/docs/ru/data_types/domains/ipv6.md index e25805b7931..045a2ad1960 100644 --- a/docs/ru/data_types/domains/ipv6.md +++ b/docs/ru/data_types/domains/ipv6.md @@ -1,37 +1,37 @@ -## IPv6 +## IPv6 {#ipv6} -`IPv6` — это домен, базирующийся на типе данных `FixedString(16)`, предназначенный для хранения адресов IPv6. Он обеспечивает компактное хранение данных с удобным для человека форматом ввода-вывода, и явно отображаемым типом данных в структуре таблицы. +`IPv6` — это домен, базирующийся на типе данных `FixedString(16)`, предназначенный для хранения адресов IPv6. Он обеспечивает компактное хранение данных с удобным для человека форматом ввода-вывода, и явно отображаемым типом данных в структуре таблицы. -### Применение +### Применение {#primenenie} -```sql +``` sql CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -```text +``` text ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ │ url │ String │ │ │ │ │ │ from │ IPv6 │ │ │ │ │ └──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘ ``` -Или вы можете использовать домен `IPv6` в качестве ключа: +Или вы можете использовать домен `IPv6` в качестве ключа: -```sql +``` sql CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from; ``` `IPv6` поддерживает вставку в виде строк с текстовым представлением IPv6 адреса: -```sql +``` sql INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '2a02:aa08:e000:3100::2')('https://clickhouse.tech', '2001:44c8:129:2632:33:0:252:2')('https://clickhouse.yandex/docs/en/', '2a02:e980:1e::1'); SELECT * FROM hits; ``` -```text +``` text ┌─url────────────────────────────────┬─from──────────────────────────┐ │ https://clickhouse.tech │ 2001:44c8:129:2632:33:0:252:2 │ │ https://clickhouse.tech/docs/en/ │ 2a02:e980:1e::1 │ @@ -41,11 +41,11 @@ SELECT * FROM hits; Значения хранятся в компактной бинарной форме: -```sql +``` sql SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(from)─┬─hex(from)────────────────────────┐ │ IPv6 │ 200144C8012926320033000002520002 │ └──────────────────┴──────────────────────────────────┘ @@ -54,11 +54,11 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; Значения с доменным типом данных не преобразуются неявно в другие типы данных, кроме `FixedString(16)`. Если необходимо преобразовать значение типа `IPv6` в строку, то это необходимо делать явно с помощью функции `IPv6NumToString()`: -```sql +``` sql SELECT toTypeName(s), IPv6NumToString(from) AS s FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐ │ String │ 2001:44c8:129:2632:33:0:252:2 │ └───────────────────────────────────┴───────────────────────────────┘ @@ -66,11 +66,11 @@ SELECT toTypeName(s), IPv6NumToString(from) AS s FROM hits LIMIT 1; Или приводить к типу данных `FixedString(16)`: -```sql +``` sql SELECT toTypeName(i), CAST(from AS FixedString(16)) AS i FROM hits LIMIT 1; ``` -```text +``` text ┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐ │ FixedString(16) │ ��� │ └───────────────────────────────────────────┴─────────┘ diff --git a/docs/ru/data_types/domains/overview.md b/docs/ru/data_types/domains/overview.md index 03818d03a20..6feac834e5e 100644 --- a/docs/ru/data_types/domains/overview.md +++ b/docs/ru/data_types/domains/overview.md @@ -1,26 +1,26 @@ -# Домены +# Домены {#domeny} Домены — это типы данных специального назначения, которые добавляют некоторые дополнительные функции поверх существующего базового типа. На данный момент ClickHouse не поддерживает пользовательские домены. Вы можете использовать домены везде, где можно использовать соответствующий базовый тип: -* Создание столбца с доменным типом данных. -* Чтение/запись значений из/в столбец с доменным типом данных. -* Используйте его как индекс, если базовый тип можно использовать в качестве индекса. -* Вызов функций со значениями столбца, имеющего доменный тип данных. -* и так далее. +- Создание столбца с доменным типом данных. +- Чтение/запись значений из/в столбец с доменным типом данных. +- Используйте его как индекс, если базовый тип можно использовать в качестве индекса. +- Вызов функций со значениями столбца, имеющего доменный тип данных. +- и так далее. -### Дополнительные возможности доменов +### Дополнительные возможности доменов {#dopolnitelnye-vozmozhnosti-domenov} -* Явное название типа данных столбца в запросах `SHOW CREATE TABLE` и `DESCRIBE TABLE` -* Ввод данных в удобном человеку формате `INSERT INTO domain_table(domain_column) VALUES(...)` -* Вывод данных в удобном человеку формате `SELECT domain_column FROM domain_table` -* Загрузка данных из внешнего источника в удобном для человека формате: `INSERT INTO domain_table FORMAT CSV ...` +- Явное название типа данных столбца в запросах `SHOW CREATE TABLE` и `DESCRIBE TABLE` +- Ввод данных в удобном человеку формате `INSERT INTO domain_table(domain_column) VALUES(...)` +- Вывод данных в удобном человеку формате `SELECT domain_column FROM domain_table` +- Загрузка данных из внешнего источника в удобном для человека формате: `INSERT INTO domain_table FORMAT CSV ...` -### Ограничения +### Ограничения {#ogranicheniia} -* Невозможно преобразовать базовый тип данных в доменный для индексного столбца с помощью `ALTER TABLE`. -* Невозможно неявно преобразовывать строковые значение в значения с доменным типом данных при вставке данных из другого столбца или таблицы. -* Домен не добавляет ограничения на хранимые значения. +- Невозможно преобразовать базовый тип данных в доменный для индексного столбца с помощью `ALTER TABLE`. +- Невозможно неявно преобразовывать строковые значение в значения с доменным типом данных при вставке данных из другого столбца или таблицы. +- Домен не добавляет ограничения на хранимые значения. [Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/domains/overview) diff --git a/docs/ru/data_types/enum.md b/docs/ru/data_types/enum.md index b157c745602..2ee7c77028b 100644 --- a/docs/ru/data_types/enum.md +++ b/docs/ru/data_types/enum.md @@ -1,4 +1,4 @@ -# Enum +# Enum {#enum} Перечисляемый тип данных, содержащий именованные значения. @@ -6,16 +6,16 @@ ClickHouse поддерживает: -- 8-битный `Enum`. Может содержать до 256 значений, пронумерованных в диапазоне `[-128, 127]`. -- 16-битный `Enum`. Может содержать до 65536 значений, пронумерованных в диапазоне `[-32768, 32767]`. +- 8-битный `Enum`. Может содержать до 256 значений, пронумерованных в диапазоне `[-128, 127]`. +- 16-битный `Enum`. Может содержать до 65536 значений, пронумерованных в диапазоне `[-32768, 32767]`. ClickHouse автоматически выбирает размерность `Enum` при вставке данных. Чтобы точно понимать размер хранимых данных можно использовать типы `Enum8` или `Enum16`. -## Примеры использования +## Примеры использования {#primery-ispolzovaniia} Создадим таблицу со столбцом типа `Enum8('hello' = 1, 'world' = 2)`. -```sql +``` sql CREATE TABLE t_enum ( x Enum('hello' = 1, 'world' = 2) @@ -25,26 +25,30 @@ ENGINE = TinyLog В столбец `x` можно сохранять только значения, перечисленные при определении типа, т.е. `'hello'` или `'world'`. Если вы попытаетесь сохранить любое другое значение, ClickHouse сгенерирует исключение. ClickHouse автоматически выберет размерность 8-bit для этого `Enum`. -```sql +``` sql INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') ``` -```text + +``` text Ok. ``` -```sql + +``` sql insert into t_enum values('a') ``` -```text + +``` text Exception on client: Code: 49. DB::Exception: Unknown element 'a' for type Enum('hello' = 1, 'world' = 2) ``` При запросе данных из таблицы ClickHouse выдаст строковые значения из `Enum`. -```sql +``` sql SELECT * FROM t_enum ``` -```text + +``` text ┌─x─────┐ │ hello │ │ world │ @@ -54,10 +58,11 @@ SELECT * FROM t_enum Если необходимо увидеть цифровые эквиваленты строкам, то необходимо привести тип `Enum` к целочисленному. -```sql +``` sql SELECT CAST(x AS Int8) FROM t_enum ``` -```text + +``` text ┌─CAST(x, 'Int8')─┐ │ 1 │ │ 2 │ @@ -67,16 +72,17 @@ SELECT CAST(x AS Int8) FROM t_enum Чтобы создать значение типа `Enum` в запросе, также необходимо использовать функцию `CAST`. -```sql +``` sql SELECT toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)')) ``` -```text + +``` text ┌─toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)'))─┐ │ Enum8('a' = 1, 'b' = 2) │ └─────────────────────────────────────────────────────┘ ``` -## Общие правила и особенности использования +## Общие правила и особенности использования {#obshchie-pravila-i-osobennosti-ispolzovaniia} Для каждого из значений прописывается число в диапазоне `-128 .. 127` для `Enum8` или в диапазоне `-32768 .. 32767` для `Enum16`. Все строки должны быть разными, числа - тоже. Разрешена пустая строка. При указании такого типа (в определении таблицы), числа могут идти не подряд и в произвольном порядке. При этом, порядок не имеет значения. @@ -84,7 +90,7 @@ SELECT toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)')) `Enum` может быть передан в тип [Nullable](nullable.md). Таким образом, если создать таблицу запросом -```sql +``` sql CREATE TABLE t_enum_nullable ( x Nullable( Enum8('hello' = 1, 'world' = 2) ) @@ -94,7 +100,7 @@ ENGINE = TinyLog , то в ней можно будет хранить не только `'hello'` и `'world'`, но и `NULL`. -```sql +``` sql INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) ``` @@ -111,7 +117,7 @@ INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) Для Enum-а естественным образом определяется функция `toString`, которая возвращает его строковое значение. Также для Enum-а определяются функции `toT`, где T - числовой тип. При совпадении T с типом столбца Enum-а, преобразование работает бесплатно. -При ALTER, есть возможность бесплатно изменить тип Enum-а, если меняется только множество значений. При этом, можно добавлять новые значения; можно удалять старые значения (это безопасно только если они ни разу не использовались, так как это не проверяется). В качестве "защиты от дурака", нельзя менять числовые значения у имеющихся строк - в этом случае, кидается исключение. +При ALTER, есть возможность бесплатно изменить тип Enum-а, если меняется только множество значений. При этом, можно добавлять новые значения; можно удалять старые значения (это безопасно только если они ни разу не использовались, так как это не проверяется). В качестве «защиты от дурака», нельзя менять числовые значения у имеющихся строк - в этом случае, кидается исключение. При ALTER, есть возможность поменять Enum8 на Enum16 и обратно - так же, как можно поменять Int8 на Int16. diff --git a/docs/ru/data_types/fixedstring.md b/docs/ru/data_types/fixedstring.md index 07e45f71cab..ba91fcde9b3 100644 --- a/docs/ru/data_types/fixedstring.md +++ b/docs/ru/data_types/fixedstring.md @@ -1,10 +1,10 @@ -# FixedString +# FixedString {#fixedstring} -Строка фиксированной длины `N` байт (не символов, не кодовых точек). +Строка фиксированной длины `N` байт (не символов, не кодовых точек). Чтобы объявить столбец типа `FixedString`, используйте следующий синтаксис: -```sql +``` sql FixedString(N) ``` @@ -14,23 +14,23 @@ Примеры значений, которые можно эффективно хранить в столбцах типа `FixedString`: -- Двоичное представление IP-адреса (`FixedString(16)` для IPv6). -- Коды языков (ru_RU, en_US ... ). -- Коды валют (USD, RUB ... ). -- Двоичное представление хэшей (`FixedString(16)` для MD5, `FixedString(32)` для SHA256). +- Двоичное представление IP-адреса (`FixedString(16)` для IPv6). +- Коды языков (ru\_RU, en\_US … ). +- Коды валют (USD, RUB … ). +- Двоичное представление хэшей (`FixedString(16)` для MD5, `FixedString(32)` для SHA256). Для хранения значений UUID используйте тип данных [UUID](uuid.md). При вставке данных, ClickHouse: -- Дополняет строку нулевыми байтами, если строка содержит меньше байтов, чем `N`. -- Генерирует исключение `Too large value for FixedString(N)`, если строка содержит более `N` байт. +- Дополняет строку нулевыми байтами, если строка содержит меньше байтов, чем `N`. +- Генерирует исключение `Too large value for FixedString(N)`, если строка содержит более `N` байт. При выборе данных ClickHouse не обрезает нулевые байты в конце строки. Если вы используете секцию `WHERE`, то необходимо добавлять нулевые байты вручную, чтобы ClickHouse смог сопоставить выражение из фильтра значению `FixedString`. Следующий пример показывает, как использовать секцию `WHERE` с `FixedString`. Рассмотрим следующую таблицу с единственным столбцом типа `FixedString(2)`: -```text +``` text ┌─name──┐ │ b │ └───────┘ @@ -38,19 +38,19 @@ Запрос `SELECT * FROM FixedStringTable WHERE a = 'b'` не возвращает необходимых данных. Необходимо дополнить шаблон фильтра нулевыми байтами. -```sql +``` sql SELECT * FROM FixedStringTable WHERE a = 'b\0' ``` -```text + +``` text ┌─a─┐ │ b │ └───┘ - ``` Это поведение отличается от поведения MySQL для типа `CHAR`, где строки дополняются пробелами, а пробелы перед выводом вырезаются. -Обратите внимание, что длина значения `FixedString(N)` постоянна. Функция [length](../query_language/functions/array_functions.md#array_functions-length) возвращает `N` даже если значение `FixedString(N)` заполнено только нулевыми байтами, однако функция [empty](../query_language/functions/string_functions.md#string_functions-empty) в этом же случае возвращает `1`. +Обратите внимание, что длина значения `FixedString(N)` постоянна. Функция [length](../query_language/functions/array_functions.md#array_functions-length) возвращает `N` даже если значение `FixedString(N)` заполнено только нулевыми байтами, однако функция [empty](../query_language/functions/string_functions.md#empty) в этом же случае возвращает `1`. [Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/fixedstring/) diff --git a/docs/ru/data_types/float.md b/docs/ru/data_types/float.md index 23c0097cd19..229bef5ff52 100644 --- a/docs/ru/data_types/float.md +++ b/docs/ru/data_types/float.md @@ -1,70 +1,80 @@ -# Float32, Float64 +# Float32, Float64 {#float32-float64} [Числа с плавающей запятой](https://en.wikipedia.org/wiki/IEEE_754). Типы эквивалентны типам языка С: -- `Float32` - `float`; -- `Float64` - `double`. +- `Float32` - `float`; +- `Float64` - `double`. Рекомендуется хранить данные в целочисленном виде всегда, когда это возможно. Например, переводите в целочисленные значения числа с фиксированной точностью, такие как денежные суммы или времена загрузки страниц в миллисекундах. -## Особенности использования чисел с плавающей запятой +## Особенности использования чисел с плавающей запятой {#osobennosti-ispolzovaniia-chisel-s-plavaiushchei-zapiatoi} -- При вычислениях с числами с плавающей запятой возможна ошибка округления. +- При вычислениях с числами с плавающей запятой возможна ошибка округления. -```sql + + +``` sql SELECT 1 - 0.9 ``` -```text + +``` text ┌───────minus(1, 0.9)─┐ │ 0.09999999999999998 │ └─────────────────────┘ ``` -- Результат вычисления зависит от метода вычисления (типа процессора и архитектуры вычислительной системы). -- При вычислениях с плавающей запятой возможно появление таких категорий числа как бесконечность (`Inf`) и "не число" (`NaN`). Это необходимо учитывать при обработке результатов вычислений. -- При чтении чисел с плавающей запятой из строк, в качестве результата может быть получено не обязательно ближайшее машинно-представимое число. - -## NaN и Inf {#data_type-float-nan-inf} +- Результат вычисления зависит от метода вычисления (типа процессора и архитектуры вычислительной системы). +- При вычислениях с плавающей запятой возможно появление таких категорий числа как бесконечность (`Inf`) и «не число» (`NaN`). Это необходимо учитывать при обработке результатов вычислений. +- При чтении чисел с плавающей запятой из строк, в качестве результата может быть получено не обязательно ближайшее машинно-представимое число. + +## NaN и Inf {#data-type-float-nan-inf} В отличие от стандартного SQL, ClickHouse поддерживает следующие категории чисел с плавающей запятой: -- `Inf` - бесконечность. +- `Inf` - бесконечность. -```sql + + +``` sql SELECT 0.5 / 0 ``` -```text +``` text ┌─divide(0.5, 0)─┐ │ inf │ └────────────────┘ ``` -- `-Inf` - отрицательная бесконечность; +- `-Inf` - отрицательная бесконечность; -```sql + + +``` sql SELECT -0.5 / 0 ``` -```text +``` text ┌─divide(-0.5, 0)─┐ │ -inf │ └─────────────────┘ ``` -- `NaN` - не число. +- `NaN` - не число. -```sql + + +``` sql SELECT 0 / 0 ``` -```text + +``` text ┌─divide(0, 0)─┐ │ nan │ └──────────────┘ ``` - Смотрите правила сортировки `NaN` в разделе [Секция ORDER BY](../query_language/select.md). + Смотрите правила сортировки `NaN` в разделе [Секция ORDER BY](../query_language/select.md). [Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/float/) diff --git a/docs/ru/data_types/index.md b/docs/ru/data_types/index.md index 2ed9b9b6a4f..538b3098f30 100644 --- a/docs/ru/data_types/index.md +++ b/docs/ru/data_types/index.md @@ -1,6 +1,6 @@ -# Типы данных {#data_types} +# Типы данных {#data-types} -ClickHouse может сохранять в ячейках таблиц данные различных типов. +ClickHouse может сохранять в ячейках таблиц данные различных типов. Раздел содержит описания поддерживаемых типов данных и специфику их использования и/или реализации, если таковые имеются. [Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/) diff --git a/docs/ru/data_types/int_uint.md b/docs/ru/data_types/int_uint.md index 1b74432c013..417e05db626 100644 --- a/docs/ru/data_types/int_uint.md +++ b/docs/ru/data_types/int_uint.md @@ -1,22 +1,19 @@ - -# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 +# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 {#uint8-uint16-uint32-uint64-int8-int16-int32-int64} Целые числа фиксированной длины, без знака или со знаком. -## Диапазоны Int +## Диапазоны Int {#diapazony-int} -- Int8 - [ -128 : 127 ] -- Int16 - [ -32768 : 32767 ] -- Int32 - [ -2147483648 : 2147483647 ] -- Int64 - [ -9223372036854775808 : 9223372036854775807 ] +- Int8 - \[ -128 : 127 \] +- Int16 - \[ -32768 : 32767 \] +- Int32 - \[ -2147483648 : 2147483647 \] +- Int64 - \[ -9223372036854775808 : 9223372036854775807 \] +## Диапазоны Uint {#diapazony-uint} - -## Диапазоны Uint - -- UInt8 - [ 0 : 255 ] -- UInt16 - [ 0 : 65535 ] -- UInt32 - [ 0 : 4294967295 ] -- UInt64 - [ 0 : 18446744073709551615 ] +- UInt8 - \[ 0 : 255 \] +- UInt16 - \[ 0 : 65535 \] +- UInt32 - \[ 0 : 4294967295 \] +- UInt64 - \[ 0 : 18446744073709551615 \] [Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/int_uint/) diff --git a/docs/ru/data_types/nested_data_structures/aggregatefunction.md b/docs/ru/data_types/nested_data_structures/aggregatefunction.md index 3c71db5e8a4..641d8aa8386 100644 --- a/docs/ru/data_types/nested_data_structures/aggregatefunction.md +++ b/docs/ru/data_types/nested_data_structures/aggregatefunction.md @@ -1,4 +1,4 @@ -# AggregateFunction(name, types_of_arguments...) {#data_type-aggregatefunction} +# AggregateFunction(name, types\_of\_arguments…) {#data-type-aggregatefunction} Промежуточное состояние агрегатной функции. Чтобы его получить, используются агрегатные функции с суффиксом `-State`. Чтобы в дальнейшем получить агрегированные данные необходимо использовать те же агрегатные функции с суффиксом `-Merge`. @@ -6,15 +6,15 @@ **Параметры** -- Имя агрегатной функции. +- Имя агрегатной функции. - Для параметрических агрегатных функций указываются также их параметры. + Для параметрических агрегатных функций указываются также их параметры. -- Типы аргументов агрегатной функции. +- Типы аргументов агрегатной функции. **Пример** -```sql +``` sql CREATE TABLE t ( column1 AggregateFunction(uniq, UInt64), @@ -25,15 +25,15 @@ CREATE TABLE t [uniq](../../query_language/agg_functions/reference.md#agg_function-uniq), anyIf ([any](../../query_language/agg_functions/reference.md#agg_function-any)+[If](../../query_language/agg_functions/combinators.md#agg-functions-combinator-if)) и [quantiles](../../query_language/agg_functions/reference.md) — агрегатные функции, поддержанные в ClickHouse. -## Особенности использования +## Особенности использования {#osobennosti-ispolzovaniia} -### Вставка данных +### Вставка данных {#vstavka-dannykh} Для вставки данных используйте `INSERT SELECT` с агрегатными `-State`-функциями. **Примеры функций** -```sql +``` sql uniqState(UserID) quantilesState(0.5, 0.9)(SendTiming) ``` @@ -42,7 +42,7 @@ quantilesState(0.5, 0.9)(SendTiming) В запросах `SELECT` значения типа `AggregateFunction` выводятся во всех форматах, которые поддерживает ClickHouse, в виде implementation-specific бинарных данных. Если с помощью `SELECT` выполнить дамп данных, например, в формат `TabSeparated`, то потом этот дамп можно загрузить обратно с помощью запроса `INSERT`. -### Выборка данных +### Выборка данных {#vyborka-dannykh} При выборке данных из таблицы `AggregatingMergeTree`, используйте `GROUP BY` и те же агрегатные функции, что и при вставке данных, но с суффиксом `-Merge`. @@ -50,13 +50,13 @@ quantilesState(0.5, 0.9)(SendTiming) Например, следующие два запроса возвращают один и тот же результат: -```sql +``` sql SELECT uniq(UserID) FROM table SELECT uniqMerge(state) FROM (SELECT uniqState(UserID) AS state FROM table GROUP BY RegionID) ``` -## Пример использования +## Пример использования {#primer-ispolzovaniia} Смотрите в описании движка [AggregatingMergeTree](../../operations/table_engines/aggregatingmergetree.md). diff --git a/docs/ru/data_types/nested_data_structures/index.md b/docs/ru/data_types/nested_data_structures/index.md index ec665f2dbb5..73d9019c96a 100644 --- a/docs/ru/data_types/nested_data_structures/index.md +++ b/docs/ru/data_types/nested_data_structures/index.md @@ -1,2 +1,3 @@ -# Вложенные структуры данных +# Вложенные структуры данных {#vlozhennye-struktury-dannykh} + [Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/nested_data_structures/) diff --git a/docs/ru/data_types/nested_data_structures/nested.md b/docs/ru/data_types/nested_data_structures/nested.md index face89c02d1..0e43383b283 100644 --- a/docs/ru/data_types/nested_data_structures/nested.md +++ b/docs/ru/data_types/nested_data_structures/nested.md @@ -1,10 +1,10 @@ -# Nested(Name1 Type1, Name2 Type2, ...) +# Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} Вложенная структура данных - это как будто вложенная таблица. Параметры вложенной структуры данных - имена и типы столбцов, указываются так же, как у запроса CREATE. Каждой строке таблицы может соответствовать произвольное количество строк вложенной структуры данных. Пример: -```sql +``` sql CREATE TABLE test.visits ( CounterID UInt32, @@ -35,7 +35,7 @@ CREATE TABLE test.visits Пример: -```sql +``` sql SELECT Goals.ID, Goals.EventTime @@ -44,7 +44,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -```text +``` text ┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ │ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ │ [1073752] │ ['2014-03-17 00:28:25'] │ @@ -61,9 +61,9 @@ LIMIT 10 Проще всего понимать вложенную структуру данных, как набор из нескольких столбцов-массивов одинаковых длин. -Единственное место, где в запросе SELECT можно указать имя целой вложенной структуры данных, а не отдельных столбцов - секция ARRAY JOIN. Подробнее см. раздел "Секция ARRAY JOIN". Пример: +Единственное место, где в запросе SELECT можно указать имя целой вложенной структуры данных, а не отдельных столбцов - секция ARRAY JOIN. Подробнее см. раздел «Секция ARRAY JOIN». Пример: -```sql +``` sql SELECT Goal.ID, Goal.EventTime @@ -73,7 +73,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -```text +``` text ┌─Goal.ID─┬──────Goal.EventTime─┐ │ 1073752 │ 2014-03-17 16:38:10 │ │ 591325 │ 2014-03-17 16:38:48 │ diff --git a/docs/ru/data_types/nullable.md b/docs/ru/data_types/nullable.md index b379c619383..c7b09368707 100644 --- a/docs/ru/data_types/nullable.md +++ b/docs/ru/data_types/nullable.md @@ -1,4 +1,4 @@ -# Nullable(TypeName) {#data_type-nullable} +# Nullable(TypeName) {#data-type-nullable} Позволяет работать как со значением типа `TypeName` так и с отсутствием этого значения ([NULL](../query_language/syntax.md)) в одной и той же переменной, в том числе хранить `NULL` в таблицах вместе со значения типа `TypeName`. Например, в столбце типа `Nullable(Int8)` можно хранить значения типа `Int8`, а в тех строках, где значения нет, будет храниться `NULL`. @@ -8,37 +8,39 @@ `NULL` — значение по умолчанию для типа `Nullable`, если в конфигурации сервера ClickHouse не указано иное. -##Особенности хранения +## Особенности хранения {#osobennosti-khraneniia} Для хранения значения типа `Nullable` ClickHouse использует: -- Отдельный файл с масками `NULL` (далее маска). -- Непосредственно файл со значениями. +- Отдельный файл с масками `NULL` (далее маска). +- Непосредственно файл со значениями. Маска определяет, что лежит в ячейке данных: `NULL` или значение. В случае, когда маска указывает, что в ячейке хранится `NULL`, в файле значений хранится значение по умолчанию для типа данных. Т.е. если, например, поле имеет тип `Nullable(Int8)`, то ячейка будет хранить значение по умолчанию для `Int8`. Эта особенность увеличивает размер хранилища. -!!! info +!!! info "Info" Почти всегда использование `Nullable` снижает производительность, учитывайте это при проектировании своих баз. -## Пример использования +## Пример использования {#primer-ispolzovaniia} -```sql +``` sql CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog ``` -```sql + +``` sql INSERT INTO t_null VALUES (1, NULL), (2, 3) ``` -```sql + +``` sql SELECT x + y from t_null ``` -```text + +``` text ┌─plus(x, y)─┐ │ ᴺᵁᴸᴸ │ │ 5 │ └────────────┘ - ``` [Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/nullable/) diff --git a/docs/ru/data_types/special_data_types/expression.md b/docs/ru/data_types/special_data_types/expression.md index 6242805927e..1f4b960fb10 100644 --- a/docs/ru/data_types/special_data_types/expression.md +++ b/docs/ru/data_types/special_data_types/expression.md @@ -1,4 +1,4 @@ -# Expression +# Expression {#expression} Используется для представления лямбда-выражений в функциях высшего порядка. diff --git a/docs/ru/data_types/special_data_types/index.md b/docs/ru/data_types/special_data_types/index.md index e4f7a418d7c..d5aff1501db 100644 --- a/docs/ru/data_types/special_data_types/index.md +++ b/docs/ru/data_types/special_data_types/index.md @@ -1,4 +1,4 @@ -# Служебные типы данных +# Служебные типы данных {#sluzhebnye-tipy-dannykh} Значения служебных типов данных не могут сохраняться в таблицу и выводиться в качестве результата, а возникают как промежуточный результат выполнения запроса. diff --git a/docs/ru/data_types/special_data_types/interval.md b/docs/ru/data_types/special_data_types/interval.md index 6762f9bc850..22912bdbca1 100644 --- a/docs/ru/data_types/special_data_types/interval.md +++ b/docs/ru/data_types/special_data_types/interval.md @@ -7,26 +7,27 @@ Структура: -- Интервал времени в виде положительного целого числа. -- Тип интервала. +- Интервал времени в виде положительного целого числа. +- Тип интервала. Поддержанные типы интервалов: -- `SECOND` -- `MINUTE` -- `HOUR` -- `DAY` -- `WEEK` -- `MONTH` -- `QUARTER` -- `YEAR` +- `SECOND` +- `MINUTE` +- `HOUR` +- `DAY` +- `WEEK` +- `MONTH` +- `QUARTER` +- `YEAR` Каждому типу интервала соответствует отдельный тип данных. Например, тип данных `IntervalDay` соответствует интервалу `DAY`: -```sql +``` sql SELECT toTypeName(INTERVAL 4 DAY) ``` -```text + +``` text ┌─toTypeName(toIntervalDay(4))─┐ │ IntervalDay │ └──────────────────────────────┘ @@ -36,23 +37,25 @@ SELECT toTypeName(INTERVAL 4 DAY) Значения типов `Interval` можно использовать в арифметических операциях со значениями типов [Date](../../data_types/date.md) и [DateTime](../../data_types/datetime.md). Например, можно добавить 4 дня к текущей дате: -```sql +``` sql SELECT now() as current_date_time, current_date_time + INTERVAL 4 DAY ``` -```text + +``` text ┌───current_date_time─┬─plus(now(), toIntervalDay(4))─┐ │ 2019-10-23 10:58:45 │ 2019-10-27 10:58:45 │ └─────────────────────┴───────────────────────────────┘ ``` -Нельзя объединять интервалы различных типов. Нельзя использовать интервалы вида `4 DAY 1 HOUR`. Вместо этого выражайте интервал в единицах меньших или равных минимальной единице интервала, например, интервал "1 день и 1 час" можно выразить как `25 HOUR` или `90000 SECOND`. +Нельзя объединять интервалы различных типов. Нельзя использовать интервалы вида `4 DAY 1 HOUR`. Вместо этого выражайте интервал в единицах меньших или равных минимальной единице интервала, например, интервал «1 день и 1 час» можно выразить как `25 HOUR` или `90000 SECOND`. Арифметические операции со значениями типов `Interval` не доступны, однако можно последовательно добавлять различные интервалы к значениям типов `Date` и `DateTime`. Например: -```sql +``` sql SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL 3 HOUR ``` -```text + +``` text ┌───current_date_time─┬─plus(plus(now(), toIntervalDay(4)), toIntervalHour(3))─┐ │ 2019-10-23 11:16:28 │ 2019-10-27 14:16:28 │ └─────────────────────┴────────────────────────────────────────────────────────┘ @@ -60,15 +63,16 @@ SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL Следующий запрос приведёт к генерированию исключения: -```sql +``` sql select now() AS current_date_time, current_date_time + (INTERVAL 4 DAY + INTERVAL 3 HOUR) ``` -```text + +``` text Received exception from server (version 19.14.1): -Code: 43. DB::Exception: Received from localhost:9000. DB::Exception: Wrong argument types for function plus: if one argument is Interval, then another must be Date or DateTime.. +Code: 43. DB::Exception: Received from localhost:9000. DB::Exception: Wrong argument types for function plus: if one argument is Interval, then another must be Date or DateTime.. ``` -## Смотрите также +## Смотрите также {#smotrite-takzhe} -- Оператор[INTERVAL](../../query_language/operators.md#operator-interval) -- Функция приведения типа [toInterval](../../query_language/functions/type_conversion_functions.md#function-tointerval) +- Оператор[INTERVAL](../../query_language/operators.md#operator-interval) +- Функция приведения типа [toInterval](../../query_language/functions/type_conversion_functions.md#function-tointerval) diff --git a/docs/ru/data_types/special_data_types/nothing.md b/docs/ru/data_types/special_data_types/nothing.md index fde3722e684..ab4b96cc519 100644 --- a/docs/ru/data_types/special_data_types/nothing.md +++ b/docs/ru/data_types/special_data_types/nothing.md @@ -1,14 +1,14 @@ - -# Nothing +# Nothing {#nothing} Этот тип данных предназначен только для того, чтобы представлять [NULL](../../query_language/syntax.md), т.е. отсутствие значения. Невозможно создать значение типа `Nothing`, поэтому он используется там, где значение не подразумевается. Например, `NULL` записывается как `Nullable(Nothing)` ([Nullable](../../data_types/nullable.md) — это тип данных, позволяющий хранить `NULL` в таблицах). Также тип `Nothing` используется для обозначения пустых массивов: -```sql +``` sql SELECT toTypeName(Array()) ``` -```text + +``` text ┌─toTypeName(array())─┐ │ Array(Nothing) │ └─────────────────────┘ diff --git a/docs/ru/data_types/special_data_types/set.md b/docs/ru/data_types/special_data_types/set.md index 6326cf6e9ce..9a99ed0ca9b 100644 --- a/docs/ru/data_types/special_data_types/set.md +++ b/docs/ru/data_types/special_data_types/set.md @@ -1,4 +1,4 @@ -# Set +# Set {#set} Используется для представления правой части выражения IN. diff --git a/docs/ru/data_types/string.md b/docs/ru/data_types/string.md index ec03eff41e7..4669d154df2 100644 --- a/docs/ru/data_types/string.md +++ b/docs/ru/data_types/string.md @@ -1,10 +1,9 @@ - -# String +# String {#string} Строки произвольной длины. Длина не ограничена. Значение может содержать произвольный набор байт, включая нулевые байты. Таким образом, тип String заменяет типы VARCHAR, BLOB, CLOB и т. п. из других СУБД. -## Кодировки +## Кодировки {#kodirovki} В ClickHouse нет понятия кодировок. Строки могут содержать произвольный набор байт, который хранится и выводится, как есть. Если вам нужно хранить тексты, рекомендуется использовать кодировку UTF-8. По крайней мере, если у вас терминал работает в кодировке UTF-8 (это рекомендуется), вы сможете читать и писать свои значения без каких-либо преобразований. diff --git a/docs/ru/data_types/tuple.md b/docs/ru/data_types/tuple.md index 5f7e4063690..17732d04953 100644 --- a/docs/ru/data_types/tuple.md +++ b/docs/ru/data_types/tuple.md @@ -1,5 +1,4 @@ - -# Tuple(T1, T2, ...) +# Tuple(T1, T2, …) {#tuplet1-t2} Кортеж из элементов любого [типа](index.md#data_types). Элементы кортежа могут быть одного или разных типов. @@ -7,35 +6,37 @@ Кортежи могут быть результатом запроса. В этом случае, в текстовых форматах кроме JSON, значения выводятся в круглых скобках через запятую. В форматах JSON, кортежи выводятся в виде массивов (в квадратных скобках). -## Создание кортежа +## Создание кортежа {#sozdanie-kortezha} Кортеж можно создать с помощью функции -```sql +``` sql tuple(T1, T2, ...) ``` Пример создания кортежа: -```sql +``` sql SELECT tuple(1,'a') AS x, toTypeName(x) ``` -```text + +``` text ┌─x───────┬─toTypeName(tuple(1, 'a'))─┐ │ (1,'a') │ Tuple(UInt8, String) │ └─────────┴───────────────────────────┘ ``` -## Особенности работы с типами данных +## Особенности работы с типами данных {#osobennosti-raboty-s-tipami-dannykh} -При создании кортежа "на лету" ClickHouse автоматически определяет тип каждого аргументов как минимальный из типов, который может сохранить значение аргумента. Если аргумент — [NULL](../query_language/syntax.md#null-literal), то тип элемента кортежа — [Nullable](nullable.md). +При создании кортежа «на лету» ClickHouse автоматически определяет тип каждого аргументов как минимальный из типов, который может сохранить значение аргумента. Если аргумент — [NULL](../query_language/syntax.md#null-literal), то тип элемента кортежа — [Nullable](nullable.md). Пример автоматического определения типа данных: -```sql +``` sql SELECT tuple(1,NULL) AS x, toTypeName(x) ``` -```text + +``` text ┌─x────────┬─toTypeName(tuple(1, NULL))──────┐ │ (1,NULL) │ Tuple(UInt8, Nullable(Nothing)) │ └──────────┴─────────────────────────────────┘ diff --git a/docs/ru/data_types/uuid.md b/docs/ru/data_types/uuid.md index 523f4f8c64a..24f43cc5d24 100644 --- a/docs/ru/data_types/uuid.md +++ b/docs/ru/data_types/uuid.md @@ -4,21 +4,21 @@ Пример UUID значения представлен ниже: -```text +``` text 61f0c404-5cb3-11e7-907b-a6006ad3dba0 ``` Если при вставке новой записи значение для UUID-колонки не указано, UUID идентификатор будет заполнен нулями: -```text +``` text 00000000-0000-0000-0000-000000000000 ``` -## Как сгенерировать UUID +## Как сгенерировать UUID {#kak-sgenerirovat-uuid} Для генерации UUID-значений предназначена функция [generateUUIDv4](../query_language/functions/uuid_functions.md). -## Примеры использования +## Примеры использования {#primery-ispolzovaniia} Ниже представлены примеры работы с UUID. @@ -26,16 +26,19 @@ Этот пример демонстрирует, как создать таблицу с UUID-колонкой и добавить в нее сгенерированный UUID. -```sql +``` sql CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog ``` -```sql + +``` sql INSERT INTO t_uuid SELECT generateUUIDv4(), 'Example 1' ``` -```sql + +``` sql SELECT * FROM t_uuid ``` -```text + +``` text ┌────────────────────────────────────x─┬─y─────────┐ │ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ └──────────────────────────────────────┴───────────┘ @@ -45,24 +48,25 @@ SELECT * FROM t_uuid В этом примере, при добавлении записи в таблицу значение для UUID-колонки не задано. UUID будет заполнен нулями. -```sql +``` sql INSERT INTO t_uuid (y) VALUES ('Example 2') ``` -```sql + +``` sql SELECT * FROM t_uuid ``` -```text + +``` text ┌────────────────────────────────────x─┬─y─────────┐ │ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ │ 00000000-0000-0000-0000-000000000000 │ Example 2 │ └──────────────────────────────────────┴───────────┘ ``` -## Ограничения +## Ограничения {#ogranicheniia} Тип данных UUID можно использовать только с функциями, которые поддерживаются типом данных [String](string.md) (например, [min](../query_language/agg_functions/reference.md#agg_function-min), [max](../query_language/agg_functions/reference.md#agg_function-max), и [count](../query_language/agg_functions/reference.md#agg_function-count)). -Тип данных UUID не поддерживается арифметическими операциями (например, [abs](../query_language/functions/arithmetic_functions.md#arithm_func-abs)) или агрегатными функциями, такими как [sum](../query_language/agg_functions/reference.md#agg_function-sum) и [avg](../query_language/agg_functions/reference.md#agg_function-avg). +Тип данных UUID не поддерживается арифметическими операциями (например, [abs](../query_language/functions/arithmetic_functions.md#arithm_func-abs)) или агрегатными функциями, такими как [sum](../query_language/agg_functions/reference.md#agg_function-sum) и [avg](../query_language/agg_functions/reference.md#agg_function-avg). [Original article](https://clickhouse.tech/docs/en/data_types/uuid/) - diff --git a/docs/ru/database_engines/index.md b/docs/ru/database_engines/index.md index 52aefe1bba2..982324e0408 100644 --- a/docs/ru/database_engines/index.md +++ b/docs/ru/database_engines/index.md @@ -1,4 +1,4 @@ -# Движки баз данных +# Движки баз данных {#dvizhki-baz-dannykh} Движки баз данных обеспечивают работу с таблицами. @@ -6,5 +6,4 @@ Также можно использовать следующие движки баз данных: -- [MySQL](mysql.md) - +- [MySQL](mysql.md) diff --git a/docs/ru/database_engines/lazy.md b/docs/ru/database_engines/lazy.md deleted file mode 120000 index 66830dcdb2f..00000000000 --- a/docs/ru/database_engines/lazy.md +++ /dev/null @@ -1 +0,0 @@ -../../en/database_engines/lazy.md \ No newline at end of file diff --git a/docs/ru/database_engines/lazy.md b/docs/ru/database_engines/lazy.md new file mode 100644 index 00000000000..c1538ec91bc --- /dev/null +++ b/docs/ru/database_engines/lazy.md @@ -0,0 +1,13 @@ +# Lazy {#lazy} + +Сохраняет таблицы только в оперативной памяти `expiration_time_in_seconds` через несколько секунд после последнего доступа. Может использоваться только с таблицами \*Log. + +Он оптимизирован для хранения множества небольших таблиц \*Log, для которых обычно существует большой временной интервал между обращениями. + +## Создание базы данных {#creating-a-database} + +``` sql +CREATE DATABASE testlazy ENGINE = Lazy(expiration_time_in_seconds); +``` + +[Оригинальная статья](https://clickhouse.tech/docs/ru/database_engines/lazy/) diff --git a/docs/ru/database_engines/mysql.md b/docs/ru/database_engines/mysql.md index 28922638744..420ca370297 100644 --- a/docs/ru/database_engines/mysql.md +++ b/docs/ru/database_engines/mysql.md @@ -1,4 +1,4 @@ -# MySQL +# MySQL {#mysql} Позволяет подключаться к базам данных на удалённом MySQL сервере и выполнять запросы `INSERT` и `SELECT` для обмена данными между ClickHouse и MySQL. @@ -6,52 +6,53 @@ Не поддерживаемые виды запросов: -- `ATTACH`/`DETACH` -- `DROP` -- `RENAME` -- `CREATE TABLE` -- `ALTER` +- `ATTACH`/`DETACH` +- `DROP` +- `RENAME` +- `CREATE TABLE` +- `ALTER` -## Создание базы данных +## Создание базы данных {#sozdanie-bazy-dannykh} -```sql +``` sql CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] ENGINE = MySQL('host:port', 'database', 'user', 'password') ``` **Параметры движка** -- `host:port` — адрес сервера MySQL. -- `database` — имя базы данных на удалённом сервере. -- `user` — пользователь MySQL. -- `password` — пароль пользователя. +- `host:port` — адрес сервера MySQL. +- `database` — имя базы данных на удалённом сервере. +- `user` — пользователь MySQL. +- `password` — пароль пользователя. -## Поддержка типов данных +## Поддержка типов данных {#podderzhka-tipov-dannykh} -| MySQL | ClickHouse | -| ------ | ------------ | -| UNSIGNED TINYINT | [UInt8](../data_types/int_uint.md) | -| TINYINT | [Int8](../data_types/int_uint.md) | -| UNSIGNED SMALLINT | [UInt16](../data_types/int_uint.md) | -| SMALLINT | [Int16](../data_types/int_uint.md) | -| UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../data_types/int_uint.md) | -| INT, MEDIUMINT | [Int32](../data_types/int_uint.md) | -| UNSIGNED BIGINT | [UInt64](../data_types/int_uint.md) | -| BIGINT | [Int64](../data_types/int_uint.md) | -| FLOAT | [Float32](../data_types/float.md) | -| DOUBLE | [Float64](../data_types/float.md) | -| DATE | [Date](../data_types/date.md) | -| DATETIME, TIMESTAMP | [DateTime](../data_types/datetime.md) | -| BINARY | [FixedString](../data_types/fixedstring.md) | +| MySQL | ClickHouse | +|----------------------------------|---------------------------------------------| +| UNSIGNED TINYINT | [UInt8](../data_types/int_uint.md) | +| TINYINT | [Int8](../data_types/int_uint.md) | +| UNSIGNED SMALLINT | [UInt16](../data_types/int_uint.md) | +| SMALLINT | [Int16](../data_types/int_uint.md) | +| UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../data_types/int_uint.md) | +| INT, MEDIUMINT | [Int32](../data_types/int_uint.md) | +| UNSIGNED BIGINT | [UInt64](../data_types/int_uint.md) | +| BIGINT | [Int64](../data_types/int_uint.md) | +| FLOAT | [Float32](../data_types/float.md) | +| DOUBLE | [Float64](../data_types/float.md) | +| DATE | [Date](../data_types/date.md) | +| DATETIME, TIMESTAMP | [DateTime](../data_types/datetime.md) | +| BINARY | [FixedString](../data_types/fixedstring.md) | Все прочие типы данных преобразуются в [String](../data_types/string.md). [Nullable](../data_types/nullable.md) поддержан. -## Примеры использования +## Примеры использования {#primery-ispolzovaniia} Таблица в MySQL: -```text + +``` text mysql> USE test; Database changed @@ -72,17 +73,18 @@ mysql> select * from mysql_table; +--------+-------+ 1 row in set (0,00 sec) ``` + База данных в ClickHouse, позволяющая обмениваться данными с сервером MySQL: -```sql +``` sql CREATE DATABASE mysql_db ENGINE = MySQL('localhost:3306', 'test', 'my_user', 'user_password') ``` -```sql +``` sql SHOW DATABASES ``` -```text +``` text ┌─name─────┐ │ default │ │ mysql_db │ @@ -90,38 +92,37 @@ SHOW DATABASES └──────────┘ ``` -```sql +``` sql SHOW TABLES FROM mysql_db ``` -```text +``` text ┌─name─────────┐ │ mysql_table │ └──────────────┘ ``` -```sql +``` sql SELECT * FROM mysql_db.mysql_table ``` -```text +``` text ┌─int_id─┬─value─┐ │ 1 │ 2 │ └────────┴───────┘ ``` -```sql +``` sql INSERT INTO mysql_db.mysql_table VALUES (3,4) ``` -```sql +``` sql SELECT * FROM mysql_db.mysql_table ``` -```text +``` text ┌─int_id─┬─value─┐ │ 1 │ 2 │ │ 3 │ 4 │ └────────┴───────┘ ``` - diff --git a/docs/ru/development/browse_code.md b/docs/ru/development/browse_code.md index e18ba21b889..55a1628301f 100644 --- a/docs/ru/development/browse_code.md +++ b/docs/ru/development/browse_code.md @@ -1,4 +1,4 @@ -# Навигация по коду ClickHouse +# Навигация по коду ClickHouse {#navigatsiia-po-kodu-clickhouse} Для навигации по коду онлайн доступен **Woboq**, он расположен [здесь](https://clickhouse-test-reports.s3.yandex.net/codebrowser/html_report///ClickHouse/dbms/src/index.html). В нём реализовано удобное перемещение между исходными файлами, семантическая подсветка, подсказки, индексация и поиск. Слепок кода обновляется ежедневно. diff --git a/docs/ru/development/contrib.md b/docs/ru/development/contrib.md index c0f2c5e1d6f..7a9c7c7c14d 100644 --- a/docs/ru/development/contrib.md +++ b/docs/ru/development/contrib.md @@ -1,35 +1,35 @@ -# Используемые сторонние библиотеки +# Используемые сторонние библиотеки {#ispolzuemye-storonnie-biblioteki} -| Библиотека | Лицензия | -| ------- | ------- | -| base64 | [BSD 2-Clause License](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) | -| boost | [Boost Software License 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) | -| brotli | [MIT](https://github.com/google/brotli/blob/master/LICENSE) | -| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/master/LICENSE) | -| cctz | [Apache License 2.0](https://github.com/google/cctz/blob/4f9776a310f4952454636363def82c2bf6641d5f/LICENSE.txt) | -| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | -| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | -| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | -| h3 | [Apache License 2.0](https://github.com/uber/h3/blob/master/LICENSE) -| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | -| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | -| libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | -| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) -| libhdfs3 | [Apache License 2.0](https://github.com/ClickHouse-Extras/libhdfs3/blob/bd6505cbb0c130b0db695305b9a38546fa880e5a/LICENSE.txt) | -| libmetrohash | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | -| libpcg-random | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libpcg-random/LICENSE-APACHE.txt) | -| libressl | [OpenSSL License](https://github.com/ClickHouse-Extras/ssl/blob/master/COPYING) | -| librdkafka | [BSD 2-Clause License](https://github.com/edenhill/librdkafka/blob/363dcad5a23dc29381cc626620e68ae418b3af19/LICENSE) | -| libwidechar\_width | [CC0 1.0 Universal](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libwidechar_width/LICENSE) | -| llvm | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/llvm/blob/163def217817c90fb982a6daf384744d8472b92b/llvm/LICENSE.TXT) | -| lz4 | [BSD 2-Clause License](https://github.com/lz4/lz4/blob/c10863b98e1503af90616ae99725ecd120265dfb/LICENSE) | -| mariadb-connector-c | [LGPL v2.1](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/3.1/COPYING.LIB) | -| murmurhash | [Public Domain](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) -| pdqsort | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/pdqsort/license.txt) | -| poco | [Boost Software License - Version 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) | -| protobuf | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) | -| re2 | [BSD 3-Clause License](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) | -| UnixODBC | [LGPL v2.1](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) | -| zlib-ng | [Zlib License](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) | -| zstd | [BSD 3-Clause License](https://github.com/facebook/zstd/blob/dev/LICENSE) | +| Библиотека | Лицензия | +|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------| +| base64 | [BSD 2-Clause License](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) | +| boost | [Boost Software License 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) | +| brotli | [MIT](https://github.com/google/brotli/blob/master/LICENSE) | +| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/master/LICENSE) | +| cctz | [Apache License 2.0](https://github.com/google/cctz/blob/4f9776a310f4952454636363def82c2bf6641d5f/LICENSE.txt) | +| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | +| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | +| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | +| h3 | [Apache License 2.0](https://github.com/uber/h3/blob/master/LICENSE) | +| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | +| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | +| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | +| libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | +| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | +| libhdfs3 | [Apache License 2.0](https://github.com/ClickHouse-Extras/libhdfs3/blob/bd6505cbb0c130b0db695305b9a38546fa880e5a/LICENSE.txt) | +| libmetrohash | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | +| libpcg-random | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libpcg-random/LICENSE-APACHE.txt) | +| libressl | [OpenSSL License](https://github.com/ClickHouse-Extras/ssl/blob/master/COPYING) | +| librdkafka | [BSD 2-Clause License](https://github.com/edenhill/librdkafka/blob/363dcad5a23dc29381cc626620e68ae418b3af19/LICENSE) | +| libwidechar\_width | [CC0 1.0 Universal](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libwidechar_width/LICENSE) | +| llvm | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/llvm/blob/163def217817c90fb982a6daf384744d8472b92b/llvm/LICENSE.TXT) | +| lz4 | [BSD 2-Clause License](https://github.com/lz4/lz4/blob/c10863b98e1503af90616ae99725ecd120265dfb/LICENSE) | +| mariadb-connector-c | [LGPL v2.1](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/3.1/COPYING.LIB) | +| murmurhash | [Public Domain](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) | +| pdqsort | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/pdqsort/license.txt) | +| poco | [Boost Software License - Version 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) | +| protobuf | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) | +| re2 | [BSD 3-Clause License](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) | +| UnixODBC | [LGPL v2.1](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) | +| zlib-ng | [Zlib License](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) | +| zstd | [BSD 3-Clause License](https://github.com/facebook/zstd/blob/dev/LICENSE) | diff --git a/docs/ru/development/developer_instruction.md b/docs/ru/development/developer_instruction.md index 5f541359b65..4613c3ca409 100644 --- a/docs/ru/development/developer_instruction.md +++ b/docs/ru/development/developer_instruction.md @@ -1,45 +1,41 @@ Сборка ClickHouse поддерживается на Linux, FreeBSD, Mac OS X. +# Если вы используете Windows {#esli-vy-ispolzuete-windows} -# Если вы используете Windows +Если вы используете Windows, вам потребуется создать виртуальную машину с Ubuntu. Для работы с виртуальной машиной, установите VirtualBox. Скачать Ubuntu можно на сайте: https://www.ubuntu.com/\#download Создайте виртуальную машину из полученного образа. Выделите для неё не менее 4 GB оперативной памяти. Для запуска терминала в Ubuntu, найдите в меню программу со словом terminal (gnome-terminal, konsole или что-то в этом роде) или нажмите Ctrl+Alt+T. -Если вы используете Windows, вам потребуется создать виртуальную машину с Ubuntu. Для работы с виртуальной машиной, установите VirtualBox. Скачать Ubuntu можно на сайте: https://www.ubuntu.com/#download Создайте виртуальную машину из полученного образа. Выделите для неё не менее 4 GB оперативной памяти. Для запуска терминала в Ubuntu, найдите в меню программу со словом terminal (gnome-terminal, konsole или что-то в этом роде) или нажмите Ctrl+Alt+T. - -# Если вы используете 32-битную систему +# Если вы используете 32-битную систему {#esli-vy-ispolzuete-32-bitnuiu-sistemu} ClickHouse не работает и не собирается на 32-битных системах. Получите доступ к 64-битной системе и продолжайте. - -# Создание репозитория на GitHub +# Создание репозитория на GitHub {#sozdanie-repozitoriia-na-github} Для работы с репозиторием ClickHouse, вам потребуется аккаунт на GitHub. Наверное, он у вас уже есть. Если аккаунта нет - зарегистрируйтесь на https://github.com/. Создайте ssh ключи, если их нет, и загрузите публичные ключи на GitHub. Это потребуется для отправки изменений. Для работы с GitHub можно использовать такие же ssh ключи, как и для работы с другими ssh серверами - скорее всего, они уже у вас есть. -Создайте fork репозитория ClickHouse. Для этого, на странице https://github.com/ClickHouse/ClickHouse нажмите на кнопку "fork" в правом верхнем углу. Вы получите полную копию репозитория ClickHouse на своём аккаунте, которая называется "форк". Процесс разработки состоит в том, чтобы внести нужные изменения в свой форк репозитория, а затем создать "pull request" для принятия изменений в основной репозиторий. +Создайте fork репозитория ClickHouse. Для этого, на странице https://github.com/ClickHouse/ClickHouse нажмите на кнопку «fork» в правом верхнем углу. Вы получите полную копию репозитория ClickHouse на своём аккаунте, которая называется «форк». Процесс разработки состоит в том, чтобы внести нужные изменения в свой форк репозитория, а затем создать «pull request» для принятия изменений в основной репозиторий. Для работы с git репозиториями, установите `git`. В Ubuntu выполните в терминале: -``` -sudo apt update -sudo apt install git -``` + + sudo apt update + sudo apt install git Краткое руководство по использованию Git: https://services.github.com/on-demand/downloads/github-git-cheat-sheet.pdf Подробное руководство по использованию Git: https://git-scm.com/book/ru/v2 +# Клонирование репозитория на рабочую машину {#klonirovanie-repozitoriia-na-rabochuiu-mashinu} -# Клонирование репозитория на рабочую машину - -Затем вам потребуется загрузить исходники для работы на свой компьютер. Это называется "клонирование репозитория", потому что создаёт на вашем компьютере локальную копию репозитория, с которой вы будете работать. +Затем вам потребуется загрузить исходники для работы на свой компьютер. Это называется «клонирование репозитория», потому что создаёт на вашем компьютере локальную копию репозитория, с которой вы будете работать. Выполните в терминале: -``` -git clone --recursive git@github.com:ClickHouse/ClickHouse.git -cd ClickHouse -``` + + git clone --recursive git@github.com:ClickHouse/ClickHouse.git + cd ClickHouse + Замените *yandex* на имя вашего аккаунта на GitHub. Эта команда создаст директорию ClickHouse, содержащую рабочую копию проекта. @@ -47,73 +43,67 @@ cd ClickHouse Необходимо, чтобы путь к рабочей копии не содержал пробелы в именах директорий. Это может привести к проблемам в работе системы сборки. Обратите внимание, что репозиторий ClickHouse использует submodules. Так называются ссылки на дополнительные репозитории (например, внешние библиотеки, от которых зависит проект). Это значит, что при клонировании репозитория, следует указывать ключ `--recursive`, как в примере выше. Если репозиторий был клонирован без submodules, то для их скачивания, необходимо выполнить: -``` -git submodule init -git submodule update -``` + + git submodule init + git submodule update + Проверить наличие submodules можно с помощью команды `git submodule status`. Если вы получили сообщение об ошибке: -``` -Permission denied (publickey). -fatal: Could not read from remote repository. -Please make sure you have the correct access rights -and the repository exists. -``` + Permission denied (publickey). + fatal: Could not read from remote repository. + + Please make sure you have the correct access rights + and the repository exists. + Как правило это означает, что отсутствуют ssh ключи для соединения с GitHub. Ключи расположены в директории `~/.ssh`. В интерфейсе GitHub, в настройках, необходимо загрузить публичные ключи, чтобы он их понимал. Вы также можете клонировать репозиторий по протоколу https: -``` -git clone https://github.com/ClickHouse/ClickHouse.git -``` + + git clone https://github.com/ClickHouse/ClickHouse.git + Этот вариант не подходит для отправки изменений на сервер. Вы можете временно его использовать, а затем добавить ssh ключи и заменить адрес репозитория с помощью команды `git remote`. Вы можете также добавить для своего локального репозитория адрес оригинального репозитория Яндекса, чтобы притягивать оттуда обновления: -``` -git remote add upstream git@github.com:ClickHouse/ClickHouse.git -``` + + git remote add upstream git@github.com:ClickHouse/ClickHouse.git + После этого, вы сможете добавлять в свой репозиторий обновления из репозитория Яндекса с помощью команды `git pull upstream master`. - -## Работа с сабмодулями git +## Работа с сабмодулями git {#rabota-s-sabmoduliami-git} Работа с сабмодулями git может быть достаточно болезненной. Следующие команды позволят содержать их в порядке: -``` -# ! Каждая команда принимает аргумент --recursive -# Обновить URLs удалённого репозитория для каждого сабмодуля, используется относительно редко -git submodule sync -# Добавить новые сабмодули -git submodule init -# Обновить сабмодули до актуального состояния -git submodule update -# Две последние команды могут быть объединены вместе: -git submodule update --init -``` + # ! Каждая команда принимает аргумент --recursive + # Обновить URLs удалённого репозитория для каждого сабмодуля, используется относительно редко + git submodule sync + # Добавить новые сабмодули + git submodule init + # Обновить сабмодули до актуального состояния + git submodule update + # Две последние команды могут быть объединены вместе: + git submodule update --init The next commands would help you to reset all submodules to the initial state (!WARING! - any chenges inside will be deleted): Следующие команды помогут сбросить все сабмодули в изначальное состояние (!ВНИМАНИЕ! - все изменения в сабмодулях будут утеряны): -``` -# Synchronizes submodules' remote URL with .gitmodules -# Обновить URLs удалённого репозитория для каждого сабмодуля -git submodule sync --recursive -# Обновить существующие модули и добавить отсутствующие -git submodule update --init --recursive -# Удалить все изменения в сабмодуле относительно HEAD -git submodule foreach git reset --hard -# Очистить игнорируемые файлы -git submodule foreach git clean -xfd -# Повторить последние 4 команды для каждого из сабмодулей -git submodule foreach git submodule sync --recursive -git submodule foreach git submodule update --init --recursive -git submodule foreach git submodule foreach git reset --hard -git submodule foreach git submodule foreach git clean -xfd -``` + # Synchronizes submodules' remote URL with .gitmodules + # Обновить URLs удалённого репозитория для каждого сабмодуля + git submodule sync --recursive + # Обновить существующие модули и добавить отсутствующие + git submodule update --init --recursive + # Удалить все изменения в сабмодуле относительно HEAD + git submodule foreach git reset --hard + # Очистить игнорируемые файлы + git submodule foreach git clean -xfd + # Повторить последние 4 команды для каждого из сабмодулей + git submodule foreach git submodule sync --recursive + git submodule foreach git submodule update --init --recursive + git submodule foreach git submodule foreach git reset --hard + git submodule foreach git submodule foreach git clean -xfd - -# Система сборки +# Система сборки {#sistema-sborki} ClickHouse использует систему сборки CMake и Ninja. @@ -127,20 +117,17 @@ Ninja - система запуска сборочных задач. Если у вас Arch или Gentoo, то вы сами знаете, как установить CMake. Для установки CMake и Ninja на Mac OS X, сначала установите Homebrew, а затем, с помощью него, установите всё остальное. -``` -/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" -brew install cmake ninja -``` + + /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" + brew install cmake ninja Проверьте версию CMake: `cmake --version`. Если версия меньше 3.3, то установите новую версию с сайта https://cmake.org/download/ - -# Необязательные внешние библиотеки +# Необязательные внешние библиотеки {#neobiazatelnye-vneshnie-biblioteki} ClickHouse использует для сборки некоторое количество внешних библиотек. Но ни одну из них не требуется отдельно устанавливать, так как они собираются вместе с ClickHouse, из исходников, которые расположены в submodules. Посмотреть набор этих библиотек можно в директории contrib. - -# Компилятор C++ +# Компилятор C++ {#kompiliator-c} В качестве компилятора C++ поддерживается GCC начиная с версии 9 или Clang начиная с версии 8. @@ -148,61 +135,58 @@ ClickHouse использует для сборки некоторое коли Для установки GCC под Ubuntu, выполните: `sudo apt install gcc g++`. -Проверьте версию gcc: `gcc --version`. Если версия меньше 9, то следуйте инструкции: https://clickhouse.tech/docs/en/development/build/#install-gcc-9 +Проверьте версию gcc: `gcc --version`. Если версия меньше 9, то следуйте инструкции: https://clickhouse.tech/docs/en/development/build/\#install-gcc-9 Сборка под Mac OS X поддерживается только для компилятора Clang. Чтобы установить его выполните `brew install llvm` Если вы решили использовать Clang, вы также можете установить `libc++` и `lld`, если вы знаете, что это такое. При желании, установите `ccache`. - -# Процесс сборки +# Процесс сборки {#protsess-sborki} Теперь вы готовы к сборке ClickHouse. Для размещения собранных файлов, рекомендуется создать отдельную директорию build внутри директории ClickHouse: -``` -mkdir build -cd build -``` -Вы можете иметь несколько разных директорий (build_release, build_debug) для разных вариантов сборки. + + mkdir build + cd build + +Вы можете иметь несколько разных директорий (build\_release, build\_debug) для разных вариантов сборки. Находясь в директории build, выполните конфигурацию сборки с помощью CMake. Перед первым запуском необходимо выставить переменные окружения, отвечающие за выбор компилятора (в данном примере это - gcc версии 9). Linux: -``` -export CC=gcc-9 CXX=g++-9 -cmake .. -``` + + export CC=gcc-9 CXX=g++-9 + cmake .. Mac OS X: -``` -export CC=clang CXX=clang++ -cmake .. -``` + + export CC=clang CXX=clang++ + cmake .. + Переменная CC отвечает за компилятор C (сокращение от слов C Compiler), переменная CXX отвечает за выбор компилятора C++ (символ X - это как плюс, но положенный набок, ради того, чтобы превратить его в букву). Для более быстрой сборки, можно использовать debug вариант - сборку без оптимизаций. Для этого, укажите параметр `-D CMAKE_BUILD_TYPE=Debug`: -``` -cmake -D CMAKE_BUILD_TYPE=Debug .. -``` + + cmake -D CMAKE_BUILD_TYPE=Debug .. + Вы можете изменить вариант сборки, выполнив эту команду в директории build. Запустите ninja для сборки: -``` -ninja clickhouse-server clickhouse-client -``` + + ninja clickhouse-server clickhouse-client + В этом примере собираются только нужные в первую очередь программы. Если вы хотите собрать все программы (утилиты и тесты), то запустите ninja без параметров: -``` -ninja -``` + + ninja Для полной сборки требуется около 30 GB свободного места на диске или 15 GB для сборки только основных программ. При наличии небольшого количества оперативной памяти на компьютере, следует ограничить количество параллельных задач с помощью параметра `-j`: -``` -ninja -j 1 clickhouse-server clickhouse-client -``` + + ninja -j 1 clickhouse-server clickhouse-client + На машинах с 4 GB памяти, рекомендуется указывать значение 1, а если памяти до 8 GB, укажите значение 2. Если вы получили сообщение `ninja: error: loading 'build.ninja': No such file or directory`, значит конфигурация сборки прошла с ошибкой и вам необходимо посмотреть на сообщение об ошибке выше. @@ -212,56 +196,47 @@ ninja -j 1 clickhouse-server clickhouse-client В процессе сборки могут появится сообщения `libprotobuf WARNING` про protobuf файлы в библиотеке libhdfs2. Это не имеет значения. При успешной сборке, вы получите готовый исполняемый файл `ClickHouse/build/dbms/programs/clickhouse`: -``` -ls -l dbms/programs/clickhouse -``` + ls -l dbms/programs/clickhouse -# Запуск собранной версии ClickHouse +# Запуск собранной версии ClickHouse {#zapusk-sobrannoi-versii-clickhouse} Для запуска сервера из под текущего пользователя, с выводом логов в терминал и с использованием примеров конфигурационных файлов, расположенных в исходниках, перейдите в директорию `ClickHouse/dbms/programs/server/` (эта директория находится не в директории build) и выполните: -``` -../../../build/dbms/programs/clickhouse server -``` + ../../../build/dbms/programs/clickhouse server В этом случае, ClickHouse будет использовать конфигурационные файлы, расположенные в текущей директории. Вы можете запустить `clickhouse server` из любой директории, передав ему путь к конфигурационному файлу в аргументе командной строки `--config-file`. Для подключения к ClickHouse с помощью clickhouse-client, в соседнем терминале, зайдите в директорию `ClickHouse/build/dbms/programs/` и выполните `clickhouse client`. Если вы получили сообщение `Connection refused` на Mac OS X или FreeBSD, то укажите для клиента 127.0.0.1 в качестве имени хоста: -``` -clickhouse client --host 127.0.0.1 -``` + + clickhouse client --host 127.0.0.1 Вы можете заменить собранным вами ClickHouse продакшен версию, установленную в системе. Для этого, установите ClickHouse на свою машину по инструкции с официального сайта. Затем выполните: -``` -sudo service clickhouse-server stop -sudo cp ClickHouse/build/dbms/programs/clickhouse /usr/bin/ -sudo service clickhouse-server start -``` + + sudo service clickhouse-server stop + sudo cp ClickHouse/build/dbms/programs/clickhouse /usr/bin/ + sudo service clickhouse-server start Обратите внимание, что `clickhouse-client`, `clickhouse-server` и другие, являеются симлинками на общий бинарник `clickhouse`. Также вы можете запустить собранный вами ClickHouse с конфигурационным файлом системного ClickHouse: -``` -sudo service clickhouse-server stop -sudo -u clickhouse ClickHouse/build/dbms/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml -``` + sudo service clickhouse-server stop + sudo -u clickhouse ClickHouse/build/dbms/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml -# Среда разработки +# Среда разработки {#sreda-razrabotki} Если вы не знаете, какую среду разработки использовать, то рекомендуется использовать CLion. CLion является платным ПО, но его можно использовать бесплатно в течение пробного периода. Также он бесплатен для учащихся. CLion можно использовать как под Linux, так и под Mac OS X. -Также в качестве среды разработки, вы можете использовать KDevelop или QTCreator. KDevelop - очень удобная, но нестабильная среда разработки. Если KDevelop вылетает через небольшое время после открытия проекта, вам следует нажать на кнопку "Stop All" как только он открыл список файлов проекта. После этого, KDevelop можно будет использовать. +Также в качестве среды разработки, вы можете использовать KDevelop или QTCreator. KDevelop - очень удобная, но нестабильная среда разработки. Если KDevelop вылетает через небольшое время после открытия проекта, вам следует нажать на кнопку «Stop All» как только он открыл список файлов проекта. После этого, KDevelop можно будет использовать. В качестве простых редакторов кода можно использовать Sublime Text или Visual Studio Code или Kate (все варианты доступны под Linux). На всякий случай заметим, что CLion самостоятельно создаёт свою build директорию, самостоятельно выбирает тип сборки debug по-умолчанию, для конфигурации использует встроенную в CLion версию CMake вместо установленного вами, а для запуска задач использует make вместо ninja. Это нормально, просто имейте это ввиду, чтобы не возникало путаницы. - -# Написание кода +# Написание кода {#napisanie-koda} Описание архитектуры ClickHouse: https://clickhouse.tech/docs/ru/development/architecture/ @@ -269,41 +244,37 @@ sudo -u clickhouse ClickHouse/build/dbms/programs/clickhouse server --config-fil Разработка тестов: https://clickhouse.tech/docs/ru/development/tests/ -Список задач: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/instructions/easy_tasks_sorted_ru.md +Список задач: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/instructions/easy\_tasks\_sorted\_ru.md - -# Тестовые данные +# Тестовые данные {#testovye-dannye} Разработка ClickHouse часто требует загрузки реалистичных наборов данных. Особенно это важно для тестирования производительности. Специально для вас мы подготовили набор данных, представляющий собой анонимизированные данные Яндекс.Метрики. Загрузка этих данных потребует ещё 3 GB места на диске. Для выполнения большинства задач разработки, загружать эти данные не обязательно. -``` -sudo apt install wget xz-utils + sudo apt install wget xz-utils -wget https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz -wget https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz + wget https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz + wget https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz -xz -v -d hits_v1.tsv.xz -xz -v -d visits_v1.tsv.xz + xz -v -d hits_v1.tsv.xz + xz -v -d visits_v1.tsv.xz -clickhouse-client + clickhouse-client -CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime); + CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime); -CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID); + CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID); -clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv -clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv -``` + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv +# Создание pull request {#sozdanie-pull-request} -# Создание pull request +Откройте свой форк репозитория в интерфейсе GitHub. Если вы вели разработку в бранче, выберите этот бранч. На странице будет доступна кнопка «Pull request». По сути, это означает «создать заявку на принятие моих изменений в основной репозиторий». -Откройте свой форк репозитория в интерфейсе GitHub. Если вы вели разработку в бранче, выберите этот бранч. На странице будет доступна кнопка "Pull request". По сути, это означает "создать заявку на принятие моих изменений в основной репозиторий". +Pull request можно создать, даже если работа над задачей ещё не завершена. В этом случае, добавьте в его название слово «WIP» (work in progress). Название можно будет изменить позже. Это полезно для совместного просмотра и обсуждения изменений, а также для запуска всех имеющихся тестов. Введите краткое описание изменений - впоследствии, оно будет использовано для релизных changelog. -Pull request можно создать, даже если работа над задачей ещё не завершена. В этом случае, добавьте в его название слово "WIP" (work in progress). Название можно будет изменить позже. Это полезно для совместного просмотра и обсуждения изменений, а также для запуска всех имеющихся тестов. Введите краткое описание изменений - впоследствии, оно будет использовано для релизных changelog. +Тесты будут запущены, как только сотрудники Яндекса поставят для pull request тег «Can be tested». Результаты первых проверок (стиль кода) появятся уже через несколько минут. Результаты сборки появятся примерно через пол часа. Результаты основного набора тестов будут доступны в пределах часа. -Тесты будут запущены, как только сотрудники Яндекса поставят для pull request тег "Can be tested". Результаты первых проверок (стиль кода) появятся уже через несколько минут. Результаты сборки появятся примерно через пол часа. Результаты основного набора тестов будут доступны в пределах часа. - -Система подготовит сборки ClickHouse специально для вашего pull request. Для их получения, нажмите на ссылку "Details" у проверки "Clickhouse build check". Там вы сможете найти прямые ссылки на собранные .deb пакеты ClickHouse, которые, при желании, вы даже сможете установить на свои продакшен серверы (если не страшно). +Система подготовит сборки ClickHouse специально для вашего pull request. Для их получения, нажмите на ссылку «Details» у проверки «Clickhouse build check». Там вы сможете найти прямые ссылки на собранные .deb пакеты ClickHouse, которые, при желании, вы даже сможете установить на свои продакшен серверы (если не страшно). Вероятнее всего, часть сборок не будет успешной с первого раза. Ведь мы проверяем сборку кода и gcc и clang, а при сборке с помощью clang включаются почти все существующие в природе warnings (всегда с флагом `-Werror`). На той же странице, вы сможете найти логи сборки - вам не обязательно самому собирать ClickHouse всеми возможными способами. diff --git a/docs/ru/development/style.md b/docs/ru/development/style.md index d47eb1c0d12..091419394f7 100644 --- a/docs/ru/development/style.md +++ b/docs/ru/development/style.md @@ -1,6 +1,6 @@ -# Как писать код на C++ +# Как писать код на C++ {#kak-pisat-kod-na-c} -## Общее +## Общее {#obshchee} **1.** Этот текст носит рекомендательный характер. @@ -10,7 +10,7 @@ **4.** Многие правила продиктованы не какими либо разумными соображениями, а сложившейся практикой. -## Форматирование +## Форматирование {#formatirovanie} **1.** Большую часть форматирования сделает автоматически `clang-format`. @@ -18,7 +18,7 @@ **3.** Открывающая и закрывающие фигурные скобки на отдельной строке. -```cpp +``` cpp inline void readBoolText(bool & x, ReadBuffer & buf) { char tmp = '0'; @@ -29,30 +29,30 @@ inline void readBoolText(bool & x, ReadBuffer & buf) **4.** Если всё тело функции — один `statement`, то его можно разместить на одной строке. При этом, вокруг фигурных скобок ставятся пробелы (кроме пробела на конце строки). -```cpp +``` cpp inline size_t mask() const { return buf_size() - 1; } inline size_t place(HashValue x) const { return x & mask(); } ``` **5.** Для функций. Пробелы вокруг скобок не ставятся. -```cpp +``` cpp void reinsert(const Value & x) ``` -```cpp +``` cpp memcpy(&buf[place_value], &x, sizeof(x)); ``` **6.** В выражениях `if`, `for`, `while` и т.д. перед открывающей скобкой ставится пробел (в отличие от вызовов функций). -```cpp +``` cpp for (size_t i = 0; i < rows; i += storage.index_granularity) ``` -**7.** Вокруг бинарных операторов (`+`, `-`, `*`, `/`, `%`, ...), а также тернарного оператора `?:` ставятся пробелы. +**7.** Вокруг бинарных операторов (`+`, `-`, `*`, `/`, `%`, …), а также тернарного оператора `?:` ставятся пробелы. -```cpp +``` cpp UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); @@ -60,7 +60,7 @@ UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); **8.** Если ставится перенос строки, то оператор пишется на новой строке, и перед ним увеличивается отступ. -```cpp +``` cpp if (elapsed_ns) message << " (" << rows_read_on_server * 1000000000 / elapsed_ns << " rows/s., " @@ -69,7 +69,7 @@ if (elapsed_ns) **9.** Внутри строки можно, выполнять выравнивание с помощью пробелов. -```cpp +``` cpp dst.ClickLogID = click.LogID; dst.ClickEventID = click.EventID; dst.ClickGoodEvent = click.GoodEvent; @@ -79,7 +79,7 @@ dst.ClickGoodEvent = click.GoodEvent; При необходимости, оператор может быть перенесён на новую строку. В этом случае, перед ним увеличивается отступ. -**11.** Унарные операторы `--`, `++`, `*`, `&`, ... не отделяются от аргумента пробелом. +**11.** Унарные операторы `--`, `++`, `*`, `&`, … не отделяются от аргумента пробелом. **12.** После запятой ставится пробел, а перед — нет. Аналогично для точки с запятой внутри выражения `for`. @@ -87,7 +87,7 @@ dst.ClickGoodEvent = click.GoodEvent; **14.** В выражении `template <...>`, между `template` и `<` ставится пробел, а после `<` и до `>` не ставится. -```cpp +``` cpp template struct AggregatedStatElement {} @@ -95,7 +95,7 @@ struct AggregatedStatElement **15.** В классах и структурах, `public`, `private`, `protected` пишется на том же уровне, что и `class/struct`, а остальной код с отступом. -```cpp +``` cpp template class MultiVersion { @@ -108,11 +108,11 @@ public: **16.** Если на весь файл один `namespace` и кроме него ничего существенного нет, то отступ внутри `namespace` не нужен. -**17.** Если блок для выражения `if`, `for`, `while`, ... состоит из одного `statement`, то фигурные скобки не обязательны. Вместо этого поместите `statement` на отдельную строку. Это правило справедливо и для вложенных `if`, `for`, `while`, ... +**17.** Если блок для выражения `if`, `for`, `while`, … состоит из одного `statement`, то фигурные скобки не обязательны. Вместо этого поместите `statement` на отдельную строку. Это правило справедливо и для вложенных `if`, `for`, `while`, … Если внутренний `statement` содержит фигурные скобки или `else`, то внешний блок следует писать в фигурных скобках. -```cpp +``` cpp /// Finish write. for (auto & stream : streams) stream.second->finalize(); @@ -124,7 +124,7 @@ for (auto & stream : streams) **20.** В строковых литералах можно использовать не-ASCII. -```cpp +``` cpp << ", " << (timer.elapsed() / chunks_stats.hits) << " μsec/hit."; ``` @@ -136,7 +136,7 @@ for (auto & stream : streams) **24.** `const` (относящийся к значению) пишется до имени типа. -```cpp +``` cpp //correct const char * pos const std::string & s @@ -146,7 +146,7 @@ char const * pos **25.** При объявлении указателя или ссылки, символы `*` и `&` отделяются пробелами с обеих сторон. -```cpp +``` cpp //correct const char * pos //incorrect @@ -160,7 +160,7 @@ const char *pos `using` может быть объявлен локально, например, внутри функции. -```cpp +``` cpp //correct using FileStreams = std::map>; FileStreams streams; @@ -170,14 +170,14 @@ std::map> streams; **27.** Нельзя объявлять несколько переменных разных типов в одном выражении. -```cpp +``` cpp //incorrect int x, *y; ``` **28.** C-style cast не используется. -```cpp +``` cpp //incorrect std::cerr << (int)c <<; std::endl; //correct @@ -196,17 +196,17 @@ std::cerr << static_cast(c) << std::endl; **32.** Всегда используйте префиксный инкремент/декремент, если постфиксный не нужен. -```cpp +``` cpp for (Names::const_iterator it = column_names.begin(); it != column_names.end(); ++it) ``` -## Комментарии +## Комментарии {#kommentarii} **1.** Необходимо обязательно писать комментарии во всех нетривиальных местах. Это очень важно. При написании комментария, можно успеть понять, что код не нужен вообще, или что всё сделано неверно. -```cpp +``` cpp /** Part of piece of memory, that can be used. * For example, if internal_buffer is 1MB, and there was only 10 bytes loaded to buffer from file for reading, * then working_buffer will have size of only 10 bytes @@ -218,7 +218,7 @@ for (Names::const_iterator it = column_names.begin(); it != column_names.end(); **3.** Комментарии пишутся до соответствующего кода. В редких случаях после, на той же строке. -```cpp +``` cpp /** Parses and executes the query. */ void executeQuery( @@ -236,7 +236,7 @@ void executeQuery( **6.** Нельзя писать комментарии, которые не дают дополнительной информации. В частности, нельзя писать пустые комментарии вроде этого: -```cpp +``` cpp /* * Procedure Name: * Original procedure name: @@ -257,9 +257,9 @@ void executeQuery( */ ``` -Пример взят с ресурса [http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/](http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/). +Пример взят с ресурса http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/. -**7.** Нельзя писать мусорные комментарии (автор, дата создания...) в начале каждого файла. +**7.** Нельзя писать мусорные комментарии (автор, дата создания…) в начале каждого файла. **8.** Однострочные комментарии начинаются с трёх слешей: `///` , многострочные с `/**`. Такие комментарии считаются «документирующими». @@ -267,7 +267,7 @@ void executeQuery( **9.** В начале и конце многострочного комментария, не должно быть пустых строк (кроме строки, на которой закрывается многострочный комментарий). -**10.** Для закомментированных кусков кода, используются обычные, не "документирующие" комментарии. +**10.** Для закомментированных кусков кода, используются обычные, не «документирующие» комментарии. **11.** Удаляйте закомментированные куски кода перед коммитом. @@ -275,45 +275,45 @@ void executeQuery( **13.** Не пишите прописными буквами. Не используйте излишнее количество знаков препинания. -```cpp +``` cpp /// WHAT THE FAIL??? ``` **14.** Не составляйте из комментариев строки-разделители. -```cpp +``` cpp ///****************************************************** ``` **15.** Не нужно писать в комментарии диалог (лучше сказать устно). -```cpp +``` cpp /// Why did you do this stuff? ``` **16.** Не нужно писать комментарий в конце блока о том, что представлял собой этот блок. -```cpp +``` cpp /// for ``` -## Имена +## Имена {#imena} **1.** В именах переменных и членов класса используйте маленькие буквами с подчёркиванием. -```cpp +``` cpp size_t max_block_size; ``` **2.** Имена функций (методов) camelCase с маленькой буквы. -```cpp +``` cpp std::string getName() const override { return "Memory"; } ``` **3.** Имена классов (структур) - CamelCase с большой буквы. Префиксы кроме I для интерфейсов - не используются. -```cpp +``` cpp class StorageMemory : public IStorage ``` @@ -323,21 +323,21 @@ class StorageMemory : public IStorage В более сложных случаях - либо также, как имена классов, либо можно добавить в начало букву `T`. -```cpp +``` cpp template struct AggregatedStatElement ``` **6.** Имена констант — параметров шаблонов: либо также, как имена переменных, либо `N` в простом случае. -```cpp +``` cpp template struct ExtractDomain ``` **7.** Для абстрактных классов (интерфейсов) можно добавить в начало имени букву `I`. -```cpp +``` cpp class IBlockInputStream ``` @@ -345,13 +345,13 @@ class IBlockInputStream В остальных случаях используйте имя, описывающее смысл. -```cpp +``` cpp bool info_successfully_loaded = false; ``` -**9.** В именах `define` и глобальных констант используется ALL_CAPS с подчёркиванием. +**9.** В именах `define` и глобальных констант используется ALL\_CAPS с подчёркиванием. -```cpp +``` cpp #define MAX_SRC_TABLE_NAMES_TO_STORE 1000 ``` @@ -363,12 +363,12 @@ bool info_successfully_loaded = false; **11.** Если имя содержит сокращение, то: -- для имён переменных, всё сокращение пишется маленькими буквами `mysql_connection` (не `mySQL_connection`). -- для имён классов и функций, сохраняются большие буквы в сокращении `MySQLConnection` (не `MySqlConnection`). +- для имён переменных, всё сокращение пишется маленькими буквами `mysql_connection` (не `mySQL_connection`). +- для имён классов и функций, сохраняются большие буквы в сокращении `MySQLConnection` (не `MySqlConnection`). **12.** Параметры конструктора, использующиеся сразу же для инициализации соответствующих членов класса, следует назвать также, как и члены класса, добавив подчёркивание в конец. -```cpp +``` cpp FileQueueProcessor( const std::string & path_, const std::string & prefix_, @@ -385,13 +385,13 @@ FileQueueProcessor( **13.** Именование локальных переменных и членов класса никак не отличается (никакие префиксы не нужны). -```cpp +``` cpp timer (not m_timer) ``` -**14.** Константы в `enum` — CamelCase с большой буквы. Также допустим ALL_CAPS. Если `enum` не локален, то используйте `enum class`. +**14.** Константы в `enum` — CamelCase с большой буквы. Также допустим ALL\_CAPS. Если `enum` не локален, то используйте `enum class`. -```cpp +``` cpp enum class CompressionMethod { QuickLZ = 0, @@ -401,13 +401,13 @@ enum class CompressionMethod **15.** Все имена - по-английски. Транслит с русского использовать нельзя. -```text +``` text не Stroka ``` **16.** Сокращения (из нескольких букв разных слов) в именах можно использовать только если они являются общепринятыми (если для сокращения можно найти расшифровку в английской википедии или сделав поисковый запрос). -```text +``` text `AST`, `SQL`. Не `NVDH` (что-то неведомое) @@ -419,8 +419,7 @@ enum class CompressionMethod **17.** Имена файлов с исходниками на C++ должны иметь расширение только `.cpp`. Заголовочные файлы - только `.h`. - -## Как писать код +## Как писать код {#kak-pisat-kod} **1.** Управление памятью. @@ -432,9 +431,9 @@ enum class CompressionMethod Примеры: -- проще всего разместить объект на стеке, или сделать его членом другого класса. -- для большого количества маленьких объектов используйте контейнеры. -- для автоматического освобождения маленького количества объектов, выделенных на куче, используйте `shared_ptr/unique_ptr`. +- проще всего разместить объект на стеке, или сделать его членом другого класса. +- для большого количества маленьких объектов используйте контейнеры. +- для автоматического освобождения маленького количества объектов, выделенных на куче, используйте `shared_ptr/unique_ptr`. **2.** Управление ресурсами. @@ -450,7 +449,7 @@ enum class CompressionMethod В функциях потока, следует ловить и запоминать все исключения, чтобы выкинуть их в основном потоке после `join`. -```cpp +``` cpp /// Если вычислений ещё не было - вычислим первый блок синхронно if (!started) { @@ -466,14 +465,14 @@ if (exception) Ни в коем случае не «проглатывайте» исключения без разбора. Ни в коем случае, не превращайте все исключения без разбора в сообщения в логе. -```cpp +``` cpp //Not correct catch (...) {} ``` Если вам нужно проигнорировать какие-то исключения, то игнорируйте только конкретные, а остальные кидайте обратно. -```cpp +``` cpp catch (const DB::Exception & e) { if (e.code() == ErrorCodes::UNKNOWN_AGGREGATE_FUNCTION) @@ -485,7 +484,7 @@ catch (const DB::Exception & e) При использовании функций, использующих коды возврата или `errno`, проверяйте результат и кидайте исключение. -```cpp +``` cpp if (0 != close(fd)) throwFromErrno("Cannot close file " + file_name, ErrorCodes::CANNOT_CLOSE_FILE); ``` @@ -502,16 +501,16 @@ if (0 != close(fd)) Используйте следующие варианты: -- Сделайте функцию (`done()` или `finalize()`), которая позволяет заранее выполнить всю работу, в процессе которой может возникнуть исключение. Если эта функция была вызвана, то затем в деструкторе не должно возникать исключений. -- Слишком сложную работу (например, отправку данных по сети) можно вообще не делать в деструкторе, рассчитывая, что пользователь заранее позовёт метод для завершения работы. -- Если в деструкторе возникло исключение, желательно не "проглатывать" его, а вывести информацию в лог (если в этом месте доступен логгер). -- В простых программах, если соответствующие исключения не ловятся, и приводят к завершению работы с записью информации в лог, можно не беспокоиться об исключениях, вылетающих из деструкторов, так как вызов `std::terminate` (в случае `noexcept` по умолчанию в C++11), является приемлемым способом обработки исключения. +- Сделайте функцию (`done()` или `finalize()`), которая позволяет заранее выполнить всю работу, в процессе которой может возникнуть исключение. Если эта функция была вызвана, то затем в деструкторе не должно возникать исключений. +- Слишком сложную работу (например, отправку данных по сети) можно вообще не делать в деструкторе, рассчитывая, что пользователь заранее позовёт метод для завершения работы. +- Если в деструкторе возникло исключение, желательно не «проглатывать» его, а вывести информацию в лог (если в этом месте доступен логгер). +- В простых программах, если соответствующие исключения не ловятся, и приводят к завершению работы с записью информации в лог, можно не беспокоиться об исключениях, вылетающих из деструкторов, так как вызов `std::terminate` (в случае `noexcept` по умолчанию в C++11), является приемлемым способом обработки исключения. **6.** Отдельные блоки кода. Внутри одной функции, можно создать отдельный блок кода, для того, чтобы сделать некоторые переменные локальными в нём, и для того, чтобы соответствующие деструкторы были вызваны при выходе из блока. -```cpp +``` cpp Block block = data.in->read(); { @@ -527,11 +526,11 @@ ready_any.set(); В программах офлайн обработки данных: -- cначала добейтесь более-менее максимальной производительности на одном процессорном ядре, потом можно распараллеливать код, но только если есть необходимость. +- cначала добейтесь более-менее максимальной производительности на одном процессорном ядре, потом можно распараллеливать код, но только если есть необходимость. В программах - серверах: -- используйте пул потоков для обработки запросов. На данный момент, у нас не было задач, в которых была бы необходимость использовать userspace context switching. +- используйте пул потоков для обработки запросов. На данный момент, у нас не было задач, в которых была бы необходимость использовать userspace context switching. Fork для распараллеливания не используется. @@ -583,7 +582,7 @@ Fork для распараллеливания не используется. В некоторых редких случаях, может потребоваться возвращать значение через аргумент функции. В этом случае, аргументом будет ссылка. -```cpp +``` cpp using AggregateFunctionPtr = std::shared_ptr; /** Позволяет создать агрегатную функцию по её имени. @@ -615,7 +614,7 @@ public: Если потом вам потребовалась отложенная инициализация, то вы можете дописать конструктор по умолчанию (который создаст объект с некорректным состоянием). Или, для небольшого количества объектов, можно использовать `shared_ptr/unique_ptr`. -```cpp +``` cpp Loader(DB::Connection * connection_, const std::string & query, size_t max_block_size_); /// Для отложенной инициализации @@ -670,13 +669,13 @@ Loader() {} **24.** Не нужно использовать `trailing return type` для функций, если в этом нет необходимости. -```cpp +``` cpp [auto f() -> void;]{.strike} ``` **25.** Объявление и инициализация переменных. -```cpp +``` cpp //right way std::string s = "Hello"; std::string s{"Hello"}; @@ -687,14 +686,13 @@ auto s = std::string{"Hello"}; **26.** Для виртуальных функций, пишите `virtual` в базовом классе, а в классах-наследниках, пишите `override` и не пишите `virtual`. - -## Неиспользуемые возможности языка C++ +## Неиспользуемые возможности языка C++ {#neispolzuemye-vozmozhnosti-iazyka-c} **1.** Виртуальное наследование не используется. **2.** Спецификаторы исключений из C++03 не используются. -## Платформа +## Платформа {#platforma} **1.** Мы пишем код под конкретную платформу. @@ -708,7 +706,7 @@ auto s = std::string{"Hello"}; **4.** ОС - Linux Ubuntu, не более старая, чем Precise. -**5.** Код пишется под процессор с архитектурой x86_64. +**5.** Код пишется под процессор с архитектурой x86\_64. Набор инструкций минимальный из поддержаных нашими серверами. Сейчас это - SSE4.2. @@ -718,7 +716,7 @@ auto s = std::string{"Hello"}; **8.** Код разрабатывается и отлаживается с релизными параметрами сборки. -## Инструментарий +## Инструментарий {#instrumentarii} **1.** Хорошая среда разработки - KDevelop. @@ -746,8 +744,7 @@ auto s = std::string{"Hello"}; **10.** Ненужный код удаляется из исходников. - -## Библиотеки +## Библиотеки {#biblioteki} **1.** Используются стандартная библиотека C++14 (допустимо использовать экспериментальные расширения) а также фреймворки `boost`, `Poco`. @@ -763,7 +760,7 @@ auto s = std::string{"Hello"}; **5.** Предпочтение всегда отдаётся уже использующимся библиотекам. -## Общее +## Общее {#obshchee-1} **1.** Пишите как можно меньше кода. @@ -777,7 +774,7 @@ auto s = std::string{"Hello"}; **6.** Приветствуется упрощение и уменьшение объёма кода. -## Дополнительно +## Дополнительно {#dopolnitelno} **1.** Явное указание `std::` для типов из `stddef.h`. @@ -805,32 +802,32 @@ auto s = std::string{"Hello"}; Допустимо использовать любой стиль переноса, похожий на приведённые ниже: -```cpp +``` cpp function( T1 x1, T2 x2) ``` -```cpp +``` cpp function( size_t left, size_t right, const & RangesInDataParts ranges, size_t limit) ``` -```cpp +``` cpp function(size_t left, size_t right, const & RangesInDataParts ranges, size_t limit) ``` -```cpp +``` cpp function(size_t left, size_t right, const & RangesInDataParts ranges, size_t limit) ``` -```cpp +``` cpp function( size_t left, size_t right, diff --git a/docs/ru/extended_roadmap.md b/docs/ru/extended_roadmap.md index f8950c238ec..7beb6da4de9 100644 --- a/docs/ru/extended_roadmap.md +++ b/docs/ru/extended_roadmap.md @@ -1,51 +1,51 @@ -# Планы разработки ClickHouse 2020. +# Планы разработки ClickHouse 2020. {#plany-razrabotki-clickhouse-2020} Здесь собраны важные задачи на 2020 год. Многие из них присутствуют в GitHub Issues. Данный текст следует рассматривать как рабочий черновик со сводкой и кратким описанием задач, ссылками и материалами для быстрого доступа на одной странице. Классификация задач условная. Так как ClickHouse - open-source продукт, мы хотим, чтобы рабочий процесс был также максимально открытым. В связи с этим, вам следует ожидать наличия на данной странице несколько большего количества деталей описания рабочего процесса, чем вы могли бы предположить - настолько близко, насколько рабочий процесс видят разработчики. Так как неотъемлимой частью процесса разработки является исправление недостатков продукта и улучшение качества кода, на данной странице вы найдёте весьма подробные описания таких деталей. Для других open-source продуктов такой подход к разработке обычно нехарактерен. Благодаря тому, что для каждой задачи указаны её зависимости, вы сможете понять, какие подготовительные работы требуются, что позволяет более точно понимать сроки реализации. +## 1. Хранение данных, индексация. {#khranenie-dannykh-indeksatsiia} -## 1. Хранение данных, индексация. +### 1.1. Индексы по z-Order curve, normalized z-Order curve. {#indeksy-po-z-order-curve-normalized-z-order-curve} -### 1.1. Индексы по z-Order curve, normalized z-Order curve. +[\#6286](https://github.com/ClickHouse/ClickHouse/pull/6286) -[#6286](https://github.com/ClickHouse/ClickHouse/pull/6286) - -Задача также относится к категории "17. Работа с географическими данными", так как geoHash - это частный случай z-Order curve. +Задача также относится к категории «17. Работа с географическими данными», так как geoHash - это частный случай z-Order curve. Также связана с задачей 24.27 для нечёткого поиска полудубликатов строк, так как позволит индексировать min-hash. -Задача "normalized z-Order curve" в перспективе может быть полезна для БК и Метрики, так как позволяет смешивать OrderID и PageID и избежать дублирования данных. +Задача «normalized z-Order curve» в перспективе может быть полезна для БК и Метрики, так как позволяет смешивать OrderID и PageID и избежать дублирования данных. В задаче также вводится способ индексации путём обращения функции нескольких аргументов на интервале, что имеет смысл для дальнейшего развития. Изначально делал [Андрей Чулков](https://github.com/achulkov2), ВШЭ, теперь (не) доделывает [Ольга Хвостикова](https://github.com/stavrolia), но сроки немного сдвинуты из-за задачи 25.9. Будем надеятся на лучшее. Upd. Доделывать будет другой человек. Приоритет не высокий. -### 1.2. Wait-free каталог баз данных. +### 1.2. Wait-free каталог баз данных. {#wait-free-katalog-baz-dannykh} Q2. Делает [Александр Токмаков](https://github.com/tavplubix), первый рабочий вариант в декабре 2019. Нужно для DataLens и Яндекс.Метрики. Манипуляции с каталогом баз данных: запросы CREATE TABLE, DROP TABLE, RENAME TABLE и DATABASE, требуют синхронизации с помощью блокировок. Эта синхронизация становится весьма сложной, так как на неё полагается много внутренних структур данных. -Предлагается реализовать альтернативный подход, в котором таблицы и базы данных являются всего лишь ссылками на persistent объекты. Подробное описание задачи: [#6787](https://github.com/ClickHouse/ClickHouse/issues/6787) +Предлагается реализовать альтернативный подход, в котором таблицы и базы данных являются всего лишь ссылками на persistent объекты. Подробное описание задачи: [\#6787](https://github.com/ClickHouse/ClickHouse/issues/6787) Upd. Сделана крупная часть задачи, но ориентироваться стоит уже на Q2. -### 1.3. Неблокирующие ALTER. +### 1.3. Неблокирующие ALTER. {#neblokiruiushchie-alter} Q1. И полностью immutable куски. Делает [Александр Сапин](https://github.com/alesapin). Готов приступить к задаче в конце ноября 2019. Нужно для Яндекс.Метрики. Upd. Большая часть задачи реализована и добавлена в master. Есть незначительные технические долги. Остаётся реализация неблокирующего изменения метаданных таблицы. -### 1.4. Нетранзитивные ALTER столбцов. +### 1.4. Нетранзитивные ALTER столбцов. {#netranzitivnye-alter-stolbtsov} Требует 1.3. Будет делать [Александр Сапин](https://github.com/alesapin). -### 1.5. ALTER RENAME COLUMN. -[#6861](https://github.com/ClickHouse/ClickHouse/issues/6861) +### 1.5. ALTER RENAME COLUMN. {#alter-rename-column} + +[\#6861](https://github.com/ClickHouse/ClickHouse/issues/6861) Требует 1.3. Будет делать [Александр Сапин](https://github.com/alesapin). -### 1.6. Полиморфные куски данных. +### 1.6. Полиморфные куски данных. {#polimorfnye-kuski-dannykh} Компактные куски - Q1, куски в оперативке Q1/Q2. @@ -53,28 +53,28 @@ Upd. Большая часть задачи реализована и добав Делает [Антон Попов](https://github.com/CurtizJ), первый рабочий вариант в декабре. Пререквизит чтобы снизить сложность мелких INSERT, что в свою очередь нужно для 1.12, иначе задача 1.12 не сможет нормально работать. Особенно нужно для Яндекс.Облака. -Данные в таблицах типа MergeTree в ClickHouse хранятся в виде набора независимых "кусков". Внутри куска, каждый столбец, а также индекс, хранится в отдельных файлах. Это сделано для возможности быстрых манипуляций со столбцами (пример - запрос ALTER DROP COLUMN). При вставке данных (INSERT), создаётся новый кусок. Для таблиц с большим количеством столбцов, запросы INSERT с маленьким количеством строк являются неэффективными, так как требуют создания большого количества файлов в файловой системе. Это является врождённой особенностью ClickHouse - одной из первой проблем, с которыми сталкиваются пользователи. Пользователям приходится буферизовывать данные и собирать их в более крупные пачки перед вставкой в ClickHouse. +Данные в таблицах типа MergeTree в ClickHouse хранятся в виде набора независимых «кусков». Внутри куска, каждый столбец, а также индекс, хранится в отдельных файлах. Это сделано для возможности быстрых манипуляций со столбцами (пример - запрос ALTER DROP COLUMN). При вставке данных (INSERT), создаётся новый кусок. Для таблиц с большим количеством столбцов, запросы INSERT с маленьким количеством строк являются неэффективными, так как требуют создания большого количества файлов в файловой системе. Это является врождённой особенностью ClickHouse - одной из первой проблем, с которыми сталкиваются пользователи. Пользователям приходится буферизовывать данные и собирать их в более крупные пачки перед вставкой в ClickHouse. Для смягчения эффекта от этой проблемы, в ClickHouse существуют таблицы типа Buffer. Они накапливают данные в оперативке перед записью в другую таблицу. Впрочем, таблицы Buffer не являются полноценным решением проблемы из-за: - наличия блокировок при вставке; - переупорядочивание вставляемых данных; - неатомарность перекладывания данных из Buffer в результирующую таблицу. Вместо этого предлагается разрешить кускам таблиц типа MergeTree располагать данные в разных форматах. А именно: - в оперативной памяти; - на диске со всеми столбцами в одном файле; - на диске со столбцами в отдельных файлах: в зависимости от размера куска и прошедшего времени. Для размещения кусков в оперативной памяти, придётся также реализовать опциональную поддержку write-ahead log с настраиваемыми правилами по сбросу на диск. Это позволит избавиться от проблем с мелкими вставками для MergeTree таблиц. Для ReplicatedMergeTree таблиц, это решит проблему лишь частично. -### 1.7. Буферизация и WAL в MergeTree. +### 1.7. Буферизация и WAL в MergeTree. {#buferizatsiia-i-wal-v-mergetree} Требует 1.6. -### 1.8. + Перенос между разделами по TTL. +### 1.8. + Перенос между разделами по TTL. {#perenos-mezhdu-razdelami-po-ttl} Делает [Владимир Чеботарёв](https://github.com/excitoon), Altinity. Декабрь 2019. Q1. Закоммичено, но есть технический долг, который исправляется сейчас. Готово. -### 1.9. Использование TTL для прореживания данных. +### 1.9. Использование TTL для прореживания данных. {#ispolzovanie-ttl-dlia-prorezhivaniia-dannykh} Будет делать Сорокин Николай, ВШЭ и Яндекс. -Сейчас пользователь может задать в таблице выражение, которое определяет, сколько времени хранятся данные. Обычно это выражение задаётся относительно значения столбца с датой - например: удалять данные через три месяца. https://clickhouse.tech/docs/ru/operations/table_engines/mergetree/#table_engine-mergetree-ttl +Сейчас пользователь может задать в таблице выражение, которое определяет, сколько времени хранятся данные. Обычно это выражение задаётся относительно значения столбца с датой - например: удалять данные через три месяца. https://clickhouse.tech/docs/ru/operations/table\_engines/mergetree/\#table\_engine-mergetree-ttl Это может быть задано для всей таблицы (тогда строки целиком удаляются после указанного времени) или для отдельных столбцов (тогда данные столбца физически удаляются с диска, а строки в таблице остаются; при чтении значений столбца, они читаются как значения по-умолчанию). @@ -88,19 +88,19 @@ Q1. Закоммичено, но есть технический долг, ко А вот пункт 2 требуется продумать. Не очевидно даже, какой лучше использовать синтаксис для этого при создании таблицы. Но мы придумаем - сразу видно несколько вариантов. -Частный случай такой задачи уже есть в https://clickhouse.tech/docs/ru/operations/table_engines/graphitemergetree/ Но это было сделано для конкретной задачи. А надо обобщить. +Частный случай такой задачи уже есть в https://clickhouse.tech/docs/ru/operations/table\_engines/graphitemergetree/ Но это было сделано для конкретной задачи. А надо обобщить. -### 1.10. Пережатие старых данных в фоне. +### 1.10. Пережатие старых данных в фоне. {#perezhatie-starykh-dannykh-v-fone} Будет делать Кирилл Барухов, ВШЭ, экспериментальная реализация к весне 2020. Нужно для Яндекс.Метрики. Алгоритмы сжатия типа LZ77 позволяют потратить больше времени на сжатие данных, чтобы сжать данные сильнее, но при этом без проигрыша по скорости разжатия данных. В частности, этим свойством обладает LZ4 и ZSTD, которые используются в ClickHouse. Это позволяет использовать свободные ресурсы CPU, когда сервер не нагружен, для пережатия данных, чтобы данные занимали меньше места на дисках, и при этом сохранить или даже улучшить скорость обработки запросов. -В то же время, ClickHouse обычно используется для "импульсного" сценария нагрузки. Запрос от пользователя обрабатывается максимально быстро, используя все ресурсы CPU, но в среднем по времени, сервер недостаточно нагружен. +В то же время, ClickHouse обычно используется для «импульсного» сценария нагрузки. Запрос от пользователя обрабатывается максимально быстро, используя все ресурсы CPU, но в среднем по времени, сервер недостаточно нагружен. Предлагается добавить в ClickHouse настройки по пережатию данных и фоновые потоки, выполняющие эту задачу. -### 1.11. Виртуальная файловая система. +### 1.11. Виртуальная файловая система. {#virtualnaia-failovaia-sistema} В процессе реализации, сейчас на VFS переведены Log, TinyLog, StripeLog, готовится MergeTree. @@ -110,9 +110,9 @@ Q2. ClickHouse использует для хранения данных локальную файловую систему. Существует сценарий работы, в котором размещение старых (архивных) данных было бы выгодно на удалённой файловой системе. Если файловая система POSIX совместимая, то это не составляет проблем: ClickHouse успешно работает с Ceph, GlusterFS, MooseFS. Также востребованным является сценарий использования S3 (из-за доступности в облаке) или HDFS (для интеграции с Hadoop). Но эти файловые системы не являются POSIX совместимыми. Хотя для них существуют FUSE драйверы, но скорость работы сильно страдает и поддержка неполная. -ClickHouse использует небольшое подмножество функций ФС, но в то же время, и некоторые специфические части: симлинки и хардлинки, O_DIRECT. Предлагается выделить всё взаимодействие с файловой системой в отдельный интерфейс. +ClickHouse использует небольшое подмножество функций ФС, но в то же время, и некоторые специфические части: симлинки и хардлинки, O\_DIRECT. Предлагается выделить всё взаимодействие с файловой системой в отдельный интерфейс. -### 1.12. Экспериментальная реализация VFS поверх S3 и HDFS. +### 1.12. Экспериментальная реализация VFS поверх S3 и HDFS. {#eksperimentalnaia-realizatsiia-vfs-poverkh-s3-i-hdfs} Q2. @@ -121,38 +121,36 @@ Q2. Upd. Олег будет делать только часть про HDFS. -### 1.13. Ускорение запросов с FINAL. +### 1.13. Ускорение запросов с FINAL. {#uskorenie-zaprosov-s-final} Требует 2.1. Делает [Николай Кочетов](https://github.com/KochetovNicolai). Нужно для Яндекс.Метрики. -### 1.14. Не писать столбцы, полностью состоящие из нулей. +### 1.14. Не писать столбцы, полностью состоящие из нулей. {#ne-pisat-stolbtsy-polnostiu-sostoiashchie-iz-nulei} Антон Попов. Q1/Q2. В очереди. Простая задача, является небольшим пререквизитом для потенциальной поддержки полуструктурированных данных. - -### 1.15. Возможность иметь разный первичный ключ в разных кусках. +### 1.15. Возможность иметь разный первичный ключ в разных кусках. {#vozmozhnost-imet-raznyi-pervichnyi-kliuch-v-raznykh-kuskakh} Сложная задача, только после 1.3. -### 1.16. Несколько физических представлений для одного куска данных. +### 1.16. Несколько физических представлений для одного куска данных. {#neskolko-fizicheskikh-predstavlenii-dlia-odnogo-kuska-dannykh} Сложная задача, только после 1.3 и 1.6. Позволяет компенсировать 21.20. -### 1.17. Несколько сортировок для одной таблицы. +### 1.17. Несколько сортировок для одной таблицы. {#neskolko-sortirovok-dlia-odnoi-tablitsy} Сложная задача, только после 1.3 и 1.6. -### 1.18. Отдельное хранение файлов кусков. +### 1.18. Отдельное хранение файлов кусков. {#otdelnoe-khranenie-failov-kuskov} Требует 1.3 и 1.6. Полная замена hard links на sym links, что будет лучше для 1.12. - -## 2. Крупные рефакторинги. +## 2. Крупные рефакторинги. {#krupnye-refaktoringi} Для обоснования необходимости смотрите ссылки в описании других задач. -### 2.1. Переделка конвейера выполнения запросов на Processors. +### 2.1. Переделка конвейера выполнения запросов на Processors. {#peredelka-konveiera-vypolneniia-zaprosov-na-processors} Делает [Николай Кочетов](https://github.com/KochetovNicolai). Финальная стадия разработки. Включение по-умолчанию в конце декабря 2019. Удаление старого кода в начале 2020. @@ -163,27 +161,27 @@ Upd. Включили по-умолчанию. Удаление старого Upd. Уже есть первый релиз, в котором это включено по-умолчанию. -### 2.2. Инфраструктура событий/метрик/ограничений/квот/трассировки. +### 2.2. Инфраструктура событий/метрик/ограничений/квот/трассировки. {#infrastruktura-sobytiimetrikogranicheniikvottrassirovki} В очереди. https://gist.github.com/alexey-milovidov/d62d73222d83b9319dc519cbb13aeff6 -### 2.3. Перенос столбцового ser/de из DataType в Column. +### 2.3. Перенос столбцового ser/de из DataType в Column. {#perenos-stolbtsovogo-serde-iz-datatype-v-column} В очереди. -### 2.4. Перевод LowCardinality из DataType в Column. Добавление ColumnSparse. +### 2.4. Перевод LowCardinality из DataType в Column. Добавление ColumnSparse. {#perevod-lowcardinality-iz-datatype-v-column-dobavlenie-columnsparse} Требует 2.3. -### 2.5. Версионирование состояний агрегатных функций. +### 2.5. Версионирование состояний агрегатных функций. {#versionirovanie-sostoianii-agregatnykh-funktsii} В очереди. -### 2.6. Правая часть IN как тип данных. Выполнение IN в виде скалярного подзапроса. +### 2.6. Правая часть IN как тип данных. Выполнение IN в виде скалярного подзапроса. {#pravaia-chast-in-kak-tip-dannykh-vypolnenie-in-v-vide-skaliarnogo-podzaprosa} Требует 2.1. -### 2.7. Нормализация Context. +### 2.7. Нормализация Context. {#normalizatsiia-context} В очереди. Нужно для YQL. @@ -191,71 +189,70 @@ Upd. Уже есть первый релиз, в котором это вклю Upd. Каталог БД вынесен из Context. -### 2.8. Декларативный парсер запросов. +### 2.8. Декларативный парсер запросов. {#deklarativnyi-parser-zaprosov} Средний приоритет. Нужно для YQL. -### 2.9. Логгировние в format-стиле. +### 2.9. Логгировние в format-стиле. {#loggirovnie-v-format-stile} Делает [Иван Лежанкин](https://github.com/abyss7). Низкий приоритет. -[#6049](https://github.com/ClickHouse/ClickHouse/issues/6049#issuecomment-570836998) +[\#6049](https://github.com/ClickHouse/ClickHouse/issues/6049#issuecomment-570836998) -### 2.10. Запрашивать у таблиц не столбцы, а срезы. +### 2.10. Запрашивать у таблиц не столбцы, а срезы. {#zaprashivat-u-tablits-ne-stolbtsy-a-srezy} В очереди. -### 2.11. Разбирательство и нормализация функциональности для bitmap. +### 2.11. Разбирательство и нормализация функциональности для bitmap. {#razbiratelstvo-i-normalizatsiia-funktsionalnosti-dlia-bitmap} В очереди. -### 2.12. Декларативные сигнатуры функций. +### 2.12. Декларативные сигнатуры функций. {#deklarativnye-signatury-funktsii} -[#3775](https://github.com/ClickHouse/ClickHouse/pull/3775) +[\#3775](https://github.com/ClickHouse/ClickHouse/pull/3775) Задачу делает Алексей Миловидов. Прогресс 50% и разработка временно приостановлена. -### 2.13. Каждая функция в отдельном файле. +### 2.13. Каждая функция в отдельном файле. {#kazhdaia-funktsiia-v-otdelnom-faile} Задачу делает Алексей Миловидов. Прогресс 80%. Потребуется помощь других разработчиков. -### 2.14. Все функции с состоянием переделать на FunctionBuilder. +### 2.14. Все функции с состоянием переделать на FunctionBuilder. {#vse-funktsii-s-sostoianiem-peredelat-na-functionbuilder} Долг [Николай Кочетов](https://github.com/KochetovNicolai). Сейчас код находится в переходном состоянии, что неприемлемо. -### 2.15. Функция subscribe для IStorage. +### 2.15. Функция subscribe для IStorage. {#funktsiia-subscribe-dlia-istorage} Для нормализации работы materialized views поверх Merge, Distributed, Kafka. -## 3. Документация. +## 3. Документация. {#dokumentatsiia} Здесь задачи только по инфраструктуре документации. -### 3.1. Перенос документации по функциям в код. +### 3.1. Перенос документации по функциям в код. {#perenos-dokumentatsii-po-funktsiiam-v-kod} Требует 2.12 и 2.13. Хотим в Q1/Q2, средний приоритет. -### 3.2. Перенос однородных частей документации в код. +### 3.2. Перенос однородных частей документации в код. {#perenos-odnorodnykh-chastei-dokumentatsii-v-kod} Требует 3.1. -### + 3.3. Исправить катастрофически отвратительно неприемлемый поиск по документации. +### + 3.3. Исправить катастрофически отвратительно неприемлемый поиск по документации. {#ispravit-katastroficheski-otvratitelno-nepriemlemyi-poisk-po-dokumentatsii} [Иван Блинков](https://github.com/blinkov/) - очень хороший человек. Сам сайт документации основан на технологиях, не удовлетворяющих требованиям задачи, и эти технологии трудно исправить. Задачу будет делать первый встретившийся нам frontend разработчик, которого мы сможем заставить это сделать. Upd. Иван Блинков сделал эту задачу путём замены треш-технологий на нормальные. -### 3.4. + Добавить японский язык в документацию. +### 3.4. + Добавить японский язык в документацию. {#dobavit-iaponskii-iazyk-v-dokumentatsiiu} Эту задачу сделает [Иван Блинков](https://github.com/blinkov/), до конца декабря 2019. Сделано. +## 4. Сетевое взаимодействие. {#setevoe-vzaimodeistvie} -## 4. Сетевое взаимодействие. - -### 4.1. Уменьшение числа потоков при распределённых запросах. +### 4.1. Уменьшение числа потоков при распределённых запросах. {#umenshenie-chisla-potokov-pri-raspredelionnykh-zaprosakh} [Никита Лапков](https://github.com/laplab), весна 2020. Upd. Есть прототип. Upd. Он не работает. -### 4.2. Спекулятивное выполнение запросов на нескольких репликах. +### 4.2. Спекулятивное выполнение запросов на нескольких репликах. {#spekuliativnoe-vypolnenie-zaprosov-na-neskolkikh-replikakh} Нужно для Яндекс.Метрики. Требует 4.1. @@ -265,68 +262,68 @@ Upd. Иван Блинков сделал эту задачу путём зам Сейчас для распределённых запросов используется по потоку на соединение. Это позволяет хорошо распараллелить вычисления над полученными данными и утилизировать сеть, но становится сильно избыточным для больших кластеров. Для примера, создание 1000 потоков для чтения данных из 1000 серверов кластера - лишь расходует ресурсы и увеличивает время выполнения запроса. Вместо этого необходимо использовать количество потоков не большее количества процессорных ядер, и мультиплексировать в одном потоке общение с серверами. Реализация нетривиальна, так как мультиплексировать необходимо каждую стадию общения по сети, включая установку соединения и обмен handshake. -### 4.3. Ограничение числа одновременных скачиваний с реплик. +### 4.3. Ограничение числа одновременных скачиваний с реплик. {#ogranichenie-chisla-odnovremennykh-skachivanii-s-replik} Дмитрий Григорьев, ВШЭ. Изначально делал Олег Алексеенков, но пока решение не готово, хотя там не так уж много доделывать. -### 4.4. Ограничение сетевой полосы при репликации. +### 4.4. Ограничение сетевой полосы при репликации. {#ogranichenie-setevoi-polosy-pri-replikatsii} Дмитрий Григорьев, ВШЭ. Нужно для Метрики. -### 4.5. Возможность продолжить передачу куска данных при репликации после сбоя. +### 4.5. Возможность продолжить передачу куска данных при репликации после сбоя. {#vozmozhnost-prodolzhit-peredachu-kuska-dannykh-pri-replikatsii-posle-sboia} Дмитрий Григорьев, ВШЭ. -### 4.6. p2p передача для GLOBAL подзапросов. +### 4.6. p2p передача для GLOBAL подзапросов. {#p2p-peredacha-dlia-global-podzaprosov} -### 4.7. Ленивая загрузка множеств для IN и JOIN с помощью k/v запросов. +### 4.7. Ленивая загрузка множеств для IN и JOIN с помощью k/v запросов. {#lenivaia-zagruzka-mnozhestv-dlia-in-i-join-s-pomoshchiu-kv-zaprosov} -### 4.8. Разделить background pool для fetch и merge. +### 4.8. Разделить background pool для fetch и merge. {#razdelit-background-pool-dlia-fetch-i-merge} Дмитрий Григорьев, ВШЭ. В очереди. Исправить проблему, что восстанавливающаяся реплика перестаёт мержить. Частично компенсируется 4.3. -## 5. Операции. +## 5. Операции. {#operatsii} -### 5.1. Разделение задач на более мелкие куски в clickhouse-copier. +### 5.1. Разделение задач на более мелкие куски в clickhouse-copier. {#razdelenie-zadach-na-bolee-melkie-kuski-v-clickhouse-copier} -[#9075](https://github.com/ClickHouse/ClickHouse/pull/9075) +[\#9075](https://github.com/ClickHouse/ClickHouse/pull/9075) Q1. Нужно для Метрики, в очереди. Никита Михайлов. Upd. Задача на финальной стадии разработки. -### 5.2. Автонастройка лимита на оперативку и размера кэшей. +### 5.2. Автонастройка лимита на оперативку и размера кэшей. {#avtonastroika-limita-na-operativku-i-razmera-keshei} -### 5.3. + Встроенная ручка для Prometheus. +### 5.3. + Встроенная ручка для Prometheus. {#vstroennaia-ruchka-dlia-prometheus} Сделано. https://github.com/Vdimir -### 5.4. Opt-in сообщать в клиенте, если вышла новая версия. +### 5.4. Opt-in сообщать в клиенте, если вышла новая версия. {#opt-in-soobshchat-v-kliente-esli-vyshla-novaia-versiia} -### 5.5. + LTS релизы. +### 5.5. + LTS релизы. {#lts-relizy} Требует 7.5. Задачу хочет Метрика, Облако, БК, Маркет и Altinity. Первой LTS версией уже стала версия 19.14. Метрика, БК, Маркет, Altinity уже используют более свежие версии чем LTS. -## 6. Инструментирование. +## 6. Инструментирование. {#instrumentirovanie} -### 6.1. + Исправления сэмплирующего профайлера запросов. +### 6.1. + Исправления сэмплирующего профайлера запросов. {#ispravleniia-sempliruiushchego-profailera-zaprosov} Михаил Филимонов, Altinity. Ноябрь 2019. Сделано. Осталось ещё проверить работоспособность профайлера в первом потоке (что важно для INSERT). Иван Лежанкин. Q1. Сделано. -### 6.2. + Добавление memory profiler. +### 6.2. + Добавление memory profiler. {#dobavlenie-memory-profiler} -[#6387](https://github.com/ClickHouse/ClickHouse/issues/6387) +[\#6387](https://github.com/ClickHouse/ClickHouse/issues/6387) Сравнительно простая задача, но только для опытных разработчиков. Нужна всем. Иван Лежанкин. Q1. Сделано. -### 6.3. Учёт оперативки total расширить не только на запросы. +### 6.3. Учёт оперативки total расширить не только на запросы. {#uchiot-operativki-total-rasshirit-ne-tolko-na-zaprosy} Исправление долгоживущей проблемы с дрифтом учёта оперативки. Нужна для Метрики и БК. Иван Лежанкин. Q1. -### 6.4. Поддержка perf events как метрик запроса. +### 6.4. Поддержка perf events как метрик запроса. {#podderzhka-perf-events-kak-metrik-zaprosa} Делает Андрей Скобцов, ВШЭ. @@ -334,90 +331,89 @@ Upd. Задача на финальной стадии разработки. Есть прототип. -### 6.5. Эксперименты с LLVM X-Ray. +### 6.5. Эксперименты с LLVM X-Ray. {#eksperimenty-s-llvm-x-ray} Требует 2.2. -### 6.6. + Стек трейс для любых исключений. +### 6.6. + Стек трейс для любых исключений. {#stek-treis-dlia-liubykh-iskliuchenii} Сейчас есть стек трейс для почти всех, но не всех исключений. Требует 7.4. -### 6.7. + Таблица system.stack_trace. +### 6.7. + Таблица system.stack\_trace. {#tablitsa-system-stack-trace} Сравнительно простая задача, но только для опытных разработчиков. -### 6.8. Таблица system.crashes. +### 6.8. Таблица system.crashes. {#tablitsa-system-crashes} Сравнительно простая задача, но только для опытных разработчиков. -### 6.9. Отправлять информацию клиенту, если сервер падает по сигналу. +### 6.9. Отправлять информацию клиенту, если сервер падает по сигналу. {#otpravliat-informatsiiu-klientu-esli-server-padaet-po-signalu} -### 6.10. Сбор общих системных метрик. +### 6.10. Сбор общих системных метрик. {#sbor-obshchikh-sistemnykh-metrik} +## 7. Сопровождение разработки. {#soprovozhdenie-razrabotki} -## 7. Сопровождение разработки. - -### 7.1. + ICU в submodules. +### 7.1. + ICU в submodules. {#icu-v-submodules} Добавление в submodules также нужно для Аркадии (7.26). -### 7.2. + LLVM в submodules. +### 7.2. + LLVM в submodules. {#llvm-v-submodules} Сделал Алексей Миловидов. -### 7.3. Обновление Poco. +### 7.3. Обновление Poco. {#obnovlenie-poco} Алексанр Кузьменков. -### 7.4. + Включить libc++, libc++-abi при сборке с gcc. +### 7.4. + Включить libc++, libc++-abi при сборке с gcc. {#vkliuchit-libc-libc-abi-pri-sborke-s-gcc} Сейчас включено только при сборке с clang, но продакшен сборка использует gcc. Требует 7.2 и, возможно, 7.1 (только в случае новой версии ICU). -### 7.5. + Начать публиковать LTS релизы. +### 7.5. + Начать публиковать LTS релизы. {#nachat-publikovat-lts-relizy} [Александр Сапин](https://github.com/alesapin). -### 7.6. + Правильный статистический тест для comparison mode в clickhouse-performance-test. +### 7.6. + Правильный статистический тест для comparison mode в clickhouse-performance-test. {#pravilnyi-statisticheskii-test-dlia-comparison-mode-v-clickhouse-performance-test} Задачу начал делать Дмитрий Рубашкин (ВШЭ). Сейчас продолжает [Александр Кузьменков](https://github.com/akuzm). Сделано, работает в CI. -### 7.7. Доделать тесты под MSan. +### 7.7. Доделать тесты под MSan. {#dodelat-testy-pod-msan} Уже есть ASan, TSan, UBSan. Не хватает тестов под MSan. Они уже добавлены в CI, но не проходят. [Александр Кузьменков](https://github.com/akuzm) и [Александр Токмаков](https://github.com/tavplubix). -### 7.8. Добавить clang-tidy. +### 7.8. Добавить clang-tidy. {#dobavit-clang-tidy} Уже есть PVS-Studio. Мы очень довольны, но этого недостаточно. Upd. Алексей Миловидов. Добавлено некоторое множество проверок, но нужно рассмотреть все проверки подряд и добавить всё, что можно. -### 7.9. Проверки на стиль имён с помощью clang-tidy. +### 7.9. Проверки на стиль имён с помощью clang-tidy. {#proverki-na-stil-imion-s-pomoshchiu-clang-tidy} -### 7.10. Включение UBSan и MSan в интеграционных тестах. +### 7.10. Включение UBSan и MSan в интеграционных тестах. {#vkliuchenie-ubsan-i-msan-v-integratsionnykh-testakh} UBSan включен в функциональных тестах, но не включен в интеграционных тестах. Требует 7.7. -### 7.11. Включение \*San в unit тестах. +### 7.11. Включение \*San в unit тестах. {#vkliuchenie-san-v-unit-testakh} У нас мало unit тестов по сравнению с функциональными тестами и их использование не обязательно. Но они всё-равно важны и нет причин не запускать их под всеми видами sanitizers. -### 7.12. Показывать тестовое покрытие нового кода в PR. +### 7.12. Показывать тестовое покрытие нового кода в PR. {#pokazyvat-testovoe-pokrytie-novogo-koda-v-pr} Пока есть просто показ тестового покрытия всего кода. -### 7.13. + Включение аналога -Weverything в gcc. +### 7.13. + Включение аналога -Weverything в gcc. {#vkliuchenie-analoga-weverything-v-gcc} Мы используем -Wall -Wextra -Weverything -Werror. При сборке с clang, -Weverything уже включено. Но в gcc есть уникальные warning-и, отсутствующие в clang. Сделал Wolf Kreuzerkrieg. -### 7.14. + Альтернатива для readline и libedit. +### 7.14. + Альтернатива для readline и libedit. {#alternativa-dlia-readline-i-libedit} Подключение replxx вместо readline сделал Иван Лежанкин. -### 7.14.1. Улучшение возможностей интерактивного режима clickhouse-client. +### 7.14.1. Улучшение возможностей интерактивного режима clickhouse-client. {#uluchshenie-vozmozhnostei-interaktivnogo-rezhima-clickhouse-client} Тагир Кускаров, ВШЭ. @@ -447,51 +443,50 @@ Upd. В рамках данной задачи добавляем подстве Вместо этого предлагается в качестве примера изучить прототип текстового редактора Kilo: https://viewsourcecode.org/snaptoken/kilo/ и реализовать всю необходимую функциональность. - -### 7.15. + Замена libressl обратно на openssl. +### 7.15. + Замена libressl обратно на openssl. {#zamena-libressl-obratno-na-openssl} Поводом использования libressl послужило желание нашего хорошего друга из известной компании несколько лет назад. Но сейчас ситуация состоит в том, что openssl продолжает развиваться, а libressl не особо, и можно спокойно менять обратно. Нужно для Яндекс.Облака для поддержки TLS 1.3. -### 7.16. tzdata внутри бинарника. +### 7.16. tzdata внутри бинарника. {#tzdata-vnutri-binarnika} Как в Аркадии, но только в качестве fallback. -### 7.17. + Доделать tgz пакеты. +### 7.17. + Доделать tgz пакеты. {#dodelat-tgz-pakety} Уже давно собираются универсальные tgz пакеты, но по нелепой случайности из них исчез install скрипт. [Александр Сапин](https://github.com/alesapin). Может делегировать эту задачу кому угодно. Upd. Сделано всё кроме инструкции на сайте. Для этого требуется создать директории testing/stable/prestable на repo.yandex.ru. Внезапно оказалось, что человек, отвечающий за это, в отпуске, и он не отвечает на вопрос, кто его заместитель. Q1. -### 7.18. + Доделать бинарники под Mac. +### 7.18. + Доделать бинарники под Mac. {#dodelat-binarniki-pod-mac} Уже есть автосборка бинарников под Mac на каждый коммит и PR, но с недостатками. [Иван Лежанкин](https://github.com/abyss7). Требует 7.1, 7.2. Рекомендуется 7.14. Сейчас не хватает по крайней мере SSL и ICU. Нужно для Яндекс.Облака. Upd. Сделано SSL. Ориентируемся в Q1, но приоритет средний и может потеряться. -### 7.18.1. Поместить ссылку на собранные бинарники под Mac на сайт. +### 7.18.1. Поместить ссылку на собранные бинарники под Mac на сайт. {#pomestit-ssylku-na-sobrannye-binarniki-pod-mac-na-sait} Сейчас людям приходится делать несколько кликов, чтобы их скачать. [Иван Лежанкин](https://github.com/abyss7) или [Александр Сапин](https://github.com/alesapin). -### 7.19. + Доделать (проверить) автосборку под AArch64. +### 7.19. + Доделать (проверить) автосборку под AArch64. {#dodelat-proverit-avtosborku-pod-aarch64} -https://github.com/ClickHouse/ClickHouse/issues/8027#issuecomment-566670282 +https://github.com/ClickHouse/ClickHouse/issues/8027\#issuecomment-566670282 Проверили на настоящем сервере Huawei, а также в специальном Docker контейнере, который содержит внутри qemu-user-static. Также можно проверить на Cavium, на Raspberry Pi а также на твоём Android телефоне. -### 7.20. Автосборка для FreeBSD x86_64. +### 7.20. Автосборка для FreeBSD x86\_64. {#avtosborka-dlia-freebsd-x86-64} [Иван Лежанкин](https://github.com/abyss7). Upd. В процессе реализации, есть pull request. -### 7.21. Автосборка для Linux ppc64. +### 7.21. Автосборка для Linux ppc64. {#avtosborka-dlia-linux-ppc64} [Иван Лежанкин](https://github.com/abyss7). -### 7.22. Дэшборд для pull requests. +### 7.22. Дэшборд для pull requests. {#deshbord-dlia-pull-requests} Дарья Петрова, УрФУ. @@ -513,22 +508,22 @@ Upd. В процессе реализации, есть pull request. - отсортировать проверки по важности: например, если у внешнего разработчика проходят все проверки кроме стиля кода, то мы можем взять это в работу сами; - если для предыдущего коммита проверка была завершена, а для последнего коммита ещё только идёт - то можно отображать в таблице статус предыдущей проверки более блёклым цветом. -Предлагается реализовать несколько вариантов сортировок. Очевидное - по времени обновления, более интересно - некое ранжирование с целью выяснить, "что лучше взять в работу прямо сейчас". +Предлагается реализовать несколько вариантов сортировок. Очевидное - по времени обновления, более интересно - некое ранжирование с целью выяснить, «что лучше взять в работу прямо сейчас». Похожие продукты уже есть, например: http://prs.mozilla.io/yandex:ClickHouse К сожалению, этот продукт заброшен, да и делает не совсем то, что нужно. По своему усмотрению, можно взять из него что-нибудь полезное. -### 7.23. Функции для fuzzing. +### 7.23. Функции для fuzzing. {#funktsii-dlia-fuzzing} Андрей Некрашевич, ВШЭ. Fuzzing тестирование - это тестирование случайными данными. Мы рассмотрим несколько подходов к этой задачи: -1. Добавление в SQL диалект ClickHouse функций для генерации случайных данных (пример - случайные бинарные строки заданной длины, случайные валидные UTF-8 строки) и "порчи" данных (например, поменять значения случайных бит с заданной частотой). Это будет использовано для тестирования SQL-функций ClickHouse. +1. Добавление в SQL диалект ClickHouse функций для генерации случайных данных (пример - случайные бинарные строки заданной длины, случайные валидные UTF-8 строки) и «порчи» данных (например, поменять значения случайных бит с заданной частотой). Это будет использовано для тестирования SQL-функций ClickHouse. Можно добавить функции: `randomString(length)` `randomFixedString(length)` - - строка заданной длины с равномерно распределёнными случайными байтами; +- строка заданной длины с равномерно распределёнными случайными байтами; `randomStringASCII(length)` `randomStringUTF8(length)` @@ -541,23 +536,23 @@ Fuzzing тестирование - это тестирование случай Upd. Сергей Штыков сделал функцию `randomPrintableASCII`. -### 7.24. Fuzzing лексера и парсера запросов; кодеков и форматов. +### 7.24. Fuzzing лексера и парсера запросов; кодеков и форматов. {#fuzzing-leksera-i-parsera-zaprosov-kodekov-i-formatov} Андрей Некрашевич, ВШЭ. Продолжение 7.23. -2. Использование AFL или LibFuzzer для тестирования отдельных частей кодовой базы ClickHouse. +1. Использование AFL или LibFuzzer для тестирования отдельных частей кодовой базы ClickHouse. -3. Генерация и выполнение случайных синтаксически корректных запросов на случайных данных. +2. Генерация и выполнение случайных синтаксически корректных запросов на случайных данных. -### 7.25. Синхронизация релизов в Аркадию. +### 7.25. Синхронизация релизов в Аркадию. {#sinkhronizatsiia-relizov-v-arkadiiu} Изначально занимался Олег Алексеенков. Сейчас он перешёл работать в дружественный отдел, но обещает продолжать синхронизацию. Затем, возможно, [Иван Лежанкин](https://github.com/abyss7). Но сейчас приостановлено, так как Максим из YT должен исправить регрессию производительности в анализе индекса. Максим из YT сказал, что сделает это после нового года. -Максим из YT сказал, что "мы планируем в январе добиться". +Максим из YT сказал, что «мы планируем в январе добиться». Максим сейчас занимается собираемостью YT с новой версией ClickHouse. Нужно для CHYT и YQL. @@ -566,27 +561,27 @@ UPD: Все патчи Максима отправлены в master. Задач Upd: Задача в процессе реализации. Синхронизироваться будет master. Делает [Иван Лежанкин](https://github.com/abyss7) -### 7.26. Побайтовая идентичность репозитория с Аркадией. +### 7.26. Побайтовая идентичность репозитория с Аркадией. {#pobaitovaia-identichnost-repozitoriia-s-arkadiei} Команда DevTools. Прогресс по задаче под вопросом. -### 7.27. Запуск автотестов в Аркадии. +### 7.27. Запуск автотестов в Аркадии. {#zapusk-avtotestov-v-arkadii} Требует 7.26. Коллеги начали делать, есть результат. -### 7.29. Опции clickhouse install, stop, start вместо postinst, init.d, systemd скриптов. +### 7.29. Опции clickhouse install, stop, start вместо postinst, init.d, systemd скриптов. {#optsii-clickhouse-install-stop-start-vmesto-postinst-init-d-systemd-skriptov} Низкий приоритет. -### 7.30. Возможность переключения бинарных файлов на продакшене без выкладки пакетов. +### 7.30. Возможность переключения бинарных файлов на продакшене без выкладки пакетов. {#vozmozhnost-perekliucheniia-binarnykh-failov-na-prodakshene-bez-vykladki-paketov} Низкий приоритет. -### 7.31. Зеркалирование нагрузки между серверами. +### 7.31. Зеркалирование нагрузки между серверами. {#zerkalirovanie-nagruzki-mezhdu-serverami} В очереди. Нужно для Яндекс.Метрики. -### 7.32. Обфускация продакшен запросов. +### 7.32. Обфускация продакшен запросов. {#obfuskatsiia-prodakshen-zaprosov} Роман Ильговский. Нужно для Яндекс.Метрики. @@ -596,120 +591,120 @@ Upd: Задача в процессе реализации. Синхронизи Обфускация запросов: имея секретные запросы и структуру таблиц, заменить имена полей и константы, чтобы запросы можно было использовать в качестве публично доступных тестов. -### 7.33. Выкладывать патч релизы в репозиторий автоматически. +### 7.33. Выкладывать патч релизы в репозиторий автоматически. {#vykladyvat-patch-relizy-v-repozitorii-avtomaticheski} В очереди. Иван Лежанкин. -### 7.34. Бэкпортировать bugfix автоматически. +### 7.34. Бэкпортировать bugfix автоматически. {#bekportirovat-bugfix-avtomaticheski} В очереди. Иван Лежанкин. -### 7.35. Начальные правила для авто-merge. +### 7.35. Начальные правила для авто-merge. {#nachalnye-pravila-dlia-avto-merge} Зелёные проверки и два ревью. [Александр Сапин](https://github.com/alesapin). Может делегировать эту задачу кому угодно. -### 7.36. Понятие доверенных контрибьюторов. +### 7.36. Понятие доверенных контрибьюторов. {#poniatie-doverennykh-kontribiutorov} Контрибьюторы, у которых есть 5 померженных PR. Для их новых PR автотесты запускаются сразу. [Александр Сапин](https://github.com/alesapin). Может делегировать эту задачу кому угодно. Сейчас добавляем некоторых доверенных контрибьюторов в ручном режиме. -### 7.37. Разобраться с repo.yandex.ru. +### 7.37. Разобраться с repo.yandex.ru. {#razobratsia-s-repo-yandex-ru} Есть жалобы на скорость загрузки и неудобство maintenance, operations, visibility. Upd. Иван Блинков настроил CDN repo.clickhouse.tech, что решает проблему с доступностью зарубежом. Вопрос с operations, visibility пока актуален. -## 8. Интеграция с внешними системами. +## 8. Интеграция с внешними системами. {#integratsiia-s-vneshnimi-sistemami} -### 8.1. Поддержка ALTER MODIFY SETTING для Kafka. +### 8.1. Поддержка ALTER MODIFY SETTING для Kafka. {#podderzhka-alter-modify-setting-dlia-kafka} Также - возможность указать все настройки форматов в Kafka. Altinity. Никто не делает эту задачу. -### 8.2. Поддержка Mongo Atlas URI. +### 8.2. Поддержка Mongo Atlas URI. {#podderzhka-mongo-atlas-uri} [Александр Кузьменков](https://github.com/akuzm). -### 8.3. Доработки globs (правильная поддержка диапазонов, уменьшение числа одновременных stream-ов). +### 8.3. Доработки globs (правильная поддержка диапазонов, уменьшение числа одновременных stream-ов). {#dorabotki-globs-pravilnaia-podderzhka-diapazonov-umenshenie-chisla-odnovremennykh-stream-ov} [Ольга Хвостикова](https://github.com/stavrolia). Уменьшение числа stream-ов сделано, а вот правильная поддержка диапазонов - нет. Будем надеяться на Q1/Q2. -### 8.4. Унификация File, HDFS, S3 под URL. +### 8.4. Унификация File, HDFS, S3 под URL. {#unifikatsiia-file-hdfs-s3-pod-url} -### 8.5. + Аутентификация в S3. +### 8.5. + Аутентификация в S3. {#autentifikatsiia-v-s3} [Владимир Чеботарёв](https://github.com/excitoon), Altinity. -### 8.6. Kerberos аутентификация для HDFS и Kafka. +### 8.6. Kerberos аутентификация для HDFS и Kafka. {#kerberos-autentifikatsiia-dlia-hdfs-i-kafka} Андрей Коняев, ArenaData. Он куда-то пропал. -### 8.7. + Исправление мелочи HDFS на очень старых ядрах Linux. +### 8.7. + Исправление мелочи HDFS на очень старых ядрах Linux. {#ispravlenie-melochi-hdfs-na-ochen-starykh-iadrakh-linux} В ядрах 2.6 отсутствует один системный вызов, который библиотека hdfs3 использует без необходимости. Сделал Amos Bird. -### 8.8. + Поддержка виртуальных столбцов с именем файла и путём. +### 8.8. + Поддержка виртуальных столбцов с именем файла и путём. {#podderzhka-virtualnykh-stolbtsov-s-imenem-faila-i-putiom} [Ольга Хвостикова](https://github.com/stavrolia). -### 8.9. + Поддержка сжатых файлов (gz, bz) на чтение и запись. +### 8.9. + Поддержка сжатых файлов (gz, bz) на чтение и запись. {#podderzhka-szhatykh-failov-gz-bz-na-chtenie-i-zapis} Сделал [Andrey Bodrov](https://github.com/apbodrov) -### 8.10. Запись в табличную функцию ODBC. +### 8.10. Запись в табличную функцию ODBC. {#zapis-v-tablichnuiu-funktsiiu-odbc} Артемий Бобровский, ВШЭ -### 8.11. Движок таблиц для чтения из Mongo. +### 8.11. Движок таблиц для чтения из Mongo. {#dvizhok-tablits-dlia-chteniia-iz-mongo} Артемий Бобровский, ВШЭ -### 8.12. Пропуск столбцов в форматах Parquet, ORC. +### 8.12. Пропуск столбцов в форматах Parquet, ORC. {#propusk-stolbtsov-v-formatakh-parquet-orc} Артемий Бобровский, ВШЭ -### 8.13. Поддержка массивов в Parquet, ORC. +### 8.13. Поддержка массивов в Parquet, ORC. {#podderzhka-massivov-v-parquet-orc} Артемий Бобровский, ВШЭ -### 8.14. Запись данных в ORC. +### 8.14. Запись данных в ORC. {#zapis-dannykh-v-orc} Возможно, Андрей Коняев, ArenaData (зависит от желания). -### 8.15. Запись данных в CapNProto. +### 8.15. Запись данных в CapNProto. {#zapis-dannykh-v-capnproto} -### 8.16. + Поддержка формата Avro. +### 8.16. + Поддержка формата Avro. {#podderzhka-formata-avro} Andrew Onyshchuk. Есть pull request. Q1. Сделано. Формат Apache Avro является компактным структурированным построчным бинарным форматом данных с внешней схемой. Этот формат часто используется совместно с Kafka и поддержка его в качестве одного из форматов ввода-вывода в ClickHouse является востребованной пользователями. -### 8.16.1. + Поддержка формата JSONEachRow, засунутого в массив. +### 8.16.1. + Поддержка формата JSONEachRow, засунутого в массив. {#podderzhka-formata-jsoneachrow-zasunutogo-v-massiv} Павел Круглов, ВШЭ и Яндекс. Есть pull request. -### 8.16.2. Поддержка формата Thrift. +### 8.16.2. Поддержка формата Thrift. {#podderzhka-formata-thrift} Павел Круглов, ВШЭ и Яндекс. -### 8.16.3. Поддержка формата MsgPack. +### 8.16.3. Поддержка формата MsgPack. {#podderzhka-formata-msgpack} Павел Круглов, ВШЭ и Яндекс. Задача взята в работу. -### 8.16.4. Формат Regexp. +### 8.16.4. Формат Regexp. {#format-regexp} Павел Круглов, ВШЭ и Яндекс. Есть pull request. -### 8.17. ClickHouse как MySQL реплика. +### 8.17. ClickHouse как MySQL реплика. {#clickhouse-kak-mysql-replika} Ильяс Адюгамов, ВШЭ. @@ -717,11 +712,11 @@ Andrew Onyshchuk. Есть pull request. Q1. Сделано. Встроенная в ClickHouse возможность работать в качестве реплики MySQL даст преимущества для дальнейшего развития. -### 8.18. + ClickHouse как Federated MySQL. +### 8.18. + ClickHouse как Federated MySQL. {#clickhouse-kak-federated-mysql} Maxim Fedotov, Wargaming + Yuri Baranov, Яндекс. -### 8.19. Интеграция с RabbitMQ. +### 8.19. Интеграция с RabbitMQ. {#integratsiia-s-rabbitmq} Ксения Сумарокова, ВШЭ. @@ -731,33 +726,33 @@ Maxim Fedotov, Wargaming + Yuri Baranov, Яндекс. Есть pull request в процессе разработки. -### 8.20. Интеграция с SQS. +### 8.20. Интеграция с SQS. {#integratsiia-s-sqs} Низкий приоритет. -### 8.21. Поддержка произвольного количества языков для имён регионов. +### 8.21. Поддержка произвольного количества языков для имён регионов. {#podderzhka-proizvolnogo-kolichestva-iazykov-dlia-imion-regionov} Нужно для БК. Декабрь 2019. В декабре для БК сделан минимальный вариант этой задачи. Максимальный вариант, вроде, никому не нужен. -### 8.22. Поддержка синтаксиса для переменных в стиле MySQL. +### 8.22. Поддержка синтаксиса для переменных в стиле MySQL. {#podderzhka-sintaksisa-dlia-peremennykh-v-stile-mysql} При парсинге запроса преобразовывать синтаксис вида `@@version_full` в вызов функции `getGlobalVariable('version_full')`. Поддержать популярные MySQL переменные. Может быть поможет Юрий Баранов, если будет энтузиазм. Upd. Юрий Баранов работает в Google, там запрещено разрабатывать ClickHouse. -### 8.23. Подписка для импорта обновляемых и ротируемых логов в ФС. +### 8.23. Подписка для импорта обновляемых и ротируемых логов в ФС. {#podpiska-dlia-importa-obnovliaemykh-i-rotiruemykh-logov-v-fs} Желательно 2.15. -## 9. Безопасность. +## 9. Безопасность. {#bezopasnost} -### 9.1. + Ограничение на хосты в запросах ко внешним системам. +### 9.1. + Ограничение на хосты в запросах ко внешним системам. {#ogranichenie-na-khosty-v-zaprosakh-ko-vneshnim-sistemam} Михаил Коротов. -### 9.2. Преднастроенные именованные соединения к внешним БД. +### 9.2. Преднастроенные именованные соединения к внешним БД. {#prednastroennye-imenovannye-soedineniia-k-vneshnim-bd} Валерий Батурин, ВШЭ. @@ -765,77 +760,76 @@ ClickHouse предоставляет возможность обратитьс Вместо этого предлагается описывать необходимые данные в конфигурационном файле сервера или в отдельном сервисе и ссылаться на них по именам. -### 9.3. Поддержка TLS для ZooKeeper. +### 9.3. Поддержка TLS для ZooKeeper. {#podderzhka-tls-dlia-zookeeper} +## 10. Внешние словари. {#vneshnie-slovari} -## 10. Внешние словари. - -### 10.1. + Исправление зависания в библиотеке доступа к YT. +### 10.1. + Исправление зависания в библиотеке доступа к YT. {#ispravlenie-zavisaniia-v-biblioteke-dostupa-k-yt} Библиотека для доступа к YT не переживает учения. Нужно для БК и Метрики. Поиск причин - [Александр Сапин](https://github.com/alesapin). Дальшейшее исправление возможно на стороне YT. -Цитата: "Оказывается для YT-клиента зависания на несколько минут это нормально. Убрал внутренние ретраи, снизил таймауты. Однозначно станет лучше". +Цитата: «Оказывается для YT-клиента зависания на несколько минут это нормально. Убрал внутренние ретраи, снизил таймауты. Однозначно станет лучше». -### 10.2. Исправление SIGILL в библиотеке доступа к YT. +### 10.2. Исправление SIGILL в библиотеке доступа к YT. {#ispravlenie-sigill-v-biblioteke-dostupa-k-yt} Код YT использует SIGILL вместо abort. Это, опять же, происходит при учениях. Нужно для БК и Метрики. Поиск причин - [Александр Сапин](https://github.com/alesapin). Дальшейшее исправление возможно на стороне YT. Upd. Одну причину устранили, но ещё что-то неизвестное осталось. -### 10.3. Возможность чтения данных из статических таблиц в YT словарях. +### 10.3. Возможность чтения данных из статических таблиц в YT словарях. {#vozmozhnost-chteniia-dannykh-iz-staticheskikh-tablits-v-yt-slovariakh} Нужно для БК и Метрики. -### 10.4. Словарь из YDB (KikiMR). +### 10.4. Словарь из YDB (KikiMR). {#slovar-iz-ydb-kikimr} Нужно для Метрики, а делать будет таинственный незнакомец из команды KikiMR (под вопросом). Таинственный незнакомец не подтверждает, что он будет делать эту задачу. -### 10.5. Закрытие соединений и уменьшение числа соединений для MySQL и ODBC. +### 10.5. Закрытие соединений и уменьшение числа соединений для MySQL и ODBC. {#zakrytie-soedinenii-i-umenshenie-chisla-soedinenii-dlia-mysql-i-odbc} Нужно для Метрики. Для MySQL сделал Clément Rodriguez. -### 10.6. Словари из Cassandra и Couchbase. +### 10.6. Словари из Cassandra и Couchbase. {#slovari-iz-cassandra-i-couchbase} -### 10.7. Поддержка Nullable в словарях. +### 10.7. Поддержка Nullable в словарях. {#podderzhka-nullable-v-slovariakh} Артём Стрельцов, Николай Дегтеринский, Наталия Михненко, ВШЭ. -### 10.8. Поддержка массивов в словарях. +### 10.8. Поддержка массивов в словарях. {#podderzhka-massivov-v-slovariakh} Артём Стрельцов, Николай Дегтеринский, Наталия Михненко, ВШЭ. -### 10.9. Уменьшение блокировок для cache словарей за счёт одновременных запросов одного и того же. +### 10.9. Уменьшение блокировок для cache словарей за счёт одновременных запросов одного и того же. {#umenshenie-blokirovok-dlia-cache-slovarei-za-schiot-odnovremennykh-zaprosov-odnogo-i-togo-zhe} Заменено в пользу 10.10, 10.11. -### 10.10. + Возможность использования старых значений из cache словаря пока они перезапрашиваются. +### 10.10. + Возможность использования старых значений из cache словаря пока они перезапрашиваются. {#vozmozhnost-ispolzovaniia-starykh-znachenii-iz-cache-slovaria-poka-oni-perezaprashivaiutsia} Никита Михайлов. Q1. Нужно для БК и Метрики. -### 10.11. + Возможность исключительно асинхронных запросов в cache словарях. +### 10.11. + Возможность исключительно асинхронных запросов в cache словарях. {#vozmozhnost-iskliuchitelno-asinkhronnykh-zaprosov-v-cache-slovariakh} Никита Михайлов. Q1. Нужно для БК и Метрики. Требует 10.10. -### 10.12. Layout direct для словарей. +### 10.12. Layout direct для словарей. {#layout-direct-dlia-slovarei} Артём Стрельцов, Николай Дегтеринский, Наталия Михненко, ВШЭ. Приступили к этой задаче. -### 10.13. Использование Join как generic layout для словарей. +### 10.13. Использование Join как generic layout для словарей. {#ispolzovanie-join-kak-generic-layout-dlia-slovarei} Артём Стрельцов, Николай Дегтеринский, Наталия Михненко, ВШЭ. -### 10.14. Поддержка всех типов в функции transform. +### 10.14. Поддержка всех типов в функции transform. {#podderzhka-vsekh-tipov-v-funktsii-transform} Задачу взяла Ольга Хвостикова. -### 10.15. Использование словарей как специализированного layout для Join. +### 10.15. Использование словарей как специализированного layout для Join. {#ispolzovanie-slovarei-kak-spetsializirovannogo-layout-dlia-join} -### 10.16. Словари на локальном SSD. +### 10.16. Словари на локальном SSD. {#slovari-na-lokalnom-ssd} Никита Васильев, ВШЭ и Яндекс. Есть pull request. @@ -843,252 +837,245 @@ Upd. Одну причину устранили, но ещё что-то неи Использовать эту структуру данных как отдельный вид словарей, как источник для cache словарей или как дополнительный уровень кэширования для cache словарей. -### 10.17. Локальный дамп состояния словаря для быстрого старта сервера. +### 10.17. Локальный дамп состояния словаря для быстрого старта сервера. {#lokalnyi-damp-sostoianiia-slovaria-dlia-bystrogo-starta-servera} -### 10.18. Таблица Join или словарь на удалённом сервере как key-value БД для cache словаря. +### 10.18. Таблица Join или словарь на удалённом сервере как key-value БД для cache словаря. {#tablitsa-join-ili-slovar-na-udalionnom-servere-kak-key-value-bd-dlia-cache-slovaria} -### 10.19. Возможность зарегистрировать некоторые функции, использующие словари, под пользовательскими именами. +### 10.19. Возможность зарегистрировать некоторые функции, использующие словари, под пользовательскими именами. {#vozmozhnost-zaregistrirovat-nekotorye-funktsii-ispolzuiushchie-slovari-pod-polzovatelskimi-imenami} +## 11. Интерфейсы. {#interfeisy} -## 11. Интерфейсы. +### 11.1. Вставка состояний агрегатных функций в виде кортежа аргументов или массива кортежей аргументов. {#vstavka-sostoianii-agregatnykh-funktsii-v-vide-kortezha-argumentov-ili-massiva-kortezhei-argumentov} -### 11.1. Вставка состояний агрегатных функций в виде кортежа аргументов или массива кортежей аргументов. - -### 11.2. Возможность использовать движок JDBC из коробки. +### 11.2. Возможность использовать движок JDBC из коробки. {#vozmozhnost-ispolzovat-dvizhok-jdbc-iz-korobki} Нужно разобраться, как упаковывать Java в статический бинарник, возможно AppImage. Или предоставить максимально простую инструкцию по установке jdbc-bridge. Может быть будет заинтересован Александр Крашенинников, Badoo, так как он разработал jdbc-bridge. -### 11.3. + Интеграционные тесты ODBC драйвера путём подключения ClickHouse к самому себе через ODBC. +### 11.3. + Интеграционные тесты ODBC драйвера путём подключения ClickHouse к самому себе через ODBC. {#integratsionnye-testy-odbc-draivera-putiom-podkliucheniia-clickhouse-k-samomu-sebe-cherez-odbc} Михаил Филимонов, Altinity. Готово. -### 11.4. Исправление упячек с типами Date и Decimal в clickhouse-cpp. +### 11.4. Исправление упячек с типами Date и Decimal в clickhouse-cpp. {#ispravlenie-upiachek-s-tipami-date-i-decimal-v-clickhouse-cpp} -### 11.5. Поддержка TLS в clickhouse-cpp. +### 11.5. Поддержка TLS в clickhouse-cpp. {#podderzhka-tls-v-clickhouse-cpp} А знаете ли вы, что библиотеку clickhouse-cpp разрабатывал один хороший человек в свободное время? -### 11.6. Интеграционные тесты clickhouse-cpp. +### 11.6. Интеграционные тесты clickhouse-cpp. {#integratsionnye-testy-clickhouse-cpp} -### 11.7. Интерактивный режим работы программы clickhouse-local. +### 11.7. Интерактивный режим работы программы clickhouse-local. {#interaktivnyi-rezhim-raboty-programmy-clickhouse-local} -### 11.8. Поддержка протокола PostgreSQL. +### 11.8. Поддержка протокола PostgreSQL. {#podderzhka-protokola-postgresql} Элбакян Мовсес Андраникович, ВШЭ. В ClickHouse в прошлом году добавили поддержку wire-протокола MySQL. PostgreSQL, так же как MySQL, использует несложный протокол общения между клиентом и сервером, но свой собственный. Поддержка этого протокола является востребованной и откроет новые возможности для ClickHouse. -### 11.9. Доработки ODBC драйвера. +### 11.9. Доработки ODBC драйвера. {#dorabotki-odbc-draivera} Денис Глазачев, Altinity. Хороший прогресс по этой задаче. -### 11.10. Преднастроенные HTTP handlers для запросов. +### 11.10. Преднастроенные HTTP handlers для запросов. {#prednastroennye-http-handlers-dlia-zaprosov} zhang2014, есть pull request. Возможность описать в конфигурационном файле handler (путь в URL) для HTTP запросов к серверу, которому соответствует некоторый параметризованный запрос. Пользователь может вызвать этот обработчик и не должен передавать SQL запрос. +## 12. Управление пользователями и доступом. {#upravlenie-polzovateliami-i-dostupom} -## 12. Управление пользователями и доступом. - -### 12.1. Role Based Access Control. +### 12.1. Role Based Access Control. {#role-based-access-control} [Виталий Баранов](https://github.com/vitlibar). Финальная стадия разработки, рабочая версия в начале февраля 2019. Q1. Сейчас сделаны все интерфейсы в коде и запросы, но не сделаны варианты хранения прав кроме прототипа. Upd. Сделано хранение прав. До готового к использованию состояния осталось несколько доработок. -### 12.2. + Управление пользователями и правами доступа с помощью SQL запросов. +### 12.2. + Управление пользователями и правами доступа с помощью SQL запросов. {#upravlenie-polzovateliami-i-pravami-dostupa-s-pomoshchiu-sql-zaprosov} [Виталий Баранов](https://github.com/vitlibar). Финальная стадия разработки, рабочая версия в декабре 2019. Q1. Сделано управление правами полностью, но не реализовано их хранение, см. 12.1. -### 12.3. Подключение справочника пользователей и прав доступа из LDAP. +### 12.3. Подключение справочника пользователей и прав доступа из LDAP. {#podkliuchenie-spravochnika-polzovatelei-i-prav-dostupa-iz-ldap} [Виталий Баранов](https://github.com/vitlibar). Требует 12.1. Q1/Q2. -### 12.4. Подключение IDM системы Яндекса как справочника пользователей и прав доступа. +### 12.4. Подключение IDM системы Яндекса как справочника пользователей и прав доступа. {#podkliuchenie-idm-sistemy-iandeksa-kak-spravochnika-polzovatelei-i-prav-dostupa} Пока низкий приоритет. Нужно для Метрики. Требует 12.3. -### 12.5. Pluggable аутентификация с помощью Kerberos (возможно, подключение GSASL). +### 12.5. Pluggable аутентификация с помощью Kerberos (возможно, подключение GSASL). {#pluggable-autentifikatsiia-s-pomoshchiu-kerberos-vozmozhno-podkliuchenie-gsasl} [Виталий Баранов](https://github.com/vitlibar). Требует 12.1. -### 12.6. Информация о пользователях и квотах в системной таблице. +### 12.6. Информация о пользователях и квотах в системной таблице. {#informatsiia-o-polzovateliakh-i-kvotakh-v-sistemnoi-tablitse} [Виталий Баранов](https://github.com/vitlibar). Требует 12.1. +## 13. Разделение ресурсов, multi-tenancy. {#razdelenie-resursov-multi-tenancy} -## 13. Разделение ресурсов, multi-tenancy. - -### 13.1. Overcommit запросов по памяти и вытеснение. +### 13.1. Overcommit запросов по памяти и вытеснение. {#overcommit-zaprosov-po-pamiati-i-vytesnenie} Требует 2.1. Способ реализации обсуждается. -### 13.2. Общий конвейер выполнения на сервер. +### 13.2. Общий конвейер выполнения на сервер. {#obshchii-konveier-vypolneniia-na-server} Требует 2.1. [Николай Кочетов](https://github.com/KochetovNicolai). -### 13.3. Пулы ресурсов. +### 13.3. Пулы ресурсов. {#puly-resursov} Требует 13.2 или сможем сделать более неудобную реализацию раньше. Обсуждается вариант неудобной реализации. Пока средний приоритет, целимся на Q1/Q2. Вариант реализации выбрал Александр Казаков. +## 14. Диалект SQL. {#dialekt-sql} -## 14. Диалект SQL. - -### 14.1. Исправление семантики CAST для Nullable. +### 14.1. Исправление семантики CAST для Nullable. {#ispravlenie-semantiki-cast-dlia-nullable} Нужно для DataLens. А также для внедрения в BI инструмент Looker. Павел Потёмкин, ВШЭ. -### 14.2. Поддержка WITH для подзапросов. +### 14.2. Поддержка WITH для подзапросов. {#podderzhka-with-dlia-podzaprosov} Михаил Коротов. -### 14.3. Поддержка подстановок для множеств в правой части IN. +### 14.3. Поддержка подстановок для множеств в правой части IN. {#podderzhka-podstanovok-dlia-mnozhestv-v-pravoi-chasti-in} -### 14.4. Поддержка подстановок для идентификаторов (имён) в SQL запросе. +### 14.4. Поддержка подстановок для идентификаторов (имён) в SQL запросе. {#podderzhka-podstanovok-dlia-identifikatorov-imion-v-sql-zaprose} zhang2014 -### 14.5. Поддержка задания множества как массива в правой части секции IN. +### 14.5. Поддержка задания множества как массива в правой части секции IN. {#podderzhka-zadaniia-mnozhestva-kak-massiva-v-pravoi-chasti-sektsii-in} Василий Немков, Altinity, делал эту задачу, но забросил её в пользу других задач. В результате, сейчас доделывает Антон Попов. -### 14.6. Глобальный scope для WITH. +### 14.6. Глобальный scope для WITH. {#globalnyi-scope-dlia-with} Павел Потёмкин, ВШЭ. -### 14.7. Nullable для WITH ROLLUP, WITH CUBE, WITH TOTALS. +### 14.7. Nullable для WITH ROLLUP, WITH CUBE, WITH TOTALS. {#nullable-dlia-with-rollup-with-cube-with-totals} Павел Потёмкин, ВШЭ. Простая задача. -### 14.8. Модификаторы DISTINCT, ORDER BY для агрегатных функций. +### 14.8. Модификаторы DISTINCT, ORDER BY для агрегатных функций. {#modifikatory-distinct-order-by-dlia-agregatnykh-funktsii} В ClickHouse поддерживается вычисление COUNT(DISTINCT x). Предлагается добавить возможность использования модификатора DISTINCT для всех агрегатных функций. Например, AVG(DISTINCT x) - вычислить среднее значение для всех различных значений x. Под вопросом вариант, в котором фильтрация уникальных значений выполняется по одному выражению, а агрегация по другому. Результат некоторых агрегатных функций зависит от порядка данных. Предлагается реализовать модификатор ORDER BY, задающий порядок явно. Пример: groupArray(x ORDER BY y, z). -### 14.9. Поддержка запроса EXPLAIN. +### 14.9. Поддержка запроса EXPLAIN. {#podderzhka-zaprosa-explain} Требует 2.1. [Николай Кочетов](https://github.com/KochetovNicolai). -### 14.10. arrayReduce как функция высшего порядка. +### 14.10. arrayReduce как функция высшего порядка. {#arrayreduce-kak-funktsiia-vysshego-poriadka} -### 14.11. Функции для grouping sets. +### 14.11. Функции для grouping sets. {#funktsii-dlia-grouping-sets} -### 14.12. Функции обработки временных рядов. +### 14.12. Функции обработки временных рядов. {#funktsii-obrabotki-vremennykh-riadov} Сложная задача, так как вводит новый класс функций и требует его обработку в оптимизаторе запросов. -В time-series СУБД нужны функции, которые зависят от последовательности значений. Или даже от последовательности значений и их меток времени. Примеры: moving average, exponential smoothing, derivative, Holt-Winters forecast. Вычисление таких функций поддерживается в ClickHouse лишь частично. Так, ClickHouse поддерживает тип данных "массив" и позволяет реализовать эти функции как функции, принимающие массивы. Но гораздо удобнее для пользователя было бы иметь возможность применить такие функции к таблице (промежуточному результату запроса после сортировки). +В time-series СУБД нужны функции, которые зависят от последовательности значений. Или даже от последовательности значений и их меток времени. Примеры: moving average, exponential smoothing, derivative, Holt-Winters forecast. Вычисление таких функций поддерживается в ClickHouse лишь частично. Так, ClickHouse поддерживает тип данных «массив» и позволяет реализовать эти функции как функции, принимающие массивы. Но гораздо удобнее для пользователя было бы иметь возможность применить такие функции к таблице (промежуточному результату запроса после сортировки). Это требует введение нового класса функций (помимо обычных и агрегатных функций) - такие функции будут иметь в коде ClickHouse свой собственный интерфейс, и их вычисление придётся отдельно учитывать в конвейере выполнения запросов. Для примера, вычисление обычных функций тривиально распараллеливается по процессорным ядрам и по серверам; вычисление агрегатных функций распараллеливается с некоторыми особенностями (работа с промежуточными состояниями вычислений, операция merge); а для функций по обработке временных рядов этот вопрос остаётся открытым - возможно, их придётся вычислять на одном сервере и в одном потоке. -### 14.13. Применимость функций высшего порядка для кортежей и Nested. +### 14.13. Применимость функций высшего порядка для кортежей и Nested. {#primenimost-funktsii-vysshego-poriadka-dlia-kortezhei-i-nested} -### 14.14. Неявные преобразования типов констант. +### 14.14. Неявные преобразования типов констант. {#neiavnye-preobrazovaniia-tipov-konstant} Требует 2.12. -### 14.15. Неявные преобразования типов под настройкой. +### 14.15. Неявные преобразования типов под настройкой. {#neiavnye-preobrazovaniia-tipov-pod-nastroikoi} Требует 2.12. Для внедрения в BI инструмент Looker. -### 14.16. Синонимы для функций из MySQL. +### 14.16. Синонимы для функций из MySQL. {#sinonimy-dlia-funktsii-iz-mysql} -### 14.17. Ввести понятие stateful функций. +### 14.17. Ввести понятие stateful функций. {#vvesti-poniatie-stateful-funktsii} zhang2014. Для runningDifference, neighbour - их учёт в оптимизаторе запросов. В интерфейсе уже сделано. Надо проверить, что учитывается в нужных местах (например, что работает predicate pushdown сквозь ORDER BY, если таких функций нет). -### 14.18. UNION DISTINCT и возможность включить его по-умолчанию. +### 14.18. UNION DISTINCT и возможность включить его по-умолчанию. {#union-distinct-i-vozmozhnost-vkliuchit-ego-po-umolchaniiu} Павел Потёмкин, ВШЭ. Для BI систем. -### 14.19. Совместимость парсера типов данных с SQL. +### 14.19. Совместимость парсера типов данных с SQL. {#sovmestimost-parsera-tipov-dannykh-s-sql} Павел Потёмкин, ВШЭ. Для BI систем. -### 14.20. Позиционные аргументы для GROUP BY и ORDER BY. +### 14.20. Позиционные аргументы для GROUP BY и ORDER BY. {#pozitsionnye-argumenty-dlia-group-by-i-order-by} Павел Потёмкин, ВШЭ. Тривиально и используется многими системами, но не входит в стандарт SQL. -### 14.21. Приведение типов для IN (подзапрос) и для JOIN. +### 14.21. Приведение типов для IN (подзапрос) и для JOIN. {#privedenie-tipov-dlia-in-podzapros-i-dlia-join} Павел Потёмкин, ВШЭ. +## 15. Улучшение поддержки JOIN. {#uluchshenie-podderzhki-join} -## 15. Улучшение поддержки JOIN. - -### 15.1. Доведение merge JOIN до продакшена. +### 15.1. Доведение merge JOIN до продакшена. {#dovedenie-merge-join-do-prodakshena} Артём Зуйков. Сейчас merge JOIN включается вручную опцией и всегда замедляет запросы. Хотим, чтобы он замедлял запросы только когда это неизбежно. Кстати, смысл merge JOIN появляется только совместно с 15.2 и 15.3. Q1. Сделали адаптивный вариант, но вроде он что-то всё-ещё замедляет. -### 15.1.1. Алгоритм two-level merge JOIN. +### 15.1.1. Алгоритм two-level merge JOIN. {#algoritm-two-level-merge-join} Александр Кузьменков. В очереди. -### 15.1.2. Тестирование реализации JOIN в Greenplum. +### 15.1.2. Тестирование реализации JOIN в Greenplum. {#testirovanie-realizatsii-join-v-greenplum} В очереди. -### 15.2. Прокидывание условий в OUTER JOIN. +### 15.2. Прокидывание условий в OUTER JOIN. {#prokidyvanie-uslovii-v-outer-join} Возможно, Артём Зуйков, но задача ещё не продумана до конца. Возможно, требует 2.1. -### 15.3. Логический вывод для цепочек вида ON t1.x = t2.y WHERE t1.x = 10 +### 15.3. Логический вывод для цепочек вида ON t1.x = t2.y WHERE t1.x = 10 {#logicheskii-vyvod-dlia-tsepochek-vida-on-t1-x-t2-y-where-t1-x-10} Возможно, Артём Зуйков. Для полноценной работы 15.2. -### 15.4. Distributed JOIN с помощью перераспределения данных. +### 15.4. Distributed JOIN с помощью перераспределения данных. {#distributed-join-s-pomoshchiu-pereraspredeleniia-dannykh} Артём Зуйков. -### 15.5. Использование ключа таблицы для оптимизации merge JOIN. +### 15.5. Использование ключа таблицы для оптимизации merge JOIN. {#ispolzovanie-kliucha-tablitsy-dlia-optimizatsii-merge-join} -### 15.6. + SEMI и ANTI JOIN. +### 15.6. + SEMI и ANTI JOIN. {#semi-i-anti-join} Артём Зуйков. +## 16. Типы данных и функции. {#tipy-dannykh-i-funktsii} -## 16. Типы данных и функции. - -### 16.1. + DateTime64. +### 16.1. + DateTime64. {#datetime64} Василий Немков, Altinity, декабрь 2019. -### 16.2. Тип данных для JSON. +### 16.2. Тип данных для JSON. {#tip-dannykh-dlia-json} zhang2014 -### 16.3. Поддержка неконстантных аргументов с регулярными выражениями в функциях. +### 16.3. Поддержка неконстантных аргументов с регулярными выражениями в функциях. {#podderzhka-nekonstantnykh-argumentov-s-reguliarnymi-vyrazheniiami-v-funktsiiakh} Данила Кутенин, но только после секретного изменения в работе. Upd. Секретного изменения в работе не будет, задачу будет делать другой человек. -### 16.4. Функция rowNumberForKey. +### 16.4. Функция rowNumberForKey. {#funktsiia-rownumberforkey} -### 16.5. Функции для XML и HTML escape. +### 16.5. Функции для XML и HTML escape. {#funktsii-dlia-xml-i-html-escape} -### 16.6. Функции нормализации и хэширования SQL запросов. +### 16.6. Функции нормализации и хэширования SQL запросов. {#funktsii-normalizatsii-i-kheshirovaniia-sql-zaprosov} +## 17. Работа с географическими данными. {#rabota-s-geograficheskimi-dannymi} -## 17. Работа с географическими данными. - -### 17.1. Гео-словари для определения региона по координатам. +### 17.1. Гео-словари для определения региона по координатам. {#geo-slovari-dlia-opredeleniia-regiona-po-koordinatam} [Андрей Чулков](https://github.com/achulkov2), Антон Кваша, Артур Петуховский, ВШЭ. Будет основано на коде от Арслана Урташева. @@ -1100,32 +1087,31 @@ ClickHouse не является geospatial СУБД. Тем не менее, в Upd. Андрей сделал прототип интерфейса и реализацию-заглушку внутри него. Upd. Андрей сделал прототип более оптимальной структуры данных. -### 17.2. GIS типы данных и операции. +### 17.2. GIS типы данных и операции. {#gis-tipy-dannykh-i-operatsii} Алексей Коряков, Алексей Илюхов, ВШЭ, Яндекс.Карты. Реализовать в ClickHouse типы данных для задач обработки геоинформационных данных: Point, Line, MultiLine, Polygon и операции над ними - проверка вхождения, пересечения. Вариантом минимум будет реализация этих операций в евклидовой системе координат. Дополнительно - на сфере и WGS84. -### 17.3. + Ускорение greatCircleDistance. +### 17.3. + Ускорение greatCircleDistance. {#uskorenie-greatcircledistance} [Ольга Хвостикова](https://github.com/stavrolia), основано на коде Андрея Аксёнова, получено разрешение на использование кода. -### 17.4. Ускорение geohash с помощью библиотеки из Аркадии. +### 17.4. Ускорение geohash с помощью библиотеки из Аркадии. {#uskorenie-geohash-s-pomoshchiu-biblioteki-iz-arkadii} Предположительно, [Андрей Чулков](https://github.com/achulkov2). Получено одобрение от руководства. -### 17.5. Проверки в функции pointInPolygon. +### 17.5. Проверки в функции pointInPolygon. {#proverki-v-funktsii-pointinpolygon} [Николай Кочетов](https://github.com/KochetovNicolai). Сейчас функция тихо не работает в случае полигонов с самопересечениями, надо кидать исключение. +## 18. Машинное обучение и статистика. {#mashinnoe-obuchenie-i-statistika} -## 18. Машинное обучение и статистика. - -### 18.1. Инкрементальная кластеризация данных. +### 18.1. Инкрементальная кластеризация данных. {#inkrementalnaia-klasterizatsiia-dannykh} Александр Кожихов, Максим Кузнецов. Обнаружена фундаментальная проблема в реализации, доделывает предположительно [Николай Кочетов](https://github.com/KochetovNicolai). Он может делегировать задачу кому угодно. -### 18.2. Агрегатные функции для статистических тестов. +### 18.2. Агрегатные функции для статистических тестов. {#agregatnye-funktsii-dlia-statisticheskikh-testov} Артём Цыганов, Руденский Константин Игоревич, Семёнов Денис, ВШЭ. @@ -1133,20 +1119,19 @@ Upd. Андрей сделал прототип более оптимально Сделали прототип одного теста, есть pull request. -### 18.3. Инфраструктура для тренировки моделей в ClickHouse. +### 18.3. Инфраструктура для тренировки моделей в ClickHouse. {#infrastruktura-dlia-trenirovki-modelei-v-clickhouse} В очереди. Возможно, Александр Кожихов. У него сначала идёт задача 24.26. +## 19. Улучшение работы кластера. {#uluchshenie-raboty-klastera} -## 19. Улучшение работы кластера. - -### 19.1. Параллельные кворумные вставки без линеаризуемости. +### 19.1. Параллельные кворумные вставки без линеаризуемости. {#parallelnye-kvorumnye-vstavki-bez-linearizuemosti} Александра Латышева, ВШЭ и Яндекс. Репликация данных в ClickHouse по-умолчанию является асинхронной без выделенного мастера. Это значит, что клиент, осуществляющий вставку данных, получает успешный ответ после того, как данные попали на один сервер; репликация данных по остальным серверам осуществляется в другой момент времени. Это ненадёжно, потому что допускает потерю только что вставленных данных при потере лишь одного сервера. -Для решения этой проблемы, в ClickHouse есть возможность включить "кворумную" вставку. Это значит, что клиент, осуществляющий вставку данных, получает успешный ответ после того, как данные попали на несколько (кворум) серверов. Обеспечивается линеаризуемость: клиент, получает успешный ответ после того, как данные попали на несколько реплик, *которые содержат все предыдущие данные, вставленные с кворумом* (такие реплики можно называть "синхронными"), и при запросе SELECT можно выставить настройку, разрешающую только чтение с синхронных реплик. +Для решения этой проблемы, в ClickHouse есть возможность включить «кворумную» вставку. Это значит, что клиент, осуществляющий вставку данных, получает успешный ответ после того, как данные попали на несколько (кворум) серверов. Обеспечивается линеаризуемость: клиент, получает успешный ответ после того, как данные попали на несколько реплик, *которые содержат все предыдущие данные, вставленные с кворумом* (такие реплики можно называть «синхронными»), и при запросе SELECT можно выставить настройку, разрешающую только чтение с синхронных реплик. Если бы свойства линеаризуемости не было, то для трёх серверов A, B, C, значения кворума = 2, и для трёх вставок данных 1, 2, 3, возможна ситуация, что первая вставка прошла на серверы A и B, вторая прошла на серверы B и C, а третья - на серверы A и C, и теперь ни один из серверов не содержит полный набор данных 1, 2, 3. @@ -1154,7 +1139,7 @@ Upd. Андрей сделал прототип более оптимально Иногда пользователь хочет реализовать кворумную вставку вручную: просто соединиться с несколькими репликами и вставть на них одинаковые данные (чтобы обеспечить надёжную вставку, не ориентируясь на то, как работает механизм репликации). Сейчас ожидания пользователя не оправдываются. В ClickHouse есть механизм дедупликации для обеспечения идемпотентности вставок. Вторая вставка с такими же данными (пусть даже на другую реплику) будет проигнорирована. Надо сделать так, чтобы вместо этого, вставка одинаковых данных на другую реплику, имела такой же эффект, как если бы эти данные были получены с помощью механизма репликации. -### 19.2. Подключение Etcd или Consul как альтернативы ZooKeeper. +### 19.2. Подключение Etcd или Consul как альтернативы ZooKeeper. {#podkliuchenie-etcd-ili-consul-kak-alternativy-zookeeper} Алексей Лёвушкин, ВШЭ и Яндекс. @@ -1164,67 +1149,65 @@ Upd. Андрей сделал прототип более оптимально Upd. Алексей сделал какой-то вариант, но борется с тем, что ничего не работает. -### 19.3. Подключение YT Cypress или YDB как альтернативы ZooKeeper. +### 19.3. Подключение YT Cypress или YDB как альтернативы ZooKeeper. {#podkliuchenie-yt-cypress-ili-ydb-kak-alternativy-zookeeper} Hold. Полезно для заказчиков внутри Яндекса, но есть риски. Эту задачу никто не будет делать. -### 19.4. internal_replication = 'auto'. +### 19.4. internal\_replication = ‘auto’. {#internal-replication-auto} -### 19.5. Реплицируемые базы данных. +### 19.5. Реплицируемые базы данных. {#replitsiruemye-bazy-dannykh} В очереди, возможно Валерий Батурин, ВШЭ. Репликация в ClickHouse работает на уровне отдельных таблиц. Это является очень гибким решением: на одном сервере одна из таблиц может быть не реплицирована, другая иметь двухкратную репликацию, а третья - реплицирована по всем серверам. Но если все таблицы в базе данных реплицированы одинаковым образом. то это затрудняет управление кластером. Например, при восстановлени сервера, требуется отдельно создавать реплику для каждой таблицы. -Предлагается реализовать "движок баз данных", который осуществляет репликацию метаданных (множество имеющихся таблиц и лог DDL операций над ними: CREATE, DROP, RENAME, ALTER). Пользователь сможет создать реплицируемую базу данных; при её создании или восстановлении на другом сервере, все реплицируемые таблицы будут созданы автоматически. +Предлагается реализовать «движок баз данных», который осуществляет репликацию метаданных (множество имеющихся таблиц и лог DDL операций над ними: CREATE, DROP, RENAME, ALTER). Пользователь сможет создать реплицируемую базу данных; при её создании или восстановлении на другом сервере, все реплицируемые таблицы будут созданы автоматически. -### 19.6. Одновременный выбор кусков для слияния многими репликами, отказ от leader election в ZK. +### 19.6. Одновременный выбор кусков для слияния многими репликами, отказ от leader election в ZK. {#odnovremennyi-vybor-kuskov-dlia-sliianiia-mnogimi-replikami-otkaz-ot-leader-election-v-zk} Обсуждается. Возможно, будет делать Александр Казаков. -### 19.7. Возможность записи данных при недоступности ZK и отказ от линейного порядка кусков в большинстве случаев. +### 19.7. Возможность записи данных при недоступности ZK и отказ от линейного порядка кусков в большинстве случаев. {#vozmozhnost-zapisi-dannykh-pri-nedostupnosti-zk-i-otkaz-ot-lineinogo-poriadka-kuskov-v-bolshinstve-sluchaev} -### 19.8. Отказ от хранения в ZK множества кусков для каждой реплики отдельно. +### 19.8. Отказ от хранения в ZK множества кусков для каждой реплики отдельно. {#otkaz-ot-khraneniia-v-zk-mnozhestva-kuskov-dlia-kazhdoi-repliki-otdelno} -### 19.9. Отказ от хранения в ZK лога вставок и мержей. Обмен данными о кусках напрямую. +### 19.9. Отказ от хранения в ZK лога вставок и мержей. Обмен данными о кусках напрямую. {#otkaz-ot-khraneniia-v-zk-loga-vstavok-i-merzhei-obmen-dannymi-o-kuskakh-napriamuiu} -### 19.10. Облачные таблицы. +### 19.10. Облачные таблицы. {#oblachnye-tablitsy} Требует 1.6, 19.1, 19.6, 19.7, 19.8, 19.9. - -## 20. Мутации данных. +## 20. Мутации данных. {#mutatsii-dannykh} Пока все задачи по точечным UPDATE/DELETE имеют низкий приоритет, но ожидаем взять в работу в середине 2020. -### 20.1. Поддержка DELETE путём запоминания множества затронутых кусков и ключей. +### 20.1. Поддержка DELETE путём запоминания множества затронутых кусков и ключей. {#podderzhka-delete-putiom-zapominaniia-mnozhestva-zatronutykh-kuskov-i-kliuchei} -### 20.2. Поддержка DELETE путём преобразования множества ключей в множество row_numbers на реплике, столбца флагов и индекса по диапазонам. +### 20.2. Поддержка DELETE путём преобразования множества ключей в множество row\_numbers на реплике, столбца флагов и индекса по диапазонам. {#podderzhka-delete-putiom-preobrazovaniia-mnozhestva-kliuchei-v-mnozhestvo-row-numbers-na-replike-stolbtsa-flagov-i-indeksa-po-diapazonam} -### 20.3. Поддержка ленивых DELETE путём запоминания выражений и преобразования к множеству ключей в фоне. +### 20.3. Поддержка ленивых DELETE путём запоминания выражений и преобразования к множеству ключей в фоне. {#podderzhka-lenivykh-delete-putiom-zapominaniia-vyrazhenii-i-preobrazovaniia-k-mnozhestvu-kliuchei-v-fone} -### 20.4. Поддержка UPDATE с помощью преобразования в DELETE и вставок. +### 20.4. Поддержка UPDATE с помощью преобразования в DELETE и вставок. {#podderzhka-update-s-pomoshchiu-preobrazovaniia-v-delete-i-vstavok} +## 21. Оптимизации производительности. {#optimizatsii-proizvoditelnosti} -## 21. Оптимизации производительности. - -### 21.1. + Параллельный парсинг форматов. +### 21.1. + Параллельный парсинг форматов. {#parallelnyi-parsing-formatov} Начинал Олег Ершов, доделывает Никита Михайлов, помогает [Александр Кузьменков](https://github.com/akuzm). Готово. -### 21.1.1. Избавление от лишнего копирования при параллельном парсинге форматов, если возможен mmap файла целиком. +### 21.1.1. Избавление от лишнего копирования при параллельном парсинге форматов, если возможен mmap файла целиком. {#izbavlenie-ot-lishnego-kopirovaniia-pri-parallelnom-parsinge-formatov-esli-vozmozhen-mmap-faila-tselikom} -### 21.2. Параллельное форматирование форматов. +### 21.2. Параллельное форматирование форматов. {#parallelnoe-formatirovanie-formatov} После 21.1, предположительно Никита Михайлов. Задача сильно проще чем 21.1. -### 21.3. Исправление низкой производительности анализа индекса в случае большого множества в секции IN. +### 21.3. Исправление низкой производительности анализа индекса в случае большого множества в секции IN. {#ispravlenie-nizkoi-proizvoditelnosti-analiza-indeksa-v-sluchae-bolshogo-mnozhestva-v-sektsii-in} -Нужно всем (Zen, БК, DataLens, TestEnv...). Антон Попов, Q1/Q2. +Нужно всем (Zen, БК, DataLens, TestEnv…). Антон Попов, Q1/Q2. Upd. Антон делает эту задачу. Большая часть уже реализована. -### 21.4. Использование ORDER BY ключа для оптимизации GROUP BY и DISTINCT. +### 21.4. Использование ORDER BY ключа для оптимизации GROUP BY и DISTINCT. {#ispolzovanie-order-by-kliucha-dlia-optimizatsii-group-by-i-distinct} Дмитрий Рубашкин, ВШЭ. Помогает Антон Попов. @@ -1234,19 +1217,19 @@ Upd. Антон делает эту задачу. Большая часть уж В прошлом году, аналогичное решение сделали для операции ORDER BY. -### 21.5. + Распараллеливание INSERT при INSERT SELECT, если это необходимо. +### 21.5. + Распараллеливание INSERT при INSERT SELECT, если это необходимо. {#rasparallelivanie-insert-pri-insert-select-esli-eto-neobkhodimo} [Vxider](https://github.com/Vxider), ICT Есть pull request. -### 21.6. Уменьшение числа потоков для SELECT в случае тривиального INSERT SELECT. +### 21.6. Уменьшение числа потоков для SELECT в случае тривиального INSERT SELECT. {#umenshenie-chisla-potokov-dlia-select-v-sluchae-trivialnogo-insert-select} -### 21.7. Кэш результатов запросов. +### 21.7. Кэш результатов запросов. {#kesh-rezultatov-zaprosov} [Achimbab](https://github.com/achimbab). Есть pull request. Но это не совсем то. -### 21.8. Взаимная интеграция аллокатора и кэша. +### 21.8. Взаимная интеграция аллокатора и кэша. {#vzaimnaia-integratsiia-allokatora-i-kesha} Михаил Кот, ВШЭ. Задача сложная и рискованная. @@ -1256,23 +1239,23 @@ Upd. Антон делает эту задачу. Большая часть уж Для domain-specific кэшей (как например, кэш разжатых данных) выгодно, чтобы они использовали как можно больший объём свободной памяти. Но в этом случае, памяти может не хватить для других структур данных в программе. Если аллокатор памяти знает про кэш, то выделение памяти можно было бы делать путём вытеснения данных из кэша. -### 21.8.1. Отдельный аллокатор для кэшей с ASLR. +### 21.8.1. Отдельный аллокатор для кэшей с ASLR. {#otdelnyi-allokator-dlia-keshei-s-aslr} В прошлом году задачу пытался сделать Данила Кутенин с помощью lfalloc из Аркадии и mimalloc из Microsoft, но оба решения не были квалифицированы для использования в продакшене. Успешная реализация задачи 21.8 отменит необходимость в этой задаче, поэтому холд. -### 21.9. Исправить push-down выражений с помощью Processors. +### 21.9. Исправить push-down выражений с помощью Processors. {#ispravit-push-down-vyrazhenii-s-pomoshchiu-processors} [Николай Кочетов](https://github.com/KochetovNicolai). Требует 2.1. -### 21.10. + Улучшение эвристики PREWHERE. +### 21.10. + Улучшение эвристики PREWHERE. {#uluchshenie-evristiki-prewhere} Amos Bird. -### 21.11. Peephole оптимизации запросов. +### 21.11. Peephole оптимизации запросов. {#peephole-optimizatsii-zaprosov} Руслан Камалов, Михаил Малафеев, Виктор Гришанин, ВШЭ -Реализовать в ClickHouse оптимизации запросов, основанные на упрощении отдельных небольших кусков выражений (так называемые "peephole" оптимизации). Примеры: +Реализовать в ClickHouse оптимизации запросов, основанные на упрощении отдельных небольших кусков выражений (так называемые «peephole» оптимизации). Примеры: - Замена цепочек if на multiIf. - Удаление min/max/any-агрегатов от выражений от ключей GROUP BY. - Вынесение арифметических операций из агрегатных функций; @@ -1281,7 +1264,7 @@ Amos Bird. Сделана замена цепочек if на multiIf, но внезапно оказалось, что это является не оптимизацией, а наоборот. -### 21.12. Алгебраические оптимизации запросов. +### 21.12. Алгебраические оптимизации запросов. {#algebraicheskie-optimizatsii-zaprosov} Руслан Камалов, Михаил Малафеев, Виктор Гришанин, ВШЭ @@ -1293,250 +1276,247 @@ Amos Bird. - Удаление из GROUP BY функций от других ключей GROUP BY. - Удаление дублирующихся DISTINCT, ORDER BY из подзапросов. -### 21.13. Fusion агрегатных функций. +### 21.13. Fusion агрегатных функций. {#fusion-agregatnykh-funktsii} После или совместно с 21.11. -### 21.14. Оптимизация запросов с помощью constraints. +### 21.14. Оптимизация запросов с помощью constraints. {#optimizatsiia-zaprosov-s-pomoshchiu-constraints} Constraints позволяют задать выражение, истинность которого проверяется при вставке данных в таблицу. Предположение о том, что выражение истинно, может использоваться и для оптимизации запросов. Например, встретив в запросе точно такое же выражение, можно заменить его на константу 1. Если выражение содержит равенство, то встретив в запросе одну из частей равенства, её можно заменить на другую часть равенства, если это сделает проще чтение данных или вычисление выражения. Например, задан constraint: `URLDomain = domain(URL)`. Значит, выражение `domain(URL)` можно заменить на `URLDomain`. -### 21.15. Многоступенчатое чтение данных вместо PREWHERE. +### 21.15. Многоступенчатое чтение данных вместо PREWHERE. {#mnogostupenchatoe-chtenie-dannykh-vmesto-prewhere} Требует 2.1 и 21.10. -### 21.16. Оптимизация GROUP BY с большим количеством агрегатных функций путём вычисления в два прохода. +### 21.16. Оптимизация GROUP BY с большим количеством агрегатных функций путём вычисления в два прохода. {#optimizatsiia-group-by-s-bolshim-kolichestvom-agregatnykh-funktsii-putiom-vychisleniia-v-dva-prokhoda} Нужно для БК. -### 21.17. Оптимизация GROUP BY при наличии ORDER BY по тем же ключам с LIMIT. +### 21.17. Оптимизация GROUP BY при наличии ORDER BY по тем же ключам с LIMIT. {#optimizatsiia-group-by-pri-nalichii-order-by-po-tem-zhe-kliucham-s-limit} Нужно для БК. -### 21.18. Внутренняя параллелизация мержа больших состояний агрегатных функций. +### 21.18. Внутренняя параллелизация мержа больших состояний агрегатных функций. {#vnutrenniaia-parallelizatsiia-merzha-bolshikh-sostoianii-agregatnykh-funktsii} -### 21.19. Оптимизация сортировки. +### 21.19. Оптимизация сортировки. {#optimizatsiia-sortirovki} Василий Морозов, Арслан Гумеров, Альберт Кидрачев, ВШЭ. В прошлом году задачу начинал делать другой человек, но не добился достаточного прогресса. -1. Оптимизация top sort. +1. Оптимизация top sort. В ClickHouse используется неоптимальный вариант top sort. Суть его в том, что из каждого блока достаётся top N записей, а затем, все блоки мержатся. Но доставание top N записей у каждого следующего блока бессмысленно, если мы знаем, что из них в глобальный top N войдёт меньше. Конечно нужно реализовать вариацию на тему priority queue (heap) с быстрым пропуском целых блоков, если ни одна строка не попадёт в накопленный top. -2. Рекурсивный вариант сортировки по кортежам. +1. Рекурсивный вариант сортировки по кортежам. -Для сортировки по кортежам используется обычная сортировка с компаратором, который в цикле по элементам кортежа делает виртуальные вызовы `IColumn::compareAt`. Это неоптимально - как из-за короткого цикла по неизвестному в compile-time количеству элементов, так и из-за виртуальных вызовов. Чтобы обойтись без виртуальных вызовов, есть метод `IColumn::getPermutation`. Он используется в случае сортировки по одному столбцу. Есть вариант, что в случае сортировки по кортежу, что-то похожее тоже можно применить... например, сделать метод `updatePermutation`, принимающий аргументы offset и limit, и допереставляющий перестановку в диапазоне значений, в которых предыдущий столбец имел равные значения. +Для сортировки по кортежам используется обычная сортировка с компаратором, который в цикле по элементам кортежа делает виртуальные вызовы `IColumn::compareAt`. Это неоптимально - как из-за короткого цикла по неизвестному в compile-time количеству элементов, так и из-за виртуальных вызовов. Чтобы обойтись без виртуальных вызовов, есть метод `IColumn::getPermutation`. Он используется в случае сортировки по одному столбцу. Есть вариант, что в случае сортировки по кортежу, что-то похожее тоже можно применить… например, сделать метод `updatePermutation`, принимающий аргументы offset и limit, и допереставляющий перестановку в диапазоне значений, в которых предыдущий столбец имел равные значения. -3. RadixSort для сортировки. +1. RadixSort для сортировки. Один наш знакомый начал делать задачу по попытке использования RadixSort для сортировки столбцов. Был сделан вариант indirect сортировки (для `getPermutation`), но не оптимизирован до конца - есть лишние ненужные перекладывания элементов. Для того, чтобы его оптимизировать, придётся добавить немного шаблонной магии (на последнем шаге что-то не копировать, вместо перекладывания индексов - складывать их в готовое место). Также этот человек добавил метод MSD Radix Sort для реализации radix partial sort. Но даже не проверил производительность. -Наиболее содержательная часть задачи может состоять в применении Radix Sort для сортировки кортежей, расположенных в оперативке в виде Structure Of Arrays неизвестного в compile-time размера. Это может работать хуже, чем то, что описано в пункте 2... Но попробовать не помешает. +Наиболее содержательная часть задачи может состоять в применении Radix Sort для сортировки кортежей, расположенных в оперативке в виде Structure Of Arrays неизвестного в compile-time размера. Это может работать хуже, чем то, что описано в пункте 2… Но попробовать не помешает. -4. Three-way comparison sort. +1. Three-way comparison sort. Виртуальный метод `compareAt` возвращает -1, 0, 1. Но алгоритмы сортировки сравнениями обычно рассчитаны на `operator<` и не могут получить преимущества от three-way comparison. А можно ли написать так, чтобы преимущество было? -5. pdq partial sort +1. pdq partial sort -Хороший алгоритм сортировки сравнениями `pdqsort` не имеет варианта partial sort. Заметим, что на практике, почти все сортировки в запросах ClickHouse являются partial_sort, так как `ORDER BY` почти всегда идёт с `LIMIT`. Кстати, Данила Кутенин уже попробовал это и показал, что в тривиальном случае преимущества нет. Но не очевидно, что нельзя сделать лучше. +Хороший алгоритм сортировки сравнениями `pdqsort` не имеет варианта partial sort. Заметим, что на практике, почти все сортировки в запросах ClickHouse являются partial\_sort, так как `ORDER BY` почти всегда идёт с `LIMIT`. Кстати, Данила Кутенин уже попробовал это и показал, что в тривиальном случае преимущества нет. Но не очевидно, что нельзя сделать лучше. -### 21.20. Использование материализованных представлений для оптимизации запросов. +### 21.20. Использование материализованных представлений для оптимизации запросов. {#ispolzovanie-materializovannykh-predstavlenii-dlia-optimizatsii-zaprosov} В ByteDance есть готовая реализация, но они её боятся из-за, возможно, низкого качества кода. -### 21.21. + Чтение больших файлов с помощью mmap. +### 21.21. + Чтение больших файлов с помощью mmap. {#chtenie-bolshikh-failov-s-pomoshchiu-mmap} Сделан вариант, но достаточно топорный. Без тестирования в продакшене включать по-умолчанию нельзя. -### 21.22. Userspace page cache. +### 21.22. Userspace page cache. {#userspace-page-cache} Требует 21.8. -### 21.23. Ускорение работы с вторичными индексами. +### 21.23. Ускорение работы с вторичными индексами. {#uskorenie-raboty-s-vtorichnymi-indeksami} zhang2014. Есть pull request. +## 22. Долги и недоделанные возможности. {#dolgi-i-nedodelannye-vozmozhnosti} -## 22. Долги и недоделанные возможности. - -### 22.1. + Исправление неработающих таймаутов, если используется TLS. +### 22.1. + Исправление неработающих таймаутов, если используется TLS. {#ispravlenie-nerabotaiushchikh-taimautov-esli-ispolzuetsia-tls} Нужно для Яндекс.Облака. Сделал Алексей Миловидов. -### 22.2. + Убрать возможность изменить настройки в native протоколе в случае readonly. +### 22.2. + Убрать возможность изменить настройки в native протоколе в случае readonly. {#ubrat-vozmozhnost-izmenit-nastroiki-v-native-protokole-v-sluchae-readonly} N.Vartolomei. -### 22.3. Защита от абсурдно заданных пользователем кодеков. +### 22.3. Защита от абсурдно заданных пользователем кодеков. {#zashchita-ot-absurdno-zadannykh-polzovatelem-kodekov} В очереди, скорее всего [Ольга Хвостикова](https://github.com/stavrolia). -### 22.4. Исправление оставшихся deadlocks в табличных RWLock-ах. +### 22.4. Исправление оставшихся deadlocks в табличных RWLock-ах. {#ispravlenie-ostavshikhsia-deadlocks-v-tablichnykh-rwlock-akh} Александр Казаков. Нужно для Яндекс.Метрики и Datalens. Задача постепенно тащится и исправлениями в соседних местах стала менее актуальна. В Q1 будет сделана или отменена с учётом 1.2. и 1.3. -### 22.5. + Исправление редких срабатываний TSan в stress тестах в CI. +### 22.5. + Исправление редких срабатываний TSan в stress тестах в CI. {#ispravlenie-redkikh-srabatyvanii-tsan-v-stress-testakh-v-ci} Александр Казаков сделал эту задачу. -### 22.6. + Изменение только DEFAULT в ALTER TABLE может поменять тип столбца. +### 22.6. + Изменение только DEFAULT в ALTER TABLE может поменять тип столбца. {#izmenenie-tolko-default-v-alter-table-mozhet-pomeniat-tip-stolbtsa} Александр Сапин сделал эту задачу. -### 22.7. + Row-Level Security не работает в случае наличия в запросе IN подзапросов. +### 22.7. + Row-Level Security не работает в случае наличия в запросе IN подзапросов. {#row-level-security-ne-rabotaet-v-sluchae-nalichiia-v-zaprose-in-podzaprosov} Нужно для Метрики. Иван Лежанкин. -### 22.8. + Исправить десериализацию параметров для параметризованных запросов. +### 22.8. + Исправить десериализацию параметров для параметризованных запросов. {#ispravit-deserializatsiiu-parametrov-dlia-parametrizovannykh-zaprosov} Хотел исправить Василий Немков, Altinity, но есть маленькие затруднения, наверное переделает Алексей Миловидов. -### 22.9. Разобраться с десериализацией массивов со значениями по-умолчанию в Protobuf формате в случае protobuf 3. +### 22.9. Разобраться с десериализацией массивов со значениями по-умолчанию в Protobuf формате в случае protobuf 3. {#razobratsia-s-deserializatsiei-massivov-so-znacheniiami-po-umolchaniiu-v-protobuf-formate-v-sluchae-protobuf-3} [Виталий Баранов](https://github.com/vitlibar). Возможно, это - фундаментальная проблема и следует её только документировать. Кажется, отменяем, но пока ещё не всё ясно. -### 22.10. Исправление дрифта при отслеживании потребления памяти запросами. +### 22.10. Исправление дрифта при отслеживании потребления памяти запросами. {#ispravlenie-drifta-pri-otslezhivanii-potrebleniia-pamiati-zaprosami} Требует 6.3., но можно улучшить отдельными хаками. Нужно Метрике и БК. -### 22.11. + Более простая ser/de настроек запросов. +### 22.11. + Более простая ser/de настроек запросов. {#bolee-prostaia-serde-nastroek-zaprosov} И пропуск неизвестных настроек. Важно для Метрики для упрощения апгрейда без изменения конфига. [Виталий Баранов](https://github.com/vitlibar), готово. -### 22.12. + Исправление низкой производительности чтения из Kafka. +### 22.12. + Исправление низкой производительности чтения из Kafka. {#ispravlenie-nizkoi-proizvoditelnosti-chteniia-iz-kafka} Для ClickHouse нехарактерно наличие кода, обладающего столь низкой производительностью. Практики разработки не подразумевают, что такой код должен попасть в продакшен без надлежащего тестирования производительности. Изначально было назначено на [Ивана Лежанкина](https://github.com/abyss7), но по неизвестной причине было не сделано в течение нескольких месяцев. Сделал Михаил Филимонов, Altinity. -### 22.13. + Посмотреть, почему не работают некоторые collations. +### 22.13. + Посмотреть, почему не работают некоторые collations. {#posmotret-pochemu-ne-rabotaiut-nekotorye-collations} Изначально было назначено на [Ивана Лежанкина](https://github.com/abyss7), но в результате сделал Александр Сапин. -### 22.14. + Посмотреть, почему не работает StorageSet для MergeTree таблиц при некоторых условиях. +### 22.14. + Посмотреть, почему не работает StorageSet для MergeTree таблиц при некоторых условиях. {#posmotret-pochemu-ne-rabotaet-storageset-dlia-mergetree-tablits-pri-nekotorykh-usloviiakh} Вроде бы сделал Никита Михайлов - проверить существующие issues на эту тему. - -### 22.15. Нормализация коммитов в Kafka и идемпотентности операций. +### 22.15. Нормализация коммитов в Kafka и идемпотентности операций. {#normalizatsiia-kommitov-v-kafka-i-idempotentnosti-operatsii} Altinity. -### 22.16. + Исправление низкой производительности кодека DoubleDelta. +### 22.16. + Исправление низкой производительности кодека DoubleDelta. {#ispravlenie-nizkoi-proizvoditelnosti-kodeka-doubledelta} Василий Немков, Altinity - в процессе. Можно считать, что сделано, хотя отсутствие SIMD оптимизаций для variable length кодеков - это ужасно. -### 22.17. Консистентно работающий POPULATE для MaterializedView. +### 22.17. Консистентно работающий POPULATE для MaterializedView. {#konsistentno-rabotaiushchii-populate-dlia-materializedview} -### 22.18. Исправление заметного падения производительности форматов после добавления доменов типов. +### 22.18. Исправление заметного падения производительности форматов после добавления доменов типов. {#ispravlenie-zametnogo-padeniia-proizvoditelnosti-formatov-posle-dobavleniia-domenov-tipov} Василий Немков, Altinity. -### 22.19. + Одновременное использование SAMPLE и PREWHERE. +### 22.19. + Одновременное использование SAMPLE и PREWHERE. {#odnovremennoe-ispolzovanie-sample-i-prewhere} Нужно для Метрики. [Николай Кочетов](https://github.com/KochetovNicolai), ноябрь 2019. -### 22.20. + Неправильная работа PREWHERE при некоторых условиях. +### 22.20. + Неправильная работа PREWHERE при некоторых условиях. {#nepravilnaia-rabota-prewhere-pri-nekotorykh-usloviiakh} [Николай Кочетов](https://github.com/KochetovNicolai), декабрь 2019. -### 22.21. Неправильное поведение DateTime в районе начала unix epoch. +### 22.21. Неправильное поведение DateTime в районе начала unix epoch. {#nepravilnoe-povedenie-datetime-v-raione-nachala-unix-epoch} Алексей Миловидов. -### 22.22. Nullable в функции transform и в CASE по множеству значений. +### 22.22. Nullable в функции transform и в CASE по множеству значений. {#nullable-v-funktsii-transform-i-v-case-po-mnozhestvu-znachenii} После 10.14. -[#7237](https://github.com/ClickHouse/ClickHouse/issues/7237) -[#2655](https://github.com/ClickHouse/ClickHouse/issues/2655) +[\#7237](https://github.com/ClickHouse/ClickHouse/issues/7237) +[\#2655](https://github.com/ClickHouse/ClickHouse/issues/2655) -### 22.23. Правильная обработка Nullable в функциях, которые кидают исключение на default значении: modulo, intDiv. +### 22.23. Правильная обработка Nullable в функциях, которые кидают исключение на default значении: modulo, intDiv. {#pravilnaia-obrabotka-nullable-v-funktsiiakh-kotorye-kidaiut-iskliuchenie-na-default-znachenii-modulo-intdiv} -### 22.24. Излишняя фильтрация ODBC connection string. +### 22.24. Излишняя фильтрация ODBC connection string. {#izlishniaia-filtratsiia-odbc-connection-string} Нужно для Метрики. Алексей Миловидов. -### 22.25. Избавиться от библиотеки btrie. +### 22.25. Избавиться от библиотеки btrie. {#izbavitsia-ot-biblioteki-btrie} Алексей Миловидов. Низкий приоритет. -### 22.26. Плохая производительность quantileTDigest. +### 22.26. Плохая производительность quantileTDigest. {#plokhaia-proizvoditelnost-quantiletdigest} -[#2668](https://github.com/ClickHouse/ClickHouse/issues/2668) +[\#2668](https://github.com/ClickHouse/ClickHouse/issues/2668) Алексей Миловидов или будет переназначено. -### 22.27. Проверить несколько PR, которые были закрыты zhang2014 и sundy-li. +### 22.27. Проверить несколько PR, которые были закрыты zhang2014 и sundy-li. {#proverit-neskolko-pr-kotorye-byli-zakryty-zhang2014-i-sundy-li} Алексей Миловидов. -### 22.28. Изучить и исправить поведение работы с Kafka при ребалансировке. +### 22.28. Изучить и исправить поведение работы с Kafka при ребалансировке. {#izuchit-i-ispravit-povedenie-raboty-s-kafka-pri-rebalansirovke} Altinity. -### 22.29. + Уязвимость DDL для словарей executable. +### 22.29. + Уязвимость DDL для словарей executable. {#uiazvimost-ddl-dlia-slovarei-executable} [Александр Сапин](https://github.com/alesapin) +## 23. Default Festival. {#default-festival} -## 23. Default Festival. - -### 23.1. + Включение minimalistic_part_header в ZooKeeper. +### 23.1. + Включение minimalistic\_part\_header в ZooKeeper. {#vkliuchenie-minimalistic-part-header-v-zookeeper} Сильно уменьшает объём данных в ZooKeeper. Уже год в продакшене в Яндекс.Метрике. Алексей Миловидов, ноябрь 2019. -### 23.2. Включение distributed_aggregation_memory_efficient. +### 23.2. Включение distributed\_aggregation\_memory\_efficient. {#vkliuchenie-distributed-aggregation-memory-efficient} Есть риски меньшей производительности лёгких запросов, хотя производительность тяжёлых запросов всегда увеличивается. -### 23.3. Включение min_bytes_to_external_sort и min_bytes_to_external_group_by. +### 23.3. Включение min\_bytes\_to\_external\_sort и min\_bytes\_to\_external\_group\_by. {#vkliuchenie-min-bytes-to-external-sort-i-min-bytes-to-external-group-by} Желательно 5.2. и 13.1. -### 23.4. Включение синхронной записи в Distributed таблицы по-умолчанию. +### 23.4. Включение синхронной записи в Distributed таблицы по-умолчанию. {#vkliuchenie-sinkhronnoi-zapisi-v-distributed-tablitsy-po-umolchaniiu} Есть гипотеза, что плохо работает на очень больших кластерах. -### 23.5. Включение compile_expressions. +### 23.5. Включение compile\_expressions. {#vkliuchenie-compile-expressions} Требует 7.2. Задачу изначально на 99% сделал Денис Скоробогатов, ВШЭ и Яндекс. Остальной процент доделывал Алексей Миловидов, а затем [Александр Сапин](https://github.com/alesapin). -### 23.6. Включение учёта порядка столбцов в CSV. +### 23.6. Включение учёта порядка столбцов в CSV. {#vkliuchenie-uchiota-poriadka-stolbtsov-v-csv} Просто аккуратно включить. -### 23.7. Включение NULL as Default в CSV. +### 23.7. Включение NULL as Default в CSV. {#vkliuchenie-null-as-default-v-csv} Просто аккуратно включить. -### 23.8. + Включение оптимизации VALUES. +### 23.8. + Включение оптимизации VALUES. {#vkliuchenie-optimizatsii-values} Просто аккуратно включить. -### 23.9. + Включение Processors. +### 23.9. + Включение Processors. {#vkliuchenie-processors} Q1. [Николай Кочетов](https://github.com/KochetovNicolai). -### 23.10. Включение mlock бинарника. +### 23.10. Включение mlock бинарника. {#vkliuchenie-mlock-binarnika} -Возможность mlock бинарника сделал Олег Алексеенков [#3553](https://github.com/ClickHouse/ClickHouse/pull/3553) -. Поможет, когда на серверах кроме ClickHouse работает много посторонних программ (мы иногда называем их в шутку "треш-программами"). +Возможность mlock бинарника сделал Олег Алексеенков [\#3553](https://github.com/ClickHouse/ClickHouse/pull/3553) +. Поможет, когда на серверах кроме ClickHouse работает много посторонних программ (мы иногда называем их в шутку «треш-программами»). -## 24. Экспериментальные задачи. +## 24. Экспериментальные задачи. {#eksperimentalnye-zadachi} -### 24.1. Веб-интерфейс для просмотра состояния кластера и профилирования запросов. +### 24.1. Веб-интерфейс для просмотра состояния кластера и профилирования запросов. {#veb-interfeis-dlia-prosmotra-sostoianiia-klastera-i-profilirovaniia-zaprosov} Антон Мамонов, УрФУ, Яндекс. @@ -1551,11 +1531,11 @@ Q1. [Николай Кочетов](https://github.com/KochetovNicolai). - просмотр метрик использования ресурсов, flame graph и pprof-граф для выбранных запросов; - отчёт по использованию кластера (пример: количество ядер CPU по пользователям за сегодня). -### 24.2. Экспериментальные алгоритмы сжатия. +### 24.2. Экспериментальные алгоритмы сжатия. {#eksperimentalnye-algoritmy-szhatiia} ClickHouse поддерживает LZ4 и ZSTD для сжатия данных. Эти алгоритмы являются парето-оптимальными по соотношению скорости и коэффициентам сжатия среди достаточно известных. Тем не менее, существуют менее известные алгоритмы сжатия, которые могут превзойти их по какому-либо критерию. Из потенциально более быстрых по сравнимом коэффициенте сжатия: Lizard, LZSSE, density. Из более сильных: bsc и csc. Необходимо изучить эти алгоритмы, добавить их поддержку в ClickHouse и исследовать их работу на тестовых датасетах. -### 24.3. Экспериментальные кодеки. +### 24.3. Экспериментальные кодеки. {#eksperimentalnye-kodeki} Вероника Фалчикова, Лада Торчик, ВШЭ. @@ -1563,34 +1543,34 @@ ClickHouse поддерживает LZ4 и ZSTD для сжатия данных Внедрить их в ClickHouse в виде кодеков и изучить их работу на тестовых датасетах. -### 24.4. Шифрование в ClickHouse на уровне VFS. +### 24.4. Шифрование в ClickHouse на уровне VFS. {#shifrovanie-v-clickhouse-na-urovne-vfs} Данные в ClickHouse хранятся без шифрования. При наличии доступа к дискам, злоумышленник может прочитать данные. Предлагается реализовать два подхода к шифрованию: -1. Шифрование на уровне VFS. +1. Шифрование на уровне VFS. -### 24.5. Поддержка функций шифрования для отдельных значений. +### 24.5. Поддержка функций шифрования для отдельных значений. {#podderzhka-funktsii-shifrovaniia-dlia-otdelnykh-znachenii} Смотрите также 24.5. -2. Шифрование отдельных значений. -Для этого требуется реализовать функции шифрования и расшифрования, доступные из SQL. Для шифрования реализовать возможность добавления нужного количества случайных бит для исключения одинаковых зашифрованных значений на одинаковых данных. Это позволит реализовать возможность "забывания" данных без удаления строк таблицы: можно шифровать данные разных клиентов разными ключами, и для того, чтобы забыть данные одного клиента, потребуется всего лишь удалить ключ. +1. Шифрование отдельных значений. + Для этого требуется реализовать функции шифрования и расшифрования, доступные из SQL. Для шифрования реализовать возможность добавления нужного количества случайных бит для исключения одинаковых зашифрованных значений на одинаковых данных. Это позволит реализовать возможность «забывания» данных без удаления строк таблицы: можно шифровать данные разных клиентов разными ключами, и для того, чтобы забыть данные одного клиента, потребуется всего лишь удалить ключ. -### 24.6. Userspace RAID. +### 24.6. Userspace RAID. {#userspace-raid} Глеб Новиков, ВШЭ. RAID позволяет одновременно увеличить надёжность хранения данных на дисках и увеличить скорость работы дискового массива. Обычно RAID настраивается с помощью встроенных возможностей ядра Linux (mdraid) или с помощью hardware контроллера. У этого есть следующие ограничения: -1. Иногда (в облачной инфраструктуре некоторых компаний) сервер предоставляется с отдельными дисками, подмонтированными в виде отдельных разделов (JBOD), без возможности создания RAID. +1. Иногда (в облачной инфраструктуре некоторых компаний) сервер предоставляется с отдельными дисками, подмонтированными в виде отдельных разделов (JBOD), без возможности создания RAID. -2. В ClickHouse для обеспечения избыточности обычно используется репликация между серверами. Но при восстановлении одного из дисков RAID не используются данные с реплик, а в случае отказа одного из дисков в RAID-0, приходится передавать с реплики все данные, а не только данные, соответствующие одному из дисков. Это происходит, потому что RAID не интегрирован в ClickHouse и "не знает" про его особенности. +2. В ClickHouse для обеспечения избыточности обычно используется репликация между серверами. Но при восстановлении одного из дисков RAID не используются данные с реплик, а в случае отказа одного из дисков в RAID-0, приходится передавать с реплики все данные, а не только данные, соответствующие одному из дисков. Это происходит, потому что RAID не интегрирован в ClickHouse и «не знает» про его особенности. -3. Отсутствуют продвинутые варианты обеспечения избыточности, как например, LRC. +3. Отсутствуют продвинутые варианты обеспечения избыточности, как например, LRC. Для преодоления этих ограничений, предлагается реализовать в ClickHouse встроенный алгоритм расположения данных на дисках. -### 24.7. Вероятностные структуры данных для фильтрации по подзапросам. +### 24.7. Вероятностные структуры данных для фильтрации по подзапросам. {#veroiatnostnye-struktury-dannykh-dlia-filtratsii-po-podzaprosam} Рузель Ибрагимов, ВШЭ и Яндекс. @@ -1600,39 +1580,39 @@ RAID позволяет одновременно увеличить надёжн Предлагается реализовать это в языке запросов ClickHouse с помощью специального синтаксиса, например `x IN BLOOM FILTER (n, m) (SELECT ...)`. -### 24.8. Специализация векторизованного кода для AVX/AVX2/AVX512 и ARM NEON. +### 24.8. Специализация векторизованного кода для AVX/AVX2/AVX512 и ARM NEON. {#spetsializatsiia-vektorizovannogo-koda-dlia-avxavx2avx512-i-arm-neon} -[#1017](https://github.com/ClickHouse/ClickHouse/issues/1017) +[\#1017](https://github.com/ClickHouse/ClickHouse/issues/1017) Дмитрий Ковальков, ВШЭ и Яндекс. -Подавляющее большинство кода ClickHouse написана для x86_64 с набором инструкций до SSE 4.2 включительно. Лишь отдельные редкие функции поддерживают AVX/AVX2/AVX512 с динамической диспетчеризацией. +Подавляющее большинство кода ClickHouse написана для x86\_64 с набором инструкций до SSE 4.2 включительно. Лишь отдельные редкие функции поддерживают AVX/AVX2/AVX512 с динамической диспетчеризацией. В первой части задачи, следует добавить в ClickHouse реализации некоторых примитивов, оптимизированные под более новый набор инструкций. Например, AVX2 реализацию генератора случайных чисел pcg: https://github.com/lemire/simdpcg Во второй части задачи, предлагается адаптировать существующие куски кода, использующие SSE intrinsics на AVX/AVX2 и сравнить производительность. Также рассматривается оптимизация под ARM NEON. -### 24.9. Общий подход к CPU dispatching в фабрике функций. +### 24.9. Общий подход к CPU dispatching в фабрике функций. {#obshchii-podkhod-k-cpu-dispatching-v-fabrike-funktsii} Дмитрий Ковальков, ВШЭ и Яндекс. Продолжение 24.8. -### 24.10. Поддержка типов half/bfloat16/unum. +### 24.10. Поддержка типов half/bfloat16/unum. {#podderzhka-tipov-halfbfloat16unum} -[#7657](https://github.com/ClickHouse/ClickHouse/issues/7657) +[\#7657](https://github.com/ClickHouse/ClickHouse/issues/7657) Рустам Гусейн-заде, ВШЭ. -### 24.11. User Defined Functions. +### 24.11. User Defined Functions. {#user-defined-functions} Игорь Минеев, ВШЭ. ClickHouse предоставляет достаточно богатый набор встроенных функций языка запросов, но не позволяет пользователю добавлять свои функции без редактировния исходников и перекомпиляции системы. Это мотивировано следующими потенциальными проблемами: -1. ClickHouse является array-oriented системой, и все функции внутри кода принимают для обработки целые массивы, а не отдельные значения. Это усложняет внутренний интерфейс и делает его менее удобным для пользователя. -2. Предоставление возможности подключения UDF в виде shared библиотек, потребовало бы фиксировать этот интерфейс или поддерживать обратную совместимость, тогда как мы бы хотели, при разработке ClickHouse, менять этот интерфейс по своему усмотрению без оглядки. -3. Сложность внутренних структур данных повышает вероятность ошибок типа buffer overflow и повреждения памяти, что сильно затруднит сопровождение ClickHouse с пользовательскими функциями. +1. ClickHouse является array-oriented системой, и все функции внутри кода принимают для обработки целые массивы, а не отдельные значения. Это усложняет внутренний интерфейс и делает его менее удобным для пользователя. +2. Предоставление возможности подключения UDF в виде shared библиотек, потребовало бы фиксировать этот интерфейс или поддерживать обратную совместимость, тогда как мы бы хотели, при разработке ClickHouse, менять этот интерфейс по своему усмотрению без оглядки. +3. Сложность внутренних структур данных повышает вероятность ошибок типа buffer overflow и повреждения памяти, что сильно затруднит сопровождение ClickHouse с пользовательскими функциями. Тем не менее, можно выбрать более аккуратный подход, избегающий непосредственной линковки с shared библиотеками. @@ -1644,7 +1624,7 @@ ClickHouse предоставляет достаточно богатый наб Также рассматривается возможность написания UDF на Rust, а также использование Web Assembly. Отдельно можно рассмотреть подключение NumPy и R и других технологий, которые предоставляют операции над целыми массивами. -### 24.12. GPU offloading. +### 24.12. GPU offloading. {#gpu-offloading} Риск состоит в том, что даже известные GPU базы, такие как OmniSci, работают медленнее, чем ClickHouse. Преимущество возможно только на полной сортировке и JOIN. @@ -1654,41 +1634,41 @@ ClickHouse предоставляет достаточно богатый наб Upd. В компании nVidia выложили прототип, теперь нужна интеграция в систему сборки. -### 24.13. Stream запросы. +### 24.13. Stream запросы. {#stream-zaprosy} Пререквизит для ClickHouse как CEP-системы. -### 24.14. Window функции. +### 24.14. Window функции. {#window-funktsii} -[#1469](https://github.com/ClickHouse/ClickHouse/issues/1469) +[\#1469](https://github.com/ClickHouse/ClickHouse/issues/1469) Требует 2.1. -### 24.15. Поддержка полуструктурированных данных. +### 24.15. Поддержка полуструктурированных данных. {#podderzhka-polustrukturirovannykh-dannykh} Требует 1.14 и 2.10. -### 24.16. Улучшение эвристики слияний. +### 24.16. Улучшение эвристики слияний. {#uluchshenie-evristiki-sliianii} В прошлом году исследование по этой задаче сделал Егор Соловьёв, ВШЭ и Яндекс.Такси. Его исследование показало, что алгоритм нельзя существенно улучшить путём изменения параметров. Но исследование лажовое, так как рассмотрен только уже использующийся алгоритм. То есть, задача остаётся открытой. -### 24.17. Экспериментальные способы ускорения параллельного GROUP BY. +### 24.17. Экспериментальные способы ускорения параллельного GROUP BY. {#eksperimentalnye-sposoby-uskoreniia-parallelnogo-group-by} Максим Серебряков -### 24.18. Не TCP протокол передачи файлов при репликации. +### 24.18. Не TCP протокол передачи файлов при репликации. {#ne-tcp-protokol-peredachi-failov-pri-replikatsii} -### 24.19. Промежуточное состояние GROUP BY как структура данных для key-value доступа. +### 24.19. Промежуточное состояние GROUP BY как структура данных для key-value доступа. {#promezhutochnoe-sostoianie-group-by-kak-struktura-dannykh-dlia-key-value-dostupa} -### 24.20. Short-circuit вычисления некоторых выражений. +### 24.20. Short-circuit вычисления некоторых выражений. {#short-circuit-vychisleniia-nekotorykh-vyrazhenii} Два года назад задачу попробовала сделать Анастасия Царькова, ВШЭ и Яндекс, но реализация получилась слишком неудобной и её удалили. -### 24.21. Реализация в ClickHouse протокола распределённого консенсуса. +### 24.21. Реализация в ClickHouse протокола распределённого консенсуса. {#realizatsiia-v-clickhouse-protokola-raspredelionnogo-konsensusa} Имеет смысл только после 19.2. -### 24.22. Вывод типов по блоку данных. Вывод формата данных по примеру. +### 24.22. Вывод типов по блоку данных. Вывод формата данных по примеру. {#vyvod-tipov-po-bloku-dannykh-vyvod-formata-dannykh-po-primeru} Задача отложена. @@ -1700,11 +1680,11 @@ ClickHouse также может использоваться для быстр Другая экспериментальная задача - реализация эвристик для обработки данных в неизвестном построчном текстовом формате. Детектирование CSV, TSV, JSON, детектирование разделителей и форматов значений. -### 24.23. Минимальная поддержка транзакций для множества вставок/чтений. +### 24.23. Минимальная поддержка транзакций для множества вставок/чтений. {#minimalnaia-podderzhka-tranzaktsii-dlia-mnozhestva-vstavokchtenii} Максим Кузнецов, ВШЭ. -Таблицы типа MergeTree состоят из набора независимых неизменяемых "кусков" данных. При вставках данных (INSERT), формируются новые куски. При модификациях данных (слияние кусков), формируются новые куски, а старые - становятся неактивными и перестают использоваться следующими запросами. Чтение данных (SELECT) производится из снэпшота множества кусков на некоторый момент времени. Таким образом, чтения и вставки не блокируют друг друга. +Таблицы типа MergeTree состоят из набора независимых неизменяемых «кусков» данных. При вставках данных (INSERT), формируются новые куски. При модификациях данных (слияние кусков), формируются новые куски, а старые - становятся неактивными и перестают использоваться следующими запросами. Чтение данных (SELECT) производится из снэпшота множества кусков на некоторый момент времени. Таким образом, чтения и вставки не блокируют друг друга. Если же выполняется несколько запросов SELECT, то чтение данных может осуществляться из снэпшотов по состоянию на несколько разных моментов времени и быть неконсистентным. Пример: пользователю отображается отчёт из нескольких графиков и таблиц, но из-за того, что между разными запросами, данные успели обновиться, отображаемые данные не соответствуют друг другу. @@ -1712,26 +1692,27 @@ ClickHouse также может использоваться для быстр Для решения этих проблем, предлагается ввести глобальные метки времени для кусков данных (сейчас уже есть инкрементальные номера кусков, но они выделяются в рамках одной таблицы). Первым шагом сделаем эти метки времени в рамках сервера. Вторым шагом сделаем метки времени в рамках всех серверов, но неточные на основе локальных часов. Третьим шагом сделаем метки времени, выдаваемые сервисом координации. -### 24.24. Реализация алгоритмов differential privacy. +### 24.24. Реализация алгоритмов differential privacy. {#realizatsiia-algoritmov-differential-privacy} -[#6874](https://github.com/ClickHouse/ClickHouse/issues/6874) +[\#6874](https://github.com/ClickHouse/ClickHouse/issues/6874) Артём Вишняков, ВШЭ. -### 24.25. Интеграция в ClickHouse функциональности обработки HTTP User Agent. -[#157](https://github.com/ClickHouse/ClickHouse/issues/157) +### 24.25. Интеграция в ClickHouse функциональности обработки HTTP User Agent. {#integratsiia-v-clickhouse-funktsionalnosti-obrabotki-http-user-agent} + +[\#157](https://github.com/ClickHouse/ClickHouse/issues/157) Есть хороший код в Яндекс.Метрике. Получено согласие от руководства. Михаил Филитов, ВШЭ. -### 24.26. Поддержка open tracing или аналогов. +### 24.26. Поддержка open tracing или аналогов. {#podderzhka-open-tracing-ili-analogov} -[#5182](https://github.com/ClickHouse/ClickHouse/issues/5182) +[\#5182](https://github.com/ClickHouse/ClickHouse/issues/5182) Александр Кожихов, ВШЭ и Яндекс.YT. -### 24.27. Реализация алгоритмов min-hash, sim-hash для нечёткого поиска полудубликатов. +### 24.27. Реализация алгоритмов min-hash, sim-hash для нечёткого поиска полудубликатов. {#realizatsiia-algoritmov-min-hash-sim-hash-dlia-nechiotkogo-poiska-poludublikatov} -[#7649](https://github.com/ClickHouse/ClickHouse/pull/7649) +[\#7649](https://github.com/ClickHouse/ClickHouse/pull/7649) ucasFL, ICT. @@ -1739,25 +1720,25 @@ ucasFL, ICT. Есть pull request, есть что доделывать. -### 24.28. Другой sketch для квантилей. +### 24.28. Другой sketch для квантилей. {#drugoi-sketch-dlia-kvantilei} Похоже на quantileTiming, но с логарифмическими корзинами. См. DDSketch. -### 24.29. Поддержка Arrow Flight. +### 24.29. Поддержка Arrow Flight. {#podderzhka-arrow-flight} -[#7554](https://github.com/ClickHouse/ClickHouse/issues/7554) +[\#7554](https://github.com/ClickHouse/ClickHouse/issues/7554) Жанна Зосимова, ВШЭ. -### 24.30. ClickHouse как графовая СУБД. +### 24.30. ClickHouse как графовая СУБД. {#clickhouse-kak-grafovaia-subd} Amos Bird, но его решение слишком громоздкое и пока не open-source. -### 24.31. Кореллированные подзапросы. +### 24.31. Кореллированные подзапросы. {#korellirovannye-podzaprosy} Перепиcывание в JOIN. Не раньше 21.11, 21.12, 21.9. Низкий приоритет. -### 24.32. Поддержка GRPC. +### 24.32. Поддержка GRPC. {#podderzhka-grpc} Мария Конькова, ВШЭ и Яндекс. Также смотрите 24.29. @@ -1769,71 +1750,70 @@ Amos Bird, но его решение слишком громоздкое и п Рассматривается вариант - поддержка GRPC в ClickHouse. Здесь есть неочевидные моменты, такие как - эффективная передача массивов данных в column-oriented формате - насколько удобно будет обернуть это в GRPC. +## 25. DevRel {#devrel} -## 25. DevRel - -### 25.1. + Перевод инструкции для начинающих разработчиков. +### 25.1. + Перевод инструкции для начинающих разработчиков. {#perevod-instruktsii-dlia-nachinaiushchikh-razrabotchikov} Александр Казаков, ноябрь 2019. -### 25.2. + Вычитка и выкладка статьи про обфускацию данных на английском. +### 25.2. + Вычитка и выкладка статьи про обфускацию данных на английском. {#vychitka-i-vykladka-stati-pro-obfuskatsiiu-dannykh-na-angliiskom} Эми, Александр Казаков, Алексей Миловидов, Q1. Готово к выкладке. -### 25.3. Подготовка статьи "Секреты оптимизации производительности ClickHouse". +### 25.3. Подготовка статьи «Секреты оптимизации производительности ClickHouse». {#podgotovka-stati-sekrety-optimizatsii-proizvoditelnosti-clickhouse} Алексей Миловидов, Леонид. -### 25.4. Подготовка статьи "Профайлер запросов: трудный путь". +### 25.4. Подготовка статьи «Профайлер запросов: трудный путь». {#podgotovka-stati-profailer-zaprosov-trudnyi-put} Алексей Миловидов, Леонид. -### 25.5. Подготовка статьи "Тестирование ClickHouse, которое мы заслужили". +### 25.5. Подготовка статьи «Тестирование ClickHouse, которое мы заслужили». {#podgotovka-stati-testirovanie-clickhouse-kotoroe-my-zasluzhili} -### 25.6. Перевод этих статей на английский. +### 25.6. Перевод этих статей на английский. {#perevod-etikh-statei-na-angliiskii} Требует 25.3, 25.4, 25.5. Эми -### 25.7. Перевод статьи Данилы Кутенина на английский. +### 25.7. Перевод статьи Данилы Кутенина на английский. {#perevod-stati-danily-kutenina-na-angliiskii} Эми -### 25.8. + Выступление keynote на BDTC. +### 25.8. + Выступление keynote на BDTC. {#vystuplenie-keynote-na-bdtc} Алексей Миловидов -### 25.9. Подготовка докладчиков: khvostikao, ilezhankin, nikitamikhailov, akuzm и другие. +### 25.9. Подготовка докладчиков: khvostikao, ilezhankin, nikitamikhailov, akuzm и другие. {#podgotovka-dokladchikov-khvostikao-ilezhankin-nikitamikhailov-akuzm-i-drugie} [Ольга Хвостикова](https://github.com/stavrolia), [Иван Лежанкин](https://github.com/abyss7), Никита Михайлов, [Александр Кузьменков](https://github.com/akuzm), Артём Зуйков. Уже готовые докладчики: Алексей Миловидов, [Николай Кочетов](https://github.com/KochetovNicolai), [Александр Сапин](https://github.com/alesapin). Получаем минимум 8 докладчиков в 2020 году. -### 25.10. Митапы в России и Беларуси: Москва x2 + митап для разработчиков или хакатон, Санкт-Петербург, Минск, Нижний Новгород, Екатеринбург, Новосибирск и/или Академгородок, Иннополис или Казань. +### 25.10. Митапы в России и Беларуси: Москва x2 + митап для разработчиков или хакатон, Санкт-Петербург, Минск, Нижний Новгород, Екатеринбург, Новосибирск и/или Академгородок, Иннополис или Казань. {#mitapy-v-rossii-i-belarusi-moskva-x2-mitap-dlia-razrabotchikov-ili-khakaton-sankt-peterburg-minsk-nizhnii-novgorod-ekaterinburg-novosibirsk-iili-akademgorodok-innopolis-ili-kazan} Екатерина - организация -### 25.11. Митапы зарубежные: восток США (Нью Йорк, возможно Raleigh), возможно северо-запад (Сиэтл), Китай (Пекин снова, возможно митап для разработчиков или хакатон), Лондон. +### 25.11. Митапы зарубежные: восток США (Нью Йорк, возможно Raleigh), возможно северо-запад (Сиэтл), Китай (Пекин снова, возможно митап для разработчиков или хакатон), Лондон. {#mitapy-zarubezhnye-vostok-ssha-niu-iork-vozmozhno-raleigh-vozmozhno-severo-zapad-sietl-kitai-pekin-snova-vozmozhno-mitap-dlia-razrabotchikov-ili-khakaton-london} [Иван Блинков](https://github.com/blinkov/) - организация. Две штуки в США запланированы. Upd. Два митапа в США и один в Европе проведены. -### 25.12. Статья "научная" - про устройство хранения данных и индексов или whitepaper по архитектуре. Есть вариант подать на VLDB. +### 25.12. Статья «научная» - про устройство хранения данных и индексов или whitepaper по архитектуре. Есть вариант подать на VLDB. {#statia-nauchnaia-pro-ustroistvo-khraneniia-dannykh-i-indeksov-ili-whitepaper-po-arkhitekture-est-variant-podat-na-vldb} Низкий приоритет. Алексей Миловидов. -### 25.13. Участие во всех мероприятиях Яндекса, которые связаны с разработкой бэкенда, C++ разработкой или с базами данных, возможно участие в DevRel мероприятиях. +### 25.13. Участие во всех мероприятиях Яндекса, которые связаны с разработкой бэкенда, C++ разработкой или с базами данных, возможно участие в DevRel мероприятиях. {#uchastie-vo-vsekh-meropriiatiiakh-iandeksa-kotorye-sviazany-s-razrabotkoi-bekenda-c-razrabotkoi-ili-s-bazami-dannykh-vozmozhno-uchastie-v-devrel-meropriiatiiakh} Алексей Миловидов и все подготовленные докладчики -### 25.14. Конференции в России: все HighLoad, возможно CodeFest, DUMP или UWDC, возможно C++ Russia. +### 25.14. Конференции в России: все HighLoad, возможно CodeFest, DUMP или UWDC, возможно C++ Russia. {#konferentsii-v-rossii-vse-highload-vozmozhno-codefest-dump-ili-uwdc-vozmozhno-c-russia} Алексей Миловидов и все подготовленные докладчики -### 25.15. Конференции зарубежные: Percona, DataOps, попытка попасть на более крупные. +### 25.15. Конференции зарубежные: Percona, DataOps, попытка попасть на более крупные. {#konferentsii-zarubezhnye-percona-dataops-popytka-popast-na-bolee-krupnye} Алексей Миловидов и все подготовленные докладчики -### 25.16. Сайт play.clickhouse. +### 25.16. Сайт play.clickhouse. {#sait-play-clickhouse} Цель состоит в реализации сайта, на котором можно попробовать задавать произвольные запросы к временному экземпляру ClickHouse и изучать его поведение. Из похожих проектов можно отметить: [Compiler Explorer](https://godbolt.org/), http://ideone.com/, [SQLFiddle](http://sqlfiddle.com/), [DB-Fiddle](https://www.db-fiddle.com/). @@ -1848,44 +1828,44 @@ Amos Bird, но его решение слишком громоздкое и п Есть минимальный прототип. Сделал Илья Яцишин. Этот прототип работает только с одной версией ClickHouse и не позволяет делиться ссылками на результаты запросов. -### 25.17. Взаимодействие с ВУЗами: ВШЭ, УрФУ, ICT Beijing. +### 25.17. Взаимодействие с ВУЗами: ВШЭ, УрФУ, ICT Beijing. {#vzaimodeistvie-s-vuzami-vshe-urfu-ict-beijing} Алексей Миловидов и вся группа разработки -### 25.18. Лекция в ШАД. +### 25.18. Лекция в ШАД. {#lektsiia-v-shad} Алексей Миловидов -### 25.19. Участие в курсе разработки на C++ в ШАД. +### 25.19. Участие в курсе разработки на C++ в ШАД. {#uchastie-v-kurse-razrabotki-na-c-v-shad} -### 25.20. Ещё одно сравнение производительности аналитических СУБД. +### 25.20. Ещё одно сравнение производительности аналитических СУБД. {#eshchio-odno-sravnenie-proizvoditelnosti-analiticheskikh-subd} Матвей Бубнов, УрФУ Существуют мало известные специализированные СУБД, способные конкурировать с ClickHouse по скорости обработки некоторых классов запросов. Пример: `TDEngine` и `DolphinDB`, `VictoriaMetrics`, а также `Apache Doris` и `LocustDB`. Предлагается изучить и классифицировать архитектурные особенности этих систем - их особенности и преимущества. Установить эти системы, загрузить тестовые данные, изучить производительность. Проанализировать, за счёт чего достигаются преимущества. -### 25.21. Повторное награждение контрибьюторов в Китае. +### 25.21. Повторное награждение контрибьюторов в Китае. {#povtornoe-nagrazhdenie-kontribiutorov-v-kitae} -### 25.22. On-site помощь с ClickHouse компаниям в дни рядом с мероприятиями. +### 25.22. On-site помощь с ClickHouse компаниям в дни рядом с мероприятиями. {#on-site-pomoshch-s-clickhouse-kompaniiam-v-dni-riadom-s-meropriiatiiami} [Иван Блинков](https://github.com/blinkov/) - организация -### 25.23. Новый мерч для ClickHouse. +### 25.23. Новый мерч для ClickHouse. {#novyi-merch-dlia-clickhouse} -### 25.24. Конкурсы bughunter или оптимизации кода на C++. +### 25.24. Конкурсы bughunter или оптимизации кода на C++. {#konkursy-bughunter-ili-optimizatsii-koda-na-c} Проведение конкурсов должно начинаться для сотрудников Яндекса, пока нет согласования. -### 25.25. Семинары для потенциальных клиентов Яндекс.Облака. +### 25.25. Семинары для потенциальных клиентов Яндекс.Облака. {#seminary-dlia-potentsialnykh-klientov-iandeks-oblaka} По мере необходимости. Алексей Миловидов, организация - Яндекс.Облако. -### 25.26. - Участие в GSoC. +### 25.26. - Участие в GSoC. {#uchastie-v-gsoc} Андрей Бородин пытается уговорить нас участвовать, но пока загружены задачей 25.17. UPD: не участвуем. -### 25.27. + Обновить сайт ClickHouse. +### 25.27. + Обновить сайт ClickHouse. {#obnovit-sait-clickhouse} Иван Блинков. Нет рисков. Нужно для Яндекс.Облака. Upd. Сделано. diff --git a/docs/ru/faq/general.md b/docs/ru/faq/general.md index 60e243d3ce3..e5a5e0c00fa 100644 --- a/docs/ru/faq/general.md +++ b/docs/ru/faq/general.md @@ -1,6 +1,6 @@ -# Общие вопросы +# Общие вопросы {#obshchie-voprosy} -## Почему бы не использовать системы типа MapReduce? +## Почему бы не использовать системы типа MapReduce? {#pochemu-by-ne-ispolzovat-sistemy-tipa-mapreduce} Системами типа MapReduce будем называть системы распределённых вычислений, в которых операция reduce сделана на основе распределённой сортировки. Наиболее распространённым opensource решением данного класса является [Apache Hadoop](http://hadoop.apache.org). Яндекс использует собственное решение — YT. @@ -13,23 +13,23 @@ ## Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC? {#oracle-odbc-encodings} -Если вы используете Oracle через драйвер ODBC в качестве источника внешних словарей, необходимо задать правильное значение для переменной окружения `NLS_LANG` в `/etc/default/clickhouse`. Подробнее читайте в [Oracle NLS_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). +Если вы используете Oracle через драйвер ODBC в качестве источника внешних словарей, необходимо задать правильное значение для переменной окружения `NLS_LANG` в `/etc/default/clickhouse`. Подробнее читайте в [Oracle NLS\_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). **Пример** -```sql +``` sql NLS_LANG=RUSSIAN_RUSSIA.UTF8 ``` ## Как экспортировать данные из ClickHouse в файл? {#how-to-export-to-file} -### Секция INTO OUTFILE +### Секция INTO OUTFILE {#sektsiia-into-outfile} Добавьте секцию [INTO OUTFILE](../query_language/select/#into-outfile-clause) к своему запросу. Например: -```sql +``` sql SELECT * FROM table INTO OUTFILE 'file' ``` @@ -37,21 +37,20 @@ SELECT * FROM table INTO OUTFILE 'file' Например: -```sql +``` sql SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV ``` -### Таблица с движком File +### Таблица с движком File {#tablitsa-s-dvizhkom-file} Смотрите [File](../operations/table_engines/file.md). -### Перенаправление в командой строке +### Перенаправление в командой строке {#perenapravlenie-v-komandoi-stroke} -```sql +``` sql $ clickhouse-client --query "SELECT * from table" --format FormatName > result.txt ``` Смотрите [clickhouse-client](../interfaces/cli.md). - -[Оригинальная статья ](https://clickhouse.tech/docs/en/faq/general/) +[Оригинальная статья](https://clickhouse.tech/docs/en/faq/general/) diff --git a/docs/ru/getting_started/example_datasets/amplab_benchmark.md b/docs/ru/getting_started/example_datasets/amplab_benchmark.md index 213ce406e05..2516dfefba7 100644 --- a/docs/ru/getting_started/example_datasets/amplab_benchmark.md +++ b/docs/ru/getting_started/example_datasets/amplab_benchmark.md @@ -1,13 +1,13 @@ -# AMPLab Big Data Benchmark +# AMPLab Big Data Benchmark {#amplab-big-data-benchmark} -См. +См. https://amplab.cs.berkeley.edu/benchmark/ -Зарегистрируйте бесплатную учетную запись на - понадобится кредитная карта, email и номер телефона -Получите новый ключ доступа на +Зарегистрируйте бесплатную учетную запись на https://aws.amazon.com - понадобится кредитная карта, email и номер телефона +Получите новый ключ доступа на https://console.aws.amazon.com/iam/home?nc2=h\_m\_sc\#security\_credential Выполните следующее в консоли: -```bash +``` bash $ sudo apt-get install s3cmd $ mkdir tiny; cd tiny; $ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . @@ -86,7 +86,7 @@ CREATE TABLE uservisits_5nodes_on_single Возвращаемся в консоль: -```bash +``` bash $ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done $ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done $ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done diff --git a/docs/ru/getting_started/example_datasets/criteo.md b/docs/ru/getting_started/example_datasets/criteo.md index 30858d80e64..4abb8c43f35 100644 --- a/docs/ru/getting_started/example_datasets/criteo.md +++ b/docs/ru/getting_started/example_datasets/criteo.md @@ -1,22 +1,22 @@ -# Терабайт логов кликов от Criteo +# Терабайт логов кликов от Criteo {#terabait-logov-klikov-ot-criteo} -Скачайте данные с +Скачайте данные с http://labs.criteo.com/downloads/download-terabyte-click-logs/ Создайте таблицу для импорта лога: -```sql +``` sql CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log ``` Загрузите данные: -```bash +``` bash $ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done ``` Создайте таблицу для сконвертированных данных: -```sql +``` sql CREATE TABLE criteo ( date Date, @@ -65,7 +65,7 @@ CREATE TABLE criteo Преобразуем данные из сырого лога и положим во вторую таблицу: -```sql +``` sql INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log; DROP TABLE criteo_log; diff --git a/docs/ru/getting_started/example_datasets/metrica.md b/docs/ru/getting_started/example_datasets/metrica.md index 4bb3dc9e4c6..7980d2936a2 100644 --- a/docs/ru/getting_started/example_datasets/metrica.md +++ b/docs/ru/getting_started/example_datasets/metrica.md @@ -1,10 +1,12 @@ -# Анонимизированные данные Яндекс.Метрики +# Анонимизированные данные Яндекс.Метрики {#anonimizirovannye-dannye-iandeks-metriki} Датасет состоит из двух таблиц, содержащих анонимизированные данные о хитах (`hits_v1`) и визитах (`visits_v1`) Яндекс.Метрики. Каждую из таблиц можно скачать в виде сжатого `.tsv.xz`-файла или в виде уже готовых партиций. Также можно скачать расширенную версию таблицы `hits`, содержащую 100 миллионов строк в виде [архива c файлами TSV](https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_100m_obfuscated_v1.tsv.xz) и в виде [готовых партиций](https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_100m_obfuscated_v1.tar.xz). -## Получение таблиц из партиций +## Получение таблиц из партиций {#poluchenie-tablits-iz-partitsii} + **Скачивание и импортирование партиций hits:** -```bash + +``` bash $ curl -O https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar $ tar xvf hits_v1.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse $ # убедитесь, что установлены корректные права доступа на файлы @@ -13,7 +15,8 @@ $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" ``` **Скачивание и импортирование партиций visits:** -```bash + +``` bash $ curl -O https://clickhouse-datasets.s3.yandex.net/visits/partitions/visits_v1.tar $ tar xvf visits_v1.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse $ # убедитесь, что установлены корректные права доступа на файлы @@ -21,10 +24,12 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` -## Получение таблиц из сжатых tsv-файлов +## Получение таблиц из сжатых tsv-файлов {#poluchenie-tablits-iz-szhatykh-tsv-failov} + **Скачивание и импортирование hits из сжатого tsv-файла** -```bash -$ curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv + +``` bash +$ curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv $ # теперь создадим таблицу $ clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" $ clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" @@ -36,8 +41,9 @@ $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" ``` **Скачивание и импортирование visits из сжатого tsv-файла** -```bash -$ curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv + +``` bash +$ curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv $ # теперь создадим таблицу $ clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" $ clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign)" @@ -48,5 +54,6 @@ $ clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` -## Запросы +## Запросы {#zaprosy} + Примеры запросов к этим таблицам (они называются `test.hits` и `test.visits`) можно найти среди [stateful тестов](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/tests/queries/1_stateful) и в некоторых [performance тестах](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/tests/performance) ClickHouse. diff --git a/docs/ru/getting_started/example_datasets/nyc_taxi.md b/docs/ru/getting_started/example_datasets/nyc_taxi.md index da675887a54..1873695f679 100644 --- a/docs/ru/getting_started/example_datasets/nyc_taxi.md +++ b/docs/ru/getting_started/example_datasets/nyc_taxi.md @@ -1,20 +1,20 @@ -# Данные о такси в Нью-Йорке +# Данные о такси в Нью-Йорке {#dannye-o-taksi-v-niu-iorke} Этот датасет может быть получен двумя способами: -- импорт из сырых данных; -- скачивание готовых партиций. +- импорт из сырых данных; +- скачивание готовых партиций. -## Как импортировать сырые данные +## Как импортировать сырые данные {#kak-importirovat-syrye-dannye} -См. и для описания набора данных и инструкций по загрузке. +См. https://github.com/toddwschneider/nyc-taxi-data и http://tech.marksblogg.com/billion-nyc-taxi-rides-redshift.html для описания набора данных и инструкций по загрузке. После скачивания получится порядка 227 Гб несжатых данных в CSV файлах. Скачивание занимает порядка часа на 1 Гбит соединении (параллельное скачивание с s3.amazonaws.com утилизирует как минимум половину гигабитного канала). Некоторые файлы могут скачаться не полностью. Проверьте размеры файлов и скачайте повторно подозрительные. Некоторые файлы могут содержать некорректные строки. Их можно скорректировать следующим образом: -```bash +``` bash sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-02.csv > data/yellow_tripdata_2010-02.csv_ sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-03.csv > data/yellow_tripdata_2010-03.csv_ mv data/yellow_tripdata_2010-02.csv_ data/yellow_tripdata_2010-02.csv @@ -29,7 +29,7 @@ mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv Проверить количество загруженных строк можно следующим образом: -```bash +``` bash $ time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" ## Count 1298979494 @@ -44,7 +44,7 @@ real 7m9.164s Экспорт данных из PostgreSQL: -```sql +``` sql COPY ( SELECT trips.id, @@ -119,7 +119,7 @@ COPY Создание временной таблицы в ClickHouse: -```sql +``` sql CREATE TABLE trips ( trip_id UInt32, @@ -176,9 +176,9 @@ dropoff_puma Nullable(String) ) ENGINE = Log; ``` -Она нужна для преобразование полей к более правильным типам данных и, если возможно, чтобы избавиться от NULL'ов. +Она нужна для преобразование полей к более правильным типам данных и, если возможно, чтобы избавиться от NULL’ов. -```bash +``` bash $ time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv real 75m56.214s @@ -190,13 +190,13 @@ real 75m56.214s (Импорт данных напрямую из Postgres также возможен с использованием `COPY ... TO PROGRAM`.) -К сожалению, все поля, связанные с погодой (precipitation...average_wind_speed) заполнены NULL. Из-за этого мы исключим их из финального набора данных. +К сожалению, все поля, связанные с погодой (precipitation…average\_wind\_speed) заполнены NULL. Из-за этого мы исключим их из финального набора данных. Для начала мы создадим таблицу на одном сервере. Позже мы сделаем таблицу распределенной. Создадим и заполним итоговую таблицу: -```sql +``` sql CREATE TABLE trips_mergetree ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) AS SELECT @@ -263,10 +263,11 @@ FROM trips Таблица заняла 126 Гб дискового пространства. -```sql +``` sql SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active ``` -```text + +``` text ┌─formatReadableSize(sum(bytes))─┐ │ 126.18 GiB │ └────────────────────────────────┘ @@ -274,9 +275,9 @@ SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mer Между прочим, на MergeTree можно запустить запрос OPTIMIZE. Но это не обязательно, всё будет в порядке и без этого. -## Скачивание готовых партиций +## Скачивание готовых партиций {#skachivanie-gotovykh-partitsii} -```bash +``` bash $ curl -O https://clickhouse-datasets.s3.yandex.net/trips_mergetree/partitions/trips_mergetree.tar $ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse $ # убедитесь, что установлены корректные права доступа на файлы @@ -284,11 +285,11 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.trips_mergetree" ``` -!!!info +!!! info "Info" Если вы собираетесь выполнять запросы, приведенные ниже, то к имени таблицы - нужно добавить имя базы, `datasets.trips_mergetree`. +нужно добавить имя базы, `datasets.trips_mergetree`. -## Результаты на одном сервере +## Результаты на одном сервере {#rezultaty-na-odnom-servere} Q1: @@ -300,7 +301,7 @@ SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type Q2: -```sql +``` sql SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenger_count ``` @@ -308,7 +309,7 @@ SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenge Q3: -```sql +``` sql SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetree GROUP BY passenger_count, year ``` @@ -316,7 +317,7 @@ SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetr Q4: -```sql +``` sql SELECT passenger_count, toYear(pickup_date) AS year, round(trip_distance) AS distance, count(*) FROM trips_mergetree GROUP BY passenger_count, year, distance @@ -339,19 +340,19 @@ ORDER BY year, count(*) DESC На каждом сервере: -```sql +``` sql CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) ``` На исходном сервере: -```sql +``` sql CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) ``` Следующим запрос перераспределит данные: -```sql +``` sql INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree ``` @@ -376,12 +377,12 @@ Q4: 0.072 sec. В этом случае, время выполнения запросов определяется в первую очередь сетевыми задержками. Мы выполняли запросы с помощью клиента, расположенного в дата-центре Яндекса в Мянтсяля (Финляндия), на кластер в России, что добавляет порядка 20 мс задержки. -## Резюме +## Резюме {#reziume} -| серверов| Q1 | Q2 | Q3 | Q4 | -| ------- | ----- | ----- | ----- | ----- | -| 1 | 0.490 | 1.224 | 2.104 | 3.593 | -| 3 | 0.212 | 0.438 | 0.733 | 1.241 | -| 140 | 0.028 | 0.043 | 0.051 | 0.072 | +| серверов | Q1 | Q2 | Q3 | Q4 | +|----------|-------|-------|-------|-------| +| 1 | 0.490 | 1.224 | 2.104 | 3.593 | +| 3 | 0.212 | 0.438 | 0.733 | 1.241 | +| 140 | 0.028 | 0.043 | 0.051 | 0.072 | [Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/example_datasets/nyc_taxi/) diff --git a/docs/ru/getting_started/example_datasets/ontime.md b/docs/ru/getting_started/example_datasets/ontime.md index 2d7a8500bd3..7e7a6be2196 100644 --- a/docs/ru/getting_started/example_datasets/ontime.md +++ b/docs/ru/getting_started/example_datasets/ontime.md @@ -1,15 +1,15 @@ -# OnTime +# OnTime {#ontime} Этот датасет может быть получен двумя способами: -- импорт из сырых данных; -- скачивание готовых партиций. +- импорт из сырых данных; +- скачивание готовых партиций. -## Импорт из сырых данных +## Импорт из сырых данных {#import-iz-syrykh-dannykh} Скачивание данных (из `https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh`): -```bash +``` bash for s in `seq 1987 2018` do for m in `seq 1 12` @@ -21,7 +21,7 @@ done Создание таблицы: -```sql +``` sql CREATE TABLE `ontime` ( `Year` UInt16, `Quarter` UInt8, @@ -132,21 +132,21 @@ CREATE TABLE `ontime` ( `Div5LongestGTime` String, `Div5WheelsOff` String, `Div5TailNum` String -) ENGINE = MergeTree -PARTITION BY Year -ORDER BY (Carrier, FlightDate) +) ENGINE = MergeTree +PARTITION BY Year +ORDER BY (Carrier, FlightDate) SETTINGS index_granularity = 8192; ``` Загрузка данных: -```bash +``` bash $ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ``` -## Скачивание готовых партиций +## Скачивание готовых партиций {#skachivanie-gotovykh-partitsii} -```bash +``` bash $ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar $ tar xvf ontime.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse $ # убедитесь, что установлены корректные права доступа на файлы @@ -154,15 +154,15 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.ontime" ``` -!!!info +!!! info "Info" Если вы собираетесь выполнять запросы, приведенные ниже, то к имени таблицы - нужно добавить имя базы, `datasets.ontime`. +нужно добавить имя базы, `datasets.ontime`. -## Запросы: +## Запросы: {#zaprosy} Q0. -```sql +``` sql SELECT avg(c1) FROM ( @@ -174,7 +174,7 @@ FROM Q1. Количество полетов в день с 2000 по 2008 года -```sql +``` sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -184,7 +184,7 @@ ORDER BY c DESC; Q2. Количество полетов, задержанных более чем на 10 минут, с группировкой по дням неделе, за 2000-2008 года -```sql +``` sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -194,7 +194,7 @@ ORDER BY c DESC; Q3. Количество задержек по аэропортам за 2000-2008 -```sql +``` sql SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -205,7 +205,7 @@ LIMIT 10; Q4. Количество задержек по перевозчикам за 2007 год -```sql +``` sql SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year=2007 @@ -215,7 +215,7 @@ ORDER BY count(*) DESC; Q5. Процент задержек по перевозчикам за 2007 год -```sql +``` sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -241,7 +241,7 @@ ORDER BY c3 DESC; Более оптимальная версия того же запроса: -```sql +``` sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year=2007 @@ -251,7 +251,7 @@ ORDER BY c3 DESC Q6. Предыдущий запрос за более широкий диапазон лет, 2000-2008 -```sql +``` sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -277,7 +277,7 @@ ORDER BY c3 DESC; Более оптимальная версия того же запроса: -```sql +``` sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -287,7 +287,7 @@ ORDER BY c3 DESC; Q7. Процент полетов, задержанных на более 10 минут, в разбивке по годам -```sql +``` sql SELECT Year, c1/c2 FROM ( @@ -311,7 +311,7 @@ ORDER BY Year; Более оптимальная версия того же запроса: -```sql +``` sql SELECT Year, avg(DepDelay>10)*100 FROM ontime GROUP BY Year @@ -320,7 +320,7 @@ ORDER BY Year; Q8. Самые популярные направления по количеству напрямую соединенных городов для различных диапазонов лет -```sql +``` sql SELECT DestCityName, uniqExact(OriginCityName) AS u F ROM ontime WHERE Year>=2000 and Year<=2010 @@ -331,7 +331,7 @@ LIMIT 10; Q9. -```sql +``` sql SELECT Year, count(*) AS c1 FROM ontime GROUP BY Year; @@ -339,7 +339,7 @@ GROUP BY Year; Q10. -```sql +``` sql SELECT min(Year), max(Year), Carrier, count(*) AS cnt, sum(ArrDelayMinutes>30) AS flights_delayed, @@ -357,7 +357,7 @@ LIMIT 1000; Бонус: -```sql +``` sql SELECT avg(cnt) FROM ( @@ -395,11 +395,11 @@ LIMIT 10; Данный тест производительности был создан Вадимом Ткаченко, статьи по теме: -- -- -- -- -- -- +- https://www.percona.com/blog/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/ +- https://www.percona.com/blog/2009/10/26/air-traffic-queries-in-luciddb/ +- https://www.percona.com/blog/2009/11/02/air-traffic-queries-in-infinidb-early-alpha/ +- https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/ +- https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/ +- http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html [Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/example_datasets/ontime/) diff --git a/docs/ru/getting_started/example_datasets/star_schema.md b/docs/ru/getting_started/example_datasets/star_schema.md index 54346b7b79d..d938b1e8620 100644 --- a/docs/ru/getting_started/example_datasets/star_schema.md +++ b/docs/ru/getting_started/example_datasets/star_schema.md @@ -1,8 +1,8 @@ -# Star Schema Benchmark +# Star Schema Benchmark {#star-schema-benchmark} Компиляция dbgen: -```bash +``` bash $ git clone git@github.com:vadimtk/ssb-dbgen.git $ cd ssb-dbgen $ make @@ -11,11 +11,10 @@ $ make Генерация данных: !!! warning "Внимание" - -s 100 -- dbgen генерирует 600 миллионов строк (67 ГБ) - -s 1000 -- dbgen генерирует 6 миллиардов строк (занимает много времени) + -s 100 – dbgen генерирует 600 миллионов строк (67 ГБ) +-s 1000 – dbgen генерирует 6 миллиардов строк (занимает много времени) - -```bash +``` bash $ ./dbgen -s 1000 -T c $ ./dbgen -s 1000 -T l $ ./dbgen -s 1000 -T p @@ -25,7 +24,7 @@ $ ./dbgen -s 1000 -T d Создание таблиц в Кликхауз: -```sql +``` sql CREATE TABLE customer ( C_CUSTKEY UInt32, @@ -90,7 +89,7 @@ ENGINE = MergeTree ORDER BY S_SUPPKEY; Вставка данных: -```bash +``` bash $ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl $ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl $ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl @@ -99,7 +98,7 @@ $ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl Конвертация схемы-звезда в денормализованную плоскую схему: -```sql +``` sql SET max_memory_usage = 20000000000; CREATE TABLE lineorder_flat ENGINE = MergeTree @@ -148,32 +147,38 @@ FROM lineorder AS l INNER JOIN customer AS c ON c.C_CUSTKEY = l.LO_CUSTKEY INNER JOIN supplier AS s ON s.S_SUPPKEY = l.LO_SUPPKEY INNER JOIN part AS p ON p.P_PARTKEY = l.LO_PARTKEY; - ``` Running the queries: Q1.1 -```sql + +``` sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; ``` + Q1.2 -```sql + +``` sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; ``` + Q1.3 -```sql + +``` sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat -WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 +WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; ``` + Q2.1 -```sql + +``` sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, @@ -187,8 +192,10 @@ ORDER BY year, P_BRAND; ``` + Q2.2 -```sql + +``` sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, @@ -202,8 +209,10 @@ ORDER BY year, P_BRAND; ``` + Q2.3 -```sql + +``` sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, @@ -217,8 +226,10 @@ ORDER BY year, P_BRAND; ``` + Q3.1 -```sql + +``` sql SELECT C_NATION, S_NATION, @@ -234,8 +245,10 @@ ORDER BY year ASC, revenue DESC; ``` + Q3.2 -```sql + +``` sql SELECT C_CITY, S_CITY, @@ -251,8 +264,10 @@ ORDER BY year ASC, revenue DESC; ``` + Q3.3 -```sql + +``` sql SELECT C_CITY, S_CITY, @@ -268,8 +283,10 @@ ORDER BY year ASC, revenue DESC; ``` + Q3.4 -```sql + +``` sql SELECT C_CITY, S_CITY, @@ -285,8 +302,10 @@ ORDER BY year ASC, revenue DESC; ``` + Q4.1 -```sql + +``` sql SELECT toYear(LO_ORDERDATE) AS year, C_NATION, @@ -300,8 +319,10 @@ ORDER BY year ASC, C_NATION ASC; ``` + Q4.2 -```sql + +``` sql SELECT toYear(LO_ORDERDATE) AS year, S_NATION, @@ -318,8 +339,10 @@ ORDER BY S_NATION ASC, P_CATEGORY ASC; ``` + Q4.3 -```sql + +``` sql SELECT toYear(LO_ORDERDATE) AS year, S_CITY, diff --git a/docs/ru/getting_started/example_datasets/wikistat.md b/docs/ru/getting_started/example_datasets/wikistat.md index dc236a403f1..0e3e269fe9f 100644 --- a/docs/ru/getting_started/example_datasets/wikistat.md +++ b/docs/ru/getting_started/example_datasets/wikistat.md @@ -1,10 +1,10 @@ -# WikiStat +# WikiStat {#wikistat} -См: +См: http://dumps.wikimedia.org/other/pagecounts-raw/ Создание таблицы: -```sql +``` sql CREATE TABLE wikistat ( date Date, @@ -19,7 +19,7 @@ CREATE TABLE wikistat Загрузка данных: -```bash +``` bash $ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt $ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done $ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done diff --git a/docs/ru/getting_started/index.md b/docs/ru/getting_started/index.md index 33b3b6b3988..aa126f7b697 100644 --- a/docs/ru/getting_started/index.md +++ b/docs/ru/getting_started/index.md @@ -1,10 +1,10 @@ -# Начало работы +# Начало работы {#nachalo-raboty} Если вы новичок в ClickHouse и хотите вживую оценить его производительность, прежде всего нужно пройти через [процесс установки](install.md). После этого можно выбрать один из следующих вариантов: -* [Пройти подробное руководство для начинающих](tutorial.md) -* [Поэкспериментировать с тестовыми наборами данных](example_datasets/ontime.md) +- [Пройти подробное руководство для начинающих](tutorial.md) +- [Поэкспериментировать с тестовыми наборами данных](example_datasets/ontime.md) [Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/) diff --git a/docs/ru/getting_started/install.md b/docs/ru/getting_started/install.md index 9f37449fd34..6f48dd4fa55 100644 --- a/docs/ru/getting_started/install.md +++ b/docs/ru/getting_started/install.md @@ -1,6 +1,6 @@ -# Установка +# Установка {#ustanovka} -## Системные требования +## Системные требования {#sistemnye-trebovaniia} ClickHouse может работать на любой операционной системе Linux, FreeBSD или Mac OS X с архитектурой процессора x86\_64, AArch64 или PowerPC64LE. @@ -12,7 +12,7 @@ $ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not Чтобы запустить ClickHouse на процессорах, которые не поддерживают SSE 4.2, либо имеют архитектуру AArch64 или PowerPC64LE, необходимо самостоятельно [собрать ClickHouse из исходного кода](#from-sources) с соответствующими настройками конфигурации. -## Доступные варианты установки +## Доступные варианты установки {#dostupnye-varianty-ustanovki} ### Из DEB пакетов {#install-from-deb-packages} @@ -20,29 +20,28 @@ $ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not Чтобы установить официальные пакеты, пропишите репозиторий Яндекса в `/etc/apt/sources.list` или в отдельный файл `/etc/apt/sources.list.d/clickhouse.list`: -``` -deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ -``` + deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ Если вы хотите использовать наиболее свежую тестовую, замените `stable` на `testing` (не рекомендуется для production окружений). Затем для самой установки пакетов выполните: -```bash +``` bash sudo apt-get install dirmngr # optional sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional sudo apt-get update sudo apt-get install clickhouse-client clickhouse-server ``` -Также эти пакеты можно скачать и установить вручную отсюда: . +Также эти пакеты можно скачать и установить вручную отсюда: https://repo.yandex.ru/clickhouse/deb/stable/main/. ### Из RPM пакетов {#from-rpm-packages} Команда ClickHouse в Яндексе рекомендует использовать официальные предкомпилированные `rpm` пакеты для CentOS, RedHad и всех остальных дистрибутивов Linux, основанных на rpm. Сначала нужно подключить официальный репозиторий: -```bash + +``` bash sudo yum install yum-utils sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64 @@ -52,19 +51,20 @@ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/ Для, собственно, установки пакетов необходимо выполнить следующие команды: -```bash +``` bash sudo yum install clickhouse-server clickhouse-client ``` -Также есть возможность установить пакеты вручную, скачав отсюда: . +Также есть возможность установить пакеты вручную, скачав отсюда: https://repo.yandex.ru/clickhouse/rpm/stable/x86\_64. ### Из tgz архивов {#from-tgz-archives} Команда ClickHouse в Яндексе рекомендует использовать предкомпилированные бинарники из `tgz` архивов для всех дистрибутивов, где невозможна установка `deb` и `rpm` пакетов. -Интересующую версию архивов можно скачать вручную с помощью `curl` или `wget` из репозитория . +Интересующую версию архивов можно скачать вручную с помощью `curl` или `wget` из репозитория https://repo.yandex.ru/clickhouse/tgz/. После этого архивы нужно распаковать и воспользоваться скриптами установки. Пример установки самой свежей версии: -```bash + +``` bash export LATEST_VERSION=`curl https://api.github.com/repos/ClickHouse/ClickHouse/tags 2>/dev/null | grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -n 1` curl -O https://repo.yandex.ru/clickhouse/tgz/clickhouse-common-static-$LATEST_VERSION.tgz curl -O https://repo.yandex.ru/clickhouse/tgz/clickhouse-common-static-dbg-$LATEST_VERSION.tgz @@ -97,21 +97,17 @@ sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh Можно скомпилировать пакеты и установить их, либо использовать программы без установки пакетов. Также при ручой сборке можно отключить необходимость поддержки набора инструкций SSE 4.2 или собрать под процессоры архитектуры AArch64. -``` -Client: dbms/programs/clickhouse-client -Server: dbms/programs/clickhouse-server -``` + Client: dbms/programs/clickhouse-client + Server: dbms/programs/clickhouse-server Для работы собранного вручную сервера необходимо создать директории для данных и метаданных, а также сделать их `chown` для желаемого пользователя. Пути к этим директориям могут быть изменены в конфигурационном файле сервера (src/dbms/programs/server/config.xml), по умолчанию используются следующие: -``` -/opt/clickhouse/data/default/ -/opt/clickhouse/metadata/default/ -``` + /opt/clickhouse/data/default/ + /opt/clickhouse/metadata/default/ На Gentoo для установки ClickHouse из исходного кода можно использовать просто `emerge clickhouse`. -## Запуск +## Запуск {#zapusk} Для запуска сервера в качестве демона, выполните: diff --git a/docs/ru/guides/apply_catboost_model.md b/docs/ru/guides/apply_catboost_model.md index 69aa0faccb2..f6c9799a171 100644 --- a/docs/ru/guides/apply_catboost_model.md +++ b/docs/ru/guides/apply_catboost_model.md @@ -6,10 +6,10 @@ Чтобы применить модель CatBoost в ClickHouse: -1. [Создайте таблицу](#create-table). -2. [Вставьте данные в таблицу](#insert-data-to-table). -3. [Интегрируйте CatBoost в ClickHouse](#integrate-catboost-into-clickhouse) (Опциональный шаг). -4. [Запустите вывод модели из SQL](#run-model-inference). +1. [Создайте таблицу](#create-table). +2. [Вставьте данные в таблицу](#insert-data-to-table). +3. [Интегрируйте CatBoost в ClickHouse](#integrate-catboost-into-clickhouse) (Опциональный шаг). +4. [Запустите вывод модели из SQL](#run-model-inference). Подробнее об обучении моделей в CatBoost, см. [Обучение и применение моделей](https://catboost.ai/docs/features/training.html#training). @@ -24,7 +24,7 @@ **1.** Скачайте [Docker-образ](https://hub.docker.com/r/yandex/tutorial-catboost-clickhouse) из реестра: -```bash +``` bash $ docker pull yandex/tutorial-catboost-clickhouse ``` @@ -32,7 +32,7 @@ $ docker pull yandex/tutorial-catboost-clickhouse **2.** Проверьте, что Docker-образ успешно скачался: -```bash +``` bash $ docker image ls REPOSITORY TAG IMAGE ID CREATED SIZE yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 hours ago 1.37GB @@ -40,7 +40,7 @@ yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 **3.** Запустите Docker-контейнер основанный на данном образе: -```bash +``` bash $ docker run -it -p 8888:8888 yandex/tutorial-catboost-clickhouse ``` @@ -50,7 +50,7 @@ $ docker run -it -p 8888:8888 yandex/tutorial-catboost-clickhouse **1.** Запустите клиент ClickHouse: -```bash +``` bash $ clickhouse client ``` @@ -59,19 +59,19 @@ $ clickhouse client **2.** Создайте таблицу в ClickHouse с помощью следующей команды: -```sql +``` sql :) CREATE TABLE amazon_train ( - date Date MATERIALIZED today(), - ACTION UInt8, - RESOURCE UInt32, - MGR_ID UInt32, - ROLE_ROLLUP_1 UInt32, - ROLE_ROLLUP_2 UInt32, - ROLE_DEPTNAME UInt32, - ROLE_TITLE UInt32, - ROLE_FAMILY_DESC UInt32, - ROLE_FAMILY UInt32, + date Date MATERIALIZED today(), + ACTION UInt8, + RESOURCE UInt32, + MGR_ID UInt32, + ROLE_ROLLUP_1 UInt32, + ROLE_ROLLUP_2 UInt32, + ROLE_DEPTNAME UInt32, + ROLE_TITLE UInt32, + ROLE_FAMILY_DESC UInt32, + ROLE_FAMILY UInt32, ROLE_CODE UInt32 ) ENGINE = MergeTree ORDER BY date @@ -79,7 +79,7 @@ ENGINE = MergeTree ORDER BY date **3.** Выйдите из клиента ClickHouse: -```sql +``` sql :) exit ``` @@ -89,19 +89,19 @@ ENGINE = MergeTree ORDER BY date **1.** Выполните следующую команду: -```bash +``` bash $ clickhouse client --host 127.0.0.1 --query 'INSERT INTO amazon_train FORMAT CSVWithNames' < ~/amazon/train.csv ``` **2.** Запустите клиент ClickHouse: -```bash +``` bash $ clickhouse client ``` **3.** Проверьте, что данные успешно загрузились: -```sql +``` sql :) SELECT count() FROM amazon_train SELECT count() @@ -114,8 +114,8 @@ FROM amazon_train ## 3. Интегрируйте CatBoost в ClickHouse {#integrate-catboost-into-clickhouse} -!!! note "Примечание" - **Опциональный шаг.** Docker-образ содержит все необходимое для запуска CatBoost и ClickHouse. +!!! note "Примечание" + **Опциональный шаг.** Docker-образ содержит все необходимое для запуска CatBoost и ClickHouse. Чтобы интегрировать CatBoost в ClickHouse: @@ -131,7 +131,7 @@ FROM amazon_train **5.** Опишите конфигурацию модели: -```xml +``` xml @@ -148,7 +148,7 @@ FROM amazon_train **6.** Добавьте в конфигурацию ClickHouse путь к CatBoost и конфигурации модели: -```xml +``` xml /home/catboost/data/libcatboostmodel.so /home/catboost/models/*_model.xml @@ -160,9 +160,9 @@ FROM amazon_train Проверьте, что модель работает: -```sql -:) SELECT - modelEvaluate('amazon', +``` sql +:) SELECT + modelEvaluate('amazon', RESOURCE, MGR_ID, ROLE_ROLLUP_1, @@ -171,20 +171,20 @@ FROM amazon_train ROLE_TITLE, ROLE_FAMILY_DESC, ROLE_FAMILY, - ROLE_CODE) > 0 AS prediction, + ROLE_CODE) > 0 AS prediction, ACTION AS target FROM amazon_train LIMIT 10 ``` -!!! note "Примечание" +!!! note "Примечание" Функция [modelEvaluate](../query_language/functions/other_functions.md#function-modelevaluate) возвращает кортежи (tuple) с исходными прогнозами по классам для моделей с несколькими классами. Спрогнозируйте вероятность: -```sql -:) SELECT - modelEvaluate('amazon', +``` sql +:) SELECT + modelEvaluate('amazon', RESOURCE, MGR_ID, ROLE_ROLLUP_1, @@ -194,23 +194,23 @@ LIMIT 10 ROLE_FAMILY_DESC, ROLE_FAMILY, ROLE_CODE) AS prediction, - 1. / (1 + exp(-prediction)) AS probability, + 1. / (1 + exp(-prediction)) AS probability, ACTION AS target FROM amazon_train LIMIT 10 ``` -!!! note "Примечание" +!!! note "Примечание" Подробнее про функцию [exp()](../query_language/functions/math_functions.md). Посчитайте логистическую функцию потерь (LogLoss) на всей выборке: -```sql +``` sql :) SELECT -avg(tg * log(prob) + (1 - tg) * log(1 - prob)) AS logloss -FROM +FROM ( - SELECT - modelEvaluate('amazon', + SELECT + modelEvaluate('amazon', RESOURCE, MGR_ID, ROLE_ROLLUP_1, @@ -220,11 +220,11 @@ FROM ROLE_FAMILY_DESC, ROLE_FAMILY, ROLE_CODE) AS prediction, - 1. / (1. + exp(-prediction)) AS prob, + 1. / (1. + exp(-prediction)) AS prob, ACTION AS tg FROM amazon_train ) ``` -!!! note "Примечание" +!!! note "Примечание" Подробнее про функции [avg()](../query_language/agg_functions/reference.md#agg_function-avg), [log()](../query_language/functions/math_functions.md). diff --git a/docs/ru/guides/index.md b/docs/ru/guides/index.md index d8bad70482d..1ece4b22b85 100644 --- a/docs/ru/guides/index.md +++ b/docs/ru/guides/index.md @@ -1,5 +1,5 @@ -# Руководства +# Руководства {#rukovodstva} Подробные пошаговые инструкции, которые помогут вам решать различные задачи с помощью ClickHouse. -- [Применение модели CatBoost в ClickHouse](apply_catboost_model.md) \ No newline at end of file +- [Применение модели CatBoost в ClickHouse](apply_catboost_model.md) diff --git a/docs/ru/images/logo.svg b/docs/ru/images/logo.svg index 865b96d98c7..b5ab923ff65 100644 --- a/docs/ru/images/logo.svg +++ b/docs/ru/images/logo.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/ru/index.md b/docs/ru/index.md index c1c864d921c..c49c33adf6f 100644 --- a/docs/ru/index.md +++ b/docs/ru/index.md @@ -1,15 +1,15 @@ -# Что такое ClickHouse +# Что такое ClickHouse {#chto-takoe-clickhouse} ClickHouse - столбцовая система управления базами данных (СУБД) для онлайн обработки аналитических запросов (OLAP). -В обычной, "строковой" СУБД, данные хранятся в таком порядке: +В обычной, «строковой» СУБД, данные хранятся в таком порядке: | Строка | WatchID | JavaEnable | Title | GoodEvent | EventTime | -| ------ | ----------- | ---------- | ------------------ | --------- | ------------------- | -| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | ... | ... | ... | ... | ... | +|--------|-------------|------------|--------------------|-----------|---------------------| +| \#0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| \#1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| \#2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| \#N | … | … | … | … | … | То есть, значения, относящиеся к одной строке, физически хранятся рядом. @@ -18,14 +18,13 @@ ClickHouse - столбцовая система управления базам В столбцовых СУБД, данные хранятся в таком порядке: -| Строка: | #0 | #1 | #2 | #N | -| ----------- | ------------------- | ------------------- | ------------------- | ------------------- | -| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | -| JavaEnable: | 1 | 0 | 1 | ... | -| Title: | Investor Relations | Contact us | Mission | ... | -| GoodEvent: | 1 | 1 | 1 | ... | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | - +| Строка: | \#0 | \#1 | \#2 | \#N | +|-------------|---------------------|---------------------|---------------------|-----| +| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | +| JavaEnable: | 1 | 0 | 1 | … | +| Title: | Investor Relations | Contact us | Mission | … | +| GoodEvent: | 1 | 1 | 1 | … | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | В примерах изображён только порядок расположения данных. То есть, значения из разных столбцов хранятся отдельно, а данные одного столбца - вместе. @@ -38,13 +37,13 @@ ClickHouse - столбцовая система управления базам Чем больше нагрузка на систему, тем более важной становится специализация под сценарий работы, и тем более конкретной становится эта специализация. Не существует системы, одинаково хорошо подходящей под существенно различные сценарии работы. Если система подходит под широкое множество сценариев работы, то при достаточно большой нагрузке, система будет справляться со всеми сценариями работы плохо, или справляться хорошо только с одним из сценариев работы. -## Ключевые особенности OLAP сценария работы +## Ключевые особенности OLAP сценария работы {#kliuchevye-osobennosti-olap-stsenariia-raboty} - подавляющее большинство запросов - на чтение; -- данные обновляются достаточно большими пачками (> 1000 строк), а не по одной строке, или не обновляются вообще; +- данные обновляются достаточно большими пачками (\> 1000 строк), а не по одной строке, или не обновляются вообще; - данные добавляются в БД, но не изменяются; - при чтении, вынимается достаточно большое количество строк из БД, но только небольшое подмножество столбцов; -- таблицы являются "широкими", то есть, содержат большое количество столбцов; +- таблицы являются «широкими», то есть, содержат большое количество столбцов; - запросы идут сравнительно редко (обычно не более сотни в секунду на сервер); - при выполнении простых запросов, допустимы задержки в районе 50 мс; - значения в столбцах достаточно мелкие - числа и небольшие строки (пример - 60 байт на URL); @@ -54,9 +53,9 @@ ClickHouse - столбцовая система управления базам - в запросе одна большая таблица, все таблицы кроме одной маленькие; - результат выполнения запроса существенно меньше исходных данных - то есть, данные фильтруются или агрегируются; результат выполнения помещается в оперативку на одном сервере. -Легко видеть, что OLAP сценарий работы существенно отличается от других распространённых сценариев работы (например, OLTP или Key-Value сценариев работы). Таким образом, не имеет никакого смысла пытаться использовать OLTP или Key-Value БД для обработки аналитических запросов, если вы хотите получить приличную производительность ("выше плинтуса"). Например, если вы попытаетесь использовать для аналитики MongoDB или Redis - вы получите анекдотически низкую производительность по сравнению с OLAP-СУБД. +Легко видеть, что OLAP сценарий работы существенно отличается от других распространённых сценариев работы (например, OLTP или Key-Value сценариев работы). Таким образом, не имеет никакого смысла пытаться использовать OLTP или Key-Value БД для обработки аналитических запросов, если вы хотите получить приличную производительность («выше плинтуса»). Например, если вы попытаетесь использовать для аналитики MongoDB или Redis - вы получите анекдотически низкую производительность по сравнению с OLAP-СУБД. -## Причины, по которым столбцовые СУБД лучше подходят для OLAP сценария +## Причины, по которым столбцовые СУБД лучше подходят для OLAP сценария {#prichiny-po-kotorym-stolbtsovye-subd-luchshe-podkhodiat-dlia-olap-stsenariia} Столбцовые СУБД лучше (от 100 раз по скорости обработки большинства запросов) подходят для OLAP сценария работы. Причины в деталях будут разъяснены ниже, а сам факт проще продемонстрировать визуально: @@ -70,25 +69,30 @@ ClickHouse - столбцовая система управления базам Видите разницу? -### По вводу-выводу +### По вводу-выводу {#po-vvodu-vyvodu} -1. Для выполнения аналитического запроса, требуется прочитать небольшое количество столбцов таблицы. В столбцовой БД для этого можно читать только нужные данные. Например, если вам требуется только 5 столбцов из 100, то следует рассчитывать на 20-кратное уменьшение ввода-вывода. -2. Так как данные читаются пачками, то их проще сжимать. Данные, лежащие по столбцам также лучше сжимаются. За счёт этого, дополнительно уменьшается объём ввода-вывода. -3. За счёт уменьшения ввода-вывода, больше данных влезает в системный кэш. +1. Для выполнения аналитического запроса, требуется прочитать небольшое количество столбцов таблицы. В столбцовой БД для этого можно читать только нужные данные. Например, если вам требуется только 5 столбцов из 100, то следует рассчитывать на 20-кратное уменьшение ввода-вывода. +2. Так как данные читаются пачками, то их проще сжимать. Данные, лежащие по столбцам также лучше сжимаются. За счёт этого, дополнительно уменьшается объём ввода-вывода. +3. За счёт уменьшения ввода-вывода, больше данных влезает в системный кэш. -Например, для запроса "посчитать количество записей для каждой рекламной системы", требуется прочитать один столбец "идентификатор рекламной системы", который занимает 1 байт в несжатом виде. Если большинство переходов было не с рекламных систем, то можно рассчитывать хотя бы на десятикратное сжатие этого столбца. При использовании быстрого алгоритма сжатия, возможно разжатие данных со скоростью более нескольких гигабайт несжатых данных в секунду. То есть, такой запрос может выполняться со скоростью около нескольких миллиардов строк в секунду на одном сервере. На практике, такая скорость действительно достигается. +Например, для запроса «посчитать количество записей для каждой рекламной системы», требуется прочитать один столбец «идентификатор рекламной системы», который занимает 1 байт в несжатом виде. Если большинство переходов было не с рекламных систем, то можно рассчитывать хотя бы на десятикратное сжатие этого столбца. При использовании быстрого алгоритма сжатия, возможно разжатие данных со скоростью более нескольких гигабайт несжатых данных в секунду. То есть, такой запрос может выполняться со скоростью около нескольких миллиардов строк в секунду на одном сервере. На практике, такая скорость действительно достигается. -
Пример -```bash +
+ +Пример + +``` bash $ clickhouse-client ClickHouse client version 0.0.52053. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.52053. ``` -```sql + +``` sql SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 ``` -```text + +``` text ┌─CounterID─┬──count()─┐ │ 114208 │ 56057344 │ │ 115080 │ 51619590 │ @@ -112,20 +116,21 @@ SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIM │ 337234 │ 8205961 │ └───────────┴──────────┘ ``` +
-### По вычислениям +### По вычислениям {#po-vychisleniiam} Так как для выполнения запроса надо обработать достаточно большое количество строк, становится актуальным диспетчеризовывать все операции не для отдельных строк, а для целых векторов, или реализовать движок выполнения запроса так, чтобы издержки на диспетчеризацию были примерно нулевыми. Если этого не делать, то при любой не слишком плохой дисковой подсистеме, интерпретатор запроса неизбежно упрётся в CPU. Имеет смысл не только хранить данные по столбцам, но и обрабатывать их, по возможности, тоже по столбцам. Есть два способа это сделать: -1. Векторный движок. Все операции пишутся не для отдельных значений, а для векторов. То есть, вызывать операции надо достаточно редко, и издержки на диспетчеризацию становятся пренебрежимо маленькими. Код операции содержит в себе хорошо оптимизированный внутренний цикл. +1. Векторный движок. Все операции пишутся не для отдельных значений, а для векторов. То есть, вызывать операции надо достаточно редко, и издержки на диспетчеризацию становятся пренебрежимо маленькими. Код операции содержит в себе хорошо оптимизированный внутренний цикл. -2. Кодогенерация. Для запроса генерируется код, в котором подставлены все косвенные вызовы. +2. Кодогенерация. Для запроса генерируется код, в котором подставлены все косвенные вызовы. -В "обычных" БД этого не делается, так как не имеет смысла при выполнении простых запросов. Хотя есть исключения. Например, в MemSQL кодогенерация используется для уменьшения latency при выполнении SQL запросов. Для сравнения, в аналитических СУБД требуется оптимизация throughput, а не latency. +В «обычных» БД этого не делается, так как не имеет смысла при выполнении простых запросов. Хотя есть исключения. Например, в MemSQL кодогенерация используется для уменьшения latency при выполнении SQL запросов. Для сравнения, в аналитических СУБД требуется оптимизация throughput, а не latency. Стоит заметить, что для эффективности по CPU требуется, чтобы язык запросов был декларативным (SQL, MDX) или хотя бы векторным (J, K). То есть, чтобы запрос содержал циклы только в неявном виде, открывая возможности для оптимизации. diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index a0da96451ba..5d64f081dfb 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -1,10 +1,10 @@ -# Клиент командной строки +# Клиент командной строки {#klient-komandnoi-stroki} ClickHouse предоставляет собственный клиент командной строки: `clickhouse-client`. Клиент поддерживает запуск с аргументами командной строки и с конфигурационными файлами. Подробнее читайте в разделе [Конфигурирование](#interfaces_cli_configuration). Клиент [устанавливается](../getting_started/index.md) пакетом `clickhouse-client` и запускается командой `clickhouse-client`. -```bash +``` bash $ clickhouse-client ClickHouse client version 19.17.1.1579 (official build). Connecting to localhost:9000 as user default. @@ -15,11 +15,9 @@ Connected to ClickHouse server version 19.17.1 revision 54428. Клиенты и серверы различных версий совместимы, однако если клиент старее сервера, то некоторые новые функции могут быть недоступны. Мы рекомендуем использовать одинаковые версии клиента и сервера. При подключении клиента к более новому серверу `clickhouse-client` выводит сообщение: -``` -ClickHouse client version is older than ClickHouse server. It may lack support for new features. -``` + ClickHouse client version is older than ClickHouse server. It may lack support for new features. -## Использование {#cli_usage} +## Использование {#cli-usage} Клиент может быть использован в интерактивном и не интерактивном (batch) режиме. Чтобы использовать batch режим, укажите параметр query, или отправьте данные в stdin (проверяется, что stdin - не терминал), или и то, и другое. @@ -27,7 +25,7 @@ ClickHouse client version is older than ClickHouse server. It may lack support f Примеры использования клиента для вставки данных: -```bash +``` bash $ echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; $ cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; @@ -40,7 +38,7 @@ $ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FOR В batch режиме в качестве формата данных по умолчанию используется формат TabSeparated. Формат может быть указан в секции FORMAT запроса. -По умолчанию, в batch режиме вы можете выполнить только один запрос. Чтобы выполнить несколько запросов из "скрипта", используйте параметр --multiquery. Это работает для всех запросов кроме INSERT. Результаты запросов выводятся подряд без дополнительных разделителей. +По умолчанию, в batch режиме вы можете выполнить только один запрос. Чтобы выполнить несколько запросов из «скрипта», используйте параметр –multiquery. Это работает для всех запросов кроме INSERT. Результаты запросов выводятся подряд без дополнительных разделителей. Также, при необходимости выполнить много запросов, вы можете запускать clickhouse-client на каждый запрос. Заметим, что запуск программы clickhouse-client может занимать десятки миллисекунд. В интерактивном режиме, вы получите командную строку, в которую можно вводить запросы. @@ -60,24 +58,24 @@ $ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FOR По умолчанию, в качестве формата, используется формат PrettyCompact (красивые таблички). Вы можете изменить формат с помощью секции FORMAT запроса, или с помощью указания `\G` на конце запроса, с помощью аргумента командной строки `--format` или `--vertical`, или с помощью конфигурационного файла клиента. -Чтобы выйти из клиента, нажмите Ctrl+D (или Ctrl+C), или наберите вместо запроса одно из: "exit", "quit", "logout", "учше", "йгше", "дщпщге", "exit;", "quit;", "logout;", "учшеж", "йгшеж", "дщпщгеж", "q", "й", "q", "Q", ":q", "й", "Й", "Жй" +Чтобы выйти из клиента, нажмите Ctrl+D (или Ctrl+C), или наберите вместо запроса одно из: «exit», «quit», «logout», «учше», «йгше», «дщпщге», «exit;», «quit;», «logout;», «учшеж», «йгшеж», «дщпщгеж», «q», «й», «q», «Q», «:q», «й», «Й», «Жй» При выполнении запроса, клиент показывает: -1. Прогресс выполнение запроса, который обновляется не чаще, чем 10 раз в секунду (по умолчанию). При быстрых запросах, прогресс может не успеть отобразиться. -2. Отформатированный запрос после его парсинга - для отладки. -3. Результат в заданном формате. -4. Количество строк результата, прошедшее время, а также среднюю скорость выполнения запроса. +1. Прогресс выполнение запроса, который обновляется не чаще, чем 10 раз в секунду (по умолчанию). При быстрых запросах, прогресс может не успеть отобразиться. +2. Отформатированный запрос после его парсинга - для отладки. +3. Результат в заданном формате. +4. Количество строк результата, прошедшее время, а также среднюю скорость выполнения запроса. Вы можете прервать длинный запрос, нажав Ctrl+C. При этом вам всё равно придётся чуть-чуть подождать, пока сервер остановит запрос. На некоторых стадиях выполнения, запрос невозможно прервать. Если вы не дождётесь и нажмёте Ctrl+C второй раз, то клиент будет завершён. -Клиент командной строки позволяет передать внешние данные (внешние временные таблицы) для использования запроса. Подробнее смотрите раздел "Внешние данные для обработки запроса" +Клиент командной строки позволяет передать внешние данные (внешние временные таблицы) для использования запроса. Подробнее смотрите раздел «Внешние данные для обработки запроса» ### Запросы с параметрами {#cli-queries-with-parameters} Вы можете создать запрос с параметрами и передавать в них значения из приложения. Это позволяет избежать форматирования запросов на стороне клиента, если известно, какие из параметров запроса динамически меняются. Например: -```bash +``` bash clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {parName:Array(UInt16)}" ``` @@ -85,61 +83,61 @@ clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {p Отформатируйте запрос обычным способом. Представьте значения, которые вы хотите передать из параметров приложения в запрос в следующем формате: -```sql +``` sql {:} ``` -- `name` — идентификатор подстановки. В консольном клиенте его следует использовать как часть имени параметра `--param_ = value`. -- `data type` — [тип данных](../data_types/index.md) значения. Например, структура данных `(integer, ('string', integer))` может иметь тип данных `Tuple(UInt8, Tuple(String, UInt8))` ([целочисленный](../data_types/int_uint.md) тип может быть и другим). +- `name` — идентификатор подстановки. В консольном клиенте его следует использовать как часть имени параметра `--param_ = value`. +- `data type` — [тип данных](../data_types/index.md) значения. Например, структура данных `(integer, ('string', integer))` может иметь тип данных `Tuple(UInt8, Tuple(String, UInt8))` ([целочисленный](../data_types/int_uint.md) тип может быть и другим). -#### Пример +#### Пример {#primer} -```bash +``` bash $ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM table WHERE val = {tuple_in_tuple:Tuple(UInt8, Tuple(String, UInt8))}" ``` -## Конфигурирование {#interfaces_cli_configuration} +## Конфигурирование {#interfaces-cli-configuration} В `clickhouse-client` можно передавать различные параметры (все параметры имеют значения по умолчанию) с помощью: -- Командной строки. +- Командной строки. - Параметры командной строки переопределяют значения по умолчанию и параметры конфигурационных файлов. + Параметры командной строки переопределяют значения по умолчанию и параметры конфигурационных файлов. -- Конфигурационных файлов. +- Конфигурационных файлов. - Параметры в конфигурационных файлах переопределяют значения по умолчанию. + Параметры в конфигурационных файлах переопределяют значения по умолчанию. -### Параметры командной строки +### Параметры командной строки {#parametry-komandnoi-stroki} -- `--host, -h` — имя сервера, по умолчанию — localhost. Вы можете использовать как имя, так и IPv4 или IPv6 адрес. -- `--port` — порт, к которому соединяться, по умолчанию — 9000. Замечу, что для HTTP и родного интерфейса используются разные порты. -- `--user, -u` — имя пользователя, по умолчанию — default. -- `--password` — пароль, по умолчанию — пустая строка. -- `--query, -q` — запрос для выполнения, при использовании в неинтерактивном режиме. -- `--database, -d` — выбрать текущую БД, по умолчанию — текущая БД из настроек сервера (по умолчанию — БД default). -- `--multiline, -m` — если указано — разрешить многострочные запросы, не отправлять запрос по нажатию Enter. -- `--multiquery, -n` — если указано — разрешить выполнять несколько запросов, разделённых точкой с запятой. -- `--format, -f` — использовать указанный формат по умолчанию для вывода результата. -- `--vertical, -E` — если указано, использовать формат Vertical по умолчанию для вывода результата. То же самое, что --format=Vertical. В этом формате каждое значение выводится на отдельной строке, что удобно для отображения широких таблиц. -- `--time, -t` — если указано, в неинтерактивном режиме вывести время выполнения запроса в stderr. -- `--stacktrace` — если указано, в случае исключения, выводить также его стек трейс. -- `--config-file` — имя конфигурационного файла. -- `--secure` — если указано, будет использован безопасный канал. -- `--param_` — значение параметра для [запроса с параметрами](#cli-queries-with-parameters). +- `--host, -h` — имя сервера, по умолчанию — localhost. Вы можете использовать как имя, так и IPv4 или IPv6 адрес. +- `--port` — порт, к которому соединяться, по умолчанию — 9000. Замечу, что для HTTP и родного интерфейса используются разные порты. +- `--user, -u` — имя пользователя, по умолчанию — default. +- `--password` — пароль, по умолчанию — пустая строка. +- `--query, -q` — запрос для выполнения, при использовании в неинтерактивном режиме. +- `--database, -d` — выбрать текущую БД, по умолчанию — текущая БД из настроек сервера (по умолчанию — БД default). +- `--multiline, -m` — если указано — разрешить многострочные запросы, не отправлять запрос по нажатию Enter. +- `--multiquery, -n` — если указано — разрешить выполнять несколько запросов, разделённых точкой с запятой. +- `--format, -f` — использовать указанный формат по умолчанию для вывода результата. +- `--vertical, -E` — если указано, использовать формат Vertical по умолчанию для вывода результата. То же самое, что –format=Vertical. В этом формате каждое значение выводится на отдельной строке, что удобно для отображения широких таблиц. +- `--time, -t` — если указано, в неинтерактивном режиме вывести время выполнения запроса в stderr. +- `--stacktrace` — если указано, в случае исключения, выводить также его стек трейс. +- `--config-file` — имя конфигурационного файла. +- `--secure` — если указано, будет использован безопасный канал. +- `--param_` — значение параметра для [запроса с параметрами](#cli-queries-with-parameters). -### Конфигурационные файлы +### Конфигурационные файлы {#konfiguratsionnye-faily} `clickhouse—client` использует первый существующий файл из: -- Определенного параметром `--config-file`. -- `./clickhouse-client.xml` -- `~/.clickhouse-client/config.xml` -- `/etc/clickhouse-client/config.xml` +- Определенного параметром `--config-file`. +- `./clickhouse-client.xml` +- `~/.clickhouse-client/config.xml` +- `/etc/clickhouse-client/config.xml` Пример конфигурационного файла: -```xml +``` xml username password diff --git a/docs/ru/interfaces/cpp.md b/docs/ru/interfaces/cpp.md index f910aa0eab2..7136af3197b 100644 --- a/docs/ru/interfaces/cpp.md +++ b/docs/ru/interfaces/cpp.md @@ -1,4 +1,4 @@ -# C++ клиентская библиотека +# C++ клиентская библиотека {#c-klientskaia-biblioteka} См. README в репозитории [clickhouse-cpp](https://github.com/ClickHouse/clickhouse-cpp). diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index db95bb5036f..8d5a38a3a63 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -4,37 +4,37 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT Поддерживаемые форматы и возможность использовать их в запросах `INSERT` и `SELECT` перечислены в таблице ниже. -| Формат | INSERT | SELECT | -| ------- | -------- | -------- | -| [TabSeparated](#tabseparated) | ✔ | ✔ | -| [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | -| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | -| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | -| [Template](#format-template) | ✔ | ✔ | -| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | -| [CSV](#csv) | ✔ | ✔ | -| [CSVWithNames](#csvwithnames) | ✔ | ✔ | -| [CustomSeparated](#format-customseparated) | ✔ | ✔ | -| [Values](#data-format-values) | ✔ | ✔ | -| [Vertical](#vertical) | ✗ | ✔ | -| [JSON](#json) | ✗ | ✔ | -| [JSONCompact](#jsoncompact) | ✗ | ✔ | -| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | -| [TSKV](#tskv) | ✔ | ✔ | -| [Pretty](#pretty) | ✗ | ✔ | -| [PrettyCompact](#prettycompact) | ✗ | ✔ | -| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | -| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | -| [PrettySpace](#prettyspace) | ✗ | ✔ | -| [Protobuf](#protobuf) | ✔ | ✔ | -| [Parquet](#data-format-parquet) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | -| [RowBinary](#rowbinary) | ✔ | ✔ | -| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | -| [Native](#native) | ✔ | ✔ | -| [Null](#null) | ✗ | ✔ | -| [XML](#xml) | ✗ | ✔ | -| [CapnProto](#capnproto) | ✔ | ✗ | +| Формат | INSERT | SELECT | +|-----------------------------------------------------------------|--------|--------| +| [TabSeparated](#tabseparated) | ✔ | ✔ | +| [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | +| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | +| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | +| [CSV](#csv) | ✔ | ✔ | +| [CSVWithNames](#csvwithnames) | ✔ | ✔ | +| [CustomSeparated](#format-customseparated) | ✔ | ✔ | +| [Values](#data-format-values) | ✔ | ✔ | +| [Vertical](#vertical) | ✗ | ✔ | +| [JSON](#json) | ✗ | ✔ | +| [JSONCompact](#jsoncompact) | ✗ | ✔ | +| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | +| [TSKV](#tskv) | ✔ | ✔ | +| [Pretty](#pretty) | ✗ | ✔ | +| [PrettyCompact](#prettycompact) | ✗ | ✔ | +| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | +| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | +| [PrettySpace](#prettyspace) | ✗ | ✔ | +| [Protobuf](#protobuf) | ✔ | ✔ | +| [Parquet](#data-format-parquet) | ✔ | ✔ | +| [ORC](#data-format-orc) | ✔ | ✗ | +| [RowBinary](#rowbinary) | ✔ | ✔ | +| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | +| [Native](#native) | ✔ | ✔ | +| [Null](#null) | ✗ | ✔ | +| [XML](#xml) | ✗ | ✔ | +| [CapnProto](#capnproto) | ✔ | ✗ | Вы можете регулировать некоторые параметры работы с форматами с помощью настроек ClickHouse. За дополнительной информацией обращайтесь к разделу [Настройки](../operations/settings/settings.md). @@ -48,11 +48,11 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT Формат `TabSeparated` поддерживает вывод тотальных значений (при использовании WITH TOTALS) и экстремальных значений (при настройке extremes выставленной в 1). В этих случаях, после основных данных выводятся тотальные значения, и экстремальные значения. Основной результат, тотальные значения и экстремальные значения, отделяются друг от друга пустой строкой. Пример: -```sql +``` sql SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated`` ``` -```text +``` text 2014-03-17 1406958 2014-03-18 1383658 2014-03-19 1405797 @@ -67,9 +67,9 @@ SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORD 2014-03-23 1406958 ``` -### Форматирование данных +### Форматирование данных {#formatirovanie-dannykh} -Целые числа пишутся в десятичной форме. Числа могут содержать лишний символ "+" в начале (игнорируется при парсинге, а при форматировании не пишется). Неотрицательные числа не могут содержать знак отрицания. При чтении допустим парсинг пустой строки, как числа ноль, или (для знаковых типов) строки, состоящей из одного минуса, как числа ноль. Числа, не помещающиеся в соответствующий тип данных, могут парсится, как некоторое другое число, без сообщения об ошибке. +Целые числа пишутся в десятичной форме. Числа могут содержать лишний символ «+» в начале (игнорируется при парсинге, а при форматировании не пишется). Неотрицательные числа не могут содержать знак отрицания. При чтении допустим парсинг пустой строки, как числа ноль, или (для знаковых типов) строки, состоящей из одного минуса, как числа ноль. Числа, не помещающиеся в соответствующий тип данных, могут парсится, как некоторое другое число, без сообщения об ошибке. Числа с плавающей запятой пишутся в десятичной форме. При этом, десятичный разделитель - точка. Поддерживается экспоненциальная запись, а также inf, +inf, -inf, nan. Запись числа с плавающей запятой может начинаться или заканчиваться на десятичную точку. При форматировании возможна потеря точности чисел с плавающей запятой. @@ -84,7 +84,7 @@ SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORD Строки выводятся с экранированием спецсимволов с помощью обратного слеша. При выводе, используются следующие escape-последовательности: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`. Парсер также поддерживает последовательности `\a`, `\v`, и `\xHH` (последовательности hex escape) и любые последовательности вида `\c`, где `c` — любой символ (такие последовательности преобразуются в `c`). Таким образом, при чтении поддерживаются форматы, где перевод строки может быть записан как `\n` и как `\` и перевод строки. Например, строка `Hello world`, где между словами вместо пробела стоит перевод строки, может быть считана в любом из следующих вариантов: -```text +``` text Hello\nworld Hello\ @@ -105,27 +105,29 @@ world Например: -```sql +``` sql CREATE TABLE nestedt ( - `id` UInt8, + `id` UInt8, `aux` Nested( - a UInt8, + a UInt8, b String ) ) ENGINE = TinyLog ``` -```sql + +``` sql INSERT INTO nestedt Values ( 1, [1], ['a']) ``` -```sql + +``` sql SELECT * FROM nestedt FORMAT TSV ``` -```text -1 [1] ['a'] -``` +``` text +1 [1] ['a'] +``` ## TabSeparatedRaw {#tabseparatedraw} @@ -157,74 +159,77 @@ SELECT * FROM nestedt FORMAT TSV Настройка `format_template_row` задаёт путь к файлу, содержащему форматную строку для строк таблицы, которая должна иметь вид: - `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, +`delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, - где `delimiter_i` - разделители между значениями (символ `$` в разделителе экранируется как `$$`), - `column_i` - имена или номера столбцов, значения которых должны быть выведены или считаны (если имя не указано - столбец пропускается), - `serializeAs_i` - тип экранирования для значений соответствующего столбца. Поддерживаются следующие типы экранирования: + где `delimiter_i` - разделители между значениями (символ `$` в разделителе экранируется как `$$`), + `column_i` - имена или номера столбцов, значения которых должны быть выведены или считаны (если имя не указано - столбец пропускается), + `serializeAs_i` - тип экранирования для значений соответствующего столбца. Поддерживаются следующие типы экранирования: - - `CSV`, `JSON`, `XML` (как в одноимённых форматах) - - `Escaped` (как в `TSV`) - - `Quoted` (как в `Values`) - - `Raw` (без экранирования, как в `TSVRaw`) - - `None` (тип экранирования отсутствует, см. далее) + - `CSV`, `JSON`, `XML` (как в одноимённых форматах) + - `Escaped` (как в `TSV`) + - `Quoted` (как в `Values`) + - `Raw` (без экранирования, как в `TSVRaw`) + - `None` (тип экранирования отсутствует, см. далее) - Если для столбца не указан тип экранирования, используется `None`. `XML` и `Raw` поддерживаются только для вывода. + Если для столбца не указан тип экранирования, используется `None`. `XML` и `Raw` поддерживаются только для вывода. - Так, в форматной строке + Так, в форматной строке - `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` - между разделителями `Search phrase: `, `, count: `, `, ad price: $` и `;` при выводе будут подставлены (при вводе - будут ожидаться) значения столбцов `SearchPhrase`, `c` и `price`, сериализованные как `Quoted`, `Escaped` и `JSON` соответственно, например: + между разделителями `Search phrase: `, `, count: `, `, ad price: $` и `;` при выводе будут подставлены (при вводе - будут ожидаться) значения столбцов `SearchPhrase`, `c` и `price`, сериализованные как `Quoted`, `Escaped` и `JSON` соответственно, например: - `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` - Настройка `format_template_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. По умолчанию `\n`. +Настройка `format_template_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. По умолчанию `\n`. Настройка `format_template_resultset` задаёт путь к файлу, содержащему форматную строку для результата. Форматная строка для результата имеет синтаксис аналогичный форматной строке для строк таблицы и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: - - `data` - строки с данными в формате `format_template_row`, разделённые `format_template_rows_between_delimiter`. Эта подстановка должна быть первой подстановкой в форматной строке. - - `totals` - строка с тотальными значениями в формате `format_template_row` (при использовании WITH TOTALS) - - `min` - строка с минимальными значениями в формате `format_template_row` (при настройке extremes, выставленной в 1) - - `max` - строка с максимальными значениями в формате `format_template_row` (при настройке extremes, выставленной в 1) - - `rows` - общее количество выведенных стрчек - - `rows_before_limit` - не менее скольких строчек получилось бы, если бы не было LIMIT-а. Выводится только если запрос содержит LIMIT. В случае, если запрос содержит GROUP BY, `rows_before_limit` - точное число строк, которое получилось бы, если бы не было LIMIT-а. - - `time` - время выполнения запроса в секундах - - `rows_read` - сколько строк было прочитано при выполнении запроса - - `bytes_read` - сколько байт (несжатых) было прочитано при выполнении запроса +- `data` - строки с данными в формате `format_template_row`, разделённые `format_template_rows_between_delimiter`. Эта подстановка должна быть первой подстановкой в форматной строке. +- `totals` - строка с тотальными значениями в формате `format_template_row` (при использовании WITH TOTALS) +- `min` - строка с минимальными значениями в формате `format_template_row` (при настройке extremes, выставленной в 1) +- `max` - строка с максимальными значениями в формате `format_template_row` (при настройке extremes, выставленной в 1) +- `rows` - общее количество выведенных стрчек +- `rows_before_limit` - не менее скольких строчек получилось бы, если бы не было LIMIT-а. Выводится только если запрос содержит LIMIT. В случае, если запрос содержит GROUP BY, `rows_before_limit` - точное число строк, которое получилось бы, если бы не было LIMIT-а. +- `time` - время выполнения запроса в секундах +- `rows_read` - сколько строк было прочитано при выполнении запроса +- `bytes_read` - сколько байт (несжатых) было прочитано при выполнении запроса - У подстановок `data`, `totals`, `min` и `max` не должны быть указаны типы экранирования (или должен быть указан `None`). Остальные подстановки - это отдельные значения, для них может быть указан любой тип экранирования. - Если строка `format_template_resultset` пустая, то по-умолчанию используется `${data}`. - Из всех перечисленных подстановок форматная строка `format_template_resultset` для ввода может содержать только `data`. - Также при вводе формат поддерживает пропуск значений столбцов и пропуск значений в префиксе и суффиксе (см. пример). +У подстановок `data`, `totals`, `min` и `max` не должны быть указаны типы экранирования (или должен быть указан `None`). Остальные подстановки - это отдельные значения, для них может быть указан любой тип экранирования. +Если строка `format_template_resultset` пустая, то по-умолчанию используется `${data}`. +Из всех перечисленных подстановок форматная строка `format_template_resultset` для ввода может содержать только `data`. +Также при вводе формат поддерживает пропуск значений столбцов и пропуск значений в префиксе и суффиксе (см. пример). - Пример вывода: -```sql +Пример вывода: + +``` sql SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 FORMAT Template SETTINGS format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = '\n ' ``` + `/some/path/resultset.format`: -``` - - Search phrases - - - - ${data} -
Search phrases
Search phrase Count
- - ${max} -
Max
- Processed ${rows_read:XML} rows in ${time:XML} sec - - -``` + + + Search phrases + + + + ${data} +
Search phrases
Search phrase Count
+ + ${max} +
Max
+ Processed ${rows_read:XML} rows in ${time:XML} sec + + + `/some/path/row.format`: -``` - ${0:XML} ${1:XML} -``` + + ${0:XML} ${1:XML} + Резутьтат: -```html + +``` html Search phrases @@ -245,31 +250,35 @@ format_template_resultset = '/some/path/resultset.format', format_template_row = ``` Пример ввода: -```text + +``` text Some header Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 Total rows: 2 ``` -```sql + +``` sql INSERT INTO UserActivity FORMAT Template SETTINGS format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format' ``` + `/some/path/resultset.format`: -``` -Some header\n${data}\nTotal rows: ${:CSV}\n -``` + + Some header\n${data}\nTotal rows: ${:CSV}\n + `/some/path/row.format`: -``` -Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV} -``` -`PageViews`, `UserID`, `Duration` и `Sign` внутри подстановок - имена столбцов в таблице, в которую вставляются данные. Значения после `Useless field` в строках и значение после `\nTotal rows: ` в суффиксе будут проигнорированы. + + Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV} + +`PageViews`, `UserID`, `Duration` и `Sign` внутри подстановок - имена столбцов в таблице, в которую вставляются данные. Значения после `Useless field` в строках и значение после `\nTotal rows:` в суффиксе будут проигнорированы. Все разделители во входных данных должны строго соответствовать разделителям в форматных строках. ## TemplateIgnoreSpaces {#templateignorespaces} Подходит только для ввода. Отличается от формата `Template` тем, что пропускает пробельные символы между разделителями и значениями во входном потоке. Также в этом формате можно указать пустые подстановки с типом экранирования `None` (`${}` или `${:None}`), чтобы разбить разделители на несколько частей, пробелы между которыми должны игнорироваться. Такие подстановки используются только для пропуска пробелов. С помощью этого формата можно считывать `JSON`, если значения столбцов в нём всегда идут в одном порядке в каждой строке. Например, для вставки данных из примера вывода формата [JSON](#json) в таблицу со столбцами `phrase` и `cnt` можно использовать следующий запрос: -```sql + +``` sql INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', @@ -280,7 +289,7 @@ format_schema_rows_between_delimiter = ',' Похож на TabSeparated, но выводит значения в формате name=value. Имена экранируются так же, как строки в формате TabSeparated и, дополнительно, экранируется также символ =. -```text +``` text SearchPhrase= count()=8267016 SearchPhrase=интерьер ванной комнаты count()=2166 SearchPhrase=яндекс count()=1655 @@ -295,12 +304,12 @@ SearchPhrase=баку count()=1000 [NULL](../query_language/syntax.md) форматируется как `\N`. -```sql +``` sql SELECT * FROM t_null FORMAT TSKV ``` -```text -x=1 y=\N +``` text +x=1 y=\N ``` При большом количестве маленьких столбцов, этот формат существенно неэффективен, и обычно нет причин его использовать. Впрочем, он не хуже формата JSONEachRow по производительности. @@ -313,23 +322,23 @@ x=1 y=\N Формат Comma Separated Values ([RFC](https://tools.ietf.org/html/rfc4180)). -При форматировании, строки выводятся в двойных кавычках. Двойная кавычка внутри строки выводится как две двойные кавычки подряд. Других правил экранирования нет. Даты и даты-с-временем выводятся в двойных кавычках. Числа выводятся без кавычек. Значения разделяются символом-разделителем, по умолчанию — `,`. Символ-разделитель определяется настройкой [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Строки разделяются unix переводом строки (LF). Массивы сериализуются в CSV следующим образом: сначала массив сериализуется в строку, как в формате TabSeparated, а затем полученная строка выводится в CSV в двойных кавычках. Кортежи в формате CSV сериализуются, как отдельные столбцы (то есть, теряется их вложенность в кортеж). +При форматировании, строки выводятся в двойных кавычках. Двойная кавычка внутри строки выводится как две двойные кавычки подряд. Других правил экранирования нет. Даты и даты-с-временем выводятся в двойных кавычках. Числа выводятся без кавычек. Значения разделяются символом-разделителем, по умолчанию — `,`. Символ-разделитель определяется настройкой [format\_csv\_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Строки разделяются unix переводом строки (LF). Массивы сериализуются в CSV следующим образом: сначала массив сериализуется в строку, как в формате TabSeparated, а затем полученная строка выводится в CSV в двойных кавычках. Кортежи в формате CSV сериализуются, как отдельные столбцы (то есть, теряется их вложенность в кортеж). -```bash +``` bash $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv ``` -*По умолчанию — `,`. См. настройку [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) для дополнительной информации. +\*По умолчанию — `,`. См. настройку [format\_csv\_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) для дополнительной информации. При парсинге, все значения могут парситься как в кавычках, так и без кавычек. Поддерживаются как двойные, так и одинарные кавычки. Строки также могут быть без кавычек. В этом случае они парсятся до символа-разделителя или перевода строки (CR или LF). В нарушение RFC, в случае парсинга строк не в кавычках, начальные и конечные пробелы и табы игнорируются. В качестве перевода строки, поддерживаются как Unix (LF), так и Windows (CR LF) и Mac OS Classic (LF CR) варианты. -`NULL` форматируется в виде `\N` или `NULL` или пустой неэкранированной строки (см. настройки [input_format_csv_unquoted_null_literal_as_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null) и [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)). +`NULL` форматируется в виде `\N` или `NULL` или пустой неэкранированной строки (см. настройки [input\_format\_csv\_unquoted\_null\_literal\_as\_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null) и [input\_format\_defaults\_for\_omitted\_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)). -Если установлена настройка [input_format_defaults_for_omitted_fields = 1](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) и тип столбца не `Nullable(T)`, то пустые значения без кавычек заменяются значениями по умолчанию для типа данных столбца. +Если установлена настройка [input\_format\_defaults\_for\_omitted\_fields = 1](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) и тип столбца не `Nullable(T)`, то пустые значения без кавычек заменяются значениями по умолчанию для типа данных столбца. Формат CSV поддерживает вывод totals и extremes аналогично `TabSeparated`. -## CSVWithNames +## CSVWithNames {#csvwithnames} Выводит также заголовок, аналогично `TabSeparatedWithNames`. @@ -342,11 +351,11 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR Выводит данные в формате JSON. Кроме таблицы с данными, также выводятся имена и типы столбцов, и некоторая дополнительная информация - общее количество выведенных строк, а также количество строк, которое могло бы быть выведено, если бы не было LIMIT-а. Пример: -```sql +``` sql SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON ``` -```json +``` json { "meta": [ @@ -410,12 +419,12 @@ SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTA } ``` -JSON совместим с JavaScript. Для этого, дополнительно экранируются некоторые символы: символ прямого слеша `/` экранируется в виде `\/`; альтернативные переводы строк `U+2028`, `U+2029`, на которых ломаются некоторые браузеры, экранируются в виде `\uXXXX`-последовательностей. Экранируются ASCII control characters: backspace, form feed, line feed, carriage return, horizontal tab в виде `\b`, `\f`, `\n`, `\r`, `\t` соответственно, а также остальные байты из диапазона 00-1F с помощью `\uXXXX`-последовательностей. Невалидные UTF-8 последовательности заменяются на replacement character � и, таким образом, выводимый текст будет состоять из валидных UTF-8 последовательностей. Числа типа UInt64 и Int64, для совместимости с JavaScript, по умолчанию выводятся в двойных кавычках. Чтобы они выводились без кавычек, можно установить конфигурационный параметр [output_format_json_quote_64bit_integers](../operations/settings/settings.md#session_settings-output_format_json_quote_64bit_integers) равным 0. +JSON совместим с JavaScript. Для этого, дополнительно экранируются некоторые символы: символ прямого слеша `/` экранируется в виде `\/`; альтернативные переводы строк `U+2028`, `U+2029`, на которых ломаются некоторые браузеры, экранируются в виде `\uXXXX`-последовательностей. Экранируются ASCII control characters: backspace, form feed, line feed, carriage return, horizontal tab в виде `\b`, `\f`, `\n`, `\r`, `\t` соответственно, а также остальные байты из диапазона 00-1F с помощью `\uXXXX`-последовательностей. Невалидные UTF-8 последовательности заменяются на replacement character � и, таким образом, выводимый текст будет состоять из валидных UTF-8 последовательностей. Числа типа UInt64 и Int64, для совместимости с JavaScript, по умолчанию выводятся в двойных кавычках. Чтобы они выводились без кавычек, можно установить конфигурационный параметр [output\_format\_json\_quote\_64bit\_integers](../operations/settings/settings.md#session_settings-output_format_json_quote_64bit_integers) равным 0. `rows` - общее количество выведенных строчек. `rows_before_limit_at_least` - не менее скольких строчек получилось бы, если бы не было LIMIT-а. Выводится только если запрос содержит LIMIT. -В случае, если запрос содержит GROUP BY, rows_before_limit_at_least - точное число строк, которое получилось бы, если бы не было LIMIT-а. +В случае, если запрос содержит GROUP BY, rows\_before\_limit\_at\_least - точное число строк, которое получилось бы, если бы не было LIMIT-а. `totals` - тотальные значения (при использовании WITH TOTALS). @@ -433,7 +442,7 @@ ClickHouse поддерживает [NULL](../query_language/syntax.md), кот Пример: -```json +``` json { "meta": [ @@ -477,7 +486,7 @@ ClickHouse поддерживает [NULL](../query_language/syntax.md), кот При использовании этого формата, ClickHouse выводит каждую запись как объект JSON (каждый объект отдельной строкой), при этом данные в целом — невалидный JSON. -```json +``` json {"SearchPhrase":"дизайн штор","count()":"1064"} {"SearchPhrase":"баку","count()":"1000"} {"SearchPhrase":"","count":"8267016"} @@ -485,16 +494,14 @@ ClickHouse поддерживает [NULL](../query_language/syntax.md), кот При вставке данных необходимо каждую запись передавать как отдельный объект JSON. -### Вставка данных +### Вставка данных {#vstavka-dannykh} -``` -INSERT INTO UserActivity FORMAT JSONEachRow {"PageViews":5, "UserID":"4324182021466249494", "Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} -``` + INSERT INTO UserActivity FORMAT JSONEachRow {"PageViews":5, "UserID":"4324182021466249494", "Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} ClickHouse допускает: -- Любой порядок пар ключ-значение в объекте. -- Пропуск отдельных значений. +- Любой порядок пар ключ-значение в объекте. +- Пропуск отдельных значений. ClickHouse игнорирует пробелы между элементами и запятые после объектов. Вы можете передать все объекты одной строкой. Вам не нужно разделять их переносами строк. @@ -502,11 +509,11 @@ ClickHouse игнорирует пробелы между элементами ClickHouse заменяет опущенные значения значениями по умолчанию для соответствующих [data types](../data_types/index.md). -Если указано `DEFAULT expr`, то ClickHouse использует различные правила подстановки в зависимости от настройки [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields). +Если указано `DEFAULT expr`, то ClickHouse использует различные правила подстановки в зависимости от настройки [input\_format\_defaults\_for\_omitted\_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields). Рассмотрим следующую таблицу: -```sql +``` sql CREATE TABLE IF NOT EXISTS example_table ( x UInt32, @@ -514,17 +521,17 @@ CREATE TABLE IF NOT EXISTS example_table ) ENGINE = Memory; ``` -- Если `input_format_defaults_for_omitted_fields = 0`, то значение по умолчанию для `x` и `a` равняется `0` (поскольку это значение по умолчанию для типа данных `UInt32`.) -- Если `input_format_defaults_for_omitted_fields = 1`, то значение по умолчанию для `x` равно `0`, а значение по умолчанию `a` равно `x * 2`. +- Если `input_format_defaults_for_omitted_fields = 0`, то значение по умолчанию для `x` и `a` равняется `0` (поскольку это значение по умолчанию для типа данных `UInt32`.) +- Если `input_format_defaults_for_omitted_fields = 1`, то значение по умолчанию для `x` равно `0`, а значение по умолчанию `a` равно `x * 2`. !!! note "Предупреждение" Если `insert_sample_with_metadata = 1`, то при обработке запросов ClickHouse потребляет больше вычислительных ресурсов, чем если `insert_sample_with_metadata = 0`. -### Выборка данных +### Выборка данных {#vyborka-dannykh} Рассмотрим в качестве примера таблицу `UserActivity`: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -533,7 +540,7 @@ CREATE TABLE IF NOT EXISTS example_table Запрос `SELECT * FROM UserActivity FORMAT JSONEachRow` возвращает: -```text +``` text {"UserID":"4324182021466249494","PageViews":5,"Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} ``` @@ -545,23 +552,23 @@ CREATE TABLE IF NOT EXISTS example_table ### Использование вложенных структур {#jsoneachrow-nested} -Если у вас есть таблица со столбцами типа [Nested](../data_types/nested_data_structures/nested.md), то в неё можно вставить данные из JSON-документа с такой же структурой. Функциональность включается настройкой [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json). +Если у вас есть таблица со столбцами типа [Nested](../data_types/nested_data_structures/nested.md), то в неё можно вставить данные из JSON-документа с такой же структурой. Функциональность включается настройкой [input\_format\_import\_nested\_json](../operations/settings/settings.md#settings-input_format_import_nested_json). Например, рассмотрим следующую таблицу: -```sql +``` sql CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory ``` Из описания типа данных `Nested` видно, что ClickHouse трактует каждый компонент вложенной структуры как отдельный столбец (для нашей таблицы `n.s` и `n.i`). Можно вставить данные следующим образом: -```sql +``` sql INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} ``` -Чтобы вставить данные как иерархический объект JSON, установите [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). +Чтобы вставить данные как иерархический объект JSON, установите [input\_format\_import\_nested\_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). -```json +``` json { "n": { "s": ["abc", "def"], @@ -572,31 +579,31 @@ INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i Без этой настройки ClickHouse сгенерирует исключение. -```sql +``` sql SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json' ``` -```text +``` text ┌─name────────────────────────────┬─value─┐ │ input_format_import_nested_json │ 0 │ └─────────────────────────────────┴───────┘ ``` -```sql +``` sql INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} ``` -```text +``` text Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1) ``` -```sql +``` sql SET input_format_import_nested_json=1 INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} SELECT * FROM json_each_row_nested ``` -```text +``` text ┌─n.s───────────┬─n.i────┐ │ ['abc','def'] │ [1,23] │ └───────────────┴────────┘ @@ -604,7 +611,7 @@ SELECT * FROM json_each_row_nested ## Native {#native} -Самый эффективный формат. Данные пишутся и читаются блоками в бинарном виде. Для каждого блока пишется количество строк, количество столбцов, имена и типы столбцов, а затем кусочки столбцов этого блока, один за другим. То есть, этот формат является "столбцовым" - не преобразует столбцы в строки. Именно этот формат используется в родном интерфейсе - при межсерверном взаимодействии, при использовании клиента командной строки, при работе клиентов, написанных на C++. +Самый эффективный формат. Данные пишутся и читаются блоками в бинарном виде. Для каждого блока пишется количество строк, количество столбцов, имена и типы столбцов, а затем кусочки столбцов этого блока, один за другим. То есть, этот формат является «столбцовым» - не преобразует столбцы в строки. Именно этот формат используется в родном интерфейсе - при межсерверном взаимодействии, при использовании клиента командной строки, при работе клиентов, написанных на C++. Вы можете использовать этот формат для быстрой генерации дампов, которые могут быть прочитаны только СУБД ClickHouse. Вряд ли имеет смысл работать с этим форматом самостоятельно. @@ -621,10 +628,11 @@ SELECT * FROM json_each_row_nested [NULL](../query_language/syntax.md) выводится как `ᴺᵁᴸᴸ`. -```sql +``` sql SELECT * FROM t_null ``` -```text + +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ └───┴──────┘ @@ -632,26 +640,26 @@ SELECT * FROM t_null В форматах `Pretty*` строки выводятся без экранирования. Ниже приведен пример для формата [PrettyCompact](#prettycompact): -```sql +``` sql SELECT 'String with \'quotes\' and \t character' AS Escaping_test ``` -```text +``` text ┌─Escaping_test────────────────────────┐ -│ String with 'quotes' and character │ +│ String with 'quotes' and character │ └──────────────────────────────────────┘ ``` -Для защиты от вываливания слишком большого количества данных в терминал, выводится только первые 10 000 строк. Если строк больше или равно 10 000, то будет написано "Showed first 10 000." +Для защиты от вываливания слишком большого количества данных в терминал, выводится только первые 10 000 строк. Если строк больше или равно 10 000, то будет написано «Showed first 10 000.» Этот формат подходит только для вывода результата выполнения запроса, но не для парсинга (приёма данных для вставки в таблицу). Формат `Pretty` поддерживает вывод тотальных значений (при использовании WITH TOTALS) и экстремальных значений (при настройке extremes выставленной в 1). В этих случаях, после основных данных выводятся тотальные значения, и экстремальные значения, в отдельных табличках. Пример (показан для формата [PrettyCompact](#prettycompact)): -```sql +``` sql SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact ``` -```text +``` text ┌──EventDate─┬───────c─┐ │ 2014-03-17 │ 1406958 │ │ 2014-03-18 │ 1383658 │ @@ -689,17 +697,17 @@ Extremes: Пример: -```bash +``` bash $ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" ``` Для отображения в браузере, вы можете использовать HTTP интерфейс. -### PrettyCompactNoEscapes +### PrettyCompactNoEscapes {#prettycompactnoescapes} Аналогично. -### PrettySpaceNoEscapes +### PrettySpaceNoEscapes {#prettyspacenoescapes} Аналогично. @@ -726,9 +734,9 @@ Array представлены как длина в формате varint (unsig То же самое что [RowBinary](#rowbinary), но добавляется заголовок: - * Количество колонок - N, закодированное [LEB128](https://en.wikipedia.org/wiki/LEB128), - * N строк (`String`) с именами колонок, - * N строк (`String`) с типами колонок. +- Количество колонок - N, закодированное [LEB128](https://en.wikipedia.org/wiki/LEB128), +- N строк (`String`) с именами колонок, +- N строк (`String`) с типами колонок. ## Values {#data-format-values} @@ -746,10 +754,11 @@ Array представлены как длина в формате varint (unsig Пример: -```sql +``` sql SELECT * FROM t_null FORMAT Vertical ``` -```text + +``` text Row 1: ────── x: 1 @@ -758,14 +767,14 @@ y: ᴺᵁᴸᴸ В формате `Vertical` строки выводятся без экранирования. Например: -```sql +``` sql SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical ``` -```text +``` text Row 1: ────── -test: string with 'quotes' and with some special +test: string with 'quotes' and with some special characters ``` @@ -775,7 +784,7 @@ test: string with 'quotes' and with some special Формат XML подходит только для вывода данных, не для парсинга. Пример: -```xml +``` xml @@ -847,17 +856,17 @@ test: string with 'quotes' and with some special ## CapnProto {#capnproto} -Cap'n Proto - формат бинарных сообщений, похож на Protocol Buffers и Thrift, но не похож на JSON или MessagePack. +Cap’n Proto - формат бинарных сообщений, похож на Protocol Buffers и Thrift, но не похож на JSON или MessagePack. -Сообщения Cap'n Proto строго типизированы и не самоописывающиеся, т.е. нуждаются во внешнем описании схемы. Схема применяется "на лету" и кешируется между запросами. +Сообщения Cap’n Proto строго типизированы и не самоописывающиеся, т.е. нуждаются во внешнем описании схемы. Схема применяется «на лету» и кешируется между запросами. -```bash +``` bash $ cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits FORMAT CapnProto SETTINGS format_schema='schema:Message'" ``` Где `schema.capnp` выглядит следующим образом: -```capnp +``` capnp struct Message { SearchPhrase @0 :Text; c @1 :Uint64; @@ -877,19 +886,19 @@ ClickHouse поддерживает как синтаксис `proto2`, так Пример использования формата: -```sql +``` sql SELECT * FROM test.table FORMAT Protobuf SETTINGS format_schema = 'schemafile:MessageType' ``` или -```bash +``` bash $ cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'" ``` Где файл `schemafile.proto` может выглядеть так: -```capnp +``` capnp syntax = "proto3"; message MessageType { @@ -906,7 +915,7 @@ message MessageType { Вложенные сообщения поддерживаются, например, для поля `z` в таком сообщении -```capnp +``` capnp message MessageType { message XType { message YType { @@ -923,7 +932,7 @@ ClickHouse попытается найти столбец с именем `x.y.z Значения по умолчанию, определённые в схеме `proto2`, например, -```capnp +``` capnp syntax = "proto2"; message MessageType { @@ -940,27 +949,27 @@ ClickHouse пишет и читает сообщения `Protocol Buffers` в [Apache Parquet](http://parquet.apache.org/) — формат поколоночного хранения данных, который распространён в экосистеме Hadoop. Для формата `Parquet` ClickHouse поддерживает операции чтения и записи. -### Соответствие типов данных +### Соответствие типов данных {#sootvetstvie-tipov-dannykh} Таблица ниже содержит поддерживаемые типы данных и их соответствие [типам данных](../data_types/index.md) ClickHouse для запросов `INSERT` и `SELECT`. -| Тип данных Parquet (`INSERT`) | Тип данных ClickHouse | Тип данных Parquet (`SELECT`) | -| -------------------- | ------------------ | ---- | -| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | `UINT8` | -| `INT8` | [Int8](../data_types/int_uint.md) | `INT8` | -| `UINT16` | [UInt16](../data_types/int_uint.md) | `UINT16` | -| `INT16` | [Int16](../data_types/int_uint.md) | `INT16` | -| `UINT32` | [UInt32](../data_types/int_uint.md) | `UINT32` | -| `INT32` | [Int32](../data_types/int_uint.md) | `INT32` | -| `UINT64` | [UInt64](../data_types/int_uint.md) | `UINT64` | -| `INT64` | [Int64](../data_types/int_uint.md) | `INT64` | -| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | `FLOAT` | -| `DOUBLE` | [Float64](../data_types/float.md) | `DOUBLE` | -| `DATE32` | [Date](../data_types/date.md) | `UINT16` | -| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | `UINT32` | -| `STRING`, `BINARY` | [String](../data_types/string.md) | `STRING` | -| — | [FixedString](../data_types/fixedstring.md) | `STRING` | -| `DECIMAL` | [Decimal](../data_types/decimal.md) | `DECIMAL` | +| Тип данных Parquet (`INSERT`) | Тип данных ClickHouse | Тип данных Parquet (`SELECT`) | +|-------------------------------|---------------------------------------------|-------------------------------| +| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | `UINT8` | +| `INT8` | [Int8](../data_types/int_uint.md) | `INT8` | +| `UINT16` | [UInt16](../data_types/int_uint.md) | `UINT16` | +| `INT16` | [Int16](../data_types/int_uint.md) | `INT16` | +| `UINT32` | [UInt32](../data_types/int_uint.md) | `UINT32` | +| `INT32` | [Int32](../data_types/int_uint.md) | `INT32` | +| `UINT64` | [UInt64](../data_types/int_uint.md) | `UINT64` | +| `INT64` | [Int64](../data_types/int_uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../data_types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../data_types/date.md) | `UINT16` | +| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | `UINT32` | +| `STRING`, `BINARY` | [String](../data_types/string.md) | `STRING` | +| — | [FixedString](../data_types/fixedstring.md) | `STRING` | +| `DECIMAL` | [Decimal](../data_types/decimal.md) | `DECIMAL` | ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных Parquet `DECIMAL` как `Decimal128`. @@ -968,47 +977,46 @@ ClickHouse поддерживает настраиваемую точность Типы данных столбцов в ClickHouse могут отличаться от типов данных соответствующих полей файла в формате Parquet. При вставке данных, ClickHouse интерпретирует типы данных в соответствии с таблицей выше, а затем [приводит](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) данные к тому типу, который установлен для столбца таблицы. -### Вставка и выборка данных +### Вставка и выборка данных {#vstavka-i-vyborka-dannykh} Чтобы вставить в ClickHouse данные из файла в формате Parquet, выполните команду следующего вида: -```bash +``` bash $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" ``` Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата Parquet, используйте команду следующего вида: -```bash +``` bash $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} ``` Для обмена данными с экосистемой Hadoop можно использовать движки таблиц [HDFS](../operations/table_engines/hdfs.md). - ## ORC {#data-format-orc} [Apache ORC](https://orc.apache.org/) - это column-oriented формат данных, распространённый в экосистеме Hadoop. Вы можете только вставлять данные этого формата в ClickHouse. -### Соответствие типов данных +### Соответствие типов данных {#sootvetstvie-tipov-dannykh-1} Таблица показывает поддержанные типы данных и их соответствие [типам данных](../data_types/index.md) ClickHouse для запросов `INSERT`. -| Тип данных ORC (`INSERT`) | Тип данных ClickHouse | -| -------------------- | ------------------ | -| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | -| `INT8` | [Int8](../data_types/int_uint.md) | -| `UINT16` | [UInt16](../data_types/int_uint.md) | -| `INT16` | [Int16](../data_types/int_uint.md) | -| `UINT32` | [UInt32](../data_types/int_uint.md) | -| `INT32` | [Int32](../data_types/int_uint.md) | -| `UINT64` | [UInt64](../data_types/int_uint.md) | -| `INT64` | [Int64](../data_types/int_uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | -| `DOUBLE` | [Float64](../data_types/float.md) | -| `DATE32` | [Date](../data_types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | -| `STRING`, `BINARY` | [String](../data_types/string.md) | -| `DECIMAL` | [Decimal](../data_types/decimal.md) | +| Тип данных ORC (`INSERT`) | Тип данных ClickHouse | +|---------------------------|---------------------------------------| +| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | +| `INT8` | [Int8](../data_types/int_uint.md) | +| `UINT16` | [UInt16](../data_types/int_uint.md) | +| `INT16` | [Int16](../data_types/int_uint.md) | +| `UINT32` | [UInt32](../data_types/int_uint.md) | +| `INT32` | [Int32](../data_types/int_uint.md) | +| `UINT64` | [UInt64](../data_types/int_uint.md) | +| `INT64` | [Int64](../data_types/int_uint.md) | +| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | +| `DOUBLE` | [Float64](../data_types/float.md) | +| `DATE32` | [Date](../data_types/date.md) | +| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | +| `STRING`, `BINARY` | [String](../data_types/string.md) | +| `DECIMAL` | [Decimal](../data_types/decimal.md) | ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных Parquet `DECIMAL` как `Decimal128`. @@ -1016,11 +1024,11 @@ ClickHouse поддерживает настраиваемую точность Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных, ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse. -### Вставка данных +### Вставка данных {#vstavka-dannykh-1} Данные ORC можно вставить в таблицу ClickHouse командой: -```bash +``` bash $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` @@ -1037,6 +1045,6 @@ $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT OR относительно текущей директории на клиенте. Если клиент используется в [batch режиме](../interfaces/cli.md#cli_usage), то в записи схемы допускается только относительный путь, из соображений безопасности. Если для ввода/вывода данных используется [HTTP-интерфейс](../interfaces/http.md), то файл со схемой должен располагаться на сервере в каталоге, -указанном в параметре [format_schema_path](../operations/server_settings/settings.md#server_settings-format_schema_path) конфигурации сервера. +указанном в параметре [format\_schema\_path](../operations/server_settings/settings.md#server_settings-format_schema_path) конфигурации сервера. [Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/formats/) diff --git a/docs/ru/interfaces/http.md b/docs/ru/interfaces/http.md index f51689054e3..9a89a25bec7 100644 --- a/docs/ru/interfaces/http.md +++ b/docs/ru/interfaces/http.md @@ -1,21 +1,23 @@ -# HTTP-интерфейс {#http_interface} +# HTTP-интерфейс {#http-interface} HTTP интерфейс позволяет использовать ClickHouse на любой платформе, из любого языка программирования. У нас он используется для работы из Java и Perl, а также из shell-скриптов. В других отделах, HTTP интерфейс используется из Perl, Python и Go. HTTP интерфейс более ограничен по сравнению с родным интерфейсом, но является более совместимым. По умолчанию, clickhouse-server слушает HTTP на порту 8123 (это можно изменить в конфиге). -Если запросить GET / без параметров, то вернётся строка заданная с помощью настройки [http_server_default_response](../operations/server_settings/settings.md#server_settings-http_server_default_response). Значение по умолчанию "Ok." (с переводом строки на конце). -```bash +Если запросить GET / без параметров, то вернётся строка заданная с помощью настройки [http\_server\_default\_response](../operations/server_settings/settings.md#server_settings-http_server_default_response). Значение по умолчанию «Ok.» (с переводом строки на конце). + +``` bash $ curl 'http://localhost:8123/' Ok. ``` -В скриптах проверки доступности вы можете использовать GET /ping без параметров. Если сервер доступен всегда возвращается "Ok." (с переводом строки на конце). -```bash +В скриптах проверки доступности вы можете использовать GET /ping без параметров. Если сервер доступен всегда возвращается «Ok.» (с переводом строки на конце). + +``` bash $ curl 'http://localhost:8123/ping' Ok. ``` -Запрос отправляется в виде URL параметра с именем query. Или как тело запроса при использовании метода POST. +Запрос отправляется в виде URL параметра с именем query. Или как тело запроса при использовании метода POST. Или начало запроса в URL параметре query, а продолжение POST-ом (зачем это нужно, будет объяснено ниже). Размер URL ограничен 16KB, это следует учитывать при отправке больших запросов. В случае успеха, вам вернётся код ответа 200 и результат обработки запроса в теле ответа. @@ -25,7 +27,7 @@ Ok. Примеры: -```bash +``` bash $ curl 'http://localhost:8123/?query=SELECT%201' 1 @@ -47,7 +49,7 @@ X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","writ Как видно, curl немного неудобен тем, что надо URL-эскейпить пробелы. Хотя wget сам всё эскейпит, но его не рекомендуется использовать, так как он плохо работает по HTTP 1.1 при использовании keep-alive и Transfer-Encoding: chunked. -```bash +``` bash $ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @- 1 @@ -61,17 +63,17 @@ $ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @- Если часть запроса отправляется в параметре, а часть POST-ом, то между этими двумя кусками данных ставится перевод строки. Пример (так работать не будет): -```bash +``` bash $ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @- Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL ECT 1 , expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception ``` -По умолчанию, данные возвращаются в формате TabSeparated (подробнее смотри раздел "Форматы"). +По умолчанию, данные возвращаются в формате TabSeparated (подробнее смотри раздел «Форматы»). Можно попросить любой другой формат - с помощью секции FORMAT запроса. -```bash +``` bash $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @- ┏━━━┓ ┃ 1 ┃ @@ -85,37 +87,37 @@ $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @ Примеры: Создаём таблицу: -```bash +``` bash $ echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- ``` Используем привычный запрос INSERT для вставки данных: -```bash +``` bash $ echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- ``` Данные можно отправить отдельно от запроса: -```bash +``` bash $ echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- ``` Можно указать любой формат для данных. Формат Values - то же, что используется при записи INSERT INTO t VALUES: -```bash +``` bash $ echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- ``` Можно вставить данные из tab-separated дампа, указав соответствующий формат: -```bash +``` bash $ echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- ``` Прочитаем содержимое таблицы. Данные выводятся в произвольном порядке из-за параллельной обработки запроса: -```bash +``` bash $ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' 7 8 @@ -133,24 +135,24 @@ $ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' Удаляем таблицу. -```bash +``` bash $ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- ``` Для запросов, которые не возвращают таблицу с данными, в случае успеха, выдаётся пустое тело ответа. -Вы можете использовать внутренний формат сжатия Clickhouse при передаче данных. Формат сжатых данных нестандартный, и вам придётся использовать для работы с ним специальную программу `clickhouse-compressor` (устанавливается вместе с пакетом `clickhouse-client`). Для повышения эффективности вставки данных можно отключить проверку контрольной суммы на стороне сервера с помощью настройки[http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress). +Вы можете использовать внутренний формат сжатия Clickhouse при передаче данных. Формат сжатых данных нестандартный, и вам придётся использовать для работы с ним специальную программу `clickhouse-compressor` (устанавливается вместе с пакетом `clickhouse-client`). Для повышения эффективности вставки данных можно отключить проверку контрольной суммы на стороне сервера с помощью настройки[http\_native\_compression\_disable\_checksumming\_on\_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress). Если вы указали `compress = 1` в URL, то сервер сжимает данные, которые он отправляет. Если вы указали `decompress = 1` в URL, сервер распаковывает те данные, которые вы передаёте методом `POST`. -Также, можно использовать [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). Для отправки сжатого запроса `POST`, добавьте заголовок `Content-Encoding: compression_method`. Чтобы ClickHouse сжимал ответ, добавьте заголовок `Accept-Encoding: compression_method`. ClickHouse поддерживает следующие [методы сжатия](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens): `gzip`, `br`, and `deflate`. Чтобы включить HTTP compression, используйте настройку ClickHouse [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression). Уровень сжатия данных для всех методов сжатия можно настроить с помощью настройки [http_zlib_compression_level](#settings-http_zlib_compression_level). +Также, можно использовать [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). Для отправки сжатого запроса `POST`, добавьте заголовок `Content-Encoding: compression_method`. Чтобы ClickHouse сжимал ответ, добавьте заголовок `Accept-Encoding: compression_method`. ClickHouse поддерживает следующие [методы сжатия](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens): `gzip`, `br`, and `deflate`. Чтобы включить HTTP compression, используйте настройку ClickHouse [enable\_http\_compression](../operations/settings/settings.md#settings-enable_http_compression). Уровень сжатия данных для всех методов сжатия можно настроить с помощью настройки [http\_zlib\_compression\_level](#settings-http_zlib_compression_level). Это может быть использовано для уменьшения трафика по сети при передаче большого количества данных, а также для создания сразу сжатых дампов. Примеры отправки данных со сжатием: -```bash +``` bash $ #Отправка данных на сервер: $ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' @@ -163,7 +165,7 @@ $ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gz В параметре URL database может быть указана БД по умолчанию. -```bash +``` bash $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @- 0 1 @@ -181,30 +183,36 @@ $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?data Имя пользователя и пароль могут быть указаны в одном из трёх вариантов: -1. С использованием HTTP Basic Authentication. Пример: +1. С использованием HTTP Basic Authentication. Пример: -```bash + + +``` bash $ echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- ``` -2. В параметрах URL user и password. Пример: +1. В параметрах URL user и password. Пример: -```bash + + +``` bash $ echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- ``` -3. С использованием заголовков ‘X-ClickHouse-User’ и ‘X-ClickHouse-Key’. Пример: +1. С использованием заголовков ‘X-ClickHouse-User’ и ‘X-ClickHouse-Key’. Пример: -```bash + + +``` bash $ echo 'SELECT 1' | curl -H 'X-ClickHouse-User: user' -H 'X-ClickHouse-Key: password' 'http://localhost:8123/' -d @- ``` Если пользователь не задан,то используется `default`. Если пароль не задан, то используется пустой пароль. -Также в параметрах URL вы можете указать любые настройки, которые будут использованы для обработки одного запроса, или целые профили настроек. Пример:http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1 +Также в параметрах URL вы можете указать любые настройки, которые будут использованы для обработки одного запроса, или целые профили настроек. Пример:http://localhost:8123/?profile=web&max\_rows\_to\_read=1000000000&query=SELECT+1 Подробнее смотрите в разделе [Настройки](../operations/settings/index.md). -```bash +``` bash $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @- 0 1 @@ -218,13 +226,13 @@ $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:812 9 ``` -Об остальных параметрах смотри раздел "SET". +Об остальных параметрах смотри раздел «SET». Аналогично можно использовать ClickHouse-сессии в HTTP-протоколе. Для этого необходимо добавить к запросу GET параметр `session_id`. В качестве идентификатора сессии можно использовать произвольную строку. По умолчанию через 60 секунд бездействия сессия будет прервана. Можно изменить этот таймаут, изменяя настройку `default_session_timeout` в конфигурации сервера, или добавив к запросу GET параметр `session_timeout`. Статус сессии можно проверить с помощью параметра `session_check=1`. В рамках одной сессии одновременно может исполняться только один запрос. -Прогресс выполнения запроса можно отслеживать с помощью заголовков ответа `X-ClickHouse-Progress`. Для этого включите [send_progress_in_http_headers](../operations/settings/settings.md#settings-send_progress_in_http_headers). Пример последовательности заголовков: +Прогресс выполнения запроса можно отслеживать с помощью заголовков ответа `X-ClickHouse-Progress`. Для этого включите [send\_progress\_in\_http\_headers](../operations/settings/settings.md#settings-send_progress_in_http_headers). Пример последовательности заголовков: -```text +``` text X-ClickHouse-Progress: {"read_rows":"2752512","read_bytes":"240570816","total_rows_to_read":"8880128"} X-ClickHouse-Progress: {"read_rows":"5439488","read_bytes":"482285394","total_rows_to_read":"8880128"} X-ClickHouse-Progress: {"read_rows":"8783786","read_bytes":"819092887","total_rows_to_read":"8880128"} @@ -232,20 +240,20 @@ X-ClickHouse-Progress: {"read_rows":"8783786","read_bytes":"819092887","total_ro Возможные поля заголовка: -- `read_rows` — количество прочитанных строк. -- `read_bytes` — объём прочитанных данных в байтах. -- `total_rows_to_read` — общее количество строк для чтения. -- `written_rows` — количество записанных строк. -- `written_bytes` — объём прочитанных данных в байтах. +- `read_rows` — количество прочитанных строк. +- `read_bytes` — объём прочитанных данных в байтах. +- `total_rows_to_read` — общее количество строк для чтения. +- `written_rows` — количество записанных строк. +- `written_bytes` — объём прочитанных данных в байтах. Запущенные запросы не останавливаются автоматически при разрыве HTTP соединения. Парсинг и форматирование данных производится на стороне сервера и использование сети может быть неэффективным. -Может быть передан необязательный параметр query_id - идентификатор запроса, произвольная строка. Подробнее смотрите раздел "Настройки, replace_running_query". +Может быть передан необязательный параметр query\_id - идентификатор запроса, произвольная строка. Подробнее смотрите раздел «Настройки, replace\_running\_query». -Может быть передан необязательный параметр quota_key - ключ квоты, произвольная строка. Подробнее смотрите раздел "Квоты". +Может быть передан необязательный параметр quota\_key - ключ квоты, произвольная строка. Подробнее смотрите раздел «Квоты». -HTTP интерфейс позволяет передать внешние данные (внешние временные таблицы) для использования запроса. Подробнее смотрите раздел "Внешние данные для обработки запроса" +HTTP интерфейс позволяет передать внешние данные (внешние временные таблицы) для использования запроса. Подробнее смотрите раздел «Внешние данные для обработки запроса» -## Буферизация ответа +## Буферизация ответа {#buferizatsiia-otveta} Существует возможность включить буферизацию ответа на стороне сервера. Для этого предусмотрены параметры URL `buffer_size` и `wait_end_of_query`. @@ -255,7 +263,7 @@ HTTP интерфейс позволяет передать внешние да Пример: -```bash +``` bash $ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' ``` @@ -265,11 +273,10 @@ $ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000& Можно создать запрос с параметрами и передать для них значения из соответствующих параметров HTTP-запроса. Дополнительную информацию смотрите в [Запросы с параметрами для консольного клиента](cli.md#cli-queries-with-parameters). -### Пример +### Пример {#primer} -```bash +``` bash $ curl -sS "
?param_id=2¶m_phrase=test" -d "SELECT * FROM table WHERE int_column = {id:UInt8} and string_column = {phrase:String}" ``` [Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/http_interface/) - diff --git a/docs/ru/interfaces/index.md b/docs/ru/interfaces/index.md index 3718c499f3e..8f5a2e5f073 100644 --- a/docs/ru/interfaces/index.md +++ b/docs/ru/interfaces/index.md @@ -2,20 +2,20 @@ ClickHouse предоставляет два сетевых интерфейса (оба могут быть дополнительно обернуты в TLS для дополнительной безопасности): -* [HTTP](http.md), который задокументирован и прост для использования напрямую; -* [Native TCP](tcp.md), который имеет меньше накладных расходов. +- [HTTP](http.md), который задокументирован и прост для использования напрямую; +- [Native TCP](tcp.md), который имеет меньше накладных расходов. В большинстве случаев рекомендуется использовать подходящий инструмент или библиотеку, а не напрямую взаимодействовать с ClickHouse по сути. Официально поддерживаемые Яндексом: -* [Консольный клиент](cli.md); -* [JDBC-драйвер](jdbc.md); -* [ODBC-драйвер](odbc.md); -* [C++ клиентская библиотека](cpp.md). +- [Консольный клиент](cli.md); +- [JDBC-драйвер](jdbc.md); +- [ODBC-драйвер](odbc.md); +- [C++ клиентская библиотека](cpp.md). Существует также широкий спектр сторонних библиотек для работы с ClickHouse: -* [Клиентские библиотеки](third-party/client_libraries.md); -* [Библиотеки для интеграции](third-party/integrations.md); -* [Визуальные интерфейсы](third-party/gui.md). +- [Клиентские библиотеки](third-party/client_libraries.md); +- [Библиотеки для интеграции](third-party/integrations.md); +- [Визуальные интерфейсы](third-party/gui.md). [Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/) diff --git a/docs/ru/interfaces/jdbc.md b/docs/ru/interfaces/jdbc.md index 3dbdc28d49b..e83cd2cfacf 100644 --- a/docs/ru/interfaces/jdbc.md +++ b/docs/ru/interfaces/jdbc.md @@ -1,8 +1,8 @@ -# JDBC-драйвер +# JDBC-драйвер {#jdbc-draiver} -* **[Официальный драйвер](https://github.com/ClickHouse/clickhouse-jdbc)** -* Драйверы от сторонних организаций: - * [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) - * [clickhouse4j](https://github.com/blynkkk/clickhouse4j) +- **[Официальный драйвер](https://github.com/ClickHouse/clickhouse-jdbc)** +- Драйверы от сторонних организаций: + - [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) + - [clickhouse4j](https://github.com/blynkkk/clickhouse4j) [Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/jdbc/) diff --git a/docs/ru/interfaces/mysql.md b/docs/ru/interfaces/mysql.md index d0a36174d2b..146947342cd 100644 --- a/docs/ru/interfaces/mysql.md +++ b/docs/ru/interfaces/mysql.md @@ -1,17 +1,20 @@ -# MySQL-интерфейс {#mysql_interface} +# MySQL-интерфейс {#mysql-interface} -ClickHouse поддерживает взаимодействие по протоколу MySQL. Данная функция включается настройкой [mysql_port](../operations/server_settings/settings.md#server_settings-mysql_port) в конфигурационном файле: -```xml +ClickHouse поддерживает взаимодействие по протоколу MySQL. Данная функция включается настройкой [mysql\_port](../operations/server_settings/settings.md#server_settings-mysql_port) в конфигурационном файле: + +``` xml 9004 ``` Пример подключения с помощью стандартного клиента mysql: -```bash + +``` bash $ mysql --protocol tcp -u default -P 9004 ``` Вывод в случае успешного подключения: -```text + +``` text Welcome to the MySQL monitor. Commands end with ; or \g. Your MySQL connection id is 4 Server version: 20.2.1.1-ClickHouse @@ -24,7 +27,7 @@ owners. Type 'help;' or '\h' for help. Type '\c' to clear the current input statement. -mysql> +mysql> ``` Для совместимости со всеми клиентами рекомендуется задавать пароль пользователя в конфигурационном файле с помощью двойного хэша [SHA1](../operations/settings/settings_users.md#password_double_sha1_hex). @@ -32,6 +35,6 @@ mysql> Ограничения: -- не поддерживаются подготовленные запросы +- не поддерживаются подготовленные запросы -- некоторые типы данных отправляются как строки +- некоторые типы данных отправляются как строки diff --git a/docs/ru/interfaces/odbc.md b/docs/ru/interfaces/odbc.md index e6a3d2826b2..0f4a6279539 100644 --- a/docs/ru/interfaces/odbc.md +++ b/docs/ru/interfaces/odbc.md @@ -1,5 +1,5 @@ -# ODBC-драйвер +# ODBC-драйвер {#odbc-draiver} -- [Официальный драйвер](https://github.com/ClickHouse/clickhouse-odbc). +- [Официальный драйвер](https://github.com/ClickHouse/clickhouse-odbc). [Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/odbc/) diff --git a/docs/ru/interfaces/tcp.md b/docs/ru/interfaces/tcp.md index b18012d1120..3bb99e2450b 100644 --- a/docs/ru/interfaces/tcp.md +++ b/docs/ru/interfaces/tcp.md @@ -1,4 +1,4 @@ -# Родной интерфейс (TCP) +# Родной интерфейс (TCP) {#rodnoi-interfeis-tcp} Нативный протокол используется в [клиенте командной строки](cli.md), для взаимодействия между серверами во время обработки распределенных запросов, а также в других программах на C++. К сожалению, у родного протокола ClickHouse пока нет формальной спецификации, но в нем можно разобраться с использованием исходного кода ClickHouse (начиная с [примерно этого места](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client)) и/или путем перехвата и анализа TCP трафика. diff --git a/docs/ru/interfaces/third-party/client_libraries.md b/docs/ru/interfaces/third-party/client_libraries.md index a9a50a8d742..3ec3901666a 100644 --- a/docs/ru/interfaces/third-party/client_libraries.md +++ b/docs/ru/interfaces/third-party/client_libraries.md @@ -1,48 +1,50 @@ -# Клиентские библиотеки от сторонних разработчиков +# Клиентские библиотеки от сторонних разработчиков {#klientskie-biblioteki-ot-storonnikh-razrabotchikov} !!! warning "Disclaimer" Яндекс не поддерживает перечисленные ниже библиотеки и не проводит тщательного тестирования для проверки их качества. -- Python: - - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) - - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) - - [clickhouse-client](https://github.com/yurial/clickhouse-client) - - [aiochclient](https://github.com/maximdanilchenko/aiochclient) -- PHP - - [phpClickHouse](https://github.com/smi2/phpClickHouse) - - [clickhouse-php-client](https://github.com/8bitov/clickhouse-php-client) - - [clickhouse-client](https://github.com/bozerkins/clickhouse-client) - - [PhpClickHouseClient](https://github.com/SevaCode/PhpClickHouseClient) -- Go - - [clickhouse](https://github.com/kshvakov/clickhouse/) - - [go-clickhouse](https://github.com/roistat/go-clickhouse) - - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) - - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) -- NodeJs - - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) - - [node-clickhouse](https://github.com/apla/node-clickhouse) -- Perl - - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) - - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) - - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) -- Ruby - - [ClickHouse (Ruby)](https://github.com/shlima/click_house) -- R - - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) - - [RClickhouse](https://github.com/IMSMWU/RClickhouse) -- Java - - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) -- Scala - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- Kotlin - - [AORM](https://github.com/TanVD/AORM) -- C# - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) -- Elixir - - [clickhousex](https://github.com/appodeal/clickhousex/) -- Nim - - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) +- Python: + - [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm) + - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) + - [clickhouse-client](https://github.com/yurial/clickhouse-client) + - [aiochclient](https://github.com/maximdanilchenko/aiochclient) +- PHP + - [smi2/phpclickhouse](https://packagist.org/packages/smi2/phpClickHouse) + - [8bitov/clickhouse-php-client](https://packagist.org/packages/8bitov/clickhouse-php-client) + - [bozerkins/clickhouse-client](https://packagist.org/packages/bozerkins/clickhouse-client) + - [simpod/clickhouse-client](https://packagist.org/packages/simpod/clickhouse-client) + - [seva-code/php-click-house-client](https://packagist.org/packages/seva-code/php-click-house-client) + - [SeasClick C++ client](https://github.com/SeasX/SeasClick) +- Go + - [clickhouse](https://github.com/kshvakov/clickhouse/) + - [go-clickhouse](https://github.com/roistat/go-clickhouse) + - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) + - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) +- NodeJs + - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) + - [node-clickhouse](https://github.com/apla/node-clickhouse) +- Perl + - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) + - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) + - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) +- Ruby + - [ClickHouse (Ruby)](https://github.com/shlima/click_house) +- R + - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) + - [RClickhouse](https://github.com/IMSMWU/RClickhouse) +- Java + - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) +- Scala + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- Kotlin + - [AORM](https://github.com/TanVD/AORM) +- C\# + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) +- Elixir + - [clickhousex](https://github.com/appodeal/clickhousex/) +- Nim + - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) [Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/third-party/client_libraries/) diff --git a/docs/ru/interfaces/third-party/gui.md b/docs/ru/interfaces/third-party/gui.md index 2217ab8a913..8c8550690ef 100644 --- a/docs/ru/interfaces/third-party/gui.md +++ b/docs/ru/interfaces/third-party/gui.md @@ -1,57 +1,57 @@ -# Визуальные интерфейсы от сторонних разработчиков +# Визуальные интерфейсы от сторонних разработчиков {#vizualnye-interfeisy-ot-storonnikh-razrabotchikov} -## С открытым исходным кодом +## С открытым исходным кодом {#s-otkrytym-iskhodnym-kodom} -### Tabix +### Tabix {#tabix} Веб-интерфейс для ClickHouse в проекте [Tabix](https://github.com/tabixio/tabix). Основные возможности: -- Работает с ClickHouse напрямую из браузера, без необходимости установки дополнительного ПО; -- Редактор запросов с подсветкой синтаксиса; -- Автодополнение команд; -- Инструменты графического анализа выполнения запросов; -- Цветовые схемы на выбор. +- Работает с ClickHouse напрямую из браузера, без необходимости установки дополнительного ПО; +- Редактор запросов с подсветкой синтаксиса; +- Автодополнение команд; +- Инструменты графического анализа выполнения запросов; +- Цветовые схемы на выбор. [Документация Tabix](https://tabix.io/doc/). -### HouseOps +### HouseOps {#houseops} [HouseOps](https://github.com/HouseOps/HouseOps) — UI/IDE для OSX, Linux и Windows. Основные возможности: -- Построение запросов с подсветкой синтаксиса; -- Просмотр ответа в табличном или JSON представлении; -- Экспортирование результатов запроса в формате CSV или JSON; -- Список процессов с описанием; -- Режим записи; -- Возможность остановки (`KILL`) запроса; -- Граф базы данных. Показывает все таблицы и их столбцы с дополнительной информацией; -- Быстрый просмотр размера столбца; -- Конфигурирование сервера. +- Построение запросов с подсветкой синтаксиса; +- Просмотр ответа в табличном или JSON представлении; +- Экспортирование результатов запроса в формате CSV или JSON; +- Список процессов с описанием; +- Режим записи; +- Возможность остановки (`KILL`) запроса; +- Граф базы данных. Показывает все таблицы и их столбцы с дополнительной информацией; +- Быстрый просмотр размера столбца; +- Конфигурирование сервера. Планируется разработка следующих возможностей: -- Управление базами; -- Управление пользователями; -- Анализ данных в режиме реального времени; -- Мониторинг кластера; -- Управление кластером; -- Мониторинг реплицированных и Kafka таблиц. +- Управление базами; +- Управление пользователями; +- Анализ данных в режиме реального времени; +- Мониторинг кластера; +- Управление кластером; +- Мониторинг реплицированных и Kafka таблиц. -### LightHouse +### LightHouse {#lighthouse} [LightHouse](https://github.com/VKCOM/lighthouse) — это легковесный веб-интерфейс для ClickHouse. Основные возможности: -- Список таблиц с фильтрацией и метаданными; -- Предварительный просмотр таблицы с фильтрацией и сортировкой; -- Выполнение запросов только для чтения. +- Список таблиц с фильтрацией и метаданными; +- Предварительный просмотр таблицы с фильтрацией и сортировкой; +- Выполнение запросов только для чтения. -### Redash +### Redash {#redash} [Redash](https://github.com/getredash/redash) — платформа для отображения данных. @@ -59,50 +59,50 @@ Основные возможности: -- Мощный редактор запросов. -- Проводник по базе данных. -- Инструменты визуализации, позволяющие представить данные в различных формах. +- Мощный редактор запросов. +- Проводник по базе данных. +- Инструменты визуализации, позволяющие представить данные в различных формах. -### DBeaver +### DBeaver {#dbeaver} [DBeaver](https://dbeaver.io/) - универсальный desktop клиент баз данных с поддержкой ClickHouse. Основные возможности: -- Построение запросов с подсветкой синтаксиса; -- Просмотр таблиц; -- Автодополнение команд; -- Полнотекстовый поиск. +- Построение запросов с подсветкой синтаксиса; +- Просмотр таблиц; +- Автодополнение команд; +- Полнотекстовый поиск. -### clickhouse-cli +### clickhouse-cli {#clickhouse-cli} [clickhouse-cli](https://github.com/hatarist/clickhouse-cli) - это альтернативный клиент командной строки для ClickHouse, написанный на Python 3. Основные возможности: -- Автодополнение; -- Подсветка синтаксиса для запросов и вывода данных; -- Поддержка постраничного просмотра для результирующих данных; -- Дополнительные PostgreSQL-подобные команды. +- Автодополнение; +- Подсветка синтаксиса для запросов и вывода данных; +- Поддержка постраничного просмотра для результирующих данных; +- Дополнительные PostgreSQL-подобные команды. -### clickhouse-flamegraph +### clickhouse-flamegraph {#clickhouse-flamegraph} [clickhouse-flamegraph](https://github.com/Slach/clickhouse-flamegraph) — специализированный инструмент для визуализации `system.trace_log` в виде [flamegraph](http://www.brendangregg.com/flamegraphs.html). -## Коммерческие +## Коммерческие {#kommercheskie} -### DataGrip +### DataGrip {#datagrip} [DataGrip](https://www.jetbrains.com/datagrip/) — это IDE для баз данных о JetBrains с выделенной поддержкой ClickHouse. Он также встроен в другие инструменты на основе IntelliJ: PyCharm, IntelliJ IDEA, GoLand, PhpStorm и другие. Основные возможности: -- Очень быстрое дополнение кода. -- Подсветка синтаксиса для SQL диалекта ClickHouse. -- Поддержка функций, специфичных для ClickHouse, например вложенных столбцов, движков таблиц. -- Редактор данных. -- Рефакторинги. -- Поиск и навигация. +- Очень быстрое дополнение кода. +- Подсветка синтаксиса для SQL диалекта ClickHouse. +- Поддержка функций, специфичных для ClickHouse, например вложенных столбцов, движков таблиц. +- Редактор данных. +- Рефакторинги. +- Поиск и навигация. ### Yandex DataLens {#yandex-datalens} @@ -110,41 +110,42 @@ Основные возможности: -- Широкий выбор инструментов визуализации, от простых столбчатых диаграмм до сложных дашбордов. -- Возможность опубликовать дашборды на широкую аудиторию. -- Поддержка множества источников данных, включая ClickHouse. -- Хранение материализованных данных в кластере ClickHouse DataLens. +- Широкий выбор инструментов визуализации, от простых столбчатых диаграмм до сложных дашбордов. +- Возможность опубликовать дашборды на широкую аудиторию. +- Поддержка множества источников данных, включая ClickHouse. +- Хранение материализованных данных в кластере ClickHouse DataLens. Для небольших проектов DataLens [доступен бесплатно](https://cloud.yandex.ru/docs/datalens/pricing), в том числе и для коммерческого использования. -- [Документация DataLens](https://cloud.yandex.ru/docs/datalens/). -- [Пособие по визуализации данных из ClickHouse](https://cloud.yandex.ru/docs/solutions/datalens/data-from-ch-visualization). +- [Документация DataLens](https://cloud.yandex.ru/docs/datalens/). +- [Пособие по визуализации данных из ClickHouse](https://cloud.yandex.ru/docs/solutions/datalens/data-from-ch-visualization). -### Holistics Software +### Holistics Software {#holistics-software} [Holistics](https://www.holistics.io/) — full-stack платформа для обработки данных и бизнес-аналитики. Основные возможности: -- Автоматизированные отчёты на почту, Slack, и Google Sheet. -- Редактор SQL c визуализацией, контролем версий, автодополнением, повторным использованием частей запроса и динамическими фильтрами. -- Встроенные инструменты анализа отчётов и всплывающие (iframe) дашборды. -- Подготовка данных и возможности ETL. -- Моделирование данных с помощью SQL для их реляционного отображения. +- Автоматизированные отчёты на почту, Slack, и Google Sheet. +- Редактор SQL c визуализацией, контролем версий, автодополнением, повторным использованием частей запроса и динамическими фильтрами. +- Встроенные инструменты анализа отчётов и всплывающие (iframe) дашборды. +- Подготовка данных и возможности ETL. +- Моделирование данных с помощью SQL для их реляционного отображения. [Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/third-party/gui/) -### Looker -[Looker](https://looker.com) — платформа для обработки данных и бизнес-аналитики. Поддерживает более 50 диалектов баз данных, включая ClickHouse. Looker можно установить самостоятельно или воспользоваться готовой платформой SaaS. +### Looker {#looker} + +[Looker](https://looker.com) — платформа для обработки данных и бизнес-аналитики. Поддерживает более 50 диалектов баз данных, включая ClickHouse. Looker можно установить самостоятельно или воспользоваться готовой платформой SaaS. Просмотр данных, построение отображений и дашбордов, планирование отчётов и обмен данными с коллегами доступны с помощью браузера. Также, Looker предоставляет ряд инструментов, позволяющих встраивать сервис в другие приложения и API для обмена данными. Основные возможности: -- Язык LookML, поддерживающий [моделирование данных](https://looker.com/platform/data-modeling). -- Интеграция с различными системами с помощью [Data Actions](https://looker.com/platform/actions). -- Инструменты для встраивания сервиса в приложения. -- API. +- Язык LookML, поддерживающий [моделирование данных](https://looker.com/platform/data-modeling). +- Интеграция с различными системами с помощью [Data Actions](https://looker.com/platform/actions). +- Инструменты для встраивания сервиса в приложения. +- API. [Как сконфигурировать ClickHouse в Looker.](https://docs.looker.com/setup-and-management/database-config/clickhouse) diff --git a/docs/ru/interfaces/third-party/integrations.md b/docs/ru/interfaces/third-party/integrations.md index fd784d85c03..1eb43c5e666 100644 --- a/docs/ru/interfaces/third-party/integrations.md +++ b/docs/ru/interfaces/third-party/integrations.md @@ -1,87 +1,88 @@ -# Библиотеки для интеграции от сторонних разработчиков +# Библиотеки для интеграции от сторонних разработчиков {#biblioteki-dlia-integratsii-ot-storonnikh-razrabotchikov} !!! warning "Disclaimer" Яндекс не занимается поддержкой перечисленных ниже инструментов и библиотек и не проводит тщательного тестирования для проверки их качества. -## Инфраструктурные продукты -- Реляционные системы управления базами данных - - [MySQL](https://www.mysql.com) - - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) - - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) - - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) - - [PostgreSQL](https://www.postgresql.org) - - [clickhousedb_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) - - [infi.clickhouse_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (использует [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pg2ch](https://github.com/mkabilov/pg2ch) - - [clickhouse_fdw](https://github.com/adjust/clickhouse_fdw) - - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) - - [ClickHouseMightrator](https://github.com/zlzforever/ClickHouseMigrator) -- Очереди сообщений - - [Kafka](https://kafka.apache.org) - - [clickhouse_sinker](https://github.com/housepower/clickhouse_sinker) (использует [Go client](https://github.com/kshvakov/clickhouse/)) -- Хранилища объектов - - [S3](https://en.wikipedia.org/wiki/Amazon_S3) - - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) -- Оркестрация контейнеров - - [Kubernetes](https://kubernetes.io) - - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) -- Системы управления конфигурацией - - [puppet](https://puppet.com) - - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) - - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) -- Мониторинг - - [Graphite](https://graphiteapp.org) - - [graphouse](https://github.com/yandex/graphouse) - - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + - - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) - - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - оптимизирует партиции таблиц [\*GraphiteMergeTree](../../operations/table_engines/graphitemergetree.md#graphitemergetree) согласно правилам в [конфигурации rollup](../../operations/table_engines/graphitemergetree.md#rollup-configuration) - - [Grafana](https://grafana.com/) - - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) - - [Prometheus](https://prometheus.io/) - - [clickhouse_exporter](https://github.com/f1yegor/clickhouse_exporter) - - [PromHouse](https://github.com/Percona-Lab/PromHouse) - - [clickhouse_exporter](https://github.com/hot-wifi/clickhouse_exporter) (использует [Go client](https://github.com/kshvakov/clickhouse/)) - - [Nagios](https://www.nagios.org/) - - [check_clickhouse](https://github.com/exogroup/check_clickhouse/) - - [Zabbix](https://www.zabbix.com) - - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) - - [Sematext](https://sematext.com/) - - [clickhouse интеграция](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) -- Логирование - - [rsyslog](https://www.rsyslog.com/) - - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) - - [fluentd](https://www.fluentd.org) - - [loghouse](https://github.com/flant/loghouse) (для [Kubernetes](https://kubernetes.io)) - - [logagent](https://www.sematext.com/logagent) - - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) -- Гео - - [MaxMind](https://dev.maxmind.com/geoip/) - - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) +## Инфраструктурные продукты {#infrastrukturnye-produkty} -## Экосистемы вокруг языков программирования +- Реляционные системы управления базами данных + - [MySQL](https://www.mysql.com) + - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) + - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) + - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) + - [PostgreSQL](https://www.postgresql.org) + - [clickhousedb\_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) + - [infi.clickhouse\_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (использует [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pg2ch](https://github.com/mkabilov/pg2ch) + - [clickhouse\_fdw](https://github.com/adjust/clickhouse_fdw) + - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) + - [ClickHouseMightrator](https://github.com/zlzforever/ClickHouseMigrator) +- Очереди сообщений + - [Kafka](https://kafka.apache.org) + - [clickhouse\_sinker](https://github.com/housepower/clickhouse_sinker) (использует [Go client](https://github.com/kshvakov/clickhouse/)) +- Хранилища объектов + - [S3](https://en.wikipedia.org/wiki/Amazon_S3) + - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) +- Оркестрация контейнеров + - [Kubernetes](https://kubernetes.io) + - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) +- Системы управления конфигурацией + - [puppet](https://puppet.com) + - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) + - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) +- Мониторинг + - [Graphite](https://graphiteapp.org) + - [graphouse](https://github.com/yandex/graphouse) + - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + + - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) + - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - оптимизирует партиции таблиц [\*GraphiteMergeTree](../../operations/table_engines/graphitemergetree.md#graphitemergetree) согласно правилам в [конфигурации rollup](../../operations/table_engines/graphitemergetree.md#rollup-configuration) + - [Grafana](https://grafana.com/) + - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) + - [Prometheus](https://prometheus.io/) + - [clickhouse\_exporter](https://github.com/f1yegor/clickhouse_exporter) + - [PromHouse](https://github.com/Percona-Lab/PromHouse) + - [clickhouse\_exporter](https://github.com/hot-wifi/clickhouse_exporter) (использует [Go client](https://github.com/kshvakov/clickhouse/)) + - [Nagios](https://www.nagios.org/) + - [check\_clickhouse](https://github.com/exogroup/check_clickhouse/) + - [Zabbix](https://www.zabbix.com) + - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) + - [Sematext](https://sematext.com/) + - [clickhouse интеграция](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) +- Логирование + - [rsyslog](https://www.rsyslog.com/) + - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) + - [fluentd](https://www.fluentd.org) + - [loghouse](https://github.com/flant/loghouse) (для [Kubernetes](https://kubernetes.io)) + - [logagent](https://www.sematext.com/logagent) + - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) +- Гео + - [MaxMind](https://dev.maxmind.com/geoip/) + - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) -- Python - - [SQLAlchemy](https://www.sqlalchemy.org) - - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (использует [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pandas](https://pandas.pydata.org) - - [pandahouse](https://github.com/kszucs/pandahouse) -- R - - [dplyr](https://db.rstudio.com/dplyr/) - - [RClickhouse](https://github.com/IMSMWU/RClickhouse) (использует [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) -- Java - - [Hadoop](http://hadoop.apache.org) - - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (использует [JDBC](../../query_language/table_functions/jdbc.md)) -- Scala - - [Akka](https://akka.io) - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- C# - - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) - - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) -- Elixir - - [Ecto](https://github.com/elixir-ecto/ecto) - - [clickhouse_ecto](https://github.com/appodeal/clickhouse_ecto) +## Экосистемы вокруг языков программирования {#ekosistemy-vokrug-iazykov-programmirovaniia} + +- Python + - [SQLAlchemy](https://www.sqlalchemy.org) + - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (использует [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pandas](https://pandas.pydata.org) + - [pandahouse](https://github.com/kszucs/pandahouse) +- R + - [dplyr](https://db.rstudio.com/dplyr/) + - [RClickhouse](https://github.com/IMSMWU/RClickhouse) (использует [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) +- Java + - [Hadoop](http://hadoop.apache.org) + - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (использует [JDBC](../../query_language/table_functions/jdbc.md)) +- Scala + - [Akka](https://akka.io) + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- C\# + - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) + - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) +- Elixir + - [Ecto](https://github.com/elixir-ecto/ecto) + - [clickhouse\_ecto](https://github.com/appodeal/clickhouse_ecto) [Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/third-party/integrations/) diff --git a/docs/ru/interfaces/third-party/proxy.md b/docs/ru/interfaces/third-party/proxy.md index 1bce6b12631..225c3fee150 100644 --- a/docs/ru/interfaces/third-party/proxy.md +++ b/docs/ru/interfaces/third-party/proxy.md @@ -1,38 +1,38 @@ -# Прокси-серверы от сторонних разработчиков +# Прокси-серверы от сторонних разработчиков {#proksi-servery-ot-storonnikh-razrabotchikov} -## chproxy +## chproxy {#chproxy} [chproxy](https://github.com/Vertamedia/chproxy) - это http-прокси и балансировщик нагрузки для базы данных ClickHouse. Основные возможности: -* Индивидуальная маршрутизация и кэширование ответов; -* Гибкие ограничения; -* Автоматическое продление SSL сертификатов. +- Индивидуальная маршрутизация и кэширование ответов; +- Гибкие ограничения; +- Автоматическое продление SSL сертификатов. Реализован на Go. -## KittenHouse +## KittenHouse {#kittenhouse} [KittenHouse](https://github.com/VKCOM/kittenhouse) предназначен для использования в качестве локального прокси-сервера между ClickHouse и вашим сервером приложений в случае, если буферизовать данные INSERT на стороне приложения не представляется возможным или не удобно. Основные возможности: -* Буферизация данных в памяти и на диске; -* Маршрутизация по таблицам; -* Балансировка нагрузки и проверка работоспособности. +- Буферизация данных в памяти и на диске; +- Маршрутизация по таблицам; +- Балансировка нагрузки и проверка работоспособности. Реализован на Go. -## ClickHouse-Bulk +## ClickHouse-Bulk {#clickhouse-bulk} [ClickHouse-Bulk](https://github.com/nikepan/clickhouse-bulk) - простой сборщик вставок ClickHouse. Особенности: -* Группировка запросов и отправка по порогу или интервалу; -* Несколько удаленных серверов; -* Базовая аутентификация. +- Группировка запросов и отправка по порогу или интервалу; +- Несколько удаленных серверов; +- Базовая аутентификация. Реализован на Go. diff --git a/docs/ru/introduction/distinctive_features.md b/docs/ru/introduction/distinctive_features.md index 7570c0e58c9..093053a3b89 100644 --- a/docs/ru/introduction/distinctive_features.md +++ b/docs/ru/introduction/distinctive_features.md @@ -1,61 +1,61 @@ -# Отличительные возможности ClickHouse +# Отличительные возможности ClickHouse {#otlichitelnye-vozmozhnosti-clickhouse} -## По-настоящему столбцовая СУБД +## По-настоящему столбцовая СУБД {#po-nastoiashchemu-stolbtsovaia-subd} -В по-настоящему столбцовой СУБД рядом со значениями не хранится никаких лишних данных. Например, должны поддерживаться значения постоянной длины, чтобы не хранить рядом со значениями типа "число" их длины. Для примера, миллиард значений типа UInt8 должен действительно занимать в несжатом виде около 1GB, иначе это сильно ударит по эффективности использования CPU. Очень важно хранить данные компактно (без "мусора") в том числе в несжатом виде, так как скорость разжатия (использование CPU) зависит, в основном, от объёма несжатых данных. +В по-настоящему столбцовой СУБД рядом со значениями не хранится никаких лишних данных. Например, должны поддерживаться значения постоянной длины, чтобы не хранить рядом со значениями типа «число» их длины. Для примера, миллиард значений типа UInt8 должен действительно занимать в несжатом виде около 1GB, иначе это сильно ударит по эффективности использования CPU. Очень важно хранить данные компактно (без «мусора») в том числе в несжатом виде, так как скорость разжатия (использование CPU) зависит, в основном, от объёма несжатых данных. Этот пункт пришлось выделить, так как существуют системы, которые могут хранить значения отдельных столбцов по отдельности, но не могут эффективно выполнять аналитические запросы в силу оптимизации под другой сценарий работы. Примеры: HBase, BigTable, Cassandra, HyperTable. В этих системах вы получите пропускную способность в районе сотен тысяч строк в секунду, но не сотен миллионов строк в секунду. Также стоит заметить, что ClickHouse является системой управления базами данных, а не одной базой данных. То есть, ClickHouse позволяет создавать таблицы и базы данных в runtime, загружать данные и выполнять запросы без переконфигурирования и перезапуска сервера. -## Сжатие данных +## Сжатие данных {#szhatie-dannykh} Некоторые столбцовые СУБД (InfiniDB CE, MonetDB) не используют сжатие данных. Однако сжатие данных действительно играет одну из ключевых ролей в демонстрации отличной производительности. -## Хранение данных на диске +## Хранение данных на диске {#khranenie-dannykh-na-diske} Многие столбцовые СУБД (SAP HANA, Google PowerDrill) могут работать только в оперативной памяти. Такой подход стимулирует выделять больший бюджет на оборудование, чем фактически требуется для анализа в реальном времени. ClickHouse спроектирован для работы на обычных жестких дисках, что обеспечивает низкую стоимость хранения на гигабайт данных, но SSD и дополнительная оперативная память тоже полноценно используются, если доступны. -## Параллельная обработка запроса на многих процессорных ядрах +## Параллельная обработка запроса на многих процессорных ядрах {#parallelnaia-obrabotka-zaprosa-na-mnogikh-protsessornykh-iadrakh} Большие запросы естественным образом распараллеливаются, используя все необходимые ресурсы из доступных на сервере. -## Распределённая обработка запроса на многих серверах +## Распределённая обработка запроса на многих серверах {#raspredelionnaia-obrabotka-zaprosa-na-mnogikh-serverakh} Почти все перечисленные ранее столбцовые СУБД не поддерживают распределённую обработку запроса. В ClickHouse данные могут быть расположены на разных шардах. Каждый шард может представлять собой группу реплик, которые используются для отказоустойчивости. Запрос будет выполнен на всех шардах параллельно. Это делается прозрачно для пользователя. -## Поддержка SQL +## Поддержка SQL {#podderzhka-sql} ClickHouse поддерживает декларативный язык запросов на основе SQL и во многих случаях совпадающий с SQL стандартом. Поддерживаются GROUP BY, ORDER BY, подзапросы в секциях FROM, IN, JOIN, а также скалярные подзапросы. Зависимые подзапросы и оконные функции не поддерживаются. -## Векторный движок +## Векторный движок {#vektornyi-dvizhok} Данные не только хранятся по столбцам, но и обрабатываются по векторам - кусочкам столбцов. За счёт этого достигается высокая эффективность по CPU. -## Обновление данных в реальном времени +## Обновление данных в реальном времени {#obnovlenie-dannykh-v-realnom-vremeni} ClickHouse поддерживает таблицы с первичным ключом. Для того, чтобы можно было быстро выполнять запросы по диапазону первичного ключа, данные инкрементально сортируются с помощью merge дерева. За счёт этого, поддерживается постоянное добавление данных в таблицу. Блокировки при добавлении данных отсутствуют. -## Наличие индекса +## Наличие индекса {#nalichie-indeksa} Физическая сортировка данных по первичному ключу позволяет получать данные для конкретных его значений или их диапазонов с низкими задержками - менее десятков миллисекунд. -## Подходит для онлайн запросов +## Подходит для онлайн запросов {#podkhodit-dlia-onlain-zaprosov} Низкие задержки позволяют не откладывать выполнение запроса и не подготавливать ответ заранее, а выполнять его именно в момент загрузки страницы пользовательского интерфейса. То есть, в режиме онлайн. -## Поддержка приближённых вычислений +## Поддержка приближённых вычислений {#podderzhka-priblizhionnykh-vychislenii} ClickHouse предоставляет различные способы разменять точность вычислений на производительность: -1. Система содержит агрегатные функции для приближённого вычисления количества различных значений, медианы и квантилей. -2. Поддерживается возможность выполнить запрос на основе части (выборки) данных и получить приближённый результат. При этом, с диска будет считано пропорционально меньше данных. -3. Поддерживается возможность выполнить агрегацию не для всех ключей, а для ограниченного количества первых попавшихся ключей. При выполнении некоторых условий на распределение ключей в данных, это позволяет получить достаточно точный результат с использованием меньшего количества ресурсов. +1. Система содержит агрегатные функции для приближённого вычисления количества различных значений, медианы и квантилей. +2. Поддерживается возможность выполнить запрос на основе части (выборки) данных и получить приближённый результат. При этом, с диска будет считано пропорционально меньше данных. +3. Поддерживается возможность выполнить агрегацию не для всех ключей, а для ограниченного количества первых попавшихся ключей. При выполнении некоторых условий на распределение ключей в данных, это позволяет получить достаточно точный результат с использованием меньшего количества ресурсов. -## Репликация данных и поддержка целостности +## Репликация данных и поддержка целостности {#replikatsiia-dannykh-i-podderzhka-tselostnosti} Используется асинхронная multimaster репликация. После записи на любую доступную реплику, данные распространяются на все остальные реплики в фоне. Система поддерживает полную идентичность данных на разных репликах. Восстановление после большинства сбоев осуществляется автоматически, а в сложных случаях — полуавтоматически. При необходимости, можно [включить кворумную запись](../operations/settings/settings.md) данных. diff --git a/docs/ru/introduction/features_considered_disadvantages.md b/docs/ru/introduction/features_considered_disadvantages.md index 440311a199d..0cd4838d908 100644 --- a/docs/ru/introduction/features_considered_disadvantages.md +++ b/docs/ru/introduction/features_considered_disadvantages.md @@ -1,8 +1,8 @@ -# Особенности ClickHouse, которые могут считаться недостатками +# Особенности ClickHouse, которые могут считаться недостатками {#osobennosti-clickhouse-kotorye-mogut-schitatsia-nedostatkami} -1. Отсутствие полноценных транзакций. -2. Возможность изменять или удалять ранее записанные данные с низкими задержками и высокой частотой запросов не предоставляется. Есть массовое удаление и изменение данных для очистки более не нужного или соответствия [GDPR](https://gdpr-info.eu). -3. Разреженный индекс делает ClickHouse плохо пригодным для точечных чтений одиночных строк по своим -ключам. +1. Отсутствие полноценных транзакций. +2. Возможность изменять или удалять ранее записанные данные с низкими задержками и высокой частотой запросов не предоставляется. Есть массовое удаление и изменение данных для очистки более не нужного или соответствия [GDPR](https://gdpr-info.eu). +3. Разреженный индекс делает ClickHouse плохо пригодным для точечных чтений одиночных строк по своим + ключам. [Оригинальная статья](https://clickhouse.tech/docs/ru/introduction/features_considered_disadvantages/) diff --git a/docs/ru/introduction/history.md b/docs/ru/introduction/history.md index c2d2a85ea09..856263704e5 100644 --- a/docs/ru/introduction/history.md +++ b/docs/ru/introduction/history.md @@ -1,4 +1,4 @@ -# История ClickHouse +# История ClickHouse {#istoriia-clickhouse} ClickHouse изначально разрабатывался для обеспечения работы [Яндекс.Метрики](https://metrika.yandex.ru/), [второй крупнейшей в мире](http://w3techs.com/technologies/overview/traffic_analysis/all) платформы для веб аналитики, и продолжает быть её ключевым компонентом. При более 13 триллионах записей в базе данных и более 20 миллиардах событий в сутки, ClickHouse позволяет генерировать индивидуально настроенные отчёты на лету напрямую из неагрегированных данных. Данная статья вкратце демонстрирует какие цели исторически стояли перед ClickHouse на ранних этапах его развития. @@ -6,35 +6,35 @@ ClickHouse изначально разрабатывался для обеспе На апрель 2014, в Яндекс.Метрику поступало около 12 миллиардов событий (показов страниц и кликов мыши) ежедневно. Все эти события должны быть сохранены для возможности строить произвольные отчёты. Один запрос может потребовать просканировать миллионы строк за время не более нескольких сотен миллисекунд, или сотни миллионов строк за время не более нескольких секунд. -## Использование в Яндекс.Метрике и других отделах Яндекса +## Использование в Яндекс.Метрике и других отделах Яндекса {#ispolzovanie-v-iandeks-metrike-i-drugikh-otdelakh-iandeksa} В Яндекс.Метрике ClickHouse используется для нескольких задач. Основная задача - построение отчётов в режиме онлайн по неагрегированным данным. Для решения этой задачи используется кластер из 374 серверов, хранящий более 20,3 триллионов строк в базе данных. Объём сжатых данных, без учёта дублирования и репликации, составляет около 2 ПБ. Объём несжатых данных (в формате tsv) составил бы, приблизительно, 17 ПБ. Также ClickHouse используется: -- для хранения данных Вебвизора; -- для обработки промежуточных данных; -- для построения глобальных отчётов Аналитиками; -- для выполнения запросов в целях отладки движка Метрики; -- для анализа логов работы API и пользовательского интерфейса. +- для хранения данных Вебвизора; +- для обработки промежуточных данных; +- для построения глобальных отчётов Аналитиками; +- для выполнения запросов в целях отладки движка Метрики; +- для анализа логов работы API и пользовательского интерфейса. ClickHouse имеет более десятка инсталляций в других отделах Яндекса: в Вертикальных сервисах, Маркете, Директе, БК, Бизнес аналитике, Мобильной разработке, AdFox, Персональных сервисах и т п. -## Агрегированные и неагрегированные данные +## Агрегированные и неагрегированные данные {#agregirovannye-i-neagregirovannye-dannye} Существует мнение, что для того, чтобы эффективно считать статистику, данные нужно агрегировать, так как это позволяет уменьшить объём данных. Но агрегированные данные являются очень ограниченным решением, по следующим причинам: -- вы должны заранее знать перечень отчётов, необходимых пользователю; -- то есть, пользователь не может построить произвольный отчёт; -- при агрегации по большому количеству ключей, объём данных не уменьшается и агрегация бесполезна; -- при большом количестве отчётов, получается слишком много вариантов агрегации (комбинаторный взрыв); -- при агрегации по ключам высокой кардинальности (например, URL) объём данных уменьшается не сильно (менее чем в 2 раза); -- из-за этого, объём данных при агрегации может не уменьшиться, а вырасти; -- пользователи будут смотреть не все отчёты, которые мы для них посчитаем - то есть, большая часть вычислений бесполезна; -- возможно нарушение логической целостности данных для разных агрегаций; +- вы должны заранее знать перечень отчётов, необходимых пользователю; +- то есть, пользователь не может построить произвольный отчёт; +- при агрегации по большому количеству ключей, объём данных не уменьшается и агрегация бесполезна; +- при большом количестве отчётов, получается слишком много вариантов агрегации (комбинаторный взрыв); +- при агрегации по ключам высокой кардинальности (например, URL) объём данных уменьшается не сильно (менее чем в 2 раза); +- из-за этого, объём данных при агрегации может не уменьшиться, а вырасти; +- пользователи будут смотреть не все отчёты, которые мы для них посчитаем - то есть, большая часть вычислений бесполезна; +- возможно нарушение логической целостности данных для разных агрегаций; Как видно, если ничего не агрегировать, и работать с неагрегированными данными, то это даже может уменьшить объём вычислений. diff --git a/docs/ru/introduction/info.md b/docs/ru/introduction/info.md index 7f4e7c9e479..bd95c825f21 100644 --- a/docs/ru/introduction/info.md +++ b/docs/ru/introduction/info.md @@ -1,8 +1,9 @@ -# Информационная поддержка +# Информационная поддержка {#informatsionnaia-podderzhka} + Информационная поддержка ClickHouse осуществляется на всей территории Российской Федерации без ограничений посредством использования телефонной связи и средств электронной почты на русском языке в круглосуточном режиме: -* Адрес электронной почты: -* Телефон: 8-800-250-96-39 (звонки бесплатны из всех регионов России) +- Адрес электронной почты: +- Телефон: 8-800-250-96-39 (звонки бесплатны из всех регионов России)

















diff --git a/docs/ru/introduction/performance.md b/docs/ru/introduction/performance.md index b4ea0f24bd1..568fd835261 100644 --- a/docs/ru/introduction/performance.md +++ b/docs/ru/introduction/performance.md @@ -1,24 +1,24 @@ -# Производительность +# Производительность {#proizvoditelnost} По результатам внутреннего тестирования в Яндексе, ClickHouse обладает наиболее высокой производительностью (как наиболее высокой пропускной способностью на длинных запросах, так и наиболее низкой задержкой на коротких запросах), при соответствующем сценарии работы, среди доступных для тестирования систем подобного класса. Результаты тестирования можно посмотреть на [отдельной странице](https://clickhouse.tech/benchmark.html). Также это подтверждают многочисленные независимые бенчмарки. Их не сложно найти в Интернете самостоятельно, либо можно воспользоваться [небольшой коллекцией ссылок по теме](https://clickhouse.tech/#independent-benchmarks). -## Пропускная способность при обработке одного большого запроса +## Пропускная способность при обработке одного большого запроса {#propusknaia-sposobnost-pri-obrabotke-odnogo-bolshogo-zaprosa} -Пропускную способность можно измерять в строчках в секунду и в мегабайтах в секунду. При условии, что данные помещаются в page cache, не слишком сложный запрос обрабатывается на современном железе со скоростью около 2-10 GB/sec. несжатых данных на одном сервере (в простейшем случае скорость может достигать 30 GB/sec). Если данные не помещаются в page cache, то скорость работы зависит от скорости дисковой подсистемы и коэффициента сжатия данных. Например, если дисковая подсистема позволяет читать данные со скоростью 400 MB/sec., а коэффициент сжатия данных составляет 3, то скорость будет около 1.2GB/sec. Для получения скорости в строчках в секунду, следует поделить скорость в байтах в секунду на суммарный размер используемых в запросе столбцов. Например, если вынимаются столбцы на 10 байт, то скорость будет в районе 100-200 млн. строк в секунду. +Пропускную способность можно измерять в строчках в секунду и в мегабайтах в секунду. При условии, что данные помещаются в page cache, не слишком сложный запрос обрабатывается на современном железе со скоростью около 2-10 GB/sec. несжатых данных на одном сервере (в простейшем случае скорость может достигать 30 GB/sec). Если данные не помещаются в page cache, то скорость работы зависит от скорости дисковой подсистемы и коэффициента сжатия данных. Например, если дисковая подсистема позволяет читать данные со скоростью 400 MB/sec., а коэффициент сжатия данных составляет 3, то скорость будет около 1.2GB/sec. Для получения скорости в строчках в секунду, следует поделить скорость в байтах в секунду на суммарный размер используемых в запросе столбцов. Например, если вынимаются столбцы на 10 байт, то скорость будет в районе 100-200 млн. строк в секунду. При распределённой обработке запроса, скорость обработки запроса растёт почти линейно, но только при условии, что в результате агрегации или при сортировке получается не слишком большое множество строчек. -## Задержки при обработке коротких запросов +## Задержки при обработке коротких запросов {#zaderzhki-pri-obrabotke-korotkikh-zaprosov} Если запрос использует первичный ключ, и выбирает для обработки не слишком большое количество строчек (сотни тысяч), и использует не слишком большое количество столбцов, то вы можете рассчитывать на latency менее 50 миллисекунд (от единиц миллисекунд в лучшем случае), при условии, что данные помещаются в page cache. Иначе latency вычисляется из количества seek-ов. Если вы используйте вращающиеся диски, то на не слишком сильно нагруженной системе, latency вычисляется по формуле: seek time (10 мс.) \* количество столбцов в запросе \* количество кусков с данными. -## Пропускная способность при обработке многочисленных коротких запросов +## Пропускная способность при обработке многочисленных коротких запросов {#propusknaia-sposobnost-pri-obrabotke-mnogochislennykh-korotkikh-zaprosov} При тех же условиях, ClickHouse может обработать несколько сотен (до нескольких тысяч в лучшем случае) запросов в секунду на одном сервере. Так как такой сценарий работы не является типичным для аналитических СУБД, рекомендуется рассчитывать не более чем на 100 запросов в секунду. -## Производительность при вставке данных +## Производительность при вставке данных {#proizvoditelnost-pri-vstavke-dannykh} Данные рекомендуется вставлять пачками не менее 1000 строк или не более одного запроса в секунду. При вставке в таблицу типа MergeTree из tab-separated дампа, скорость вставки будет в районе 50-200 МБ/сек. Если вставляются строчки размером около 1 КБ, то скорость будет в районе 50 000 - 200 000 строчек в секунду. Если строчки маленькие - производительность в строчках в секунду будет выше (на данных БК - `>` 500 000 строк в секунду, на данных Graphite - `>` 1 000 000 строк в секунду). Для увеличения производительности, можно производить несколько запросов INSERT параллельно - при этом производительность растёт линейно. diff --git a/docs/ru/operations/access_rights.md b/docs/ru/operations/access_rights.md index ac99c2b0449..d4cd7793bf1 100644 --- a/docs/ru/operations/access_rights.md +++ b/docs/ru/operations/access_rights.md @@ -1,10 +1,10 @@ -# Права доступа +# Права доступа {#prava-dostupa} Пользователи и права доступа настраиваются в конфиге пользователей. Обычно это `users.xml`. Пользователи прописаны в секции `users`. Рассмотрим фрагмент файла `users.xml`: -```xml +``` xml @@ -69,7 +69,7 @@ Указывается список сетей, из которых разрешён доступ. В этом примере, список сетей для обеих пользователей, загружается из отдельного файла (`/etc/metrika.xml`), содержащего подстановку `networks`. Вот его фрагмент: -```xml +``` xml ... @@ -81,14 +81,14 @@ ``` -Можно было бы указать этот список сетей непосредственно в `users.xml`, или в файле в директории `users.d` (подробнее смотрите раздел "[Конфигурационные файлы](configuration_files.md#configuration_files)"). +Можно было бы указать этот список сетей непосредственно в `users.xml`, или в файле в директории `users.d` (подробнее смотрите раздел «[Конфигурационные файлы](configuration_files.md#configuration_files)»). В конфиге приведён комментарий, указывающий, как можно открыть доступ отовсюду. Для продакшен использования, указывайте только элементы вида `ip` (IP-адреса и их маски), так как использование `host` и `host_regexp` может вызывать лишние задержки. -Далее указывается используемый профиль настроек пользователя (смотрите раздел "[Профили настроек](settings/settings_profiles.md)"). Вы можете указать профиль по умолчанию - `default`. Профиль может называться как угодно; один и тот же профиль может быть указан для разных пользователей. Наиболее важная вещь, которую вы можете прописать в профиле настроек `readonly=1`, что обеспечивает доступ только на чтение. -Затем указывается используемая квота (смотрите раздел "[Квоты](quotas.md#quotas)"). Вы можете указать квоту по умолчанию — `default`. Она настроена в конфиге по умолчанию так, что только считает использование ресурсов, но никак их не ограничивает. Квота может называться как угодно. Одна и та же квота может быть указана для разных пользователей, в этом случае подсчёт использования ресурсов делается для каждого пользователя по отдельности. +Далее указывается используемый профиль настроек пользователя (смотрите раздел «[Профили настроек](settings/settings_profiles.md)»). Вы можете указать профиль по умолчанию - `default`. Профиль может называться как угодно; один и тот же профиль может быть указан для разных пользователей. Наиболее важная вещь, которую вы можете прописать в профиле настроек `readonly=1`, что обеспечивает доступ только на чтение. +Затем указывается используемая квота (смотрите раздел «[Квоты](quotas.md#quotas)»). Вы можете указать квоту по умолчанию — `default`. Она настроена в конфиге по умолчанию так, что только считает использование ресурсов, но никак их не ограничивает. Квота может называться как угодно. Одна и та же квота может быть указана для разных пользователей, в этом случае подсчёт использования ресурсов делается для каждого пользователя по отдельности. Также, в необязательном разделе `` можно указать перечень баз, к которым у пользователя будет доступ. По умолчанию пользователю доступны все базы. Можно указать базу данных `default`, в этом случае пользователь получит доступ к базе данных по умолчанию. diff --git a/docs/ru/operations/backup.md b/docs/ru/operations/backup.md index 04875c3f35d..dd01ea9057f 100644 --- a/docs/ru/operations/backup.md +++ b/docs/ru/operations/backup.md @@ -1,6 +1,6 @@ -# Резервное копирование данных +# Резервное копирование данных {#rezervnoe-kopirovanie-dannykh} -[Репликация](table_engines/replication.md) обеспечивает защиту от аппаратных сбоев, но не защищает от человеческих ошибок: случайного удаления данных, удаления не той таблицы, которую надо было, или таблицы на не том кластере, а также программных ошибок, которые приводят к неправильной обработке данных или их повреждению. Во многих случаях подобные ошибки влияют на все реплики. ClickHouse имеет встроенные средства защиты для предотвращения некоторых типов ошибок — например, по умолчанию [не получится удалить таблицы *MergeTree, содержащие более 50 Гб данных, одной командой](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/dbms/programs/server/config.xml#L322-L330). Однако эти средства защиты не охватывают все возможные случаи и могут быть обойдены. +[Репликация](table_engines/replication.md) обеспечивает защиту от аппаратных сбоев, но не защищает от человеческих ошибок: случайного удаления данных, удаления не той таблицы, которую надо было, или таблицы на не том кластере, а также программных ошибок, которые приводят к неправильной обработке данных или их повреждению. Во многих случаях подобные ошибки влияют на все реплики. ClickHouse имеет встроенные средства защиты для предотвращения некоторых типов ошибок — например, по умолчанию [не получится удалить таблицы \*MergeTree, содержащие более 50 Гб данных, одной командой](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/dbms/programs/server/config.xml#L322-L330). Однако эти средства защиты не охватывают все возможные случаи и могут быть обойдены. Для того чтобы эффективно уменьшить возможные человеческие ошибки, следует тщательно подготовить стратегию резервного копирования и восстановления данных **заранее**. @@ -9,21 +9,21 @@ !!! note "Примечание" Имейте в виду, что если вы создали резервную копию чего-то и никогда не пытались восстановить её, скорее всего, восстановление не будет работать должным образом, когда вам это действительно понадобится (или, по крайней мере, это займет больше времени, чем будет приемлемо для бизнеса). Поэтому, какой бы подход к резервному копированию вы ни выбрали, обязательно автоматизируйте процесс восстановления и регулярно запускайте его на резервном кластере ClickHouse. -## Дублирование данных +## Дублирование данных {#dublirovanie-dannykh} Часто данные, которые поступают в ClickHouse, доставляются через некоторую отказоустойчивую очередь, например [Apache Kafka](https://kafka.apache.org). В этом случае можно настроить дополнительный набор подписчиков, которые будут считывать один и тот же поток данных во время записи в ClickHouse и хранить его в холодном хранилище. Большинство компаний уже имеют некоторые рекомендуемые по умолчанию холодные хранилища, которые могут быть хранилищем объектов или распределенной файловой системой, например [HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html). -## Снимки файловой системы +## Снимки файловой системы {#snimki-failovoi-sistemy} Некоторые локальные файловые системы позволяют делать снимки (например, [ZFS](https://en.wikipedia.org/wiki/ZFS)), но они могут быть не лучшим выбором для обслуживания живых запросов. Возможным решением является создание дополнительных реплик с такой файловой системой и исключение их из [Distributed](table_engines/distributed.md) таблиц, используемых для запросов `SELECT`. Снимки на таких репликах будут недоступны для запросов, изменяющих данные. В качестве бонуса, эти реплики могут иметь особые конфигурации оборудования с большим количеством дисков, подключенных к серверу, что будет экономически эффективным. -## clickhouse-copier +## clickhouse-copier {#clickhouse-copier} [clickhouse-copier](utils/clickhouse-copier.md) — это универсальный инструмент, который изначально был создан для перешардирования таблиц с петабайтами данных. Его также можно использовать для резервного копирования и восстановления, поскольку он надёжно копирует данные между таблицами и кластерами ClickHouse. Для небольших объёмов данных можно применять `INSERT INTO ... SELECT ...` в удалённые таблицы. -## Манипуляции с партициями +## Манипуляции с партициями {#manipuliatsii-s-partitsiiami} ClickHouse позволяет использовать запрос `ALTER TABLE ... FREEZE PARTITION ...` для создания локальной копии партиций таблицы. Это реализуется с помощью жестких ссылок (hardlinks) на каталог `/var/lib/clickhouse/shadow/`, поэтому такая копия обычно не занимает дополнительное место на диске для старых данных. Созданные копии файлов не обрабатываются сервером ClickHouse, поэтому вы можете просто оставить их там: у вас будет простая резервная копия, которая не требует дополнительной внешней системы, однако при аппаратных проблемах вы можете утратить и актуальные данные и сохраненную копию. По этой причине, лучше удаленно скопировать их в другое место, а затем удалить локальную копию. Распределенные файловые системы и хранилища объектов по-прежнему являются хорошими вариантами для этого, однако можно использовать и обычные присоединенные файловые серверы с достаточно большой ёмкостью (в этом случае передача будет происходить через сетевую файловую систему или, возможно, [rsync](https://en.wikipedia.org/wiki/Rsync)). @@ -31,4 +31,4 @@ ClickHouse позволяет использовать запрос `ALTER TABLE Для автоматизации этого подхода доступен инструмент от сторонних разработчиков: [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup). -[Оригинальная статья ](https://clickhouse.tech/docs/ru/operations/backup/) +[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/backup/) diff --git a/docs/ru/operations/configuration_files.md b/docs/ru/operations/configuration_files.md index e5fc0fc0cec..88d165e9637 100644 --- a/docs/ru/operations/configuration_files.md +++ b/docs/ru/operations/configuration_files.md @@ -1,4 +1,4 @@ -# Конфигурационные файлы {#configuration_files} +# Конфигурационные файлы {#configuration-files} Основной конфигурационный файл сервера - `config.xml`. Он расположен в директории `/etc/clickhouse-server/`. @@ -12,17 +12,19 @@ Если указано `remove` - удалить элемент. -Также в конфиге могут быть указаны "подстановки". Если у элемента присутствует атрибут `incl`, то в качестве значения будет использована соответствующая подстановка из файла. По умолчанию, путь к файлу с подстановками - `/etc/metrika.xml`. Он может быть изменён в конфигурации сервера в элементе [include_from](server_settings/settings.md#server_settings-include_from). Значения подстановок указываются в элементах `/yandex/имя_подстановки` этого файла. Если подстановка, заданная в `incl` отсутствует, то в лог попадает соответствующая запись. Чтобы ClickHouse не писал в лог об отсутствии подстановки, необходимо указать атрибут `optional="true"` (например, настройка [macros](server_settings/settings.md)). +Также в конфиге могут быть указаны «подстановки». Если у элемента присутствует атрибут `incl`, то в качестве значения будет использована соответствующая подстановка из файла. По умолчанию, путь к файлу с подстановками - `/etc/metrika.xml`. Он может быть изменён в конфигурации сервера в элементе [include\_from](server_settings/settings.md#server_settings-include_from). Значения подстановок указываются в элементах `/yandex/имя_подстановки` этого файла. Если подстановка, заданная в `incl` отсутствует, то в лог попадает соответствующая запись. Чтобы ClickHouse не писал в лог об отсутствии подстановки, необходимо указать атрибут `optional="true"` (например, настройка [macros](server_settings/settings.md)). Подстановки могут также выполняться из ZooKeeper. Для этого укажите у элемента атрибут `from_zk = "/path/to/node"`. Значение элемента заменится на содержимое узла `/path/to/node` в ZooKeeper. В ZooKeeper-узел также можно положить целое XML-поддерево, оно будет целиком вставлено в исходный элемент. -В `config.xml` может быть указан отдельный конфиг с настройками пользователей, профилей и квот. Относительный путь к нему указывается в элементе users_config. По умолчанию - `users.xml`. Если `users_config` не указан, то настройки пользователей, профилей и квот, указываются непосредственно в `config.xml`. +В `config.xml` может быть указан отдельный конфиг с настройками пользователей, профилей и квот. Относительный путь к нему указывается в элементе users\_config. По умолчанию - `users.xml`. Если `users_config` не указан, то настройки пользователей, профилей и квот, указываются непосредственно в `config.xml`. Для `users_config` могут также существовать переопределения в файлах из директории `users_config.d` (например, `users.d`) и подстановки. Например, можно иметь по отдельному конфигурационному файлу для каждого пользователя: -```bash + +``` bash $ cat /etc/clickhouse-server/users.d/alice.xml ``` -```xml + +``` xml diff --git a/docs/ru/operations/index.md b/docs/ru/operations/index.md index 7e936a4f063..2e85f1bf816 100644 --- a/docs/ru/operations/index.md +++ b/docs/ru/operations/index.md @@ -1,20 +1,20 @@ -# Эксплуатация +# Эксплуатация {#ekspluatatsiia} Руководство по эксплуатации ClickHouse состоит из следующих основных разделов: - - [Требования](requirements.md) - - [Мониторинг](monitoring.md) - - [Решение проблем](troubleshooting.md) - - [Советы по эксплуатации](tips.md) - - [Процедура обновления](update.md) - - [Права доступа](access_rights.md) - - [Резервное копирование](backup.md) - - [Конфигурационные файлы](configuration_files.md) - - [Квоты](quotas.md) - - [Системные таблицы](system_tables.md) - - [Конфигурационные параметры сервера](server_settings/index.md) - - [Тестирование севреров с помощью ClickHouse](performance_test.md) - - [Настройки](settings/index.md) - - [Утилиты](utils/index.md) + - [Требования](requirements.md) + - [Мониторинг](monitoring.md) + - [Решение проблем](troubleshooting.md) + - [Советы по эксплуатации](tips.md) + - [Процедура обновления](update.md) + - [Права доступа](access_rights.md) + - [Резервное копирование](backup.md) + - [Конфигурационные файлы](configuration_files.md) + - [Квоты](quotas.md) + - [Системные таблицы](system_tables.md) + - [Конфигурационные параметры сервера](server_settings/index.md) + - [Тестирование севреров с помощью ClickHouse](performance_test.md) + - [Настройки](settings/index.md) + - [Утилиты](utils/index.md) [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/) diff --git a/docs/ru/operations/monitoring.md b/docs/ru/operations/monitoring.md index 4467ef134d6..8681261bf6f 100644 --- a/docs/ru/operations/monitoring.md +++ b/docs/ru/operations/monitoring.md @@ -1,23 +1,23 @@ -# Мониторинг +# Мониторинг {#monitoring} Вы можете отслеживать: -- Использование аппаратных ресурсов. -- Метрики сервера ClickHouse. +- Использование аппаратных ресурсов. +- Метрики сервера ClickHouse. -## Использование ресурсов +## Использование ресурсов {#ispolzovanie-resursov} ClickHouse не отслеживает состояние аппаратных ресурсов самостоятельно. Рекомендуем контролировать: -- Загрузку и температуру процессоров. +- Загрузку и температуру процессоров. - Можно использовать [dmesg](https://en.wikipedia.org/wiki/Dmesg), [turbostat](https://www.linux.org/docs/man8/turbostat.html) или другие инструменты. + Можно использовать [dmesg](https://en.wikipedia.org/wiki/Dmesg), [turbostat](https://www.linux.org/docs/man8/turbostat.html) или другие инструменты. -- Использование системы хранения, оперативной памяти и сети. +- Использование системы хранения, оперативной памяти и сети. -## Метрики сервера ClickHouse +## Метрики сервера ClickHouse {#metriki-servera-clickhouse} Сервер ClickHouse имеет встроенные инструменты мониторинга. @@ -25,13 +25,13 @@ ClickHouse не отслеживает состояние аппаратных ClickHouse собирает: -- Различные метрики того, как сервер использует вычислительные ресурсы. -- Общую статистику обработки запросов. +- Различные метрики того, как сервер использует вычислительные ресурсы. +- Общую статистику обработки запросов. -Метрики находятся в таблицах [system.metrics](system_tables.md#system_tables-metrics), [system.events](system_tables.md#system_tables-events) и [system.asynchronous_metrics](system_tables.md#system_tables-asynchronous_metrics). +Метрики находятся в таблицах [system.metrics](system_tables.md#system_tables-metrics), [system.events](system_tables.md#system_tables-events) и [system.asynchronous\_metrics](system_tables.md#system_tables-asynchronous_metrics). Можно настроить экспорт метрик из ClickHouse в [Graphite](https://github.com/graphite-project). Смотрите секцию [graphite](server_settings/settings.md#server_settings-graphite) конфигурационного файла ClickHouse. Перед настройкой экспорта метрик необходимо настроить Graphite, как указано в [официальном руководстве](https://graphite.readthedocs.io/en/latest/install.html). Также, можно отслеживать доступность сервера через HTTP API. Отправьте `HTTP GET` к ресурсу `/ping`. Если сервер доступен, он отвечает `200 OK`. -Для мониторинга серверов в кластерной конфигурации необходимо установить параметр [max_replica_delay_for_distributed_queries](settings/settings.md#settings-max_replica_delay_for_distributed_queries) и использовать HTTP ресурс `/replicas_status`. Если реплика доступна и не отстаёт от других реплик, то запрос к `/replicas_status` возвращает `200 OK`. Если реплика отстаёт, то запрос возвращает `503 HTTP_SERVICE_UNAVAILABLE`, включая информацию о размере отставания. +Для мониторинга серверов в кластерной конфигурации необходимо установить параметр [max\_replica\_delay\_for\_distributed\_queries](settings/settings.md#settings-max_replica_delay_for_distributed_queries) и использовать HTTP ресурс `/replicas_status`. Если реплика доступна и не отстаёт от других реплик, то запрос к `/replicas_status` возвращает `200 OK`. Если реплика отстаёт, то запрос возвращает `503 HTTP_SERVICE_UNAVAILABLE`, включая информацию о размере отставания. diff --git a/docs/ru/operations/performance/sampling_query_profiler_example_result.txt b/docs/ru/operations/performance/sampling_query_profiler_example_result.txt new file mode 120000 index 00000000000..58c5abe7122 --- /dev/null +++ b/docs/ru/operations/performance/sampling_query_profiler_example_result.txt @@ -0,0 +1 @@ +../../../en/operations/performance/sampling_query_profiler_example_result.txt \ No newline at end of file diff --git a/docs/ru/operations/quotas.md b/docs/ru/operations/quotas.md index 46750231161..f109b889350 100644 --- a/docs/ru/operations/quotas.md +++ b/docs/ru/operations/quotas.md @@ -3,16 +3,16 @@ Квоты позволяют ограничить использование ресурсов за некоторый интервал времени, или просто подсчитывать использование ресурсов. Квоты настраиваются в конфиге пользователей. Обычно это users.xml. -В системе есть возможность ограничить сложность одного запроса. Для этого смотрите раздел "Ограничения на сложность запроса". +В системе есть возможность ограничить сложность одного запроса. Для этого смотрите раздел «Ограничения на сложность запроса». В отличие от них, квоты: -- ограничивают не один запрос, а множество запросов, которые могут быть выполнены за интервал времени; -- при распределённой обработке запроса, учитывают ресурсы, потраченные на всех удалённых серверах. +- ограничивают не один запрос, а множество запросов, которые могут быть выполнены за интервал времени; +- при распределённой обработке запроса, учитывают ресурсы, потраченные на всех удалённых серверах. Рассмотрим фрагмент файла users.xml, описывающего квоты. -```xml +``` xml @@ -35,7 +35,7 @@ Видно, что квота по умолчанию просто считает использование ресурсов за каждый час, но не ограничивает их. Подсчитанное использование ресурсов за каждый интервал, выводится в лог сервера после каждого запроса. -```xml +``` xml @@ -79,9 +79,9 @@ Если за хотя бы один интервал, ограничение превышено, то кидается исключение с текстом о том, какая величина превышена, за какой интервал, и когда начнётся новый интервал (когда снова можно будет задавать запросы). -Для квоты может быть включена возможность указывать "ключ квоты", чтобы производить учёт ресурсов для многих ключей независимо. Рассмотрим это на примере: +Для квоты может быть включена возможность указывать «ключ квоты», чтобы производить учёт ресурсов для многих ключей независимо. Рассмотрим это на примере: -```xml +``` xml format_schemas/ ``` -## graphite {#server_settings-graphite} +## graphite {#server-settings-graphite} Отправка данных в [Graphite](https://github.com/graphite-project). Настройки: -- host – Сервер Graphite. -- port – Порт сервера Graphite. -- interval – Период отправки в секундах. -- timeout – Таймаут отправки данных в секундах. -- root_path – Префикс для ключей. -- metrics – Отправка данных из таблицы [system.metrics](../system_tables.md#system_tables-metrics). -- events – Отправка дельты данных, накопленной за промежуток времени из таблицы [system.events](../system_tables.md#system_tables-events). -- events_cumulative – Отправка суммарных данных из таблицы [system.events](../system_tables.md#system_tables-events). -- asynchronous_metrics – Отправка данных из таблицы [system.asynchronous_metrics](../system_tables.md#system_tables-asynchronous_metrics). +- host – Сервер Graphite. +- port – Порт сервера Graphite. +- interval – Период отправки в секундах. +- timeout – Таймаут отправки данных в секундах. +- root\_path – Префикс для ключей. +- metrics – Отправка данных из таблицы [system.metrics](../system_tables.md#system_tables-metrics). +- events – Отправка дельты данных, накопленной за промежуток времени из таблицы [system.events](../system_tables.md#system_tables-events). +- events\_cumulative – Отправка суммарных данных из таблицы [system.events](../system_tables.md#system_tables-events). +- asynchronous\_metrics – Отправка данных из таблицы [system.asynchronous\_metrics](../system_tables.md#system_tables-asynchronous_metrics). Можно определить несколько секций ``, например, для передачи различных данных с различной частотой. **Пример** -```xml +``` xml localhost 42000 @@ -162,8 +156,7 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat ``` - -## graphite_rollup {#server_settings-graphite_rollup} +## graphite\_rollup {#server-settings-graphite-rollup} Настройка прореживания данных для Graphite. @@ -171,7 +164,7 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat **Пример** -```xml +``` xml max @@ -191,8 +184,7 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat ``` - -## http_port/https_port +## http\_port/https\_port {#http-porthttps-port} Порт для обращений к серверу по протоколу HTTP(s). @@ -202,51 +194,48 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat **Пример** -```xml +``` xml 0000 ``` - -## http_server_default_response {#server_settings-http_server_default_response} +## http\_server\_default\_response {#server-settings-http-server-default-response} Страница, показываемая по умолчанию, при обращении к HTTP(s) серверу ClickHouse. -Значение по умолчанию "Ok." (с переводом строки на конце). +Значение по умолчанию «Ok.» (с переводом строки на конце). **Пример** Показывает `https://tabix.io/` при обращении к `http://localhost:http_port`. -```xml +``` xml
]]>
``` -## include_from {#server_settings-include_from} +## include\_from {#server-settings-include-from} Путь к файлу с подстановками. -Подробности смотрите в разделе "[Конфигурационный файлы](../configuration_files.md#configuration_files)". +Подробности смотрите в разделе «[Конфигурационный файлы](../configuration_files.md#configuration_files)». **Пример** -```xml +``` xml /etc/metrica.xml ``` - -## interserver_http_port +## interserver\_http\_port {#interserver-http-port} Порт для обмена между серверами ClickHouse. **Пример** -```xml +``` xml 9009 ``` - -## interserver_http_host +## interserver\_http\_host {#interserver-http-host} Имя хоста, которое могут использовать другие серверы для обращения к этому. @@ -256,69 +245,65 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat **Пример** -```xml +``` xml example.yandex.ru ``` +## interserver\_http\_credentials {#server-settings-interserver-http-credentials} -## interserver_http_credentials {#server-settings-interserver_http_credentials} - -Имя пользователя и пароль, использующиеся для аутентификации при [репликации](../table_engines/replication.md) движками Replicated*. Это имя пользователя и пароль используются только для взаимодействия между репликами кластера и никак не связаны с аутентификацией клиентов ClickHouse. Сервер проверяет совпадение имени и пароля для соединяющихся с ним реплик, а также использует это же имя и пароль для соединения с другими репликами. Соответственно, эти имя и пароль должны быть прописаны одинаковыми для всех реплик кластера. +Имя пользователя и пароль, использующиеся для аутентификации при [репликации](../table_engines/replication.md) движками Replicated\*. Это имя пользователя и пароль используются только для взаимодействия между репликами кластера и никак не связаны с аутентификацией клиентов ClickHouse. Сервер проверяет совпадение имени и пароля для соединяющихся с ним реплик, а также использует это же имя и пароль для соединения с другими репликами. Соответственно, эти имя и пароль должны быть прописаны одинаковыми для всех реплик кластера. По умолчанию аутентификация не используется. Раздел содержит следующие параметры: -- `user` — имя пользователя. -- `password` — пароль. +- `user` — имя пользователя. +- `password` — пароль. **Пример конфигурации** -```xml +``` xml admin 222 ``` - -## keep_alive_timeout +## keep\_alive\_timeout {#keep-alive-timeout} Время в секундах, в течение которого ClickHouse ожидает входящих запросов прежде, чем закрыть соединение. **Пример** -```xml +``` xml 3 ``` - -## listen_host {#server_settings-listen_host} +## listen\_host {#server-settings-listen-host} Ограничение по хостам, с которых может прийти запрос. Если необходимо, чтобы сервер отвечал всем, то надо указать `::`. Примеры: -```xml +``` xml ::1 127.0.0.1 ``` - -## logger {#server_settings-logger} +## logger {#server-settings-logger} Настройки логирования. Ключи: -- level - Уровень логирования. Допустимые значения: ``trace``, ``debug``, ``information``, ``warning``, ``error``. -- log - Файл лога. Содержит все записи согласно ``level``. -- errorlog - Файл лога ошибок. -- size - Размер файла. Действует для ``log`` и ``errorlog``. Как только файл достиг размера ``size``, ClickHouse архивирует и переименовывает его, а на его месте создает новый файл лога. -- count - Количество заархивированных файлов логов, которые сохраняет ClickHouse. +- level - Уровень логирования. Допустимые значения: `trace`, `debug`, `information`, `warning`, `error`. +- log - Файл лога. Содержит все записи согласно `level`. +- errorlog - Файл лога ошибок. +- size - Размер файла. Действует для `log` и `errorlog`. Как только файл достиг размера `size`, ClickHouse архивирует и переименовывает его, а на его месте создает новый файл лога. +- count - Количество заархивированных файлов логов, которые сохраняет ClickHouse. **Пример** -```xml +``` xml trace /var/log/clickhouse-server/clickhouse-server.log @@ -329,7 +314,8 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat ``` Также, существует поддержка записи в syslog. Пример конфига: -```xml + +``` xml 1 @@ -342,68 +328,61 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat ``` Ключи: -- use_syslog - обязательная настройка, если требуется запись в syslog -- address - хост[:порт] демона syslogd. Если не указан, используется локальный +- use\_syslog - обязательная настройка, если требуется запись в syslog +- address - хост\[:порт\] демона syslogd. Если не указан, используется локальный - hostname - опционально, имя хоста, с которого отсылаются логи - facility - [категория syslog](https://en.wikipedia.org/wiki/Syslog#Facility), -записанная в верхнем регистре, с префиксом "LOG_": (``LOG_USER``, ``LOG_DAEMON``, ``LOG_LOCAL3`` и прочие). -Значения по умолчанию: при указанном ``address`` - ``LOG_USER``, иначе - ``LOG_DAEMON`` -- format - формат сообщений. Возможные значения - ``bsd`` и ``syslog`` +записанная в верхнем регистре, с префиксом «LOG\_»: (`LOG_USER`, `LOG_DAEMON`, `LOG_LOCAL3` и прочие). +Значения по умолчанию: при указанном `address` - `LOG_USER`, иначе - `LOG_DAEMON` +- format - формат сообщений. Возможные значения - `bsd` и `syslog` - - -## macros +## macros {#macros} Подстановки параметров реплицируемых таблиц. Можно не указывать, если реплицируемых таблицы не используются. -Подробнее смотрите в разделе "[Создание реплицируемых таблиц](../../operations/table_engines/replication.md)". +Подробнее смотрите в разделе «[Создание реплицируемых таблиц](../../operations/table_engines/replication.md)». **Пример** -```xml +``` xml ``` - -## mark_cache_size {#server-mark-cache-size} +## mark\_cache\_size {#server-mark-cache-size} Приблизительный размер (в байтах) кэша засечек, используемых движками таблиц семейства [MergeTree](../table_engines/mergetree.md). Кэш общий для сервера, память выделяется по мере необходимости. - **Пример** -```xml +``` xml 5368709120 ``` - -## max_concurrent_queries +## max\_concurrent\_queries {#max-concurrent-queries} Максимальное количество одновременно обрабатываемых запросов. **Пример** -```xml +``` xml 100 ``` - -## max_connections +## max\_connections {#max-connections} Максимальное количество входящих соединений. **Пример** -```xml +``` xml 4096 ``` - -## max_open_files +## max\_open\_files {#max-open-files} Максимальное количество открытых файлов. @@ -413,12 +392,11 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat **Пример** -```xml +``` xml 262144 ``` - -## max_table_size_to_drop +## max\_table\_size\_to\_drop {#max-table-size-to-drop} Ограничение на удаление таблиц. @@ -432,12 +410,11 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat **Пример** -```xml +``` xml 0 ``` - -## merge_tree {#server_settings-merge_tree} +## merge\_tree {#server-settings-merge-tree} Тонкая настройка таблиц семейства [MergeTree](../table_engines/mergetree.md). @@ -445,45 +422,44 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat **Пример** -```xml +``` xml 5 ``` - -## openSSL {#server_settings-openssl} +## openSSL {#server-settings-openssl} Настройки клиента/сервера SSL. -Поддержку SSL обеспечивает библиотека ``libpoco``. Описание интерфейса находится в файле [SSLManager.h](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/SSLManager.h) +Поддержку SSL обеспечивает библиотека `libpoco`. Описание интерфейса находится в файле [SSLManager.h](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/SSLManager.h) Ключи настроек сервера/клиента: -- privateKeyFile - Путь к файлу с секретным ключом сертификата в формате PEM. Файл может содержать ключ и сертификат одновременно. -- certificateFile - Путь к файлу сертификата клиента/сервера в формате PEM. Можно не указывать, если ``privateKeyFile`` содержит сертификат. -- caConfig - Путь к файлу или каталогу, которые содержат доверенные корневые сертификаты. -- verificationMode - Способ проверки сертификатов узла. Подробности находятся в описании класса [Context](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/Context.h). Допустимые значения: ``none``, ``relaxed``, ``strict``, ``once``. -- verificationDepth - Максимальная длина верификационной цепи. Верификация завершится ошибкой, если длина цепи сертификатов превысит установленное значение. -- loadDefaultCAFile - Признак того, что будут использоваться встроенные CA-сертификаты для OpenSSL. Допустимые значения: ``true``, ``false``. | -- cipherList - Поддерживаемые OpenSSL-шифры. Например, ``ALL:!ADH:!LOW:!EXP:!MD5:@STRENGTH``. -- cacheSessions - Включение/выключение кеширования сессии. Использовать обязательно вместе с ``sessionIdContext``. Допустимые значения: ``true``, ``false``. -- sessionIdContext - Уникальный набор произвольных символов, которые сервер добавляет к каждому сгенерированному идентификатору. Длина строки не должна превышать ``SSL_MAX_SSL_SESSION_ID_LENGTH``. Рекомендуется к использованию всегда, поскольку позволяет избежать проблем как в случае, если сервер кеширует сессию, так и если клиент затребовал кеширование. По умолчанию ``${application.name}``. -- sessionCacheSize - Максимальное количество сессий, которые кэширует сервер. По умолчанию - 1024\*20. 0 - неограниченное количество сессий. -- sessionTimeout - Время кеширования сессии на сервере. -- extendedVerification - Автоматическая расширенная проверка сертификатов после завершении сессии. Допустимые значения: ``true``, ``false``. -- requireTLSv1 - Требование соединения TLSv1. Допустимые значения: ``true``, ``false``. -- requireTLSv1_1 - Требование соединения TLSv1.1. Допустимые значения: ``true``, ``false``. -- requireTLSv1_2 - Требование соединения TLSv1.2. Допустимые значения: ``true``, ``false``. -- fips - Активация режима OpenSSL FIPS. Поддерживается, если версия OpenSSL, с которой собрана библиотека поддерживает fips. -- privateKeyPassphraseHandler - Класс (подкласс PrivateKeyPassphraseHandler)запрашивающий кодовую фразу доступа к секретному ключу. Например, ````, ``KeyFileHandler``, ``test``, ````. -- invalidCertificateHandler - Класс (подкласс CertificateHandler) для подтверждения не валидных сертификатов. Например, `` ConsoleCertificateHandler ``. -- disableProtocols - Запрещенные к использованию протоколы. -- preferServerCiphers - Предпочтение серверных шифров на клиенте. +- privateKeyFile - Путь к файлу с секретным ключом сертификата в формате PEM. Файл может содержать ключ и сертификат одновременно. +- certificateFile - Путь к файлу сертификата клиента/сервера в формате PEM. Можно не указывать, если `privateKeyFile` содержит сертификат. +- caConfig - Путь к файлу или каталогу, которые содержат доверенные корневые сертификаты. +- verificationMode - Способ проверки сертификатов узла. Подробности находятся в описании класса [Context](https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/Context.h). Допустимые значения: `none`, `relaxed`, `strict`, `once`. +- verificationDepth - Максимальная длина верификационной цепи. Верификация завершится ошибкой, если длина цепи сертификатов превысит установленное значение. +- loadDefaultCAFile - Признак того, что будут использоваться встроенные CA-сертификаты для OpenSSL. Допустимые значения: `true`, `false`. \| +- cipherList - Поддерживаемые OpenSSL-шифры. Например, `ALL:!ADH:!LOW:!EXP:!MD5:@STRENGTH`. +- cacheSessions - Включение/выключение кеширования сессии. Использовать обязательно вместе с `sessionIdContext`. Допустимые значения: `true`, `false`. +- sessionIdContext - Уникальный набор произвольных символов, которые сервер добавляет к каждому сгенерированному идентификатору. Длина строки не должна превышать `SSL_MAX_SSL_SESSION_ID_LENGTH`. Рекомендуется к использованию всегда, поскольку позволяет избежать проблем как в случае, если сервер кеширует сессию, так и если клиент затребовал кеширование. По умолчанию `${application.name}`. +- sessionCacheSize - Максимальное количество сессий, которые кэширует сервер. По умолчанию - 1024\*20. 0 - неограниченное количество сессий. +- sessionTimeout - Время кеширования сессии на сервере. +- extendedVerification - Автоматическая расширенная проверка сертификатов после завершении сессии. Допустимые значения: `true`, `false`. +- requireTLSv1 - Требование соединения TLSv1. Допустимые значения: `true`, `false`. +- requireTLSv1\_1 - Требование соединения TLSv1.1. Допустимые значения: `true`, `false`. +- requireTLSv1\_2 - Требование соединения TLSv1.2. Допустимые значения: `true`, `false`. +- fips - Активация режима OpenSSL FIPS. Поддерживается, если версия OpenSSL, с которой собрана библиотека поддерживает fips. +- privateKeyPassphraseHandler - Класс (подкласс PrivateKeyPassphraseHandler)запрашивающий кодовую фразу доступа к секретному ключу. Например, ``, `KeyFileHandler`, `test`, ``. +- invalidCertificateHandler - Класс (подкласс CertificateHandler) для подтверждения не валидных сертификатов. Например, ` ConsoleCertificateHandler `. +- disableProtocols - Запрещенные к использованию протоколы. +- preferServerCiphers - Предпочтение серверных шифров на клиенте. **Пример настройки:** -```xml +``` xml @@ -511,24 +487,22 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat ``` - -## part_log {#server_settings-part-log} +## part\_log {#server-settings-part-log} Логирование событий, связанных с данными типа [MergeTree](../table_engines/mergetree.md). Например, события добавления или мержа данных. Лог можно использовать для симуляции алгоритмов слияния, чтобы сравнивать их характеристики. Также, можно визуализировать процесс слияния. -Запросы логируются не в отдельный файл, а в таблицу [system.part_log](../system_tables.md#system_tables-part-log). Вы можете изменить название этой таблицы в параметре `table` (см. ниже). +Запросы логируются не в отдельный файл, а в таблицу [system.part\_log](../system_tables.md#system_tables-part-log). Вы можете изменить название этой таблицы в параметре `table` (см. ниже). При настройке логирования используются следующие параметры: -- `database` — имя базы данных; -- `table` — имя таблицы; -- `partition_by` — устанавливает [произвольный ключ партиционирования](../../operations/table_engines/custom_partitioning_key.md); -- `flush_interval_milliseconds` — период сброса данных из буфера в памяти в таблицу. - +- `database` — имя базы данных; +- `table` — имя таблицы; +- `partition_by` — устанавливает [произвольный ключ партиционирования](../../operations/table_engines/custom_partitioning_key.md); +- `flush_interval_milliseconds` — период сброса данных из буфера в памяти в таблицу. **Пример** -```xml +``` xml system part_log
@@ -537,8 +511,7 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat
``` - -## path {#server_settings-path} +## path {#server-settings-path} Путь к каталогу с данными. @@ -547,29 +520,28 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat **Пример** -```xml +``` xml /var/lib/clickhouse/ ``` +## query\_log {#server-settings-query-log} -## query_log {#server_settings-query-log} +Настройка логирования запросов, принятых с настройкой [log\_queries=1](../settings/settings.md). -Настройка логирования запросов, принятых с настройкой [log_queries=1](../settings/settings.md). - -Запросы логируются не в отдельный файл, а в системную таблицу [system.query_log](../system_tables.md#system_tables-query-log). Вы можете изменить название этой таблицы в параметре `table` (см. ниже). +Запросы логируются не в отдельный файл, а в системную таблицу [system.query\_log](../system_tables.md#system_tables-query-log). Вы можете изменить название этой таблицы в параметре `table` (см. ниже). При настройке логирования используются следующие параметры: -- `database` — имя базы данных; -- `table` — имя таблицы, куда будет записываться лог; -- `partition_by` — [произвольный ключ партиционирования](../../operations/table_engines/custom_partitioning_key.md) для таблицы с логами; -- `flush_interval_milliseconds` — период сброса данных из буфера в памяти в таблицу. +- `database` — имя базы данных; +- `table` — имя таблицы, куда будет записываться лог; +- `partition_by` — [произвольный ключ партиционирования](../../operations/table_engines/custom_partitioning_key.md) для таблицы с логами; +- `flush_interval_milliseconds` — период сброса данных из буфера в памяти в таблицу. Если таблица не существует, то ClickHouse создаст её. Если структура журнала запросов изменилась при обновлении сервера ClickHouse, то таблица со старой структурой переименовывается, а новая таблица создается автоматически. **Пример** -```xml +``` xml system query_log
@@ -578,25 +550,24 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat
``` +## query\_thread\_log {#server-settings-query-thread-log} -## query_thread_log {#server_settings-query-thread-log} +Настройка логирования потоков выполнения запросов, принятых с настройкой [log\_query\_threads=1](../settings/settings.md#settings-log-query-threads). -Настройка логирования потоков выполнения запросов, принятых с настройкой [log_query_threads=1](../settings/settings.md#settings-log-query-threads). - -Запросы логируются не в отдельный файл, а в системную таблицу [system.query_thread_log](../system_tables.md#system_tables-query-thread-log). Вы можете изменить название этой таблицы в параметре `table` (см. ниже). +Запросы логируются не в отдельный файл, а в системную таблицу [system.query\_thread\_log](../system_tables.md#system_tables-query-thread-log). Вы можете изменить название этой таблицы в параметре `table` (см. ниже). При настройке логирования используются следующие параметры: -- `database` — имя базы данных; -- `table` — имя таблицы, куда будет записываться лог; -- `partition_by` — [произвольный ключ партиционирования](../../operations/table_engines/custom_partitioning_key.md) для таблицы с логами; -- `flush_interval_milliseconds` — период сброса данных из буфера в памяти в таблицу. +- `database` — имя базы данных; +- `table` — имя таблицы, куда будет записываться лог; +- `partition_by` — [произвольный ключ партиционирования](../../operations/table_engines/custom_partitioning_key.md) для таблицы с логами; +- `flush_interval_milliseconds` — период сброса данных из буфера в памяти в таблицу. Если таблица не существует, то ClickHouse создаст её. Если структура журнала запросов изменилась при обновлении сервера ClickHouse, то таблица со старой структурой переименовывается, а новая таблица создается автоматически. **Пример** -```xml +``` xml system query_thread_log
@@ -605,20 +576,20 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat
``` -## trace_log {#server_settings-trace_log} +## trace\_log {#server-settings-trace-log} -Settings for the [trace_log](../system_tables.md#system_tables-trace_log) system table operation. +Settings for the [trace\_log](../system_tables.md#system_tables-trace_log) system table operation. Parameters: -- `database` — Database for storing a table. -- `table` — Table name. -- `partition_by` — [Custom partitioning key](../../operations/table_engines/custom_partitioning_key.md) for a system table. -- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. +- `database` — Database for storing a table. +- `table` — Table name. +- `partition_by` — [Custom partitioning key](../../operations/table_engines/custom_partitioning_key.md) for a system table. +- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table. The default server configuration file `config.xml` contains the following settings section: -```xml +``` xml system trace_log
@@ -627,24 +598,23 @@ The default server configuration file `config.xml` contains the following settin
``` -## remote_servers {#server_settings_remote_servers} +## remote\_servers {#server-settings-remote-servers} Конфигурация кластеров, которые использует движок таблиц [Distributed](../../operations/table_engines/distributed.md) и табличная функция `cluster`. **Пример** -```xml +``` xml ``` -Значение атрибута `incl` смотрите в разделе "[Конфигурационные файлы](../configuration_files.md#configuration_files)". - +Значение атрибута `incl` смотрите в разделе «[Конфигурационные файлы](../configuration_files.md#configuration_files)». **Смотрите также** - - [skip_unavailable_shards](../settings/settings.md#settings-skip_unavailable_shards) +- [skip\_unavailable\_shards](../settings/settings.md#settings-skip_unavailable_shards) -## timezone {#server_settings-timezone} +## timezone {#server-settings-timezone} Временная зона сервера. @@ -654,22 +624,21 @@ The default server configuration file `config.xml` contains the following settin **Пример** -```xml +``` xml Europe/Moscow ``` - -## tcp_port {#server_settings-tcp_port} +## tcp\_port {#server-settings-tcp-port} Порт для взаимодействия с клиентами по протоколу TCP. **Пример** -```xml +``` xml 9000 ``` -## tcp_port_secure {#server_settings-tcp_port_secure} +## tcp\_port\_secure {#server-settings-tcp-port-secure} TCP порт для защищённого обмена данными с клиентами. Используйте с настройкой [OpenSSL](#server_settings-openssl). @@ -679,21 +648,21 @@ TCP порт для защищённого обмена данными с кли **Значение по умолчанию** -```xml +``` xml 9440 ``` -## mysql_port {#server_settings-mysql_port} +## mysql\_port {#server-settings-mysql-port} Порт для взаимодействия с клиентами по протоколу MySQL. Пример -```xml +``` xml 9004 ``` -## tmp_path +## tmp\_path {#tmp-path} Путь ко временным данным для обработки больших запросов. @@ -702,52 +671,50 @@ TCP порт для защищённого обмена данными с кли **Пример** -```xml +``` xml /var/lib/clickhouse/tmp/ ``` - -## uncompressed_cache_size {#server-settings-uncompressed_cache_size} +## uncompressed\_cache\_size {#server-settings-uncompressed-cache-size} Размер кеша (в байтах) для несжатых данных, используемых движками таблиц семейства [MergeTree](../table_engines/mergetree.md). -Кеш единый для сервера. Память выделяется по требованию. Кеш используется в том случае, если включена опция [use_uncompressed_cache](../settings/settings.md). +Кеш единый для сервера. Память выделяется по требованию. Кеш используется в том случае, если включена опция [use\_uncompressed\_cache](../settings/settings.md). Несжатый кеш выгодно использовать для очень коротких запросов в отдельных случаях. **Пример** -```xml +``` xml 8589934592 ``` -## user_files_path {#server_settings-user_files_path} +## user\_files\_path {#server-settings-user-files-path} Каталог с пользовательскими файлами. Используется в табличной функции [file()](../../query_language/table_functions/file.md). **Пример** -```xml +``` xml /var/lib/clickhouse/user_files/ ``` - -## users_config +## users\_config {#users-config} Путь к файлу, который содержит: -- Конфигурации пользователей. -- Права доступа. -- Профили настроек. -- Настройки квот. +- Конфигурации пользователей. +- Права доступа. +- Профили настроек. +- Настройки квот. **Пример** -```xml +``` xml users.xml ``` -## zookeeper {#server-settings_zookeeper} +## zookeeper {#server-settings-zookeeper} Содержит параметры, позволяющие ClickHouse взаимодействовать с кластером [ZooKeeper](http://zookeeper.apache.org/). @@ -755,26 +722,28 @@ ClickHouse использует ZooKeeper для хранения метадан Раздел содержит следующие параметры: -- `node` — адрес ноды (сервера) ZooKeeper. Можно сконфигурировать несколько нод. +- `node` — адрес ноды (сервера) ZooKeeper. Можно сконфигурировать несколько нод. - Например: + Например: -```xml + + +``` xml example_host 2181 ``` - Атрибут `index` задает порядок опроса нод при попытках подключиться к кластеру ZooKeeper. + Атрибут `index` задает порядок опроса нод при попытках подключиться к кластеру ZooKeeper. -- `session_timeout` — максимальный таймаут клиентской сессии в миллисекундах. -- `root` — [znode](http://zookeeper.apache.org/doc/r3.5.5/zookeeperOver.html#Nodes+and+ephemeral+nodes), который используется как корневой для всех znode, которые использует сервер ClickHouse. Необязательный. -- `identity` — пользователь и пароль, которые может потребовать ZooKeeper для доступа к запрошенным znode. Необязательный. +- `session_timeout` — максимальный таймаут клиентской сессии в миллисекундах. +- `root` — [znode](http://zookeeper.apache.org/doc/r3.5.5/zookeeperOver.html#Nodes+and+ephemeral+nodes), который используется как корневой для всех znode, которые использует сервер ClickHouse. Необязательный. +- `identity` — пользователь и пароль, которые может потребовать ZooKeeper для доступа к запрошенным znode. Необязательный. **Пример конфигурации** -```xml +``` xml example1 @@ -794,45 +763,45 @@ ClickHouse использует ZooKeeper для хранения метадан **Смотрите также** -- [Репликация](../../operations/table_engines/replication.md) -- [ZooKeeper Programmer's Guide](http://zookeeper.apache.org/doc/current/zookeeperProgrammers.html) +- [Репликация](../../operations/table_engines/replication.md) +- [ZooKeeper Programmer’s Guide](http://zookeeper.apache.org/doc/current/zookeeperProgrammers.html) -## use_minimalistic_part_header_in_zookeeper {#server-settings-use_minimalistic_part_header_in_zookeeper} +## use\_minimalistic\_part\_header\_in\_zookeeper {#server-settings-use-minimalistic-part-header-in-zookeeper} Способ хранения заголовков кусков данных в ZooKeeper. Параметр применяется только к семейству таблиц `MergeTree`. Его можно установить: -- Глобально в разделе [merge_tree](#server_settings-merge_tree) файла `config.xml`. +- Глобально в разделе [merge\_tree](#server_settings-merge_tree) файла `config.xml`. - ClickHouse использует этот параметр для всех таблиц на сервере. Вы можете изменить настройку в любое время. Существующие таблицы изменяют свое поведение при изменении параметра. + ClickHouse использует этот параметр для всех таблиц на сервере. Вы можете изменить настройку в любое время. Существующие таблицы изменяют свое поведение при изменении параметра. -- Для каждой отдельной таблицы. +- Для каждой отдельной таблицы. - При создании таблицы укажите соответствующую [настройку движка](../table_engines/mergetree.md#table_engine-mergetree-creating-a-table). Поведение существующей таблицы с установленным параметром не изменяется даже при изменении глобального параметра. + При создании таблицы укажите соответствующую [настройку движка](../table_engines/mergetree.md#table_engine-mergetree-creating-a-table). Поведение существующей таблицы с установленным параметром не изменяется даже при изменении глобального параметра. **Возможные значения** -- 0 — функциональность выключена. -- 1 — функциональность включена. +- 0 — функциональность выключена. +- 1 — функциональность включена. Если `use_minimalistic_part_header_in_zookeeper = 1`, то [реплицированные](../table_engines/replication.md) таблицы хранят заголовки кусков данных в компактном виде, используя только одну `znode`. Если таблица содержит много столбцов, этот метод хранения значительно уменьшает объём данных, хранящихся в Zookeeper. !!! attention "Внимание" После того как вы установили `use_minimalistic_part_header_in_zookeeper = 1`, невозможно откатить ClickHouse до версии, которая не поддерживает этот параметр. Будьте осторожны при обновлении ClickHouse на серверах в кластере. Не обновляйте все серверы сразу. Безопаснее проверять новые версии ClickHouse в тестовой среде или только на некоторых серверах кластера. - Заголовки частей данных, ранее сохранённые с этим параметром, не могут быть восстановлены в их предыдущем (некомпактном) представлении. + Заголовки частей данных, ранее сохранённые с этим параметром, не могут быть восстановлены в их предыдущем (некомпактном) представлении. **Значение по умолчанию**: 0. -## disable_internal_dns_cache {#server-settings-disable_internal_dns_cache} +## disable\_internal\_dns\_cache {#server-settings-disable-internal-dns-cache} Отключает внутренний кеш DNS записей. Используется при эксплуатации ClickHouse в системах с часто меняющейся инфраструктурой, таких как Kubernetes. **Значение по умолчанию**: 0. -## dns_cache_update_period {#server-settings-dns_cache_update_period} +## dns\_cache\_update\_period {#server-settings-dns-cache-update-period} Период обновления IP адресов у записей во внутреннем DNS кеше ClickHouse (в секундах). Обновление выполняется асинхронно, отдельным системным потоком. diff --git a/docs/ru/operations/settings/constraints_on_settings.md b/docs/ru/operations/settings/constraints_on_settings.md index 687c8d7f9a4..71ae9cb3bb4 100644 --- a/docs/ru/operations/settings/constraints_on_settings.md +++ b/docs/ru/operations/settings/constraints_on_settings.md @@ -1,9 +1,9 @@ -# Ограничения на изменение настроек +# Ограничения на изменение настроек {#ogranicheniia-na-izmenenie-nastroek} Ограничения на изменение настроек могут находиться внутри секции `profiles` файла `user.xml` и запрещают пользователю менять некоторые настройки с помощью запроса `SET`. Выглядит это следующим образом: -```xml +``` xml <имя_пользователя> @@ -31,7 +31,7 @@ **Пример:** Пусть файл `users.xml` содержит строки: -```xml +``` xml 10000000000 @@ -52,13 +52,13 @@ Каждый из следующих запросов кинет исключение: -```sql +``` sql SET max_memory_usage=20000000001; SET max_memory_usage=4999999999; SET force_index_by_date=1; ``` -```text +``` text Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be greater than 20000000000. Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be less than 5000000000. Code: 452, e.displayText() = DB::Exception: Setting force_index_by_date should not be changed. diff --git a/docs/ru/operations/settings/index.md b/docs/ru/operations/settings/index.md index 6c74054f933..f60b7837acc 100644 --- a/docs/ru/operations/settings/index.md +++ b/docs/ru/operations/settings/index.md @@ -5,19 +5,19 @@ Способы задания настроек, упорядоченные по приоритету: -- Настройки в конфигурационном файле сервера `users.xml`. +- Настройки в конфигурационном файле сервера `users.xml`. - Устанавливаются в элементе ``. + Устанавливаются в элементе ``. -- Настройки для сессии. +- Настройки для сессии. - Из консольного клиента ClickHouse в интерактивном режиме отправьте запрос `SET setting=value`. - Аналогично можно использовать ClickHouse-сессии в HTTP-протоколе, для этого необходимо указывать HTTP-параметр `session_id`. + Из консольного клиента ClickHouse в интерактивном режиме отправьте запрос `SET setting=value`. + Аналогично можно использовать ClickHouse-сессии в HTTP-протоколе, для этого необходимо указывать HTTP-параметр `session_id`. -- Настройки для запроса. +- Настройки для запроса. - - При запуске консольного клиента ClickHouse в не интерактивном режиме установите параметр запуска `--setting=value`. - - При использовании HTTP API передавайте cgi-параметры (`URL?setting_1=value&setting_2=value...`). + - При запуске консольного клиента ClickHouse в не интерактивном режиме установите параметр запуска `--setting=value`. + - При использовании HTTP API передавайте cgi-параметры (`URL?setting_1=value&setting_2=value...`). Настройки, которые можно задать только в конфигурационном файле сервера, в разделе не рассматриваются. diff --git a/docs/ru/operations/settings/permissions_for_queries.md b/docs/ru/operations/settings/permissions_for_queries.md index 04b5d798bab..b6ce51dbe59 100644 --- a/docs/ru/operations/settings/permissions_for_queries.md +++ b/docs/ru/operations/settings/permissions_for_queries.md @@ -1,21 +1,21 @@ -# Разрешения для запросов {#permissions_for_queries} +# Разрешения для запросов {#permissions-for-queries} Запросы в ClickHouse можно разделить на несколько типов: -1. Запросы на чтение данных: `SELECT`, `SHOW`, `DESCRIBE`, `EXISTS`. -1. Запросы за запись данных: `INSERT`, `OPTIMIZE`. -1. Запросы на изменение настроек: `SET`, `USE`. -1. [Запросы DDL](https://ru.wikipedia.org/wiki/Data_Definition_Language): `CREATE`, `ALTER`, `RENAME`, `ATTACH`, `DETACH`, `DROP` `TRUNCATE`. -1. `KILL QUERY`. +1. Запросы на чтение данных: `SELECT`, `SHOW`, `DESCRIBE`, `EXISTS`. +2. Запросы за запись данных: `INSERT`, `OPTIMIZE`. +3. Запросы на изменение настроек: `SET`, `USE`. +4. [Запросы DDL](https://ru.wikipedia.org/wiki/Data_Definition_Language): `CREATE`, `ALTER`, `RENAME`, `ATTACH`, `DETACH`, `DROP` `TRUNCATE`. +5. `KILL QUERY`. Разрешения пользователя по типу запроса регулируются параметрами: -- [readonly](#settings_readonly) — ограничивает разрешения для всех типов запросов, кроме DDL. -- [allow_ddl](#settings_allow_ddl) — ограничивает разрешения для DDL запросов. +- [readonly](#settings_readonly) — ограничивает разрешения для всех типов запросов, кроме DDL. +- [allow\_ddl](#settings_allow_ddl) — ограничивает разрешения для DDL запросов. `KILL QUERY` выполняется с любыми настройками. -## readonly {#settings_readonly} +## readonly {#settings-readonly} Ограничивает разрешения для запросов на чтение данных, запись данных и изменение параметров. @@ -23,9 +23,9 @@ **Возможные значения** -- 0 — разрешены все запросы. -- 1 — разрешены только запросы на чтение данных. -- 2 — разрешены запросы на чтение данных и изменение настроек. +- 0 — разрешены все запросы. +- 1 — разрешены только запросы на чтение данных. +- 2 — разрешены запросы на чтение данных и изменение настроек. После установки `readonly = 1` или `2` пользователь не может изменить настройки `readonly` и `allow_ddl` в текущей сессии. @@ -37,7 +37,7 @@ 0 -## allow_ddl {#settings_allow_ddl} +## allow\_ddl {#settings-allow-ddl} Разрешает/запрещает [DDL](https://ru.wikipedia.org/wiki/Data_Definition_Language) запросы. @@ -45,8 +45,8 @@ **Возможные значения** -- 0 — DDL запросы не разрешены. -- 1 — DDL запросы разрешены. +- 0 — DDL запросы не разрешены. +- 1 — DDL запросы разрешены. Если `allow_ddl = 0`, то невозможно выполнить `SET allow_ddl = 1` для текущей сессии. diff --git a/docs/ru/operations/settings/query_complexity.md b/docs/ru/operations/settings/query_complexity.md index 01ab8745840..3c128c414bc 100644 --- a/docs/ru/operations/settings/query_complexity.md +++ b/docs/ru/operations/settings/query_complexity.md @@ -1,4 +1,4 @@ -# Ограничения на сложность запроса +# Ограничения на сложность запроса {#ogranicheniia-na-slozhnost-zaprosa} Ограничения на сложность запроса - часть настроек. Используются, чтобы обеспечить более безопасное исполнение запросов из пользовательского интерфейса. @@ -7,9 +7,9 @@ Ограничения проверяются на каждый блок обработанных данных, а не на каждую строку. В связи с этим, ограничения могут быть превышены на размер блока. -Ограничения вида "максимальное количество чего-нибудь" могут принимать значение 0, которое обозначает "не ограничено". -Для большинства ограничений также присутствует настройка вида overflow_mode - что делать, когда ограничение превышено. -Оно может принимать одно из двух значений: `throw` или `break`; а для ограничения на агрегацию (group_by_overflow_mode) есть ещё значение `any`. +Ограничения вида «максимальное количество чего-нибудь» могут принимать значение 0, которое обозначает «не ограничено». +Для большинства ограничений также присутствует настройка вида overflow\_mode - что делать, когда ограничение превышено. +Оно может принимать одно из двух значений: `throw` или `break`; а для ограничения на агрегацию (group\_by\_overflow\_mode) есть ещё значение `any`. `throw` - кинуть исключение (по умолчанию). @@ -17,7 +17,7 @@ `any (только для group_by_overflow_mode)` - продолжить агрегацию по ключам, которые успели войти в набор, но не добавлять новые ключи в набор. -## max_memory_usage {#settings_max_memory_usage} +## max\_memory\_usage {#settings-max-memory-usage} Максимальный возможный объём оперативной памяти для выполнения запроса на одном сервере. @@ -34,86 +34,87 @@ Потребление памяти ограничивается также параметрами `max_memory_usage_for_user` и `max_memory_usage_for_all_queries`. -## max_memory_usage_for_user +## max\_memory\_usage\_for\_user {#max-memory-usage-for-user} Максимальный возможный объём оперативной памяти для запросов пользователя на одном сервере. Значения по умолчанию определены в файле [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L288). По умолчанию размер не ограничен (`max_memory_usage_for_user = 0`). -Смотрите также описание настройки [max_memory_usage](#settings_max_memory_usage). +Смотрите также описание настройки [max\_memory\_usage](#settings_max_memory_usage). -## max_memory_usage_for_all_queries +## max\_memory\_usage\_for\_all\_queries {#max-memory-usage-for-all-queries} Максимальный возможный объём оперативной памяти для всех запросов на одном сервере. Значения по умолчанию определены в файле [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L289). По умолчанию размер не ограничен (`max_memory_usage_for_all_queries = 0`). -Смотрите также описание настройки [max_memory_usage](#settings_max_memory_usage). +Смотрите также описание настройки [max\_memory\_usage](#settings_max_memory_usage). -## max_rows_to_read +## max\_rows\_to\_read {#max-rows-to-read} Следующие ограничения могут проверяться на каждый блок (а не на каждую строку). То есть, ограничения могут быть немного нарушены. При выполнении запроса в несколько потоков, следующие ограничения действуют в каждом потоке по отдельности. Максимальное количество строчек, которое можно прочитать из таблицы при выполнении запроса. -## max_bytes_to_read +## max\_bytes\_to\_read {#max-bytes-to-read} Максимальное количество байт (несжатых данных), которое можно прочитать из таблицы при выполнении запроса. -## read_overflow_mode +## read\_overflow\_mode {#read-overflow-mode} Что делать, когда количество прочитанных данных превысило одно из ограничений: throw или break. По умолчанию: throw. -## max_rows_to_group_by {#settings-max_rows_to_group_by} +## max\_rows\_to\_group\_by {#settings-max-rows-to-group-by} Максимальное количество уникальных ключей, получаемых в процессе агрегации. Позволяет ограничить потребление оперативки при агрегации. -## group_by_overflow_mode +## group\_by\_overflow\_mode {#group-by-overflow-mode} Что делать, когда количество уникальных ключей при агрегации превысило ограничение: throw, break или any. По умолчанию: throw. Использование значения any позволяет выполнить GROUP BY приближённо. Качество такого приближённого вычисления сильно зависит от статистических свойств данных. -## max_bytes_before_external_group_by {#settings-max_bytes_before_external_group_by} +## max\_bytes\_before\_external\_group\_by {#settings-max-bytes-before-external-group-by} Включает или отключает выполнение секций `GROUP BY` во внешней памяти. Смотрите [GROUP BY во внешней памяти](../../query_language/select.md#select-group-by-in-external-memory). Возможные значения: -- Максимальный объём RAM (в байтах), который может использовать отдельная операция [GROUP BY](../../query_language/select.md#select-group-by-clause). -- 0 — `GROUP BY` во внешней памяти отключен. +- Максимальный объём RAM (в байтах), который может использовать отдельная операция [GROUP BY](../../query_language/select.md#select-group-by-clause). +- 0 — `GROUP BY` во внешней памяти отключен. Значение по умолчанию — 0. -## max_rows_to_sort +## max\_rows\_to\_sort {#max-rows-to-sort} Максимальное количество строк до сортировки. Позволяет ограничить потребление оперативки при сортировке. -## max_bytes_to_sort +## max\_bytes\_to\_sort {#max-bytes-to-sort} Максимальное количество байт до сортировки. -## sort_overflow_mode +## sort\_overflow\_mode {#sort-overflow-mode} Что делать, если количество строк, полученное перед сортировкой, превысило одно из ограничений: throw или break. По умолчанию: throw. -## max_result_rows {#setting-max_result_rows} +## max\_result\_rows {#setting-max-result-rows} Ограничение на количество строк результата. Проверяются также для подзапросов и на удалённых серверах при выполнении части распределённого запроса. -## max_result_bytes +## max\_result\_bytes {#max-result-bytes} Ограничение на количество байт результата. Аналогично. -## result_overflow_mode +## result\_overflow\_mode {#result-overflow-mode} Что делать, если объём результата превысил одно из ограничений: throw или break. По умолчанию: throw. -Использование break по смыслу похоже на LIMIT. Break прерывает выполнение только на уровне блока. Т.е. число строк которые вернет запрос будет больше чем ограничение [max_result_rows](#setting-max_result_rows), кратно [max_block_size](settings.md#setting-max_block_size) и зависит от [max_threads](settings.md#settings-max_threads). +Использование break по смыслу похоже на LIMIT. Break прерывает выполнение только на уровне блока. Т.е. число строк которые вернет запрос будет больше чем ограничение [max\_result\_rows](#setting-max_result_rows), кратно [max\_block\_size](settings.md#setting-max_block_size) и зависит от [max\_threads](settings.md#settings-max_threads). Пример: -```sql -SET max_threads = 3, max_block_size = 3333; + +``` sql +SET max_threads = 3, max_block_size = 3333; SET max_result_rows = 3334, result_overflow_mode = 'break'; SELECT * @@ -123,165 +124,165 @@ FORMAT Null; Результат: -```text +``` text 6666 rows in set. ... ``` -## max_execution_time +## max\_execution\_time {#max-execution-time} Максимальное время выполнения запроса в секундах. На данный момент не проверяется при одной из стадий сортировки а также при слиянии и финализации агрегатных функций. -## timeout_overflow_mode +## timeout\_overflow\_mode {#timeout-overflow-mode} -Что делать, если запрос выполняется дольше max_execution_time: throw или break. По умолчанию: throw. +Что делать, если запрос выполняется дольше max\_execution\_time: throw или break. По умолчанию: throw. -## min_execution_speed +## min\_execution\_speed {#min-execution-speed} -Минимальная скорость выполнения запроса в строчках в секунду. Проверяется на каждый блок данных по истечении timeout_before_checking_execution_speed. Если скорость выполнения запроса оказывается меньше, то кидается исключение. +Минимальная скорость выполнения запроса в строчках в секунду. Проверяется на каждый блок данных по истечении timeout\_before\_checking\_execution\_speed. Если скорость выполнения запроса оказывается меньше, то кидается исключение. -## min_execution_speed_bytes +## min\_execution\_speed\_bytes {#min-execution-speed-bytes} -Минимальная скорость выполнения запроса в строках на байт. Он проверяется для каждого блока данных после timeout_before_checking_execution_speed. Если скорость выполнения запроса меньше, исключение. +Минимальная скорость выполнения запроса в строках на байт. Он проверяется для каждого блока данных после timeout\_before\_checking\_execution\_speed. Если скорость выполнения запроса меньше, исключение. -## max_execution_speed +## max\_execution\_speed {#max-execution-speed} -Максимальная скорость выполнения запроса в строках в секунду. Он проверяется для каждого блока данных после timeout_before_checking_execution_speed. Если скорость выполнения запроса выше, скорость будет снижена. +Максимальная скорость выполнения запроса в строках в секунду. Он проверяется для каждого блока данных после timeout\_before\_checking\_execution\_speed. Если скорость выполнения запроса выше, скорость будет снижена. -## max_execution_speed_bytes +## max\_execution\_speed\_bytes {#max-execution-speed-bytes} -Максимальная скорость выполнения запроса в байтах в секунду. Он проверяется для каждого блока данных после timeout_before_checking_execution_speed. Если скорость выполнения запроса выше, скорость будет снижена. +Максимальная скорость выполнения запроса в байтах в секунду. Он проверяется для каждого блока данных после timeout\_before\_checking\_execution\_speed. Если скорость выполнения запроса выше, скорость будет снижена. -## timeout_before_checking_execution_speed +## timeout\_before\_checking\_execution\_speed {#timeout-before-checking-execution-speed} -Проверять, что скорость выполнения запроса не слишком низкая (не меньше min_execution_speed), после прошествия указанного времени в секундах. +Проверять, что скорость выполнения запроса не слишком низкая (не меньше min\_execution\_speed), после прошествия указанного времени в секундах. -## max_columns_to_read +## max\_columns\_to\_read {#max-columns-to-read} Максимальное количество столбцов, которых можно читать из таблицы в одном запросе. Если запрос требует чтения большего количества столбцов - кинуть исключение. -## max_temporary_columns +## max\_temporary\_columns {#max-temporary-columns} Максимальное количество временных столбцов, которых необходимо одновременно держать в оперативке, в процессе выполнения запроса, включая константные столбцы. Если временных столбцов оказалось больше - кидается исключение. -## max_temporary_non_const_columns +## max\_temporary\_non\_const\_columns {#max-temporary-non-const-columns} -То же самое, что и max_temporary_columns, но без учёта столбцов-констант. +То же самое, что и max\_temporary\_columns, но без учёта столбцов-констант. Стоит заметить, что столбцы-константы довольно часто образуются в процессе выполнения запроса, но расходуют примерно нулевое количество вычислительных ресурсов. -## max_subquery_depth +## max\_subquery\_depth {#max-subquery-depth} Максимальная вложенность подзапросов. Если подзапросы более глубокие - кидается исключение. По умолчанию: 100. -## max_pipeline_depth +## max\_pipeline\_depth {#max-pipeline-depth} Максимальная глубина конвейера выполнения запроса. Соответствует количеству преобразований, которое проходит каждый блок данных в процессе выполнения запроса. Считается в пределах одного сервера. Если глубина конвейера больше - кидается исключение. По умолчанию: 1000. -## max_ast_depth +## max\_ast\_depth {#max-ast-depth} Максимальная вложенность синтаксического дерева запроса. Если превышена - кидается исключение. На данный момент, проверяются не во время парсинга а уже после парсинга запроса. То есть, во время парсинга может быть создано слишком глубокое синтаксическое дерево, но запрос не будет выполнен. По умолчанию: 1000. -## max_ast_elements +## max\_ast\_elements {#max-ast-elements} Максимальное количество элементов синтаксического дерева запроса. Если превышено - кидается исключение. Аналогично, проверяется уже после парсинга запроса. По умолчанию: 50 000. -## max_rows_in_set +## max\_rows\_in\_set {#max-rows-in-set} Максимальное количество строчек для множества в секции IN, создаваемого из подзапроса. -## max_bytes_in_set +## max\_bytes\_in\_set {#max-bytes-in-set} Максимальное количество байт (несжатых данных), занимаемое множеством в секции IN, создаваемым из подзапроса. -## set_overflow_mode +## set\_overflow\_mode {#set-overflow-mode} Что делать, когда количество данных превысило одно из ограничений: throw или break. По умолчанию: throw. -## max_rows_in_distinct +## max\_rows\_in\_distinct {#max-rows-in-distinct} Максимальное количество различных строчек при использовании DISTINCT. -## max_bytes_in_distinct +## max\_bytes\_in\_distinct {#max-bytes-in-distinct} Максимальное количество байт, занимаемых хэш-таблицей, при использовании DISTINCT. -## distinct_overflow_mode +## distinct\_overflow\_mode {#distinct-overflow-mode} Что делать, когда количество данных превысило одно из ограничений: throw или break. По умолчанию: throw. -## max_rows_to_transfer +## max\_rows\_to\_transfer {#max-rows-to-transfer} Максимальное количество строчек, которых можно передать на удалённый сервер или сохранить во временную таблицу, при использовании GLOBAL IN. -## max_bytes_to_transfer +## max\_bytes\_to\_transfer {#max-bytes-to-transfer} Максимальное количество байт (несжатых данных), которых можно передать на удалённый сервер или сохранить во временную таблицу, при использовании GLOBAL IN. -## transfer_overflow_mode +## transfer\_overflow\_mode {#transfer-overflow-mode} Что делать, когда количество данных превысило одно из ограничений: throw или break. По умолчанию: throw. -## max_rows_in_join {#settings-max_rows_in_join} +## max\_rows\_in\_join {#settings-max-rows-in-join} Ограничивает количество строк в хэш-таблице, используемой при соединении таблиц. -Параметр применяется к операциям [SELECT... JOIN](../../query_language/select.md#select-join) и к движку таблиц [Join](../table_engines/join.md). +Параметр применяется к операциям [SELECT… JOIN](../../query_language/select.md#select-join) и к движку таблиц [Join](../table_engines/join.md). Если запрос содержит несколько `JOIN`, то ClickHouse проверяет значение настройки для каждого промежуточного результата. -При достижении предела ClickHouse может выполнять различные действия. Используйте настройку [join_overflow_mode](#settings-join_overflow_mode) для выбора действия. +При достижении предела ClickHouse может выполнять различные действия. Используйте настройку [join\_overflow\_mode](#settings-join_overflow_mode) для выбора действия. Возможные значения: -- Положительное целое число. -- 0 — неограниченное количество строк. +- Положительное целое число. +- 0 — неограниченное количество строк. Значение по умолчанию — 0. -## max_bytes_in_join {#settings-max_bytes_in_join} +## max\_bytes\_in\_join {#settings-max-bytes-in-join} Ограничивает размер (в байтах) хэш-таблицы, используемой при объединении таблиц. -Параметр применяется к операциям [SELECT... JOIN](../../query_language/select.md#select-join) и к движку таблиц [Join](../table_engines/join.md). +Параметр применяется к операциям [SELECT… JOIN](../../query_language/select.md#select-join) и к движку таблиц [Join](../table_engines/join.md). Если запрос содержит несколько `JOIN`, то ClickHouse проверяет значение настройки для каждого промежуточного результата. -При достижении предела ClickHouse может выполнять различные действия. Используйте настройку [join_overflow_mode](#settings-join_overflow_mode) для выбора действия. +При достижении предела ClickHouse может выполнять различные действия. Используйте настройку [join\_overflow\_mode](#settings-join_overflow_mode) для выбора действия. Возможные значения: -- Положительное целое число. -- 0 — контроль памяти отключен. +- Положительное целое число. +- 0 — контроль памяти отключен. Значение по умолчанию — 0. -## join_overflow_mode {#settings-join_overflow_mode} +## join\_overflow\_mode {#settings-join-overflow-mode} Определяет, какое действие ClickHouse выполняет при достижении любого из следующих ограничений для `JOIN`: -- [max_bytes_in_join](#settings-max_bytes_in_join) -- [max_rows_in_join](#settings-max_rows_in_join) +- [max\_bytes\_in\_join](#settings-max_bytes_in_join) +- [max\_rows\_in\_join](#settings-max_rows_in_join) Возможные значения: -- `THROW` — ClickHouse генерирует исключение и прерывает операцию. -- `BREAK` — ClickHouse прерывает операцию, но не генерирует исключение. +- `THROW` — ClickHouse генерирует исключение и прерывает операцию. +- `BREAK` — ClickHouse прерывает операцию, но не генерирует исключение. Значение по умолчанию — `THROW`. **Смотрите также** -- [Секция JOIN](../../query_language/select.md#select-join) -- [Движок таблиц Join](../table_engines/join.md) +- [Секция JOIN](../../query_language/select.md#select-join) +- [Движок таблиц Join](../table_engines/join.md) -## max_partitions_per_insert_block +## max\_partitions\_per\_insert\_block {#max-partitions-per-insert-block} Ограничивает максимальное количество партиций в одном вставленном блоке. -- Положительное целое число. -- 0 — неограниченное количество разделов. +- Положительное целое число. +- 0 — неограниченное количество разделов. Значение по умолчанию: 100. @@ -289,6 +290,6 @@ FORMAT Null; При вставке данных, ClickHouse вычисляет количество партиций во вставленном блоке. Если число партиций больше, чем `max_partitions_per_insert_block`, ClickHouse генерирует исключение со следующим текстом: -> "Too many partitions for single INSERT block (more than " + toString(max_parts) + "). The limit is controlled by 'max_partitions_per_insert_block' setting. Large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc)." +> «Too many partitions for single INSERT block (more than» + toString(max\_parts) + «). The limit is controlled by ‘max\_partitions\_per\_insert\_block’ setting. Large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).» [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/query_complexity/) diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 2f7d5ac0abb..bc2b5f1acf8 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1,6 +1,6 @@ -# Настройки +# Настройки {#nastroiki} -## distributed_product_mode +## distributed\_product\_mode {#distributed-product-mode} Изменяет поведение [распределенных подзапросов](../../query_language/select.md). @@ -8,19 +8,19 @@ ClickHouse применяет настройку в тех случаях, ко Условия применения: -- Только подзапросы для IN, JOIN. -- Только если в секции FROM используется распределённая таблица, содержащая более одного шарда. -- Если подзапрос касается распределенной таблицы, содержащей более одного шарда. -- Не используется в случае табличной функции [remote](../../query_language/table_functions/remote.md). +- Только подзапросы для IN, JOIN. +- Только если в секции FROM используется распределённая таблица, содержащая более одного шарда. +- Если подзапрос касается распределенной таблицы, содержащей более одного шарда. +- Не используется в случае табличной функции [remote](../../query_language/table_functions/remote.md). Возможные значения: -- `deny` — значение по умолчанию. Запрещает использование таких подзапросов (При попытке использование вернет исключение "Double-distributed IN/JOIN subqueries is denied"); -- `local` — заменяет базу данных и таблицу в подзапросе на локальные для конечного сервера (шарда), оставив обычный `IN`/`JOIN.` -- `global` — заменяет запрос `IN`/`JOIN` на `GLOBAL IN`/`GLOBAL JOIN.` -- `allow` — разрешает использование таких подзапросов. +- `deny` — значение по умолчанию. Запрещает использование таких подзапросов (При попытке использование вернет исключение «Double-distributed IN/JOIN subqueries is denied»); +- `local` — заменяет базу данных и таблицу в подзапросе на локальные для конечного сервера (шарда), оставив обычный `IN`/`JOIN.` +- `global` — заменяет запрос `IN`/`JOIN` на `GLOBAL IN`/`GLOBAL JOIN.` +- `allow` — разрешает использование таких подзапросов. -## enable_optimize_predicate_expression +## enable\_optimize\_predicate\_expression {#enable-optimize-predicate-expression} Включает пробрасывание предикатов в подзапросы для запросов `SELECT`. @@ -28,8 +28,8 @@ ClickHouse применяет настройку в тех случаях, ко Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 1. @@ -37,14 +37,14 @@ ClickHouse применяет настройку в тех случаях, ко Рассмотрим следующие запросы: -1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` -2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` +1. `SELECT count() FROM test_table WHERE date = '2018-10-10'` +2. `SELECT count() FROM (SELECT * FROM test_table) WHERE date = '2018-10-10'` Если `enable_optimize_predicate_expression = 1`, то время выполнения запросов одинаковое, так как ClickHouse применяет `WHERE` к подзапросу сразу при его обработке. Если `enable_optimize_predicate_expression = 0`, то время выполнения второго запроса намного больше, потому что секция `WHERE` применяется к данным уже после завершения подзапроса. -## fallback_to_stale_replicas_for_distributed_queries {#settings-fallback_to_stale_replicas_for_distributed_queries} +## fallback\_to\_stale\_replicas\_for\_distributed\_queries {#settings-fallback-to-stale-replicas-for-distributed-queries} Форсирует запрос в устаревшую реплику в случае, если актуальные данные недоступны. См. [Репликация](../../operations/table_engines/replication.md). @@ -54,7 +54,7 @@ ClickHouse применяет настройку в тех случаях, ко По умолчанию - 1 (включена). -## force_index_by_date {#settings-force_index_by_date} +## force\_index\_by\_date {#settings-force-index-by-date} Запрещает выполнение запросов, если использовать индекс по дате невозможно. @@ -62,7 +62,7 @@ ClickHouse применяет настройку в тех случаях, ко При `force_index_by_date=1` ClickHouse проверяет, есть ли в запросе условие на ключ даты, которое может использоваться для отсечения диапазонов данных. Если подходящего условия нет - кидается исключение. При этом не проверяется, действительно ли условие уменьшает объём данных для чтения. Например, условие `Date != '2000-01-01'` подходит даже в том случае, когда соответствует всем данным в таблице (т.е. для выполнения запроса требуется full scan). Подробнее про диапазоны данных в таблицах MergeTree читайте в разделе [MergeTree](../table_engines/mergetree.md). -## force_primary_key {#settings-force_primary_key} +## force\_primary\_key {#settings-force-primary-key} Запрещает выполнение запросов, если использовать индекс по первичному ключу невозможно. @@ -70,17 +70,17 @@ ClickHouse применяет настройку в тех случаях, ко При `force_primary_key=1` ClickHouse проверяет, есть ли в запросе условие на первичный ключ, которое может использоваться для отсечения диапазонов данных. Если подходящего условия нет - кидается исключение. При этом не проверяется, действительно ли условие уменьшает объём данных для чтения. Подробнее про диапазоны данных в таблицах MergeTree читайте в разделе [MergeTree](../table_engines/mergetree.md). -## format_schema +## format\_schema {#format-schema} -Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap'n Proto](https://capnproto.org/) или [Protobuf](https://developers.google.com/protocol-buffers/). Значение параметра зависит от формата. +Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap’n Proto](https://capnproto.org/) или [Protobuf](https://developers.google.com/protocol-buffers/). Значение параметра зависит от формата. -## fsync_metadata +## fsync\_metadata {#fsync-metadata} Включает или отключает [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/functions/fsync.html) при записи `.sql` файлов. По умолчанию включено. Имеет смысл выключать, если на сервере миллионы мелких таблиц-чанков, которые постоянно создаются и уничтожаются. -## enable_http_compression {#settings-enable_http_compression} +## enable\_http\_compression {#settings-enable-http-compression} Включает или отключает сжатие данных в ответе на HTTP-запрос. @@ -88,20 +88,20 @@ ClickHouse применяет настройку в тех случаях, ко Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 0. -## http_zlib_compression_level {#settings-http_zlib_compression_level} +## http\_zlib\_compression\_level {#settings-http-zlib-compression-level} -Задаёт уровень сжатия данных в ответе на HTTP-запрос, если [enable_http_compression = 1](#settings-enable_http_compression). +Задаёт уровень сжатия данных в ответе на HTTP-запрос, если [enable\_http\_compression = 1](#settings-enable_http_compression). Возможные значения: числа от 1 до 9. Значение по умолчанию: 3. -## http_native_compression_disable_checksumming_on_decompress {#settings-http_native_compression_disable_checksumming_on_decompress} +## http\_native\_compression\_disable\_checksumming\_on\_decompress {#settings-http-native-compression-disable-checksumming-on-decompress} Включает или отключает проверку контрольной суммы при распаковке данных HTTP POST от клиента. Используется только для собственного (`Navite`) формата сжатия ClickHouse (ни `gzip`, ни `deflate`). @@ -109,12 +109,12 @@ ClickHouse применяет настройку в тех случаях, ко Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 0. -## send_progress_in_http_headers {#settings-send_progress_in_http_headers} +## send\_progress\_in\_http\_headers {#settings-send-progress-in-http-headers} Включает или отключает HTTP-заголовки `X-ClickHouse-Progress` в ответах `clickhouse-server`. @@ -122,23 +122,23 @@ ClickHouse применяет настройку в тех случаях, ко Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 0. -## max_http_get_redirects {#setting-max_http_get_redirects} +## max\_http\_get\_redirects {#setting-max-http-get-redirects} Ограничивает максимальное количество переходов по редиректам в таблицах с движком [URL](../table_engines/url.md) при выполнении HTTP запросов методом GET. Настройка применяется для обоих типов таблиц: созданных запросом [CREATE TABLE](../../query_language/create/#create-table-query) и с помощью табличной функции [url](../../query_language/table_functions/url.md). Возможные значения: -- Положительное целое число переходов. -- 0 — переходы запрещены. +- Положительное целое число переходов. +- 0 — переходы запрещены. Значение по умолчанию: 0. -## input_format_allow_errors_num +## input\_format\_allow\_errors\_num {#input-format-allow-errors-num} Устанавливает максимальное количество допустимых ошибок при чтении из текстовых форматов (CSV, TSV и т.п.). @@ -150,7 +150,7 @@ ClickHouse применяет настройку в тех случаях, ко В случае превышения `input_format_allow_errors_num` ClickHouse генерирует исключение. -## input_format_allow_errors_ratio +## input\_format\_allow\_errors\_ratio {#input-format-allow-errors-ratio} Устанавливает максимальную долю допустимых ошибок при чтении из текстовых форматов (CSV, TSV и т.п.). Доля ошибок задаётся в виде числа с плавающей запятой от 0 до 1. @@ -163,19 +163,19 @@ ClickHouse применяет настройку в тех случаях, ко В случае превышения `input_format_allow_errors_ratio` ClickHouse генерирует исключение. -## input_format_values_interpret_expressions {#settings-input_format_values_interpret_expressions} +## input\_format\_values\_interpret\_expressions {#settings-input-format-values-interpret-expressions} Включает или отключает парсер SQL, если потоковый парсер не может проанализировать данные. Этот параметр используется только для формата [Values](../../interfaces/formats.md#data-format-values) при вставке данных. Дополнительные сведения о парсерах читайте в разделе [Синтаксис](../../query_language/syntax.md). Возможные значения: -- 0 — выключена. +- 0 — выключена. - В этом случае необходимо вставлять форматированные данные. Смотрите раздел [Форматы](../../interfaces/formats.md). + В этом случае необходимо вставлять форматированные данные. Смотрите раздел [Форматы](../../interfaces/formats.md). -- 1 — включена. +- 1 — включена. - В этом случае вы можете использовать выражение SQL в качестве значения, но вставка данных намного медленнее. Если вы вставляете только форматированные данные, ClickHouse ведет себя так, как будто значение параметра равно 0. + В этом случае вы можете использовать выражение SQL в качестве значения, но вставка данных намного медленнее. Если вы вставляете только форматированные данные, ClickHouse ведет себя так, как будто значение параметра равно 0. Значение по умолчанию: 1. @@ -183,58 +183,61 @@ ClickHouse применяет настройку в тех случаях, ко Вставим значение типа [DateTime](../../data_types/datetime.md) при разных значения настройки. -```sql +``` sql SET input_format_values_interpret_expressions = 0; INSERT INTO datetime_t VALUES (now()) ``` -```text + +``` text Exception on client: Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row 1) ``` -```sql +``` sql SET input_format_values_interpret_expressions = 1; INSERT INTO datetime_t VALUES (now()) ``` -```text + +``` text Ok. ``` Последний запрос эквивалентен следующему: -```sql +``` sql SET input_format_values_interpret_expressions = 0; INSERT INTO datetime_t SELECT now() ``` -```text + +``` text Ok. ``` -## input_format_defaults_for_omitted_fields {#session_settings-input_format_defaults_for_omitted_fields} +## input\_format\_defaults\_for\_omitted\_fields {#session-settings-input-format-defaults-for-omitted-fields} При вставке данных запросом `INSERT`, заменяет пропущенные поля значениям по умолчанию для типа данных столбца. Поддерживаемые форматы вставки: -- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) -- [CSV](../../interfaces/formats.md#csv) -- [TabSeparated](../../interfaces/formats.md#tabseparated) +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) +- [CSV](../../interfaces/formats.md#csv) +- [TabSeparated](../../interfaces/formats.md#tabseparated) !!! note "Примечание" Когда опция включена, сервер отправляет клиенту расширенные метаданные. Это требует дополнительных вычислительных ресурсов на сервере и может снизить производительность. Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 1. -## input_format_null_as_default {#settings-input_format_null_as_default} +## input\_format\_null\_as\_default {#settings-input-format-null-as-default} Включает или отключает использование значений по умолчанию в случаях, когда во входных данных содержится `NULL`, но тип соответствующего столбца не `Nullable(T)` (для текстовых форматов). -## input_format_skip_unknown_fields {#settings-input_format_skip_unknown_fields} +## input\_format\_skip\_unknown\_fields {#settings-input-format-skip-unknown-fields} Включает или отключает пропускание вставки неизвестных данных. @@ -242,38 +245,38 @@ Ok. Поддерживаемые форматы: -- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) -- [CSVWithNames](../../interfaces/formats.md#csvwithnames) -- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) -- [TSKV](../../interfaces/formats.md#tskv) +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [TSKV](../../interfaces/formats.md#tskv) Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 0. -## input_format_import_nested_json {#settings-input_format_import_nested_json} +## input\_format\_import\_nested\_json {#settings-input-format-import-nested-json} Включает или отключает вставку данных JSON с вложенными объектами. Поддерживаемые форматы: -- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) +- [JSONEachRow](../../interfaces/formats.md#jsoneachrow) Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 0. См. также: -- [Использование вложенных структур](../../interfaces/formats.md#jsoneachrow-nested) with the `JSONEachRow` format. +- [Использование вложенных структур](../../interfaces/formats.md#jsoneachrow-nested) with the `JSONEachRow` format. -## input_format_with_names_use_header {#settings-input_format_with_names_use_header} +## input\_format\_with\_names\_use\_header {#settings-input-format-with-names-use-header} Включает или отключает проверку порядка столбцов при вставке данных. @@ -281,17 +284,17 @@ Ok. Поддерживаемые форматы: -- [CSVWithNames](../../interfaces/formats.md#csvwithnames) -- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) +- [CSVWithNames](../../interfaces/formats.md#csvwithnames) +- [TabSeparatedWithNames](../../interfaces/formats.md#tabseparatedwithnames) Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 1. -## date_time_input_format {#settings-date_time_input_format} +## date\_time\_input\_format {#settings-date-time-input-format} Выбор парсера для текстового представления дат и времени при обработке входного формата. @@ -299,34 +302,34 @@ Ok. Возможные значения: -- `'best_effort'` — включает расширенный парсинг. +- `'best_effort'` — включает расширенный парсинг. - ClickHouse может парсить базовый формат `YYYY-MM-DD HH:MM:SS` и все форматы [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601). Например, `'2018-06-08T01:02:03.000Z'`. + ClickHouse может парсить базовый формат `YYYY-MM-DD HH:MM:SS` и все форматы [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601). Например, `'2018-06-08T01:02:03.000Z'`. -- `'basic'` — используется базовый парсер. +- `'basic'` — используется базовый парсер. - ClickHouse может парсить только базовый формат `YYYY-MM-DD HH:MM:SS`. Например, `'2019-08-20 10:18:56'`. + ClickHouse может парсить только базовый формат `YYYY-MM-DD HH:MM:SS`. Например, `'2019-08-20 10:18:56'`. Значение по умолчанию: `'basic'`. См. также: -- [Тип данных DateTime.](../../data_types/datetime.md) -- [Функции для работы с датой и временем.](../../query_language/functions/date_time_functions.md) +- [Тип данных DateTime.](../../data_types/datetime.md) +- [Функции для работы с датой и временем.](../../query_language/functions/date_time_functions.md) -## join_default_strictness {#settings-join_default_strictness} +## join\_default\_strictness {#settings-join-default-strictness} Устанавливает строгость по умолчанию для [JOIN](../../query_language/select.md#select-join). Возможные значения -- `ALL` — если в правой таблице несколько совпадающих строк, данные умножаются на количество этих строк. Это нормальное поведение `JOIN` как в стандартном SQL. -- `ANY` — если в правой таблице несколько соответствующих строк, то соединяется только первая найденная. Если в "правой" таблице есть не более одной подходящей строки, то результаты `ANY` и `ALL` совпадают. -- `Пустая строка` — если `ALL` или `ANY` не указаны в запросе, то ClickHouse генерирует исключение. +- `ALL` — если в правой таблице несколько совпадающих строк, данные умножаются на количество этих строк. Это нормальное поведение `JOIN` как в стандартном SQL. +- `ANY` — если в правой таблице несколько соответствующих строк, то соединяется только первая найденная. Если в «правой» таблице есть не более одной подходящей строки, то результаты `ANY` и `ALL` совпадают. +- `Пустая строка` — если `ALL` или `ANY` не указаны в запросе, то ClickHouse генерирует исключение. Значение по умолчанию: `ALL`. -## join_any_take_last_row {#settings-join_any_take_last_row} +## join\_any\_take\_last\_row {#settings-join-any-take-last-row} Изменяет поведение операций, выполняемых со строгостью `ANY`. @@ -335,29 +338,29 @@ Ok. Возможные значения: -- 0 — если в правой таблице несколько соответствующих строк, то присоединяется только первая найденная строка. -- 1 — если в правой таблице несколько соответствующих строк, то присоединяется только последняя найденная строка. +- 0 — если в правой таблице несколько соответствующих строк, то присоединяется только первая найденная строка. +- 1 — если в правой таблице несколько соответствующих строк, то присоединяется только последняя найденная строка. Значение по умолчанию: 0. См. также: -- [Секция JOIN](../../query_language/select.md#select-join) -- [Движок таблиц Join](../table_engines/join.md) -- [join_default_strictness](#settings-join_default_strictness) +- [Секция JOIN](../../query_language/select.md#select-join) +- [Движок таблиц Join](../table_engines/join.md) +- [join\_default\_strictness](#settings-join_default_strictness) -## join_use_nulls {#join_use_nulls} +## join\_use\_nulls {#join-use-nulls} Устанавливает тип поведения [JOIN](../../query_language/select.md). При объединении таблиц могут появиться пустые ячейки. ClickHouse заполняет их по-разному в зависимости от настроек. Возможные значения -- 0 — пустые ячейки заполняются значением по умолчанию соответствующего типа поля. -- 1 — `JOIN` ведёт себя как в стандартном SQL. Тип соответствующего поля преобразуется в [Nullable](../../data_types/nullable.md#data_type-nullable), а пустые ячейки заполняются значениями [NULL](../../query_language/syntax.md). +- 0 — пустые ячейки заполняются значением по умолчанию соответствующего типа поля. +- 1 — `JOIN` ведёт себя как в стандартном SQL. Тип соответствующего поля преобразуется в [Nullable](../../data_types/nullable.md#data_type-nullable), а пустые ячейки заполняются значениями [NULL](../../query_language/syntax.md). Значение по умолчанию: 0. -## max_block_size {#setting-max_block_size} +## max\_block\_size {#setting-max-block-size} Данные в ClickHouse обрабатываются по блокам (наборам кусочков столбцов). Внутренние циклы обработки для одного блока достаточно эффективны, но есть заметные издержки на каждый блок. Настройка `max_block_size` — это рекомендация, какой размер блока (в количестве строк) загружать из таблиц. Размер блока не должен быть слишком маленьким, чтобы затраты на каждый блок были заметны, но не слишком велики, чтобы запрос с LIMIT, который завершается после первого блока, обрабатывался быстро. Цель состоит в том, чтобы не использовалось слишком много оперативки при вынимании большого количества столбцов в несколько потоков; чтобы оставалась хоть какая-нибудь кэш-локальность. @@ -365,98 +368,98 @@ Ok. Из таблицы не всегда загружаются блоки размера `max_block_size`. Если ясно, что нужно прочитать меньше данных, то будет считан блок меньшего размера. -## preferred_block_size_bytes +## preferred\_block\_size\_bytes {#preferred-block-size-bytes} Служит для тех же целей что и `max_block_size`, но задает рекомендуемый размер блоков в байтах, выбирая адаптивное количество строк в блоке. При этом размер блока не может быть более `max_block_size` строк. По умолчанию: 1,000,000. Работает только при чтении из MergeTree-движков. -## merge_tree_uniform_read_distribution {#setting-merge_tree_uniform_read_distribution} +## merge\_tree\_uniform\_read\_distribution {#setting-merge-tree-uniform-read-distribution} При чтении из таблиц [MergeTree](../table_engines/mergetree.md) ClickHouse использует несколько потоков. Этот параметр включает/выключает равномерное распределение заданий по рабочим потокам. Алгоритм равномерного распределения стремится сделать время выполнения всех потоков примерно равным для одного запроса `SELECT`. Возможные значения: -- 0 — не использовать равномерное распределение заданий на чтение. -- 1 — использовать равномерное распределение заданий на чтение. +- 0 — не использовать равномерное распределение заданий на чтение. +- 1 — использовать равномерное распределение заданий на чтение. Значение по умолчанию: 1. -## merge_tree_min_rows_for_concurrent_read {#setting-merge_tree_min_rows_for_concurrent_read} +## merge\_tree\_min\_rows\_for\_concurrent\_read {#setting-merge-tree-min-rows-for-concurrent-read} Если количество строк, считываемых из файла таблицы [MergeTree](../table_engines/mergetree.md) превышает `merge_tree_min_rows_for_concurrent_read`, то ClickHouse пытается выполнить одновременное чтение из этого файла в несколько потоков. Возможные значения: -- Любое положительное целое число. +- Любое положительное целое число. Значение по умолчанию: 163840. -## merge_tree_min_bytes_for_concurrent_read {#setting-merge_tree_min_bytes_for_concurrent_read} +## merge\_tree\_min\_bytes\_for\_concurrent\_read {#setting-merge-tree-min-bytes-for-concurrent-read} Если число байтов, которое должно быть прочитано из одного файла таблицы с движком [MergeTree](../table_engines/mergetree.md), превышает значение `merge_tree_min_bytes_for_concurrent_read`, то ClickHouse выполняет одновременное чтение в несколько потоков из этого файла. Возможное значение: -- Положительное целое число. +- Положительное целое число. Значение по умолчанию: 251658240. -## merge_tree_min_rows_for_seek {#setting-merge_tree_min_rows_for_seek} +## merge\_tree\_min\_rows\_for\_seek {#setting-merge-tree-min-rows-for-seek} Если расстояние между двумя блоками данных для чтения в одном файле меньше, чем `merge_tree_min_rows_for_seek` строк, то ClickHouse не перескакивает (seek) через блоки, а считывает данные последовательно. Возможные значения: -- Положительное целое число. +- Положительное целое число. Значение по умолчанию: 0. -## merge_tree_min_bytes_for_seek {#setting-merge_tree_min_bytes_for_seek} +## merge\_tree\_min\_bytes\_for\_seek {#setting-merge-tree-min-bytes-for-seek} Если расстояние между двумя блоками данных для чтения в одном файле меньше, чем `merge_tree_min_bytes_for_seek` байтов, то ClickHouse не перескакивает (seek) через блоки, а считывает данные последовательно. Возможные значения: -- Положительное целое число. +- Положительное целое число. Значение по умолчанию: 0. -## merge_tree_coarse_index_granularity {#setting-merge_tree_coarse_index_granularity} +## merge\_tree\_coarse\_index\_granularity {#setting-merge-tree-coarse-index-granularity} При поиске данных ClickHouse проверяет засечки данных в файле индекса. Если ClickHouse обнаруживает, что требуемые ключи находятся в некотором диапазоне, он делит этот диапазон на `merge_tree_coarse_index_granularity` поддиапазонов и выполняет в них рекурсивный поиск нужных ключей. Возможные значения: -- Положительное целое число. +- Положительное целое число. Значение по умолчанию: 8. -## merge_tree_max_rows_to_use_cache {#setting-merge_tree_max_rows_to_use_cache} +## merge\_tree\_max\_rows\_to\_use\_cache {#setting-merge-tree-max-rows-to-use-cache} Если требуется прочитать более, чем `merge_tree_max_rows_to_use_cache` строк в одном запросе, ClickHouse не используют кэш несжатых блоков. -Кэш несжатых блоков хранит данные, извлечённые при выполнении запросов. ClickHouse использует этот кэш для ускорения ответов на повторяющиеся небольшие запросы. Настройка защищает кэш от замусоривания запросами, для выполнения которых необходимо извлечь большое количество данных. Настройка сервера [uncompressed_cache_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) определяет размер кэша несжатых блоков. +Кэш несжатых блоков хранит данные, извлечённые при выполнении запросов. ClickHouse использует этот кэш для ускорения ответов на повторяющиеся небольшие запросы. Настройка защищает кэш от замусоривания запросами, для выполнения которых необходимо извлечь большое количество данных. Настройка сервера [uncompressed\_cache\_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) определяет размер кэша несжатых блоков. Возможные значения: -- Положительное целое число. +- Положительное целое число. Значение по умолчанию: 128 ✕ 8192. -## merge_tree_max_bytes_to_use_cache {#setting-merge_tree_max_bytes_to_use_cache} +## merge\_tree\_max\_bytes\_to\_use\_cache {#setting-merge-tree-max-bytes-to-use-cache} Если требуется прочитать более, чем `merge_tree_max_bytes_to_use_cache` байтов в одном запросе, ClickHouse не используют кэш несжатых блоков. -Кэш несжатых блоков хранит данные, извлечённые при выполнении запросов. ClickHouse использует кэш для ускорения ответов на повторяющиеся небольшие запросы. Настройка защищает кэш от переполнения. Настройка сервера [uncompressed_cache_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) определяет размер кэша несжатых блоков. +Кэш несжатых блоков хранит данные, извлечённые при выполнении запросов. ClickHouse использует кэш для ускорения ответов на повторяющиеся небольшие запросы. Настройка защищает кэш от переполнения. Настройка сервера [uncompressed\_cache\_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) определяет размер кэша несжатых блоков. Возможное значение: -- Положительное целое число. +- Положительное целое число. Значение по умолчанию: 2013265920. -## min_bytes_to_use_direct_io {#settings-min_bytes_to_use_direct_io} +## min\_bytes\_to\_use\_direct\_io {#settings-min-bytes-to-use-direct-io} Минимальный объём данных, необходимый для прямого (небуферизованного) чтения/записи (direct I/O) на диск. @@ -464,48 +467,48 @@ ClickHouse использует этот параметр при чтении д Возможные значения: -- 0 — прямой ввод-вывод отключен. -- Положительное целое число. +- 0 — прямой ввод-вывод отключен. +- Положительное целое число. Значение по умолчанию: 0. -## log_queries {#settings-log-queries} +## log\_queries {#settings-log-queries} Установка логирования запроса. -Запросы, переданные в ClickHouse с этой установкой, логируются согласно правилам конфигурационного параметра сервера [query_log](../../operations/server_settings/settings.md#server_settings-query-log). +Запросы, переданные в ClickHouse с этой установкой, логируются согласно правилам конфигурационного параметра сервера [query\_log](../../operations/server_settings/settings.md#server_settings-query-log). Пример: -```text +``` text log_queries=1 ``` -## log_query_threads {#settings-log-query-threads} +## log\_query\_threads {#settings-log-query-threads} Установка логирования информации о потоках выполнения запроса. -Лог информации о потоках выполнения запросов, переданных в ClickHouse с этой установкой, записывается согласно правилам конфигурационного параметра сервера [query_thread_log](../server_settings/settings.md#server_settings-query-thread-log). +Лог информации о потоках выполнения запросов, переданных в ClickHouse с этой установкой, записывается согласно правилам конфигурационного параметра сервера [query\_thread\_log](../server_settings/settings.md#server_settings-query-thread-log). Пример: -```text +``` text log_query_threads=1 ``` -## max_insert_block_size {#settings-max_insert_block_size} +## max\_insert\_block\_size {#settings-max-insert-block-size} Формировать блоки указанного размера, при вставке в таблицу. Эта настройка действует только в тех случаях, когда сервер сам формирует такие блоки. Например, при INSERT-е через HTTP интерфейс, сервер парсит формат данных, и формирует блоки указанного размера. -А при использовании clickhouse-client, клиент сам парсит данные, и настройка max_insert_block_size на сервере не влияет на размер вставляемых блоков. +А при использовании clickhouse-client, клиент сам парсит данные, и настройка max\_insert\_block\_size на сервере не влияет на размер вставляемых блоков. При использовании INSERT SELECT, настройка так же не имеет смысла, так как данные будут вставляться теми блоками, которые вышли после SELECT-а. Значение по умолчанию: 1,048,576. Это значение намного больше, чем `max_block_size`. Это сделано, потому что некоторые движки таблиц (`*MergeTree`) будут на каждый вставляемый блок формировать кусок данных на диске, что является довольно большой сущностью. Также, в таблицах типа `*MergeTree`, данные сортируются при вставке, и достаточно большой размер блока позволяет отсортировать больше данных в оперативке. -## max_replica_delay_for_distributed_queries {#settings-max_replica_delay_for_distributed_queries} +## max\_replica\_delay\_for\_distributed\_queries {#settings-max-replica-delay-for-distributed-queries} Отключает отстающие реплики при распределенных запросах. См. [Репликация](../../operations/table_engines/replication.md). @@ -515,87 +518,87 @@ log_query_threads=1 Используется при выполнении `SELECT` из распределенной таблицы, которая указывает на реплицированные таблицы. -## max_threads {#settings-max_threads} +## max\_threads {#settings-max-threads} -Максимальное количество потоков обработки запроса без учёта потоков для чтения данных с удалённых серверов (смотрите параметр max_distributed_connections). +Максимальное количество потоков обработки запроса без учёта потоков для чтения данных с удалённых серверов (смотрите параметр max\_distributed\_connections). Этот параметр относится к потокам, которые выполняют параллельно одни стадии конвейера выполнения запроса. -Например, при чтении из таблицы, если есть возможность вычислять выражения с функциями, фильтровать с помощью WHERE и предварительно агрегировать для GROUP BY параллельно, используя хотя бы количество потоков max_threads, то используются max_threads. +Например, при чтении из таблицы, если есть возможность вычислять выражения с функциями, фильтровать с помощью WHERE и предварительно агрегировать для GROUP BY параллельно, используя хотя бы количество потоков max\_threads, то используются max\_threads. Значение по умолчанию: количество процессорных ядер без учёта Hyper-Threading. Если на сервере обычно исполняется менее одного запроса SELECT одновременно, то выставите этот параметр в значение чуть меньше количества реальных процессорных ядер. -Для запросов, которые быстро завершаются из-за LIMIT-а, имеет смысл выставить max_threads поменьше. Например, если нужное количество записей находится в каждом блоке, то при max_threads = 8 будет считано 8 блоков, хотя достаточно было прочитать один. +Для запросов, которые быстро завершаются из-за LIMIT-а, имеет смысл выставить max\_threads поменьше. Например, если нужное количество записей находится в каждом блоке, то при max\_threads = 8 будет считано 8 блоков, хотя достаточно было прочитать один. Чем меньше `max_threads`, тем меньше будет использоваться оперативки. -## max_insert_threads {#settings-max_insert_threads} +## max\_insert\_threads {#settings-max-insert-threads} Максимальное количество потоков для выполнения запроса `INSERT SELECT`. Возможные значения: -- 0 (или 1) — `INSERT SELECT` не выполняется параллельно. -- Положительное целое число, больше 1. +- 0 (или 1) — `INSERT SELECT` не выполняется параллельно. +- Положительное целое число, больше 1. Значение по умолчанию: 0. -Параллельный `INSERT SELECT` действует только в том случае, если часть SELECT выполняется параллельно, см. настройку [max_threads](#settings-max_threads). +Параллельный `INSERT SELECT` действует только в том случае, если часть SELECT выполняется параллельно, см. настройку [max\_threads](#settings-max_threads). Чем больше значение `max_insert_threads`, тем больше потребление оперативной памяти. -## max_compress_block_size +## max\_compress\_block\_size {#max-compress-block-size} Максимальный размер блоков не сжатых данных перед сжатием при записи в таблицу. По умолчанию - 1 048 576 (1 MiB). При уменьшении размера, незначительно уменьшается коэффициент сжатия, незначительно возрастает скорость сжатия и разжатия за счёт кэш-локальности, и уменьшается потребление оперативки. Как правило, не имеет смысла менять эту настройку. Не путайте блоки для сжатия (кусок памяти, состоящий из байт) и блоки для обработки запроса (пачка строк из таблицы). -## min_compress_block_size +## min\_compress\_block\_size {#min-compress-block-size} -Для таблиц типа [MergeTree](../table_engines/mergetree.md). В целях уменьшения задержек при обработке запросов, блок сжимается при записи следующей засечки, если его размер не меньше min_compress_block_size. По умолчанию - 65 536. +Для таблиц типа [MergeTree](../table_engines/mergetree.md). В целях уменьшения задержек при обработке запросов, блок сжимается при записи следующей засечки, если его размер не меньше min\_compress\_block\_size. По умолчанию - 65 536. -Реальный размер блока, если несжатых данных меньше max_compress_block_size, будет не меньше этого значения и не меньше объёма данных на одну засечку. +Реальный размер блока, если несжатых данных меньше max\_compress\_block\_size, будет не меньше этого значения и не меньше объёма данных на одну засечку. -Рассмотрим пример. Пусть index_granularity, указанная при создании таблицы - 8192. +Рассмотрим пример. Пусть index\_granularity, указанная при создании таблицы - 8192. -Пусть мы записываем столбец типа UInt32 (4 байта на значение). При записи 8192 строк, будет всего 32 КБ данных. Так как min_compress_block_size = 65 536, сжатый блок будет сформирован на каждые две засечки. +Пусть мы записываем столбец типа UInt32 (4 байта на значение). При записи 8192 строк, будет всего 32 КБ данных. Так как min\_compress\_block\_size = 65 536, сжатый блок будет сформирован на каждые две засечки. Пусть мы записываем столбец URL типа String (средний размер - 60 байт на значение). При записи 8192 строк, будет, в среднем, чуть меньше 500 КБ данных. Так как это больше 65 536 строк, то сжатый блок будет сформирован на каждую засечку. В этом случае, при чтении с диска данных из диапазона в одну засечку, не будет разжато лишних данных. Как правило, не имеет смысла менять эту настройку. -## max_query_size {#settings-max_query_size} +## max\_query\_size {#settings-max-query-size} Максимальный кусок запроса, который будет считан в оперативку для разбора парсером языка SQL. Запрос INSERT также содержит данные для INSERT-а, которые обрабатываются отдельным, потоковым парсером (расходующим O(1) оперативки), и не учитываются в этом ограничении. Значение по умолчанию: 256 Кб. -## interactive_delay +## interactive\_delay {#interactive-delay} Интервал в микросекундах для проверки, не запрошена ли остановка выполнения запроса, и отправки прогресса. Значение по умолчанию: 100,000 (проверять остановку запроса и отправлять прогресс десять раз в секунду). -## connect_timeout, receive_timeout, send_timeout +## connect\_timeout, receive\_timeout, send\_timeout {#connect-timeout-receive-timeout-send-timeout} Таймауты в секундах на сокет, по которому идёт общение с клиентом. Значение по умолчанию: 10, 300, 300. -## cancel_http_readonly_queries_on_client_close +## cancel\_http\_readonly\_queries\_on\_client\_close {#cancel-http-readonly-queries-on-client-close} Отменяет HTTP readonly запросы (например, SELECT), когда клиент обрывает соединение до завершения получения данных. Значение по умолчанию: 0 -## poll_interval +## poll\_interval {#poll-interval} Блокироваться в цикле ожидания запроса в сервере на указанное количество секунд. Значение по умолчанию: 10. -## max_distributed_connections +## max\_distributed\_connections {#max-distributed-connections} Максимальное количество одновременных соединений с удалёнными серверами при распределённой обработке одного запроса к одной таблице типа Distributed. Рекомендуется выставлять не меньше, чем количество серверов в кластере. @@ -603,80 +606,80 @@ log_query_threads=1 Следующие параметры имеют значение только на момент создания таблицы типа Distributed (и при запуске сервера), поэтому их не имеет смысла менять в рантайме. -## distributed_connections_pool_size +## distributed\_connections\_pool\_size {#distributed-connections-pool-size} Максимальное количество одновременных соединений с удалёнными серверами при распределённой обработке всех запросов к одной таблице типа Distributed. Рекомендуется выставлять не меньше, чем количество серверов в кластере. Значение по умолчанию: 1024. -## connect_timeout_with_failover_ms +## connect\_timeout\_with\_failover\_ms {#connect-timeout-with-failover-ms} Таймаут в миллисекундах на соединение с удалённым сервером, для движка таблиц Distributed, если используются секции shard и replica в описании кластера. В случае неуспеха, делается несколько попыток соединений с разными репликами. Значение по умолчанию: 50. -## connections_with_failover_max_tries +## connections\_with\_failover\_max\_tries {#connections-with-failover-max-tries} Максимальное количество попыток соединения с каждой репликой, для движка таблиц Distributed. Значение по умолчанию: 3. -## extremes +## extremes {#extremes} Считать ли экстремальные значения (минимумы и максимумы по столбцам результата запроса). Принимает 0 или 1. По умолчанию - 0 (выключено). -Подробнее смотрите раздел "Экстремальные значения". +Подробнее смотрите раздел «Экстремальные значения». -## use_uncompressed_cache {#setting-use_uncompressed_cache} +## use\_uncompressed\_cache {#setting-use-uncompressed-cache} Использовать ли кэш разжатых блоков. Принимает 0 или 1. По умолчанию - 0 (выключено). -Использование кэша несжатых блоков (только для таблиц семейства MergeTree) может существенно сократить задержку и увеличить пропускную способность при работе с большим количеством коротких запросов. Включите эту настройку для пользователей, от которых идут частые короткие запросы. Также обратите внимание на конфигурационный параметр [uncompressed_cache_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) (настраивается только в конфигурационном файле) – размер кэша разжатых блоков. По умолчанию - 8 GiB. Кэш разжатых блоков заполняется по мере надобности, а наиболее невостребованные данные автоматически удаляются. +Использование кэша несжатых блоков (только для таблиц семейства MergeTree) может существенно сократить задержку и увеличить пропускную способность при работе с большим количеством коротких запросов. Включите эту настройку для пользователей, от которых идут частые короткие запросы. Также обратите внимание на конфигурационный параметр [uncompressed\_cache\_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) (настраивается только в конфигурационном файле) – размер кэша разжатых блоков. По умолчанию - 8 GiB. Кэш разжатых блоков заполняется по мере надобности, а наиболее невостребованные данные автоматически удаляются. Для запросов, читающих хоть немного приличный объём данных (миллион строк и больше), кэш разжатых блоков автоматически выключается, чтобы оставить место для действительно мелких запросов. Поэтому, можно держать настройку `use_uncompressed_cache` всегда выставленной в 1. -## replace_running_query +## replace\_running\_query {#replace-running-query} -При использовании интерфейса HTTP может быть передан параметр query_id. Это любая строка, которая служит идентификатором запроса. -Если в этот момент, уже существует запрос от того же пользователя с тем же query_id, то поведение определяется параметром replace_running_query. +При использовании интерфейса HTTP может быть передан параметр query\_id. Это любая строка, которая служит идентификатором запроса. +Если в этот момент, уже существует запрос от того же пользователя с тем же query\_id, то поведение определяется параметром replace\_running\_query. -`0` - (по умолчанию) кинуть исключение (не давать выполнить запрос, если запрос с таким же query_id уже выполняется); +`0` - (по умолчанию) кинуть исключение (не давать выполнить запрос, если запрос с таким же query\_id уже выполняется); `1` - отменить старый запрос и начать выполнять новый. Эта настройка, выставленная в 1, используется в Яндекс.Метрике для реализации suggest-а значений для условий сегментации. После ввода очередного символа, если старый запрос ещё не выполнился, его следует отменить. -## stream_flush_interval_ms +## stream\_flush\_interval\_ms {#stream-flush-interval-ms} -Работает для таблиц со стриммингом в случае тайм-аута, или когда поток генерирует [max_insert_block_size](#settings-max_insert_block_size) строк. +Работает для таблиц со стриммингом в случае тайм-аута, или когда поток генерирует [max\_insert\_block\_size](#settings-max_insert_block_size) строк. Значение по умолчанию: 7500. Чем меньше значение, тем чаще данные сбрасываются в таблицу. Установка слишком низкого значения приводит к снижению производительности. -## load_balancing {#settings-load_balancing} +## load\_balancing {#settings-load-balancing} Задает алгоритм выбора реплик, используемый при обработке распределенных запросов. ClickHouse поддерживает следующие алгоритмы выбора реплик: -- [Random](#load_balancing-random) (by default) -- [Nearest hostname](#load_balancing-nearest_hostname) -- [In order](#load_balancing-in_order) -- [First or random](#load_balancing-first_or_random) +- [Random](#load_balancing-random) (by default) +- [Nearest hostname](#load_balancing-nearest_hostname) +- [In order](#load_balancing-in_order) +- [First or random](#load_balancing-first_or_random) -### Random (by default) {#load_balancing-random} +### Random (by default) {#load-balancing-random} -```sql +``` sql load_balancing = random ``` Для каждой реплики считается количество ошибок. Запрос отправляется на реплику с минимальным числом ошибок, а если таких несколько, то на случайную из них. Недостатки: не учитывается близость серверов; если на репликах оказались разные данные, то вы будете получать так же разные данные. -### Nearest Hostname {#load_balancing-nearest_hostname} +### Nearest Hostname {#load-balancing-nearest-hostname} -```sql +``` sql load_balancing = nearest_hostname ``` @@ -688,18 +691,18 @@ load_balancing = nearest_hostname Таким образом, если есть равнозначные реплики, предпочитается ближайшая по имени. Также можно сделать предположение, что при отправке запроса на один и тот же сервер, в случае отсутствия сбоев, распределённый запрос будет идти тоже на одни и те же серверы. То есть, даже если на репликах расположены разные данные, запрос будет возвращать в основном одинаковые результаты. -### In Order {#load_balancing-in_order} +### In Order {#load-balancing-in-order} -```sql +``` sql load_balancing = in_order ``` Реплики с одинаковым количеством ошибок опрашиваются в порядке, определённом конфигурацией. Этот способ подходит для тех случаев, когда вы точно знаете, какая реплика предпочтительнее. -### First or Random {#load_balancing-first_or_random} +### First or Random {#load-balancing-first-or-random} -```sql +``` sql load_balancing = first_or_random ``` @@ -707,44 +710,44 @@ load_balancing = first_or_random Алгоритм `first or random` решает проблему алгоритма `in order`. При использовании `in order`, если одна реплика перестаёт отвечать, то следующая за ней принимает двойную нагрузку, в то время как все остальные обрабатываю свой обычный трафик. Алгоритм `first or random` равномерно распределяет нагрузку между репликами. -## prefer_localhost_replica {#settings-prefer_localhost_replica} +## prefer\_localhost\_replica {#settings-prefer-localhost-replica} Включает или выключает предпочтительное использование localhost реплики при обработке распределенных запросов. Возможные значения: -- 1 — ClickHouse всегда отправляет запрос на localhost реплику, если она существует. -- 0 — ClickHouse использует балансировку, заданную настройкой [load_balancing](#settings-load_balancing). +- 1 — ClickHouse всегда отправляет запрос на localhost реплику, если она существует. +- 0 — ClickHouse использует балансировку, заданную настройкой [load\_balancing](#settings-load_balancing). Значение по умолчанию: 1. !!! warning "Warning" - Отключайте эту настройку при использовании [max_parallel_replicas](#settings-max_parallel_replicas). + Отключайте эту настройку при использовании [max\_parallel\_replicas](#settings-max_parallel_replicas). -## totals_mode +## totals\_mode {#totals-mode} -Каким образом вычислять TOTALS при наличии HAVING, а также при наличии max_rows_to_group_by и group_by_overflow_mode = 'any'. -Смотрите раздел "Модификатор WITH TOTALS". +Каким образом вычислять TOTALS при наличии HAVING, а также при наличии max\_rows\_to\_group\_by и group\_by\_overflow\_mode = ‘any’. +Смотрите раздел «Модификатор WITH TOTALS». -## totals_auto_threshold +## totals\_auto\_threshold {#totals-auto-threshold} Порог для `totals_mode = 'auto'`. -Смотрите раздел "Модификатор WITH TOTALS". +Смотрите раздел «Модификатор WITH TOTALS». -## max_parallel_replicas {#settings-max_parallel_replicas} +## max\_parallel\_replicas {#settings-max-parallel-replicas} Максимальное количество используемых реплик каждого шарда при выполнении запроса. Для консистентности (чтобы получить разные части одного и того же разбиения), эта опция работает только при заданном ключе сэмплирования. Отставание реплик не контролируется. -## compile +## compile {#compile} Включить компиляцию запросов. По умолчанию - 0 (выключено). Компиляция предусмотрена только для части конвейера обработки запроса - для первой стадии агрегации (GROUP BY). В случае, если эта часть конвейера была скомпилирована, запрос может работать быстрее, за счёт разворачивания коротких циклов и инлайнинга вызовов агрегатных функций. Максимальный прирост производительности (до четырёх раз в редких случаях) достигается на запросах с несколькими простыми агрегатными функциями. Как правило, прирост производительности незначителен. В очень редких случаях возможно замедление выполнения запроса. -## min_count_to_compile +## min\_count\_to\_compile {#min-count-to-compile} После скольких раз, когда скомпилированный кусок кода мог пригодиться, выполнить его компиляцию. По умолчанию - 3. Для тестирования можно установить значение 0: компиляция выполняется синхронно, и запрос ожидает окончания процесса компиляции перед продолжением выполнения. Во всех остальных случаях используйте значения, начинающиеся с 1. Как правило, компиляция занимает по времени около 5-10 секунд. @@ -753,37 +756,37 @@ load_balancing = first_or_random Скомпилированный код требуется для каждого разного сочетания используемых в запросе агрегатных функций и вида ключей в GROUP BY. Результаты компиляции сохраняются в директории build в виде .so файлов. Количество результатов компиляции не ограничено, так как они не занимают много места. При перезапуске сервера, старые результаты будут использованы, за исключением случая обновления сервера - тогда старые результаты удаляются. -## input_format_skip_unknown_fields +## input\_format\_skip\_unknown\_fields {#input-format-skip-unknown-fields} Если значение равно true, то при выполнении INSERT входные данные из столбцов с неизвестными именами будут пропущены. В противном случае эта ситуация создаст исключение. Работает для форматов JSONEachRow и TSKV. -## output_format_json_quote_64bit_integers {#session_settings-output_format_json_quote_64bit_integers} +## output\_format\_json\_quote\_64bit\_integers {#session-settings-output-format-json-quote-64bit-integers} Если значение истинно, то при использовании JSON\* форматов UInt64 и Int64 числа выводятся в кавычках (из соображений совместимости с большинством реализаций JavaScript), иначе - без кавычек. -## format_csv_delimiter {#settings-format_csv_delimiter} +## format\_csv\_delimiter {#settings-format-csv-delimiter} Символ, интерпретируемый как разделитель в данных формата CSV. По умолчанию — `,`. -## input_format_csv_unquoted_null_literal_as_null {#settings-input_format_csv_unquoted_null_literal_as_null} +## input\_format\_csv\_unquoted\_null\_literal\_as\_null {#settings-input-format-csv-unquoted-null-literal-as-null} Для формата CSV включает или выключает парсинг неэкранированной строки `NULL` как литерала (синоним для `\N`) -## output_format_csv_crlf_end_of_line {#settings-output_format_csv_crlf_end_of_line} +## output\_format\_csv\_crlf\_end\_of\_line {#settings-output-format-csv-crlf-end-of-line} Использовать в качестве разделителя строк для CSV формата CRLF (DOS/Windows стиль) вместо LF (Unix стиль). -## output_format_tsv_crlf_end_of_line {#settings-output_format_tsv_crlf_end_of_line} +## output\_format\_tsv\_crlf\_end\_of\_line {#settings-output-format-tsv-crlf-end-of-line} Использовать в качестве разделителя строк для TSV формата CRLF (DOC/Windows стиль) вместо LF (Unix стиль). -## insert_quorum {#settings-insert_quorum} +## insert\_quorum {#settings-insert-quorum} Включает кворумную запись. -- Если `insert_quorum < 2`, то кворумная запись выключена. -- Если `insert_quorum >= 2`, то кворумная запись включена. +- Если `insert_quorum < 2`, то кворумная запись выключена. +- Если `insert_quorum >= 2`, то кворумная запись включена. Значение по умолчанию: 0. @@ -793,19 +796,19 @@ load_balancing = first_or_random Все реплики в кворуме консистентны, т.е. содержат данные всех более ранних запросов `INSERT`. Последовательность `INSERT` линеаризуется. -При чтении данных, записанных с `insert_quorum` можно использовать настройку [select_sequential_consistency](#settings-select_sequential_consistency). +При чтении данных, записанных с `insert_quorum` можно использовать настройку [select\_sequential\_consistency](#settings-select_sequential_consistency). ClickHouse генерирует исключение -- Если количество доступных реплик на момент запроса меньше `insert_quorum`. -- При попытке записать данные в момент, когда предыдущий блок ещё не вставлен в `insert_quorum` реплик. Эта ситуация может возникнуть, если пользователь вызвал `INSERT` прежде, чем завершился предыдущий с `insert_quorum`. +- Если количество доступных реплик на момент запроса меньше `insert_quorum`. +- При попытке записать данные в момент, когда предыдущий блок ещё не вставлен в `insert_quorum` реплик. Эта ситуация может возникнуть, если пользователь вызвал `INSERT` прежде, чем завершился предыдущий с `insert_quorum`. См. также: -- [insert_quorum_timeout](#settings-insert_quorum_timeout) -- [select_sequential_consistency](#settings-select_sequential_consistency) +- [insert\_quorum\_timeout](#settings-insert_quorum_timeout) +- [select\_sequential\_consistency](#settings-select_sequential_consistency) -## insert_quorum_timeout {#settings-insert_quorum_timeout} +## insert\_quorum\_timeout {#settings-insert-quorum-timeout} Время ожидания кворумной записи в секундах. Если время прошло, а запись так не состоялась, то ClickHouse сгенерирует исключение и клиент должен повторить запрос на запись того же блока на эту же или любую другую реплику. @@ -813,17 +816,17 @@ ClickHouse генерирует исключение См. также: -- [insert_quorum](#settings-insert_quorum) -- [select_sequential_consistency](#settings-select_sequential_consistency) +- [insert\_quorum](#settings-insert_quorum) +- [select\_sequential\_consistency](#settings-select_sequential_consistency) -## select_sequential_consistency {#settings-select_sequential_consistency} +## select\_sequential\_consistency {#settings-select-sequential-consistency} Включает или выключает последовательную консистентность для запросов `SELECT`. Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 0. @@ -833,127 +836,126 @@ ClickHouse генерирует исключение См. также: -- [insert_quorum](#settings-insert_quorum) -- [insert_quorum_timeout](#settings-insert_quorum_timeout) +- [insert\_quorum](#settings-insert_quorum) +- [insert\_quorum\_timeout](#settings-insert_quorum_timeout) -## insert_deduplicate {#settings-insert_deduplicate} +## insert\_deduplicate {#settings-insert-deduplicate} -Включает и выключает дедупликацию для запросов `INSERT` (для Replicated* таблиц). +Включает и выключает дедупликацию для запросов `INSERT` (для Replicated\* таблиц). Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 1. По умолчанию блоки, вставляемые в реплицируемые таблицы оператором `INSERT`, дедуплицируются (см. [Репликация данных](../table_engines/replication.md)). -## deduplicate_blocks_in_dependent_materialized_views {#settings-deduplicate_blocks_in_dependent_materialized_views} +## deduplicate\_blocks\_in\_dependent\_materialized\_views {#settings-deduplicate-blocks-in-dependent-materialized-views} -Включает и выключает проверку дедупликации для материализованных представлений, которые получают данные из Replicated* таблиц. +Включает и выключает проверку дедупликации для материализованных представлений, которые получают данные из Replicated\* таблиц. Возможные значения: -- 0 — выключена. -- 1 — включена. +- 0 — выключена. +- 1 — включена. Значение по умолчанию: 0. -По умолчанию проверка дедупликации у материализованных представлений не производится, а наследуется от Replicated* (основной) таблицы, за которой "следит" материализованное представление. +По умолчанию проверка дедупликации у материализованных представлений не производится, а наследуется от Replicated\* (основной) таблицы, за которой «следит» материализованное представление. Т.е. если `INSERT` в основную таблицу д.б. пропущен (сдедуплицирован), то автоматически не будет вставки и в материализованные представления. Это имплементировано для того, чтобы работали материализованные представления, которые сильно группируют данные основных `INSERT`, до такой степени что блоки вставляемые в материализованные представления получаются одинаковыми для разных `INSERT` в основную таблицу. -Одновременно это "ломает" идемпотентность вставки в материализованные представления. Т.е. если `INSERT` был успешен в основную таблицу и неуспешен в таблицу материализованного представления (напр. из-за сетевого сбоя при коммуникации с Zookeeper), клиент получит ошибку и попытается повторить `INSERT`. Но вставки в материализованные представления произведено не будет, потому что дедупликация сработает на основной таблице. Настройка `deduplicate_blocks_in_dependent_materialized_views` позволяет это изменить. Т.е. при повторном `INSERT` будет произведена дедупликация на таблице материализованного представления, и повторный инсерт вставит данные в таблицу материализованного представления, которые не удалось вставить из-за сбоя первого `INSERT`. +Одновременно это «ломает» идемпотентность вставки в материализованные представления. Т.е. если `INSERT` был успешен в основную таблицу и неуспешен в таблицу материализованного представления (напр. из-за сетевого сбоя при коммуникации с Zookeeper), клиент получит ошибку и попытается повторить `INSERT`. Но вставки в материализованные представления произведено не будет, потому что дедупликация сработает на основной таблице. Настройка `deduplicate_blocks_in_dependent_materialized_views` позволяет это изменить. Т.е. при повторном `INSERT` будет произведена дедупликация на таблице материализованного представления, и повторный инсерт вставит данные в таблицу материализованного представления, которые не удалось вставить из-за сбоя первого `INSERT`. -## count_distinct_implementation {#settings-count_distinct_implementation} +## count\_distinct\_implementation {#settings-count-distinct-implementation} -Задаёт, какая из функций `uniq*` используется при выполнении конструкции [COUNT(DISTINCT ...)](../../query_language/agg_functions/reference.md#agg_function-count). +Задаёт, какая из функций `uniq*` используется при выполнении конструкции [COUNT(DISTINCT …)](../../query_language/agg_functions/reference.md#agg_function-count). Возможные значения: -- [uniq](../../query_language/agg_functions/reference.md#agg_function-uniq) -- [uniqCombined](../../query_language/agg_functions/reference.md#agg_function-uniqcombined) -- [uniqCombined64](../../query_language/agg_functions/reference.md#agg_function-uniqcombined64) -- [uniqHLL12](../../query_language/agg_functions/reference.md#agg_function-uniqhll12) -- [uniqExact](../../query_language/agg_functions/reference.md#agg_function-uniqexact) +- [uniq](../../query_language/agg_functions/reference.md#agg_function-uniq) +- [uniqCombined](../../query_language/agg_functions/reference.md#agg_function-uniqcombined) +- [uniqCombined64](../../query_language/agg_functions/reference.md#agg_function-uniqcombined64) +- [uniqHLL12](../../query_language/agg_functions/reference.md#agg_function-uniqhll12) +- [uniqExact](../../query_language/agg_functions/reference.md#agg_function-uniqexact) Значение по умолчанию: `uniqExact`. -## max_network_bytes {#settings-max_network_bytes} +## max\_network\_bytes {#settings-max-network-bytes} Ограничивает объём данных (в байтах), который принимается или передается по сети при выполнении запроса. Параметр применяется к каждому отдельному запросу. Возможные значения: -- Положительное целое число. -- 0 — контроль объёма данных отключен. +- Положительное целое число. +- 0 — контроль объёма данных отключен. Значение по умолчанию: 0. -## max_network_bandwidth {#settings-max_network_bandwidth} +## max\_network\_bandwidth {#settings-max-network-bandwidth} Ограничивает скорость обмена данными по сети в байтах в секунду. Параметр применяется к каждому отдельному запросу. Возможные значения: -- Положительное целое число. -- 0 — контроль скорости передачи данных отключен. +- Положительное целое число. +- 0 — контроль скорости передачи данных отключен. Значение по умолчанию: 0. -## max_network_bandwidth_for_user {#settings-max_network_bandwidth_for_user} +## max\_network\_bandwidth\_for\_user {#settings-max-network-bandwidth-for-user} Ограничивает скорость обмена данными по сети в байтах в секунду. Этот параметр применяется ко всем одновременно выполняемым запросам, запущенным одним пользователем. Возможные значения: -- Положительное целое число. -- 0 — управление скоростью передачи данных отключено. +- Положительное целое число. +- 0 — управление скоростью передачи данных отключено. Значение по умолчанию: 0. -## max_network_bandwidth_for_all_users {#settings-max_network_bandwidth_for_all_users} +## max\_network\_bandwidth\_for\_all\_users {#settings-max-network-bandwidth-for-all-users} Ограничивает скорость обмена данными по сети в байтах в секунду. Этот параметр применяется ко всем одновременно выполняемым запросам на сервере. Возможные значения: -- Положительное целое число. -- 0 — управление скоростью передачи данных отключено. +- Положительное целое число. +- 0 — управление скоростью передачи данных отключено. Значение по умолчанию: 0. - -## skip_unavailable_shards {#settings-skip_unavailable_shards} +## skip\_unavailable\_shards {#settings-skip-unavailable-shards} Включает или отключает тихий пропуск недоступных шардов. Шард считается недоступным, если все его реплики недоступны. Реплика недоступна в следующих случаях: -- ClickHouse не может установить соединение с репликой по любой причине. +- ClickHouse не может установить соединение с репликой по любой причине. - ClickHouse предпринимает несколько попыток подключиться к реплике. Если все попытки оказались неудачными, реплика считается недоступной. + ClickHouse предпринимает несколько попыток подключиться к реплике. Если все попытки оказались неудачными, реплика считается недоступной. -- Реплика не может быть разрешена с помощью DNS. +- Реплика не может быть разрешена с помощью DNS. - Если имя хоста реплики не может быть разрешено с помощью DNS, это может указывать на следующие ситуации: + Если имя хоста реплики не может быть разрешено с помощью DNS, это может указывать на следующие ситуации: - - Нет записи DNS для хоста. Это может происходить в системах с динамическим DNS, например, [Kubernetes](https://kubernetes.io), где отключенные ноды не разрешаться с помощью DNS и это не ошибка. + - Нет записи DNS для хоста. Это может происходить в системах с динамическим DNS, например, [Kubernetes](https://kubernetes.io), где отключенные ноды не разрешаться с помощью DNS и это не ошибка. - - Ошибка конфигурации. Конфигурационный файл ClickHouse может содержать неправильное имя хоста. + - Ошибка конфигурации. Конфигурационный файл ClickHouse может содержать неправильное имя хоста. Возможные значения: -- 1 — пропуск включен. +- 1 — пропуск включен. - Если шард недоступен, то ClickHouse возвращает результат, основанный на неполных данных и не оповещает о проблемах с доступностью хостов. + Если шард недоступен, то ClickHouse возвращает результат, основанный на неполных данных и не оповещает о проблемах с доступностью хостов. -- 0 — пропуск выключен. +- 0 — пропуск выключен. - Если шард недоступен, то ClickHouse генерирует исключение. + Если шард недоступен, то ClickHouse генерирует исключение. Значение по умолчанию: 0. -## optimize_throw_if_noop {#setting-optimize_throw_if_noop} +## optimize\_throw\_if\_noop {#setting-optimize-throw-if-noop} Включает или отключает генерирование исключения в в случаях, когда запрос [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) не выполняет мёрж. @@ -961,33 +963,32 @@ ClickHouse генерирует исключение Возможные значения: -- 1 — генерирование исключения включено. -- 0 — генерирование исключения выключено. +- 1 — генерирование исключения включено. +- 0 — генерирование исключения выключено. Значение по умолчанию: 0. -## distributed_directory_monitor_sleep_time_ms {#distributed_directory_monitor_sleep_time_ms} +## distributed\_directory\_monitor\_sleep\_time\_ms {#distributed-directory-monitor-sleep-time-ms} Основной интервал отправки данных движком таблиц [Distributed](../table_engines/distributed.md). Фактический интервал растёт экспоненциально при возникновении ошибок. Возможные значения: -- Положительное целое количество миллисекунд. +- Положительное целое количество миллисекунд. Значение по умолчанию: 100 миллисекунд. +## distributed\_directory\_monitor\_max\_sleep\_time\_ms {#distributed-directory-monitor-max-sleep-time-ms} -## distributed_directory_monitor_max_sleep_time_ms {#distributed_directory_monitor_max_sleep_time_ms} - -Максимальный интервал отправки данных движком таблиц [Distributed](../table_engines/distributed.md). Ограничивает экпоненциальный рост интервала, установленого настройкой [distributed_directory_monitor_sleep_time_ms](#distributed_directory_monitor_sleep_time_ms). +Максимальный интервал отправки данных движком таблиц [Distributed](../table_engines/distributed.md). Ограничивает экпоненциальный рост интервала, установленого настройкой [distributed\_directory\_monitor\_sleep\_time\_ms](#distributed_directory_monitor_sleep_time_ms). Возможные значения: -- Положительное целое количество миллисекунд. +- Положительное целое количество миллисекунд. Значение по умолчанию: 30000 миллисекунд (30 секунд). -## distributed_directory_monitor_batch_inserts {#distributed_directory_monitor_batch_inserts} +## distributed\_directory\_monitor\_batch\_inserts {#distributed-directory-monitor-batch-inserts} Включает/выключает пакетную отправку вставленных данных. @@ -995,12 +996,12 @@ ClickHouse генерирует исключение Возможные значения: -- 1 — включено. -- 0 — выключено. +- 1 — включено. +- 0 — выключено. Значение по умолчанию: 0. -## os_thread_priority {#setting-os_thread_priority} +## os\_thread\_priority {#setting-os-thread-priority} Устанавливает приоритет ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) для потоков, исполняющих запросы. Планировщик ОС учитывает эти приоритеты при выборе следующего потока для исполнения на доступном ядре CPU. @@ -1009,26 +1010,26 @@ ClickHouse генерирует исключение Допустимые значения: -- Любое значение из диапазона `[-20, 19]`. +- Любое значение из диапазона `[-20, 19]`. Более низкие значения означают более высокий приоритет. Потоки с низкими значениями приоритета `nice` выполняются чаще, чем потоки с более высокими значениями. Высокие значения предпочтительно использовать для долгих неинтерактивных запросов, поскольку это позволяет бысто выделить ресурс в пользу коротких интерактивных запросов. Значение по умолчанию: 0. -## query_profiler_real_time_period_ns {#query_profiler_real_time_period_ns} +## query\_profiler\_real\_time\_period\_ns {#query-profiler-real-time-period-ns} Sets the period for a real clock timer of the [query profiler](../../operations/performance/sampling_query_profiler.md). Real clock timer counts wall-clock time. Possible values: -- Positive integer number, in nanoseconds. +- Positive integer number, in nanoseconds. - Recommended values: + Recommended values: - - 10000000 (100 times a second) nanoseconds and less for single queries. - - 1000000000 (once a second) for cluster-wide profiling. + - 10000000 (100 times a second) nanoseconds and less for single queries. + - 1000000000 (once a second) for cluster-wide profiling. -- 0 for turning off the timer. +- 0 for turning off the timer. Type: [UInt64](../../data_types/int_uint.md). @@ -1036,22 +1037,22 @@ Default value: 1000000000 nanoseconds (once a second). See also: -- System table [trace_log](../../operations/system_tables.md#system_tables-trace_log) +- System table [trace\_log](../../operations/system_tables.md#system_tables-trace_log) -## query_profiler_cpu_time_period_ns {#query_profiler_cpu_time_period_ns} +## query\_profiler\_cpu\_time\_period\_ns {#query-profiler-cpu-time-period-ns} Sets the period for a CPU clock timer of the [query profiler](../../operations/performance/sampling_query_profiler.md). This timer counts only CPU time. Possible values: -- Positive integer number of nanoseconds. +- Positive integer number of nanoseconds. - Recommended values: + Recommended values: - - 10000000 (100 times a second) nanosecods and more for for single queries. - - 1000000000 (once a second) for cluster-wide profiling. + - 10000000 (100 times a second) nanosecods and more for for single queries. + - 1000000000 (once a second) for cluster-wide profiling. -- 0 for turning off the timer. +- 0 for turning off the timer. Type: [UInt64](../../data_types/int_uint.md). @@ -1059,22 +1060,22 @@ Default value: 1000000000 nanoseconds. See also: -- System table [trace_log](../../operations/system_tables.md#system_tables-trace_log) +- System table [trace\_log](../../operations/system_tables.md#system_tables-trace_log) -## allow_introspection_functions {#settings-allow_introspection_functions} +## allow\_introspection\_functions {#settings-allow-introspection-functions} Enables of disables [introspections functions](../../query_language/functions/introspection.md) for query profiling. Possible values: -- 1 — Introspection functions enabled. -- 0 — Introspection functions disabled. +- 1 — Introspection functions enabled. +- 0 — Introspection functions disabled. Default value: 0. **See Also** -- [Sampling Query Profiler](../performance/sampling_query_profiler.md) -- System table [trace_log](../../operations/system_tables.md#system_tables-trace_log) +- [Sampling Query Profiler](../performance/sampling_query_profiler.md) +- System table [trace\_log](../../operations/system_tables.md#system_tables-trace_log) [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/settings/) diff --git a/docs/ru/operations/settings/settings_profiles.md b/docs/ru/operations/settings/settings_profiles.md index 91cf60b5b2b..a0a8b4ba0ba 100644 --- a/docs/ru/operations/settings/settings_profiles.md +++ b/docs/ru/operations/settings/settings_profiles.md @@ -1,5 +1,4 @@ - -# Профили настроек +# Профили настроек {#profili-nastroek} Профили настроек - это множество настроек, сгруппированных под одним именем. Для каждого пользователя ClickHouse указывается некоторый профиль. Все настройки профиля можно применить, установив настройку `profile`. @@ -8,7 +7,7 @@ Установить профиль `web`. -```sql +``` sql SET profile = 'web' ``` @@ -16,7 +15,7 @@ SET profile = 'web' Пример: -```xml +``` xml diff --git a/docs/ru/operations/settings/settings_users.md b/docs/ru/operations/settings/settings_users.md index 55a696e79eb..b010358e547 100644 --- a/docs/ru/operations/settings/settings_users.md +++ b/docs/ru/operations/settings/settings_users.md @@ -1,10 +1,10 @@ -# Настройки пользователей +# Настройки пользователей {#nastroiki-polzovatelei} Раздел `users` конфигурационного файла `user.xml` содержит настройки для пользователей. Структура раздела `users`: -```xml +``` xml @@ -31,63 +31,63 @@ ``` -### user_name/password +### user\_name/password {#user-namepassword} Пароль можно указать в текстовом виде или в виде SHA256 (шестнадцатеричный формат). -- Чтобы назначить пароль в текстовом виде (**не рекомендуем**), поместите его в элемент `password`. +- Чтобы назначить пароль в текстовом виде (**не рекомендуем**), поместите его в элемент `password`. - Например, `qwerty`. Пароль можно оставить пустым. + Например, `qwerty`. Пароль можно оставить пустым. -- Чтобы назначить пароль в виде SHA256, поместите хэш в элемент `password_sha256_hex`. +- Чтобы назначить пароль в виде SHA256, поместите хэш в элемент `password_sha256_hex`. - Например, `65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5`. + Например, `65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5`. - Пример создания пароля в командной строке: + Пример создания пароля в командной строке: - ``` - PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' - ``` + ``` + PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' + ``` - Первая строка результата — пароль. Вторая строка — соответствующий ему хэш SHA256. + Первая строка результата — пароль. Вторая строка — соответствующий ему хэш SHA256. -- Для совместимости с клиентами MySQL, пароль можно задать с помощью двойного хэша SHA1, поместив его в элемент `password_double_sha1_hex`. +- Для совместимости с клиентами MySQL, пароль можно задать с помощью двойного хэша SHA1, поместив его в элемент `password_double_sha1_hex`. - Например, `08b4a0f1de6ad37da17359e592c8d74788a83eb0`. + Например, `08b4a0f1de6ad37da17359e592c8d74788a83eb0`. - Пример создания пароля в командной строке: + Пример создания пароля в командной строке: - ``` - PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-' - ``` + ``` + PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-' + ``` - Первая строка результата — пароль. Вторая строка — соответствующий ему двойной хэш SHA1. + Первая строка результата — пароль. Вторая строка — соответствующий ему двойной хэш SHA1. -### user_name/networks +### user\_name/networks {#user-namenetworks} Список сетей, из которых пользователь может подключиться к серверу ClickHouse. Каждый элемент списка имеет одну из следующих форм: -- `` — IP-адрес или маска подсети. +- `` — IP-адрес или маска подсети. - Примеры: `213.180.204.3`, `10.0.0.1/8`, `10.0.0.1/255.255.255.0`, `2a02:6b8::3`, `2a02:6b8::3/64`, `2a02:6b8::3/ffff:ffff:ffff:ffff::`. + Примеры: `213.180.204.3`, `10.0.0.1/8`, `10.0.0.1/255.255.255.0`, `2a02:6b8::3`, `2a02:6b8::3/64`, `2a02:6b8::3/ffff:ffff:ffff:ffff::`. -- `` — Имя хоста. +- `` — Имя хоста. - Пример: `example01.host.ru`. + Пример: `example01.host.ru`. - Для проверки доступа выполняется DNS-запрос, и все возвращенные IP-адреса сравниваются с адресом клиента. + Для проверки доступа выполняется DNS-запрос, и все возвращенные IP-адреса сравниваются с адресом клиента. -- `` — Регулярное выражение для имен хостов. +- `` — Регулярное выражение для имен хостов. - Пример, `^example\d\d-\d\d-\d\.host\.ru$` + Пример, `^example\d\d-\d\d-\d\.host\.ru$` - Для проверки доступа выполняется [DNS запрос PTR](https://en.wikipedia.org/wiki/Reverse_DNS_lookup) для адреса клиента, а затем применяется заданное регулярное выражение. Затем, для результатов запроса PTR выполняется другой DNS-запрос и все полученные адреса сравниваются с адресом клиента. Рекомендуем завершать регулярное выражение символом $. + Для проверки доступа выполняется [DNS запрос PTR](https://en.wikipedia.org/wiki/Reverse_DNS_lookup) для адреса клиента, а затем применяется заданное регулярное выражение. Затем, для результатов запроса PTR выполняется другой DNS-запрос и все полученные адреса сравниваются с адресом клиента. Рекомендуем завершать регулярное выражение символом $. Все результаты DNS-запросов кэшируются до перезапуска сервера. @@ -95,7 +95,7 @@ Чтобы открыть доступ пользователю из любой сети, укажите: -```xml +``` xml ::/0 ``` @@ -104,22 +104,22 @@ Чтобы открыть только локальный доступ, укажите: -```xml +``` xml ::1 127.0.0.1 ``` -### user_name/profile +### user\_name/profile {#user-nameprofile} Пользователю можно назначить профиль настроек. Профили настроек конфигурируются в отдельной секции файла `users.xml`. Подробнее читайте в разделе [Профили настроек](settings_profiles.md). -### user_name/quota +### user\_name/quota {#user-namequota} Квотирование позволяет отслеживать или ограничивать использование ресурсов в течение определённого периода времени. Квоты настраиваются в разделе `quotas` конфигурационного файла `users.xml`. Пользователю можно назначить квоты. Подробное описание настройки квот смотрите в разделе [Квоты](../quotas.md#quotas). -### user_name/databases +### user\_name/databases {#user-namedatabases} В этом разделе вы можете ограничить выдачу ClickHouse запросами `SELECT` для конкретного пользователя, таким образом реализуя базовую защиту на уровне строк. @@ -127,7 +127,7 @@ Следующая конфигурация задаёт, что пользователь `user1` в результате запросов `SELECT` может получать только те строки `table1`, в которых значение поля `id` равно 1000. -```xml +``` xml diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md index dad397462d9..7dd4e5b6648 100644 --- a/docs/ru/operations/system_tables.md +++ b/docs/ru/operations/system_tables.md @@ -1,4 +1,4 @@ -# Системные таблицы +# Системные таблицы {#sistemnye-tablitsy} Системные таблицы используются для реализации части функциональности системы, а также предоставляют доступ к информации о работе системы. Вы не можете удалить системную таблицу (хотя можете сделать DETACH). @@ -6,22 +6,22 @@ В системные таблицы нельзя записывать данные - можно только читать. Системные таблицы расположены в базе данных system. -## system.asynchronous_metrics {#system_tables-asynchronous_metrics} +## system.asynchronous\_metrics {#system-tables-asynchronous-metrics} Содержит метрики, которые периодически вычисляются в фоновом режиме. Например, объём используемой оперативной памяти. Столбцы: -- `metric` ([String](../data_types/string.md)) — название метрики. -- `value` ([Float64](../data_types/float.md)) — значение метрики. +- `metric` ([String](../data_types/string.md)) — название метрики. +- `value` ([Float64](../data_types/float.md)) — значение метрики. **Пример** -```sql +``` sql SELECT * FROM system.asynchronous_metrics LIMIT 10 ``` -```text +``` text ┌─metric──────────────────────────────────┬──────value─┐ │ jemalloc.background_thread.run_interval │ 0 │ │ jemalloc.background_thread.num_runs │ 0 │ @@ -38,17 +38,17 @@ SELECT * FROM system.asynchronous_metrics LIMIT 10 **Смотрите также** -- [Мониторинг](monitoring.md) — основы мониторинга в ClickHouse. -- [system.metrics](#system_tables-metrics) — таблица с мгновенно вычисляемыми метриками. -- [system.events](#system_tables-events) — таблица с количеством произошедших событий. -- [system.metric_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. +- [Мониторинг](monitoring.md) — основы мониторинга в ClickHouse. +- [system.metrics](#system_tables-metrics) — таблица с мгновенно вычисляемыми метриками. +- [system.events](#system_tables-events) — таблица с количеством произошедших событий. +- [system.metric\_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. -## system.clusters +## system.clusters {#system-clusters} Содержит информацию о доступных в конфигурационном файле кластерах и серверах, которые в них входят. Столбцы: -```text +``` text cluster String — имя кластера. shard_num UInt32 — номер шарда в кластере, начиная с 1. shard_weight UInt32 — относительный вес шарда при записи данных @@ -59,7 +59,7 @@ port UInt16 — порт, на который обращаться для сое user String — имя пользователя, которого использовать для соединения с сервером. ``` -## system.columns +## system.columns {#system-columns} Содержит информацию о столбцах всех таблиц. @@ -67,35 +67,36 @@ user String — имя пользователя, которого использ Таблица `system.columns` содержит столбцы (тип столбца указан в скобках): -- `database` (String) — имя базы данных. -- `table` (String) — имя таблицы. -- `name` (String) — имя столбца. -- `type` (String) — тип столбца. -- `default_kind` (String) — тип выражения (`DEFAULT`, `MATERIALIZED`, `ALIAS`) значения по умолчанию, или пустая строка. -- `default_expression` (String) — выражение для значения по умолчанию или пустая строка. -- `data_compressed_bytes` (UInt64) — размер сжатых данных в байтах. -- `data_uncompressed_bytes` (UInt64) — размер распакованных данных в байтах. -- `marks_bytes` (UInt64) — размер засечек в байтах. -- `comment` (String) — комментарий к столбцу или пустая строка. -- `is_in_partition_key` (UInt8) — флаг, показывающий включение столбца в ключ партиционирования. -- `is_in_sorting_key` (UInt8) — флаг, показывающий включение столбца в ключ сортировки. -- `is_in_primary_key` (UInt8) — флаг, показывающий включение столбца в первичный ключ. -- `is_in_sampling_key` (UInt8) — флаг, показывающий включение столбца в ключ выборки. +- `database` (String) — имя базы данных. +- `table` (String) — имя таблицы. +- `name` (String) — имя столбца. +- `type` (String) — тип столбца. +- `default_kind` (String) — тип выражения (`DEFAULT`, `MATERIALIZED`, `ALIAS`) значения по умолчанию, или пустая строка. +- `default_expression` (String) — выражение для значения по умолчанию или пустая строка. +- `data_compressed_bytes` (UInt64) — размер сжатых данных в байтах. +- `data_uncompressed_bytes` (UInt64) — размер распакованных данных в байтах. +- `marks_bytes` (UInt64) — размер засечек в байтах. +- `comment` (String) — комментарий к столбцу или пустая строка. +- `is_in_partition_key` (UInt8) — флаг, показывающий включение столбца в ключ партиционирования. +- `is_in_sorting_key` (UInt8) — флаг, показывающий включение столбца в ключ сортировки. +- `is_in_primary_key` (UInt8) — флаг, показывающий включение столбца в первичный ключ. +- `is_in_sampling_key` (UInt8) — флаг, показывающий включение столбца в ключ выборки. -## system.contributors {#system_contributors} +## system.contributors {#system-contributors} Содержит информацию о контрибьютерах. Контрибьютеры расположены в таблице в случайном порядке. Порядок определяется заново при каждом запросе. Столбцы: -- `name` (String) — Имя контрибьютера (автора коммита) из git log. +- `name` (String) — Имя контрибьютера (автора коммита) из git log. **Пример** -```sql +``` sql SELECT * FROM system.contributors LIMIT 10 ``` -```text + +``` text ┌─name─────────────┐ │ Olga Khvostikova │ │ Max Vetrov │ @@ -112,67 +113,67 @@ SELECT * FROM system.contributors LIMIT 10 Чтобы найти себя в таблице, выполните запрос: -```sql +``` sql SELECT * FROM system.contributors WHERE name='Olga Khvostikova' ``` -```text + +``` text ┌─name─────────────┐ │ Olga Khvostikova │ └──────────────────┘ ``` -## system.databases +## system.databases {#system-databases} Таблица содержит один столбец name типа String - имя базы данных. Для каждой базы данных, о которой знает сервер, будет присутствовать соответствующая запись в таблице. Эта системная таблица используется для реализации запроса `SHOW DATABASES`. -## system.detached_parts {#system_tables-detached_parts} +## system.detached\_parts {#system-tables-detached-parts} Содержит информацию об отсоединённых кусках таблиц семейства [MergeTree](table_engines/mergetree.md). Столбец `reason` содержит причину, по которой кусок был отсоединён. Для кусов, отсоединённых пользователем, `reason` содержит пустую строку. -Такие куски могут быть присоединены с помощью [ALTER TABLE ATTACH PARTITION|PART](../query_language/query_language/alter/#alter_attach-partition). Остальные столбцы описаны в [system.parts](#system_tables-parts). +Такие куски могут быть присоединены с помощью [ALTER TABLE ATTACH PARTITION\|PART](../query_language/query_language/alter/#alter_attach-partition). Остальные столбцы описаны в [system.parts](#system_tables-parts). Если имя куска некорректно, значения некоторых столбцов могут быть `NULL`. Такие куски могут быть удалены с помощью [ALTER TABLE DROP DETACHED PART](../query_language/query_language/alter/#alter_drop-detached). -## system.dictionaries +## system.dictionaries {#system-dictionaries} Содержит информацию о внешних словарях. Столбцы: -- `name String` — Имя словаря. -- `type String` — Тип словаря: Flat, Hashed, Cache. -- `origin String` — Путь к конфигурационному файлу, в котором описан словарь. -- `attribute.names Array(String)` — Массив имён атрибутов, предоставляемых словарём. -- `attribute.types Array(String)` — Соответствующий массив типов атрибутов, предоставляемых словарём. -- `has_hierarchy UInt8` — Является ли словарь иерархическим. -- `bytes_allocated UInt64` — Количество оперативной памяти, которое использует словарь. -- `hit_rate Float64` — Для cache-словарей - доля использований, для которых значение было в кэше. -- `element_count UInt64` — Количество хранящихся в словаре элементов. -- `load_factor Float64` — Доля заполненности словаря (для hashed словаря - доля заполнения хэш-таблицы). -- `creation_time DateTime` — Время создания или последней успешной перезагрузки словаря. -- `last_exception String` — Текст ошибки, возникшей при создании или перезагрузке словаря, если словарь не удалось создать. -- `source String` - Текст, описывающий источник данных для словаря. - +- `name String` — Имя словаря. +- `type String` — Тип словаря: Flat, Hashed, Cache. +- `origin String` — Путь к конфигурационному файлу, в котором описан словарь. +- `attribute.names Array(String)` — Массив имён атрибутов, предоставляемых словарём. +- `attribute.types Array(String)` — Соответствующий массив типов атрибутов, предоставляемых словарём. +- `has_hierarchy UInt8` — Является ли словарь иерархическим. +- `bytes_allocated UInt64` — Количество оперативной памяти, которое использует словарь. +- `hit_rate Float64` — Для cache-словарей - доля использований, для которых значение было в кэше. +- `element_count UInt64` — Количество хранящихся в словаре элементов. +- `load_factor Float64` — Доля заполненности словаря (для hashed словаря - доля заполнения хэш-таблицы). +- `creation_time DateTime` — Время создания или последней успешной перезагрузки словаря. +- `last_exception String` — Текст ошибки, возникшей при создании или перезагрузке словаря, если словарь не удалось создать. +- `source String` - Текст, описывающий источник данных для словаря. Заметим, что количество оперативной памяти, которое использует словарь, не является пропорциональным количеству элементов, хранящихся в словаре. Так, для flat и cached словарей, все ячейки памяти выделяются заранее, независимо от реальной заполненности словаря. -## system.events {#system_tables-events} +## system.events {#system-tables-events} Содержит информацию о количестве событий, произошедших в системе. Например, в таблице можно найти, сколько запросов `SELECT` обработано с момента запуска сервера ClickHouse. Столбцы: -- `event` ([String](../data_types/string.md)) — имя события. -- `value` ([UInt64](../data_types/int_uint.md)) — количество произошедших событий. -- `description` ([String](../data_types/string.md)) — описание события. +- `event` ([String](../data_types/string.md)) — имя события. +- `value` ([UInt64](../data_types/int_uint.md)) — количество произошедших событий. +- `description` ([String](../data_types/string.md)) — описание события. **Пример** -```sql +``` sql SELECT * FROM system.events LIMIT 5 ``` -```text +``` text ┌─event─────────────────────────────────┬─value─┬─description────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Query │ 12 │ Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries. │ │ SelectQuery │ 8 │ Same as Query, but only for SELECT queries. │ @@ -184,76 +185,75 @@ SELECT * FROM system.events LIMIT 5 **Смотрите также** -- [system.asynchronous_metrics](#system_tables-asynchronous_metrics) — таблица с периодически вычисляемыми метриками. -- [system.metrics](#system_tables-metrics) — таблица с мгновенно вычисляемыми метриками. -- [system.metric_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. -- [Мониторинг](monitoring.md) — основы мониторинга в ClickHouse. +- [system.asynchronous\_metrics](#system_tables-asynchronous_metrics) — таблица с периодически вычисляемыми метриками. +- [system.metrics](#system_tables-metrics) — таблица с мгновенно вычисляемыми метриками. +- [system.metric\_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. +- [Мониторинг](monitoring.md) — основы мониторинга в ClickHouse. -## system.functions +## system.functions {#system-functions} Содержит информацию об обычных и агрегатных функциях. Столбцы: -- `name` (`String`) – Имя функции. -- `is_aggregate` (`UInt8`) – Признак, является ли функция агрегатной. +- `name` (`String`) – Имя функции. +- `is_aggregate` (`UInt8`) – Признак, является ли функция агрегатной. -## system.graphite_retentions +## system.graphite\_retentions {#system-graphite-retentions} -Содержит информацию о том, какие параметры [graphite_rollup](server_settings/settings.md#server_settings-graphite_rollup) используются в таблицах с движками [\*GraphiteMergeTree](table_engines/graphitemergetree.md). +Содержит информацию о том, какие параметры [graphite\_rollup](server_settings/settings.md#server_settings-graphite_rollup) используются в таблицах с движками [\*GraphiteMergeTree](table_engines/graphitemergetree.md). Столбцы: -- `config_name` (String) - Имя параметра, используемого для `graphite_rollup`. -- `regexp` (String) - Шаблон имени метрики. -- `function` (String) - Имя агрегирующей функции. -- `age` (UInt64) - Минимальный возраст данных в секундах. -- `precision` (UInt64) - Точность определения возраста данных в секундах. -- `priority` (UInt16) - Приоритет раздела pattern. -- `is_default` (UInt8) - Является ли раздел pattern дефолтным. -- `Tables.database` (Array(String)) - Массив имён баз данных таблиц, использующих параметр `config_name`. -- `Tables.table` (Array(String)) - Массив имён таблиц, использующих параметр `config_name`. +- `config_name` (String) - Имя параметра, используемого для `graphite_rollup`. +- `regexp` (String) - Шаблон имени метрики. +- `function` (String) - Имя агрегирующей функции. +- `age` (UInt64) - Минимальный возраст данных в секундах. +- `precision` (UInt64) - Точность определения возраста данных в секундах. +- `priority` (UInt16) - Приоритет раздела pattern. +- `is_default` (UInt8) - Является ли раздел pattern дефолтным. +- `Tables.database` (Array(String)) - Массив имён баз данных таблиц, использующих параметр `config_name`. +- `Tables.table` (Array(String)) - Массив имён таблиц, использующих параметр `config_name`. - -## system.merges +## system.merges {#system-merges} Содержит информацию о производящихся прямо сейчас слияниях и мутациях кусков для таблиц семейства MergeTree. Столбцы: -- `database String` — Имя базы данных, в которой находится таблица. -- `table String` — Имя таблицы. -- `elapsed Float64` — Время в секундах, прошедшее от начала выполнения слияния. -- `progress Float64` — Доля выполненной работы от 0 до 1. -- `num_parts UInt64` — Количество сливаемых кусков. -- `result_part_name String` — Имя куска, который будет образован в результате слияния. -- `is_mutation UInt8` - Является ли данный процесс мутацией куска. -- `total_size_bytes_compressed UInt64` — Суммарный размер сжатых данных сливаемых кусков. -- `total_size_marks UInt64` — Суммарное количество засечек в сливаемых кусках. -- `bytes_read_uncompressed UInt64` — Количество прочитанных байт, разжатых. -- `rows_read UInt64` — Количество прочитанных строк. -- `bytes_written_uncompressed UInt64` — Количество записанных байт, несжатых. -- `rows_written UInt64` — Количество записанных строк. +- `database String` — Имя базы данных, в которой находится таблица. +- `table String` — Имя таблицы. +- `elapsed Float64` — Время в секундах, прошедшее от начала выполнения слияния. +- `progress Float64` — Доля выполненной работы от 0 до 1. +- `num_parts UInt64` — Количество сливаемых кусков. +- `result_part_name String` — Имя куска, который будет образован в результате слияния. +- `is_mutation UInt8` - Является ли данный процесс мутацией куска. +- `total_size_bytes_compressed UInt64` — Суммарный размер сжатых данных сливаемых кусков. +- `total_size_marks UInt64` — Суммарное количество засечек в сливаемых кусках. +- `bytes_read_uncompressed UInt64` — Количество прочитанных байт, разжатых. +- `rows_read UInt64` — Количество прочитанных строк. +- `bytes_written_uncompressed UInt64` — Количество записанных байт, несжатых. +- `rows_written UInt64` — Количество записанных строк. -## system.metrics {#system_tables-metrics} +## system.metrics {#system-tables-metrics} Содержит метрики, которые могут быть рассчитаны мгновенно или имеют текущее значение. Например, число одновременно обрабатываемых запросов или текущее значение задержки реплики. Эта таблица всегда актуальна. Столбцы: -- `metric` ([String](../data_types/string.md)) — название метрики. -- `value` ([Int64](../data_types/int_uint.md)) — значение метрики. -- `description` ([String](../data_types/string.md)) — описание метрики. +- `metric` ([String](../data_types/string.md)) — название метрики. +- `value` ([Int64](../data_types/int_uint.md)) — значение метрики. +- `description` ([String](../data_types/string.md)) — описание метрики. Список поддержанных метрик смотрите в файле [dbms/src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Common/CurrentMetrics.cpp). **Пример** -```sql +``` sql SELECT * FROM system.metrics LIMIT 10 ``` -```text +``` text ┌─metric─────────────────────┬─value─┬─description──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Query │ 1 │ Number of executing queries │ │ Merge │ 0 │ Number of executing background merges │ @@ -270,16 +270,17 @@ SELECT * FROM system.metrics LIMIT 10 **Смотрите также** -- [system.asynchronous_metrics](#system_tables-asynchronous_metrics) — таблица с периодически вычисляемыми метриками. -- [system.events](#system_tables-events) — таблица с количеством произошедших событий. -- [system.metric_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. -- [Мониторинг](monitoring.md) — основы мониторинга в ClickHouse. +- [system.asynchronous\_metrics](#system_tables-asynchronous_metrics) — таблица с периодически вычисляемыми метриками. +- [system.events](#system_tables-events) — таблица с количеством произошедших событий. +- [system.metric\_log](#system_tables-metric_log) — таблица фиксирующая историю значений метрик из `system.metrics` и `system.events`. +- [Мониторинг](monitoring.md) — основы мониторинга в ClickHouse. -## system.metric_log {#system_tables-metric_log} +## system.metric\_log {#system-tables-metric-log} Содержит историю значений метрик из таблиц `system.metrics` и `system.events`, периодически сбрасываемую на диск. -Для включения сбора истории метрик в таблице `system.metric_log` создайте `/etc/clickhouse-server/config.d/metric_log.xml` следующего содержания: -```xml +Для включения сбора истории метрик в таблице `system.metric_log` создайте `/etc/clickhouse-server/config.d/metric_log.xml` следующего содержания: + +``` xml system @@ -292,12 +293,11 @@ SELECT * FROM system.metrics LIMIT 10 **Пример** -```sql +``` sql SELECT * FROM system.metric_log LIMIT 1 FORMAT Vertical; ``` - -```text +``` text Row 1: ────── event_date: 2020-02-18 @@ -321,34 +321,34 @@ CurrentMetric_PartMutation: 0 CurrentMetric_ReplicatedFetch: 0 CurrentMetric_ReplicatedSend: 0 CurrentMetric_ReplicatedChecks: 0 -... +... ``` **Смотрите также** -- [system.asynchronous_metrics](#system_tables-asynchronous_metrics) — таблица с периодически вычисляемыми метриками. -- [system.events](#system_tables-events) — таблица с количеством произошедших событий. -- [system.metrics](#system_tables-metrics) — таблица с мгновенно вычисляемыми метриками. -- [Мониторинг](monitoring.md) — основы мониторинга в ClickHouse. +- [system.asynchronous\_metrics](#system_tables-asynchronous_metrics) — таблица с периодически вычисляемыми метриками. +- [system.events](#system_tables-events) — таблица с количеством произошедших событий. +- [system.metrics](#system_tables-metrics) — таблица с мгновенно вычисляемыми метриками. +- [Мониторинг](monitoring.md) — основы мониторинга в ClickHouse. -## system.numbers +## system.numbers {#system-numbers} Таблица содержит один столбец с именем number типа UInt64, содержащим почти все натуральные числа, начиная с нуля. Эту таблицу можно использовать для тестов, а также если вам нужно сделать перебор. Чтения из этой таблицы не распараллеливаются. -## system.numbers_mt +## system.numbers\_mt {#system-numbers-mt} То же самое, что и system.numbers, но чтение распараллеливается. Числа могут возвращаться в произвольном порядке. Используется для тестов. -## system.one +## system.one {#system-one} Таблица содержит одну строку с одним столбцом dummy типа UInt8, содержащим значение 0. Эта таблица используется, если в SELECT запросе не указана секция FROM. То есть, это - аналог таблицы DUAL, которую можно найти в других СУБД. -## system.parts {#system_tables-parts} +## system.parts {#system-tables-parts} Содержит информацию о кусках данных таблиц семейства [MergeTree](table_engines/mergetree.md). @@ -356,364 +356,399 @@ CurrentMetric_ReplicatedChecks: 0 Столбцы: -- `partition` (`String`) – Имя партиции. Что такое партиция можно узнать из описания запроса [ALTER](../query_language/alter.md#query_language_queries_alter). +- `partition` (`String`) – Имя партиции. Что такое партиция можно узнать из описания запроса [ALTER](../query_language/alter.md#query_language_queries_alter). - Форматы: + Форматы: - - `YYYYMM` для автоматической схемы партиционирования по месяцам. - - `any_string` при партиционировании вручную. + - `YYYYMM` для автоматической схемы партиционирования по месяцам. + - `any_string` при партиционировании вручную. -- `name` (`String`) – имя куска. -- `active` (`UInt8`) – признак активности. Если кусок активен, то он используется таблицей, в противном случает он будет удален. Неактивные куски остаются после слияний. -- `marks` (`UInt64`) – количество засечек. Чтобы получить примерное количество строк в куске, умножьте `marks` на гранулированность индекса (обычно 8192). -- `rows` (`UInt64`) – количество строк. -- `bytes_on_disk` (`UInt64`) – общий размер всех файлов кусков данных в байтах. -- `data_compressed_bytes` (`UInt64`) – общий размер сжатой информации в куске данных. Размер всех дополнительных файлов (например, файлов с засечками) не учитывается. -- `data_uncompressed_bytes` (`UInt64`) – общий размер распакованной информации куска данных. Размер всех дополнительных файлов (например, файлов с засечками) не учитывается. -- `marks_bytes` (`UInt64`) – размер файла с засечками. -- `modification_time` (`DateTime`) – время модификации директории с куском данных. Обычно соответствует времени создания куска. -- `remove_time` (`DateTime`) – время, когда кусок стал неактивным. -- `refcount` (`UInt32`) – количество мест, в котором кусок используется. Значение больше 2 говорит о том, что кусок участвует в запросах или в слияниях. -- `min_date` (`Date`) – минимальное значение ключа даты в куске данных. -- `max_date` (`Date`) – максимальное значение ключа даты в куске данных. -- `min_time` (`DateTime`) – минимальное значение даты и времени в куске данных. -- `max_time`(`DateTime`) – максимальное значение даты и времени в куске данных. -- `partition_id` (`String`) – ID партиции. -- `min_block_number` (`UInt64`) – минимальное число кусков, из которых состоит текущий после слияния. -- `max_block_number` (`UInt64`) – максимальное число кусков, из которых состоит текущий после слияния. -- `level` (`UInt32`) - глубина дерева слияний. Если слияний не было, то `level=0`. -- `data_version` (`UInt64`) – число, которое используется для определения того, какие мутации необходимо применить к куску данных (мутации с версией большей, чем `data_version`). -- `primary_key_bytes_in_memory` (`UInt64`) – объём памяти (в байтах), занимаемой значениями первичных ключей. -- `primary_key_bytes_in_memory_allocated` (`UInt64`) – объём памяти (в байтах) выделенный для размещения первичных ключей. -- `is_frozen` (`UInt8`) – Признак, показывающий существование бэкапа партиции. 1, бэкап есть. 0, бэкапа нет. Смотрите раздел [FREEZE PARTITION](../query_language/alter.md#alter_freeze-partition). -- `database` (`String`) – имя базы данных. -- `table` (`String`) – имя таблицы. -- `engine` (`String`) – имя движка таблицы, без параметров. -- `path` (`String`) – абсолютный путь к папке с файлами кусков данных. -- `disk` (`String`) – имя диска, на котором находится кусок данных. -- `hash_of_all_files` (`String`) – значение [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) для сжатых файлов. -- `hash_of_uncompressed_files` (`String`) – значение [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) несжатых файлов (файлы с засечками, первичным ключом и пр.) -- `uncompressed_hash_of_compressed_files` (`String`) – значение [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) данных в сжатых файлах как если бы они были разжатыми. -- `bytes` (`UInt64`) – алиас для `bytes_on_disk`. -- `marks_size` (`UInt64`) – алиас для `marks_bytes`. +- `name` (`String`) – имя куска. +- `active` (`UInt8`) – признак активности. Если кусок активен, то он используется таблицей, в противном случает он будет удален. Неактивные куски остаются после слияний. -## system.part_log {#system_tables-part-log} +- `marks` (`UInt64`) – количество засечек. Чтобы получить примерное количество строк в куске, умножьте `marks` на гранулированность индекса (обычно 8192). -Системная таблица `system.part_log` создается только в том случае, если задана серверная настройка [part_log](server_settings/settings.md#server_settings-part-log). +- `rows` (`UInt64`) – количество строк. + +- `bytes_on_disk` (`UInt64`) – общий размер всех файлов кусков данных в байтах. + +- `data_compressed_bytes` (`UInt64`) – общий размер сжатой информации в куске данных. Размер всех дополнительных файлов (например, файлов с засечками) не учитывается. + +- `data_uncompressed_bytes` (`UInt64`) – общий размер распакованной информации куска данных. Размер всех дополнительных файлов (например, файлов с засечками) не учитывается. + +- `marks_bytes` (`UInt64`) – размер файла с засечками. + +- `modification_time` (`DateTime`) – время модификации директории с куском данных. Обычно соответствует времени создания куска. + +- `remove_time` (`DateTime`) – время, когда кусок стал неактивным. + +- `refcount` (`UInt32`) – количество мест, в котором кусок используется. Значение больше 2 говорит о том, что кусок участвует в запросах или в слияниях. + +- `min_date` (`Date`) – минимальное значение ключа даты в куске данных. + +- `max_date` (`Date`) – максимальное значение ключа даты в куске данных. + +- `min_time` (`DateTime`) – минимальное значение даты и времени в куске данных. + +- `max_time`(`DateTime`) – максимальное значение даты и времени в куске данных. + +- `partition_id` (`String`) – ID партиции. + +- `min_block_number` (`UInt64`) – минимальное число кусков, из которых состоит текущий после слияния. + +- `max_block_number` (`UInt64`) – максимальное число кусков, из которых состоит текущий после слияния. + +- `level` (`UInt32`) - глубина дерева слияний. Если слияний не было, то `level=0`. + +- `data_version` (`UInt64`) – число, которое используется для определения того, какие мутации необходимо применить к куску данных (мутации с версией большей, чем `data_version`). + +- `primary_key_bytes_in_memory` (`UInt64`) – объём памяти (в байтах), занимаемой значениями первичных ключей. + +- `primary_key_bytes_in_memory_allocated` (`UInt64`) – объём памяти (в байтах) выделенный для размещения первичных ключей. + +- `is_frozen` (`UInt8`) – Признак, показывающий существование бэкапа партиции. 1, бэкап есть. 0, бэкапа нет. Смотрите раздел [FREEZE PARTITION](../query_language/alter.md#alter_freeze-partition). + +- `database` (`String`) – имя базы данных. + +- `table` (`String`) – имя таблицы. + +- `engine` (`String`) – имя движка таблицы, без параметров. + +- `path` (`String`) – абсолютный путь к папке с файлами кусков данных. + +- `disk` (`String`) – имя диска, на котором находится кусок данных. + +- `hash_of_all_files` (`String`) – значение [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) для сжатых файлов. + +- `hash_of_uncompressed_files` (`String`) – значение [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) несжатых файлов (файлы с засечками, первичным ключом и пр.) + +- `uncompressed_hash_of_compressed_files` (`String`) – значение [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) данных в сжатых файлах как если бы они были разжатыми. + +- `bytes` (`UInt64`) – алиас для `bytes_on_disk`. + +- `marks_size` (`UInt64`) – алиас для `marks_bytes`. + +## system.part\_log {#system-tables-part-log} + +Системная таблица `system.part_log` создается только в том случае, если задана серверная настройка [part\_log](server_settings/settings.md#server_settings-part-log). Содержит информацию о всех событиях, произошедших с [кусками данных](table_engines/custom_partitioning_key.md) таблиц семейства [MergeTree](table_engines/mergetree.md) (например, события добавления, удаления или слияния данных). Столбцы: -- `event_type` (Enum) — тип события. Столбец может содержать одно из следующих значений: - - `NEW_PART` — вставка нового куска. - - `MERGE_PARTS` — слияние кусков. - - `DOWNLOAD_PART` — загрузка с реплики. - - `REMOVE_PART` — удаление или отсоединение из таблицы с помощью [DETACH PARTITION](../query_language/alter.md#alter_detach-partition). - - `MUTATE_PART` — изменение куска. - - `MOVE_PART` — перемещение куска между дисками. -- `event_date` (Date) — дата события. -- `event_time` (DateTime) — время события. -- `duration_ms` (UInt64) — длительность. -- `database` (String) — имя базы данных, в которой находится кусок. -- `table` (String) — имя таблицы, в которой находится кусок. -- `part_name` (String) — имя куска. -- `partition_id` (String) — идентификатор партиции, в которую был добавлен кусок. В столбце будет значение 'all', если таблица партициируется по выражению `tuple()`. -- `rows` (UInt64) — число строк в куске. -- `size_in_bytes` (UInt64) — размер куска данных в байтах. -- `merged_from` (Array(String)) — массив имён кусков, из которых образован текущий кусок в результате слияния (также столбец заполняется в случае скачивания уже смерженного куска). -- `bytes_uncompressed` (UInt64) — количество прочитанных разжатых байт. -- `read_rows` (UInt64) — сколько было прочитано строк при слиянии кусков. -- `read_bytes` (UInt64) — сколько было прочитано байт при слиянии кусков. -- `error` (UInt16) — код ошибки, возникшей при текущем событии. -- `exception` (String) — текст ошибки. +- `event_type` (Enum) — тип события. Столбец может содержать одно из следующих значений: + - `NEW_PART` — вставка нового куска. + - `MERGE_PARTS` — слияние кусков. + - `DOWNLOAD_PART` — загрузка с реплики. + - `REMOVE_PART` — удаление или отсоединение из таблицы с помощью [DETACH PARTITION](../query_language/alter.md#alter_detach-partition). + - `MUTATE_PART` — изменение куска. + - `MOVE_PART` — перемещение куска между дисками. +- `event_date` (Date) — дата события. +- `event_time` (DateTime) — время события. +- `duration_ms` (UInt64) — длительность. +- `database` (String) — имя базы данных, в которой находится кусок. +- `table` (String) — имя таблицы, в которой находится кусок. +- `part_name` (String) — имя куска. +- `partition_id` (String) — идентификатор партиции, в которую был добавлен кусок. В столбце будет значение ‘all’, если таблица партициируется по выражению `tuple()`. +- `rows` (UInt64) — число строк в куске. +- `size_in_bytes` (UInt64) — размер куска данных в байтах. +- `merged_from` (Array(String)) — массив имён кусков, из которых образован текущий кусок в результате слияния (также столбец заполняется в случае скачивания уже смерженного куска). +- `bytes_uncompressed` (UInt64) — количество прочитанных разжатых байт. +- `read_rows` (UInt64) — сколько было прочитано строк при слиянии кусков. +- `read_bytes` (UInt64) — сколько было прочитано байт при слиянии кусков. +- `error` (UInt16) — код ошибки, возникшей при текущем событии. +- `exception` (String) — текст ошибки. Системная таблица `system.part_log` будет создана после первой вставки данных в таблицу `MergeTree`. -## system.processes {#system_tables-processes} +## system.processes {#system-tables-processes} Используется для реализации запроса `SHOW PROCESSLIST`. Столбцы: -- `user` (String) – пользователь, инициировавший запрос. При распределённом выполнении запросы отправляются на удалённые серверы от имени пользователя `default`. Поле содержит имя пользователя для конкретного запроса, а не для запроса, который иницировал этот запрос. -- `address` (String) – IP-адрес, с которого пришёл запрос. При распределённой обработке запроса аналогично. Чтобы определить откуда запрос пришел изначально, необходимо смотреть таблицу `system.processes` на сервере-источнике запроса. -- `elapsed` (Float64) – время в секундах с начала обработки запроса. -- `rows_read` (UInt64) – количество прочитанных строк. При распределённой обработке запроса на сервере-инициаторе запроса представляет собой сумму по всем удалённым серверам. -- `bytes_read` (UInt64) – количество прочитанных из таблиц байт, в несжатом виде. При распределённой обработке запроса на сервере-инициаторе запроса представляет собой сумму по всем удалённым серверам. -- `total_rows_approx` (UInt64) – приблизительная оценка общего количества строк, которые должны быть прочитаны. При распределённой обработке запроса, на сервере-инициаторе запроса, представляет собой сумму по всем удалённым серверам. Может обновляться в процессе выполнения запроса, когда становятся известны новые источники для обработки. -- `memory_usage` (UInt64) – потребление памяти запросом. Может не учитывать некоторые виды выделенной памяти. Смотрите описание настройки [max_memory_usage](../operations/settings/query_complexity.md#settings_max_memory_usage). -- `query` (String) – текст запроса. Для запросов `INSERT` не содержит встаявляемые данные. -- `query_id` (String) – идентификатор запроса, если был задан. +- `user` (String) – пользователь, инициировавший запрос. При распределённом выполнении запросы отправляются на удалённые серверы от имени пользователя `default`. Поле содержит имя пользователя для конкретного запроса, а не для запроса, который иницировал этот запрос. +- `address` (String) – IP-адрес, с которого пришёл запрос. При распределённой обработке запроса аналогично. Чтобы определить откуда запрос пришел изначально, необходимо смотреть таблицу `system.processes` на сервере-источнике запроса. +- `elapsed` (Float64) – время в секундах с начала обработки запроса. +- `rows_read` (UInt64) – количество прочитанных строк. При распределённой обработке запроса на сервере-инициаторе запроса представляет собой сумму по всем удалённым серверам. +- `bytes_read` (UInt64) – количество прочитанных из таблиц байт, в несжатом виде. При распределённой обработке запроса на сервере-инициаторе запроса представляет собой сумму по всем удалённым серверам. +- `total_rows_approx` (UInt64) – приблизительная оценка общего количества строк, которые должны быть прочитаны. При распределённой обработке запроса, на сервере-инициаторе запроса, представляет собой сумму по всем удалённым серверам. Может обновляться в процессе выполнения запроса, когда становятся известны новые источники для обработки. +- `memory_usage` (UInt64) – потребление памяти запросом. Может не учитывать некоторые виды выделенной памяти. Смотрите описание настройки [max\_memory\_usage](../operations/settings/query_complexity.md#settings_max_memory_usage). +- `query` (String) – текст запроса. Для запросов `INSERT` не содержит встаявляемые данные. +- `query_id` (String) – идентификатор запроса, если был задан. -## system.query_log {#system_tables-query-log} +## system.query\_log {#system-tables-query-log} Содержит информацию о выполнении запросов. Для каждого запроса вы можете увидеть время начала обработки, продолжительность обработки, сообщения об ошибках и другую информацию. !!! note "Внимание" Таблица не содержит входных данных для запросов `INSERT`. -ClickHouse создаёт таблицу только в том случае, когда установлен конфигурационный параметр сервера [query_log](server_settings/settings.md#server_settings-query-log). Параметр задаёт правила ведения лога, такие как интервал логирования или имя таблицы, в которую будут логгироваться запросы. +ClickHouse создаёт таблицу только в том случае, когда установлен конфигурационный параметр сервера [query\_log](server_settings/settings.md#server_settings-query-log). Параметр задаёт правила ведения лога, такие как интервал логирования или имя таблицы, в которую будут логгироваться запросы. -Чтобы включить логирование, задайте значение параметра [log_queries](settings/settings.md#settings-log-queries) равным 1. Подробности смотрите в разделе [Настройки](settings/settings.md). +Чтобы включить логирование, задайте значение параметра [log\_queries](settings/settings.md#settings-log-queries) равным 1. Подробности смотрите в разделе [Настройки](settings/settings.md). Таблица `system.query_log` содержит информацию о двух видах запросов: -1. Первоначальные запросы, которые были выполнены непосредственно клиентом. -2. Дочерние запросы, инициированные другими запросами (для выполнения распределенных запросов). Для дочерних запросов информация о первоначальном запросе содержится в столбцах `initial_*`. +1. Первоначальные запросы, которые были выполнены непосредственно клиентом. +2. Дочерние запросы, инициированные другими запросами (для выполнения распределенных запросов). Для дочерних запросов информация о первоначальном запросе содержится в столбцах `initial_*`. Столбцы: -- `type` (`Enum8`) — тип события, произошедшего при выполнении запроса. Значения: - - `'QueryStart' = 1` — успешное начало выполнения запроса. - - `'QueryFinish' = 2` — успешное завершение выполнения запроса. - - `'ExceptionBeforeStart' = 3` — исключение перед началом обработки запроса. - - `'ExceptionWhileProcessing' = 4` — исключение во время обработки запроса. -- `event_date` (Date) — дата начала запроса. -- `event_time` (DateTime) — время начала запроса. -- `query_start_time` (DateTime) — время начала обработки запроса. -- `query_duration_ms` (UInt64) — длительность обработки запроса. -- `read_rows` (UInt64) — количество прочитанных строк. -- `read_bytes` (UInt64) — количество прочитанных байтов. -- `written_rows` (UInt64) — количество записанных строк для запросов `INSERT`. Для других запросов, значение столбца 0. -- `written_bytes` (UInt64) — объём записанных данных в байтах для запросов `INSERT`. Для других запросов, значение столбца 0. -- `result_rows` (UInt64) — количество строк в результате. -- `result_bytes` (UInt64) — объём результата в байтах. -- `memory_usage` (UInt64) — потребление RAM запросом. -- `query` (String) — текст запроса. -- `exception` (String) — сообщение исключения, если запрос завершился по исключению. -- `stack_trace` (String) — трассировка (список функций, последовательно вызванных перед ошибкой). Пустая строка, если запрос успешно завершен. -- `is_initial_query` (UInt8) — вид запроса. Возможные значения: - - 1 — запрос был инициирован клиентом. - - 0 — запрос был инициирован другим запросом при распределенном запросе. -- `user` (String) — пользователь, запустивший текущий запрос. -- `query_id` (String) — ID запроса. -- `address` (IPv6) — IP адрес, с которого пришел запрос. -- `port` (UInt16) — порт, с которого клиент сделал запрос -- `initial_user` (String) — пользователь, запустивший первоначальный запрос (для распределенных запросов). -- `initial_query_id` (String) — ID родительского запроса. -- `initial_address` (IPv6) — IP адрес, с которого пришел родительский запрос. -- `initial_port` (UInt16) — порт, с которого клиент сделал родительский запрос. -- `interface` (UInt8) — интерфейс, с которого ушёл запрос. Возможные значения: - - 1 — TCP. - - 2 — HTTP. -- `os_user` (String) — имя пользователя в OS, который запустил [clickhouse-client](../interfaces/cli.md). -- `client_hostname` (String) — имя сервера, с которого присоединился [clickhouse-client](../interfaces/cli.md) или другой TCP клиент. -- `client_name` (String) — [clickhouse-client](../interfaces/cli.md) или другой TCP клиент. -- `client_revision` (UInt32) — ревизия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. -- `client_version_major` (UInt32) — старшая версия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. -- `client_version_minor` (UInt32) — младшая версия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. -- `client_version_patch` (UInt32) — патч [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. -- `http_method` (UInt8) — HTTP метод, инициировавший запрос. Возможные значения: - - 0 — запрос запущен с интерфейса TCP. - - 1 — `GET`. - - 2 — `POST`. -- `http_user_agent` (String) — HTTP заголовок `UserAgent`. -- `quota_key` (String) — "ключ квоты" из настроек [квот](quotas.md) (см. `keyed`). -- `revision` (UInt32) — ревизия ClickHouse. -- `thread_numbers` (Array(UInt32)) — количество потоков, участвующих в обработке запросов. -- `ProfileEvents.Names` (Array(String)) — Счетчики для изменения различных метрик. Описание метрик можно получить из таблицы [system.events](#system_tables-events -- `ProfileEvents.Values` (Array(UInt64)) — метрики, перечисленные в столбце `ProfileEvents.Names`. -- `Settings.Names` (Array(String)) — имена настроек, которые меняются, когда клиент выполняет запрос. Чтобы разрешить логирование изменений настроек, установите параметр `log_query_settings` равным 1. -- `Settings.Values` (Array(String)) — Значения настроек, которые перечислены в столбце `Settings.Names`. +- `type` (`Enum8`) — тип события, произошедшего при выполнении запроса. Значения: + - `'QueryStart' = 1` — успешное начало выполнения запроса. + - `'QueryFinish' = 2` — успешное завершение выполнения запроса. + - `'ExceptionBeforeStart' = 3` — исключение перед началом обработки запроса. + - `'ExceptionWhileProcessing' = 4` — исключение во время обработки запроса. +- `event_date` (Date) — дата начала запроса. +- `event_time` (DateTime) — время начала запроса. +- `query_start_time` (DateTime) — время начала обработки запроса. +- `query_duration_ms` (UInt64) — длительность обработки запроса. +- `read_rows` (UInt64) — количество прочитанных строк. +- `read_bytes` (UInt64) — количество прочитанных байтов. +- `written_rows` (UInt64) — количество записанных строк для запросов `INSERT`. Для других запросов, значение столбца 0. +- `written_bytes` (UInt64) — объём записанных данных в байтах для запросов `INSERT`. Для других запросов, значение столбца 0. +- `result_rows` (UInt64) — количество строк в результате. +- `result_bytes` (UInt64) — объём результата в байтах. +- `memory_usage` (UInt64) — потребление RAM запросом. +- `query` (String) — текст запроса. +- `exception` (String) — сообщение исключения, если запрос завершился по исключению. +- `stack_trace` (String) — трассировка (список функций, последовательно вызванных перед ошибкой). Пустая строка, если запрос успешно завершен. +- `is_initial_query` (UInt8) — вид запроса. Возможные значения: + - 1 — запрос был инициирован клиентом. + - 0 — запрос был инициирован другим запросом при распределенном запросе. +- `user` (String) — пользователь, запустивший текущий запрос. +- `query_id` (String) — ID запроса. +- `address` (IPv6) — IP адрес, с которого пришел запрос. +- `port` (UInt16) — порт, с которого клиент сделал запрос +- `initial_user` (String) — пользователь, запустивший первоначальный запрос (для распределенных запросов). +- `initial_query_id` (String) — ID родительского запроса. +- `initial_address` (IPv6) — IP адрес, с которого пришел родительский запрос. +- `initial_port` (UInt16) — порт, с которого клиент сделал родительский запрос. +- `interface` (UInt8) — интерфейс, с которого ушёл запрос. Возможные значения: + - 1 — TCP. + - 2 — HTTP. +- `os_user` (String) — имя пользователя в OS, который запустил [clickhouse-client](../interfaces/cli.md). +- `client_hostname` (String) — имя сервера, с которого присоединился [clickhouse-client](../interfaces/cli.md) или другой TCP клиент. +- `client_name` (String) — [clickhouse-client](../interfaces/cli.md) или другой TCP клиент. +- `client_revision` (UInt32) — ревизия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. +- `client_version_major` (UInt32) — старшая версия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. +- `client_version_minor` (UInt32) — младшая версия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. +- `client_version_patch` (UInt32) — патч [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. +- `http_method` (UInt8) — HTTP метод, инициировавший запрос. Возможные значения: + - 0 — запрос запущен с интерфейса TCP. + - 1 — `GET`. + - 2 — `POST`. +- `http_user_agent` (String) — HTTP заголовок `UserAgent`. +- `quota_key` (String) — «ключ квоты» из настроек [квот](quotas.md) (см. `keyed`). +- `revision` (UInt32) — ревизия ClickHouse. +- `thread_numbers` (Array(UInt32)) — количество потоков, участвующих в обработке запросов. +- `ProfileEvents.Names` (Array(String)) — Счетчики для изменения различных метрик. Описание метрик можно получить из таблицы [system.events](#system_tables-events)(\#system\_tables-events +- `ProfileEvents.Values` (Array(UInt64)) — метрики, перечисленные в столбце `ProfileEvents.Names`. +- `Settings.Names` (Array(String)) — имена настроек, которые меняются, когда клиент выполняет запрос. Чтобы разрешить логирование изменений настроек, установите параметр `log_query_settings` равным 1. +- `Settings.Values` (Array(String)) — Значения настроек, которые перечислены в столбце `Settings.Names`. Каждый запрос создаёт одну или две строки в таблице `query_log`, в зависимости от статуса запроса: -1. Если запрос выполнен успешно, создаются два события типа 1 и 2 (смотрите столбец `type`). -2. Если во время обработки запроса произошла ошибка, создаются два события с типами 1 и 4. -3. Если ошибка произошла до запуска запроса, создается одно событие с типом 3. +1. Если запрос выполнен успешно, создаются два события типа 1 и 2 (смотрите столбец `type`). +2. Если во время обработки запроса произошла ошибка, создаются два события с типами 1 и 4. +3. Если ошибка произошла до запуска запроса, создается одно событие с типом 3. -По умолчанию, строки добавляются в таблицу логирования с интервалом в 7,5 секунд. Можно задать интервал в конфигурационном параметре сервера [query_log](server_settings/settings.md#server_settings-query-log) (смотрите параметр `flush_interval_milliseconds`). Чтобы принудительно записать логи из буффера памяти в таблицу, используйте запрос `SYSTEM FLUSH LOGS`. +По умолчанию, строки добавляются в таблицу логирования с интервалом в 7,5 секунд. Можно задать интервал в конфигурационном параметре сервера [query\_log](server_settings/settings.md#server_settings-query-log) (смотрите параметр `flush_interval_milliseconds`). Чтобы принудительно записать логи из буффера памяти в таблицу, используйте запрос `SYSTEM FLUSH LOGS`. -Если таблицу удалить вручную, она пересоздастся автоматически "на лету". При этом все логи на момент удаления таблицы будут удалены. +Если таблицу удалить вручную, она пересоздастся автоматически «на лету». При этом все логи на момент удаления таблицы будут удалены. !!! note "Примечание" Срок хранения логов не ограничен. Логи не удаляются из таблицы автоматически. Вам необходимо самостоятельно организовать удаление устаревших логов. -Можно указать произвольный ключ партиционирования для таблицы `system.query_log` в конфигурации [query_log](server_settings/settings.md#server_settings-query-log) (параметр `partition_by`). +Можно указать произвольный ключ партиционирования для таблицы `system.query_log` в конфигурации [query\_log](server_settings/settings.md#server_settings-query-log) (параметр `partition_by`). -## system.query_log {#system_tables-query_log} +## system.query\_log {#system-tables-query-log} Contains information about execution of queries. For each query, you can see processing start time, duration of processing, error messages and other information. !!! note "Note" - The table doesn't contain input data for `INSERT` queries. + The table doesn’t contain input data for `INSERT` queries. -ClickHouse creates this table only if the [query_log](server_settings/settings.md#server_settings-query-log) server parameter is specified. This parameter sets the logging rules, such as the logging interval or the name of the table the queries will be logged in. +ClickHouse creates this table only if the [query\_log](server_settings/settings.md#server_settings-query-log) server parameter is specified. This parameter sets the logging rules, such as the logging interval or the name of the table the queries will be logged in. -To enable query logging, set the [log_queries](settings/settings.md#settings-log-queries) parameter to 1. For details, see the [Settings](settings/settings.md) section. +To enable query logging, set the [log\_queries](settings/settings.md#settings-log-queries) parameter to 1. For details, see the [Settings](settings/settings.md) section. The `system.query_log` table registers two kinds of queries: -1. Initial queries that were run directly by the client. -2. Child queries that were initiated by other queries (for distributed query execution). For these types of queries, information about the parent queries is shown in the `initial_*` columns. +1. Initial queries that were run directly by the client. +2. Child queries that were initiated by other queries (for distributed query execution). For these types of queries, information about the parent queries is shown in the `initial_*` columns. Columns: -- `type` (`Enum8`) — Type of event that occurred when executing the query. Values: - - `'QueryStart' = 1` — Successful start of query execution. - - `'QueryFinish' = 2` — Successful end of query execution. - - `'ExceptionBeforeStart' = 3` — Exception before the start of query execution. - - `'ExceptionWhileProcessing' = 4` — Exception during the query execution. -- `event_date` (Date) — Query starting date. -- `event_time` (DateTime) — Query starting time. -- `query_start_time` (DateTime) — Start time of query execution. -- `query_duration_ms` (UInt64) — Duration of query execution. -- `read_rows` (UInt64) — Number of read rows. -- `read_bytes` (UInt64) — Number of read bytes. -- `written_rows` (UInt64) — For `INSERT` queries, the number of written rows. For other queries, the column value is 0. -- `written_bytes` (UInt64) — For `INSERT` queries, the number of written bytes. For other queries, the column value is 0. -- `result_rows` (UInt64) — Number of rows in the result. -- `result_bytes` (UInt64) — Number of bytes in the result. -- `memory_usage` (UInt64) — Memory consumption by the query. -- `query` (String) — Query string. -- `exception` (String) — Exception message. -- `stack_trace` (String) — Stack trace (a list of methods called before the error occurred). An empty string, if the query is completed successfully. -- `is_initial_query` (UInt8) — Query type. Possible values: - - 1 — Query was initiated by the client. - - 0 — Query was initiated by another query for distributed query execution. -- `user` (String) — Name of the user who initiated the current query. -- `query_id` (String) — ID of the query. -- `address` (IPv6) — IP address that was used to make the query. -- `port` (UInt16) — The client port that was used to make the query. -- `initial_user` (String) — Name of the user who ran the initial query (for distributed query execution). -- `initial_query_id` (String) — ID of the initial query (for distributed query execution). -- `initial_address` (IPv6) — IP address that the parent query was launched from. -- `initial_port` (UInt16) — The client port that was used to make the parent query. -- `interface` (UInt8) — Interface that the query was initiated from. Possible values: - - 1 — TCP. - - 2 — HTTP. -- `os_user` (String) — OS's username who runs [clickhouse-client](../interfaces/cli.md). -- `client_hostname` (String) — Hostname of the client machine where the [clickhouse-client](../interfaces/cli.md) or another TCP client is run. -- `client_name` (String) — The [clickhouse-client](../interfaces/cli.md) or another TCP client name. -- `client_revision` (UInt32) — Revision of the [clickhouse-client](../interfaces/cli.md) or another TCP client. -- `client_version_major` (UInt32) — Major version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. -- `client_version_minor` (UInt32) — Minor version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. -- `client_version_patch` (UInt32) — Patch component of the [clickhouse-client](../interfaces/cli.md) or another TCP client version. -- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: - - 0 — The query was launched from the TCP interface. - - 1 — `GET` method was used. - - 2 — `POST` method was used. -- `http_user_agent` (String) — The `UserAgent` header passed in the HTTP request. -- `quota_key` (String) — The "quota key" specified in the [quotas](quotas.md) setting (see `keyed`). -- `revision` (UInt32) — ClickHouse revision. -- `thread_numbers` (Array(UInt32)) — Number of threads that are participating in query execution. -- `ProfileEvents.Names` (Array(String)) — Counters that measure different metrics. The description of them could be found in the table [system.events](#system_tables-events) -- `ProfileEvents.Values` (Array(UInt64)) — Values of metrics that are listed in the `ProfileEvents.Names` column. -- `Settings.Names` (Array(String)) — Names of settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` parameter to 1. -- `Settings.Values` (Array(String)) — Values of settings that are listed in the `Settings.Names` column. +- `type` (`Enum8`) — Type of event that occurred when executing the query. Values: + - `'QueryStart' = 1` — Successful start of query execution. + - `'QueryFinish' = 2` — Successful end of query execution. + - `'ExceptionBeforeStart' = 3` — Exception before the start of query execution. + - `'ExceptionWhileProcessing' = 4` — Exception during the query execution. +- `event_date` (Date) — Query starting date. +- `event_time` (DateTime) — Query starting time. +- `query_start_time` (DateTime) — Start time of query execution. +- `query_duration_ms` (UInt64) — Duration of query execution. +- `read_rows` (UInt64) — Number of read rows. +- `read_bytes` (UInt64) — Number of read bytes. +- `written_rows` (UInt64) — For `INSERT` queries, the number of written rows. For other queries, the column value is 0. +- `written_bytes` (UInt64) — For `INSERT` queries, the number of written bytes. For other queries, the column value is 0. +- `result_rows` (UInt64) — Number of rows in the result. +- `result_bytes` (UInt64) — Number of bytes in the result. +- `memory_usage` (UInt64) — Memory consumption by the query. +- `query` (String) — Query string. +- `exception` (String) — Exception message. +- `stack_trace` (String) — Stack trace (a list of methods called before the error occurred). An empty string, if the query is completed successfully. +- `is_initial_query` (UInt8) — Query type. Possible values: + - 1 — Query was initiated by the client. + - 0 — Query was initiated by another query for distributed query execution. +- `user` (String) — Name of the user who initiated the current query. +- `query_id` (String) — ID of the query. +- `address` (IPv6) — IP address that was used to make the query. +- `port` (UInt16) — The client port that was used to make the query. +- `initial_user` (String) — Name of the user who ran the initial query (for distributed query execution). +- `initial_query_id` (String) — ID of the initial query (for distributed query execution). +- `initial_address` (IPv6) — IP address that the parent query was launched from. +- `initial_port` (UInt16) — The client port that was used to make the parent query. +- `interface` (UInt8) — Interface that the query was initiated from. Possible values: + - 1 — TCP. + - 2 — HTTP. +- `os_user` (String) — OS’s username who runs [clickhouse-client](../interfaces/cli.md). +- `client_hostname` (String) — Hostname of the client machine where the [clickhouse-client](../interfaces/cli.md) or another TCP client is run. +- `client_name` (String) — The [clickhouse-client](../interfaces/cli.md) or another TCP client name. +- `client_revision` (UInt32) — Revision of the [clickhouse-client](../interfaces/cli.md) or another TCP client. +- `client_version_major` (UInt32) — Major version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. +- `client_version_minor` (UInt32) — Minor version of the [clickhouse-client](../interfaces/cli.md) or another TCP client. +- `client_version_patch` (UInt32) — Patch component of the [clickhouse-client](../interfaces/cli.md) or another TCP client version. +- `http_method` (UInt8) — HTTP method that initiated the query. Possible values: + - 0 — The query was launched from the TCP interface. + - 1 — `GET` method was used. + - 2 — `POST` method was used. +- `http_user_agent` (String) — The `UserAgent` header passed in the HTTP request. +- `quota_key` (String) — The «quota key» specified in the [quotas](quotas.md) setting (see `keyed`). +- `revision` (UInt32) — ClickHouse revision. +- `thread_numbers` (Array(UInt32)) — Number of threads that are participating in query execution. +- `ProfileEvents.Names` (Array(String)) — Counters that measure different metrics. The description of them could be found in the table [system.events](#system_tables-events) +- `ProfileEvents.Values` (Array(UInt64)) — Values of metrics that are listed in the `ProfileEvents.Names` column. +- `Settings.Names` (Array(String)) — Names of settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` parameter to 1. +- `Settings.Values` (Array(String)) — Values of settings that are listed in the `Settings.Names` column. Each query creates one or two rows in the `query_log` table, depending on the status of the query: -1. If the query execution is successful, two events with types 1 and 2 are created (see the `type` column). -2. If an error occurred during query processing, two events with types 1 and 4 are created. -3. If an error occurred before launching the query, a single event with type 3 is created. +1. If the query execution is successful, two events with types 1 and 2 are created (see the `type` column). +2. If an error occurred during query processing, two events with types 1 and 4 are created. +3. If an error occurred before launching the query, a single event with type 3 is created. -By default, logs are added to the table at intervals of 7.5 seconds. You can set this interval in the [query_log](server_settings/settings.md#server_settings-query-log) server setting (see the `flush_interval_milliseconds` parameter). To flush the logs forcibly from the memory buffer into the table, use the `SYSTEM FLUSH LOGS` query. +By default, logs are added to the table at intervals of 7.5 seconds. You can set this interval in the [query\_log](server_settings/settings.md#server_settings-query-log) server setting (see the `flush_interval_milliseconds` parameter). To flush the logs forcibly from the memory buffer into the table, use the `SYSTEM FLUSH LOGS` query. When the table is deleted manually, it will be automatically created on the fly. Note that all the previous logs will be deleted. -!!! note - The storage period for logs is unlimited. Logs aren't automatically deleted from the table. You need to organize the removal of outdated logs yourself. +!!! note "Note" + The storage period for logs is unlimited. Logs aren’t automatically deleted from the table. You need to organize the removal of outdated logs yourself. -You can specify an arbitrary partitioning key for the `system.query_log` table in the [query_log](server_settings/settings.md#server_settings-query-log) server setting (see the `partition_by` parameter). -## system.query_thread_log {#system_tables-query-thread-log} +You can specify an arbitrary partitioning key for the `system.query_log` table in the [query\_log](server_settings/settings.md#server_settings-query-log) server setting (see the `partition_by` parameter). +\#\# system.query\_thread\_log {\#system\_tables-query-thread-log} Содержит информацию о каждом потоке выполняемых запросов. -ClickHouse создаёт таблицу только в том случае, когда установлен конфигурационный параметр сервера [query_thread_log](server_settings/settings.md#server_settings-query-thread-log). Параметр задаёт правила ведения лога, такие как интервал логирования или имя таблицы, в которую будут логгироваться запросы. +ClickHouse создаёт таблицу только в том случае, когда установлен конфигурационный параметр сервера [query\_thread\_log](server_settings/settings.md#server_settings-query-thread-log). Параметр задаёт правила ведения лога, такие как интервал логирования или имя таблицы, в которую будут логгироваться запросы. -Чтобы включить логирование, задайте значение параметра [log_query_threads](settings/settings.md#settings-log-query-threads) равным 1. Подробности смотрите в разделе [Настройки](settings/settings.md). +Чтобы включить логирование, задайте значение параметра [log\_query\_threads](settings/settings.md#settings-log-query-threads) равным 1. Подробности смотрите в разделе [Настройки](settings/settings.md). Столбцы: -- `event_date` (Date) — дата завершения выполнения запроса потоком. -- `event_time` (DateTime) — дата и время завершения выполнения запроса потоком. -- `query_start_time` (DateTime) — время начала обработки запроса. -- `query_duration_ms` (UInt64) — длительность обработки запроса в миллисекундах. -- `read_rows` (UInt64) — количество прочитанных строк. -- `read_bytes` (UInt64) — количество прочитанных байтов. -- `written_rows` (UInt64) — количество записанных строк для запросов `INSERT`. Для других запросов, значение столбца 0. -- `written_bytes` (UInt64) — объём записанных данных в байтах для запросов `INSERT`. Для других запросов, значение столбца 0. -- `memory_usage` (Int64) — разница между выделенной и освобождённой памятью в контексте потока. -- `peak_memory_usage` (Int64) — максимальная разница между выделенной и освобождённой памятью в контексте потока. -- `thread_name` (String) — Имя потока. -- `thread_id` (UInt64) — tid (ID потока операционной системы). -- `master_thread_id` (UInt64) — tid (ID потока операционной системы) главного потока. -- `query` (String) — текст запроса. -- `is_initial_query` (UInt8) — вид запроса. Возможные значения: - - 1 — запрос был инициирован клиентом. - - 0 — запрос был инициирован другим запросом при распределенном запросе. -- `user` (String) — пользователь, запустивший текущий запрос. -- `query_id` (String) — ID запроса. -- `address` (IPv6) — IP адрес, с которого пришел запрос. -- `port` (UInt16) — порт, с которого пришел запрос. -- `initial_user` (String) — пользователь, запустивший первоначальный запрос (для распределенных запросов). -- `initial_query_id` (String) — ID родительского запроса. -- `initial_address` (IPv6) — IP адрес, с которого пришел родительский запрос. -- `initial_port` (UInt16) — порт, пришел родительский запрос. -- `interface` (UInt8) — интерфейс, с которого ушёл запрос. Возможные значения: - - 1 — TCP. - - 2 — HTTP. -- `os_user` (String) — имя пользователя в OS, который запустил [clickhouse-client](../interfaces/cli.md). -- `client_hostname` (String) — hostname клиентской машины, с которой присоединился [clickhouse-client](../interfaces/cli.md) или другой TCP клиент. -- `client_name` (String) — [clickhouse-client](../interfaces/cli.md) или другой TCP клиент. -- `client_revision` (UInt32) — ревизия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. -- `client_version_major` (UInt32) — старшая версия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. -- `client_version_minor` (UInt32) — младшая версия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. -- `client_version_patch` (UInt32) — патч [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. -- `http_method` (UInt8) — HTTP метод, инициировавший запрос. Возможные значения: - - 0 — запрос запущен с интерфейса TCP. - - 1 — `GET`. - - 2 — `POST`. -- `http_user_agent` (String) — HTTP заголовок `UserAgent`. -- `quota_key` (String) — "ключ квоты" из настроек [квот](quotas.md) (см. `keyed`). -- `revision` (UInt32) — ревизия ClickHouse. -- `ProfileEvents.Names` (Array(String)) — Счетчики для изменения различных метрик для данного потока. Описание метрик можно получить из таблицы [system.events](#system_tables-events -- `ProfileEvents.Values` (Array(UInt64)) — метрики для данного потока, перечисленные в столбце `ProfileEvents.Names`. +- `event_date` (Date) — дата завершения выполнения запроса потоком. +- `event_time` (DateTime) — дата и время завершения выполнения запроса потоком. +- `query_start_time` (DateTime) — время начала обработки запроса. +- `query_duration_ms` (UInt64) — длительность обработки запроса в миллисекундах. +- `read_rows` (UInt64) — количество прочитанных строк. +- `read_bytes` (UInt64) — количество прочитанных байтов. +- `written_rows` (UInt64) — количество записанных строк для запросов `INSERT`. Для других запросов, значение столбца 0. +- `written_bytes` (UInt64) — объём записанных данных в байтах для запросов `INSERT`. Для других запросов, значение столбца 0. +- `memory_usage` (Int64) — разница между выделенной и освобождённой памятью в контексте потока. +- `peak_memory_usage` (Int64) — максимальная разница между выделенной и освобождённой памятью в контексте потока. +- `thread_name` (String) — Имя потока. +- `thread_id` (UInt64) — tid (ID потока операционной системы). +- `master_thread_id` (UInt64) — tid (ID потока операционной системы) главного потока. +- `query` (String) — текст запроса. +- `is_initial_query` (UInt8) — вид запроса. Возможные значения: + - 1 — запрос был инициирован клиентом. + - 0 — запрос был инициирован другим запросом при распределенном запросе. +- `user` (String) — пользователь, запустивший текущий запрос. +- `query_id` (String) — ID запроса. +- `address` (IPv6) — IP адрес, с которого пришел запрос. +- `port` (UInt16) — порт, с которого пришел запрос. +- `initial_user` (String) — пользователь, запустивший первоначальный запрос (для распределенных запросов). +- `initial_query_id` (String) — ID родительского запроса. +- `initial_address` (IPv6) — IP адрес, с которого пришел родительский запрос. +- `initial_port` (UInt16) — порт, пришел родительский запрос. +- `interface` (UInt8) — интерфейс, с которого ушёл запрос. Возможные значения: + - 1 — TCP. + - 2 — HTTP. +- `os_user` (String) — имя пользователя в OS, который запустил [clickhouse-client](../interfaces/cli.md). +- `client_hostname` (String) — hostname клиентской машины, с которой присоединился [clickhouse-client](../interfaces/cli.md) или другой TCP клиент. +- `client_name` (String) — [clickhouse-client](../interfaces/cli.md) или другой TCP клиент. +- `client_revision` (UInt32) — ревизия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. +- `client_version_major` (UInt32) — старшая версия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. +- `client_version_minor` (UInt32) — младшая версия [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. +- `client_version_patch` (UInt32) — патч [clickhouse-client](../interfaces/cli.md) или другого TCP клиента. +- `http_method` (UInt8) — HTTP метод, инициировавший запрос. Возможные значения: + - 0 — запрос запущен с интерфейса TCP. + - 1 — `GET`. + - 2 — `POST`. +- `http_user_agent` (String) — HTTP заголовок `UserAgent`. +- `quota_key` (String) — «ключ квоты» из настроек [квот](quotas.md) (см. `keyed`). +- `revision` (UInt32) — ревизия ClickHouse. +- `ProfileEvents.Names` (Array(String)) — Счетчики для изменения различных метрик для данного потока. Описание метрик можно получить из таблицы [system.events](#system_tables-events)(\#system\_tables-events +- `ProfileEvents.Values` (Array(UInt64)) — метрики для данного потока, перечисленные в столбце `ProfileEvents.Names`. -По умолчанию, строки добавляются в таблицу логирования с интервалом в 7,5 секунд. Можно задать интервал в конфигурационном параметре сервера [query_thread_log](server_settings/settings.md#server_settings-query-thread-log) (смотрите параметр `flush_interval_milliseconds`). Чтобы принудительно записать логи из буффера памяти в таблицу, используйте запрос `SYSTEM FLUSH LOGS`. +По умолчанию, строки добавляются в таблицу логирования с интервалом в 7,5 секунд. Можно задать интервал в конфигурационном параметре сервера [query\_thread\_log](server_settings/settings.md#server_settings-query-thread-log) (смотрите параметр `flush_interval_milliseconds`). Чтобы принудительно записать логи из буффера памяти в таблицу, используйте запрос `SYSTEM FLUSH LOGS`. -Если таблицу удалить вручную, она пересоздастся автоматически "на лету". При этом все логи на момент удаления таблицы будут удалены. +Если таблицу удалить вручную, она пересоздастся автоматически «на лету». При этом все логи на момент удаления таблицы будут удалены. !!! note "Примечание" Срок хранения логов не ограничен. Логи не удаляются из таблицы автоматически. Вам необходимо самостоятельно организовать удаление устаревших логов. -Можно указать произвольный ключ партиционирования для таблицы `system.query_log` в конфигурации [query_thread_log](server_settings/settings.md#server_settings-query-thread-log) (параметр `partition_by`). +Можно указать произвольный ключ партиционирования для таблицы `system.query_log` в конфигурации [query\_thread\_log](server_settings/settings.md#server_settings-query-thread-log) (параметр `partition_by`). -## system.trace_log {#system_tables-trace_log} +## system.trace\_log {#system-tables-trace-log} Contains stack traces collected by the sampling query profiler. -ClickHouse creates this table when the [trace_log](server_settings/settings.md#server_settings-trace_log) server configuration section is set. Also the [query_profiler_real_time_period_ns](settings/settings.md#query_profiler_real_time_period_ns) and [query_profiler_cpu_time_period_ns](settings/settings.md#query_profiler_cpu_time_period_ns) settings should be set. +ClickHouse creates this table when the [trace\_log](server_settings/settings.md#server_settings-trace_log) server configuration section is set. Also the [query\_profiler\_real\_time\_period\_ns](settings/settings.md#query_profiler_real_time_period_ns) and [query\_profiler\_cpu\_time\_period\_ns](settings/settings.md#query_profiler_cpu_time_period_ns) settings should be set. To analyze logs, use the `addressToLine`, `addressToSymbol` and `demangle` introspection functions. Columns: -- `event_date`([Date](../data_types/date.md)) — Date of sampling moment. -- `event_time`([DateTime](../data_types/datetime.md)) — Timestamp of sampling moment. -- `revision`([UInt32](../data_types/int_uint.md)) — ClickHouse server build revision. +- `event_date`([Date](../data_types/date.md)) — Date of sampling moment. - When connecting to server by `clickhouse-client`, you see the string similar to `Connected to ClickHouse server version 19.18.1 revision 54429.`. This field contains the `revision`, but not the `version` of a server. +- `event_time`([DateTime](../data_types/datetime.md)) — Timestamp of sampling moment. -- `timer_type`([Enum8](../data_types/enum.md)) — Timer type: +- `revision`([UInt32](../data_types/int_uint.md)) — ClickHouse server build revision. - - `Real` represents wall-clock time. - - `CPU` represents CPU time. + When connecting to server by `clickhouse-client`, you see the string similar to `Connected to ClickHouse server version 19.18.1 revision 54429.`. This field contains the `revision`, but not the `version` of a server. -- `thread_number`([UInt32](../data_types/int_uint.md)) — Thread identifier. -- `query_id`([String](../data_types/string.md)) — Query identifier that can be used to get details about a query that was running from the [query_log](#system_tables-query_log) system table. -- `trace`([Array(UInt64)](../data_types/array.md)) — Stack trace at the moment of sampling. Each element is a virtual memory address inside ClickHouse server process. +- `timer_type`([Enum8](../data_types/enum.md)) — Timer type: + + - `Real` represents wall-clock time. + - `CPU` represents CPU time. + +- `thread_number`([UInt32](../data_types/int_uint.md)) — Thread identifier. + +- `query_id`([String](../data_types/string.md)) — Query identifier that can be used to get details about a query that was running from the [query\_log](#system_tables-query_log) system table. + +- `trace`([Array(UInt64)](../data_types/array.md)) — Stack trace at the moment of sampling. Each element is a virtual memory address inside ClickHouse server process. **Example** -```sql +``` sql SELECT * FROM system.trace_log LIMIT 1 \G ``` -```text +``` text Row 1: ────── event_date: 2019-11-15 @@ -724,21 +759,22 @@ thread_number: 48 query_id: acc4d61f-5bd1-4a3e-bc91-2180be37c915 trace: [94222141367858,94222152240175,94222152325351,94222152329944,94222152330796,94222151449980,94222144088167,94222151682763,94222144088167,94222151682763,94222144088167,94222144058283,94222144059248,94222091840750,94222091842302,94222091831228,94222189631488,140509950166747,140509942945935] ``` -## system.replicas {#system_tables-replicas} + +## system.replicas {#system-tables-replicas} Содержит информацию и статус для реплицируемых таблиц, расположенных на локальном сервере. Эту таблицу можно использовать для мониторинга. Таблица содержит по строчке для каждой Replicated\*-таблицы. Пример: -```sql +``` sql SELECT * FROM system.replicas WHERE table = 'visits' FORMAT Vertical ``` -```text +``` text Row 1: ────── database: merge @@ -775,46 +811,46 @@ active_replicas: 2 Столбцы: -- `database` (`String`) - имя БД. -- `table` (`String`) - имя таблицы. -- `engine` (`String`) - имя движка таблицы. -- `is_leader` (`UInt8`) - является ли реплика лидером. -В один момент времени, не более одной из реплик является лидером. Лидер отвечает за выбор фоновых слияний, которые следует произвести. -Замечу, что запись можно осуществлять на любую реплику (доступную и имеющую сессию в ZK), независимо от лидерства. -- `can_become_leader` (`UInt8`) - может ли реплика быть выбрана лидером. -- `is_readonly` (`UInt8`) - находится ли реплика в режиме "только для чтения" -Этот режим включается, если в конфиге нет секции с ZK; если при переинициализации сессии в ZK произошла неизвестная ошибка; во время переинициализации сессии с ZK. -- `is_session_expired` (`UInt8`) - истекла ли сессия с ZK. В основном, то же самое, что и `is_readonly`. -- `future_parts` (`UInt32`) - количество кусков с данными, которые появятся в результате INSERT-ов или слияний, которых ещё предстоит сделать -- `parts_to_check` (`UInt32`) - количество кусков с данными в очереди на проверку. Кусок помещается в очередь на проверку, если есть подозрение, что он может быть битым. -- `zookeeper_path` (`String`) - путь к данным таблицы в ZK. -- `replica_name` (`String`) - имя реплики в ZK; разные реплики одной таблицы имеют разное имя. -- `replica_path` (`String`) - путь к данным реплики в ZK. То же самое, что конкатенация zookeeper_path/replicas/replica_path. -- `columns_version` (`Int32`) - номер версии структуры таблицы. Обозначает, сколько раз был сделан ALTER. Если на репликах разные версии, значит некоторые реплики сделали ещё не все ALTER-ы. -- `queue_size` (`UInt32`) - размер очереди действий, которые предстоит сделать. К действиям относятся вставки блоков данных, слияния, и некоторые другие действия. Как правило, совпадает с future_parts. -- `inserts_in_queue` (`UInt32`) - количество вставок блоков данных, которые предстоит сделать. Обычно вставки должны быстро реплицироваться. Если величина большая - значит что-то не так. -- `merges_in_queue` (`UInt32`) - количество слияний, которые предстоит сделать. Бывают длинные слияния - то есть, это значение может быть больше нуля продолжительное время. -- `part_mutations_in_queue` (`UInt32`) - количество мутаций, которые предстоит сделать. -- `queue_oldest_time` (`DateTime`) - если `queue_size` больше 0, показывает, когда была добавлена в очередь самая старая операция. -- `inserts_oldest_time` (`DateTime`) - см. `queue_oldest_time`. -- `merges_oldest_time` (`DateTime`) - см. `queue_oldest_time`. -- `part_mutations_oldest_time` (`DateTime`) - см. `queue_oldest_time`. +- `database` (`String`) - имя БД. +- `table` (`String`) - имя таблицы. +- `engine` (`String`) - имя движка таблицы. +- `is_leader` (`UInt8`) - является ли реплика лидером. + В один момент времени, не более одной из реплик является лидером. Лидер отвечает за выбор фоновых слияний, которые следует произвести. + Замечу, что запись можно осуществлять на любую реплику (доступную и имеющую сессию в ZK), независимо от лидерства. +- `can_become_leader` (`UInt8`) - может ли реплика быть выбрана лидером. +- `is_readonly` (`UInt8`) - находится ли реплика в режиме «только для чтения» + Этот режим включается, если в конфиге нет секции с ZK; если при переинициализации сессии в ZK произошла неизвестная ошибка; во время переинициализации сессии с ZK. +- `is_session_expired` (`UInt8`) - истекла ли сессия с ZK. В основном, то же самое, что и `is_readonly`. +- `future_parts` (`UInt32`) - количество кусков с данными, которые появятся в результате INSERT-ов или слияний, которых ещё предстоит сделать +- `parts_to_check` (`UInt32`) - количество кусков с данными в очереди на проверку. Кусок помещается в очередь на проверку, если есть подозрение, что он может быть битым. +- `zookeeper_path` (`String`) - путь к данным таблицы в ZK. +- `replica_name` (`String`) - имя реплики в ZK; разные реплики одной таблицы имеют разное имя. +- `replica_path` (`String`) - путь к данным реплики в ZK. То же самое, что конкатенация zookeeper\_path/replicas/replica\_path. +- `columns_version` (`Int32`) - номер версии структуры таблицы. Обозначает, сколько раз был сделан ALTER. Если на репликах разные версии, значит некоторые реплики сделали ещё не все ALTER-ы. +- `queue_size` (`UInt32`) - размер очереди действий, которые предстоит сделать. К действиям относятся вставки блоков данных, слияния, и некоторые другие действия. Как правило, совпадает с future\_parts. +- `inserts_in_queue` (`UInt32`) - количество вставок блоков данных, которые предстоит сделать. Обычно вставки должны быстро реплицироваться. Если величина большая - значит что-то не так. +- `merges_in_queue` (`UInt32`) - количество слияний, которые предстоит сделать. Бывают длинные слияния - то есть, это значение может быть больше нуля продолжительное время. +- `part_mutations_in_queue` (`UInt32`) - количество мутаций, которые предстоит сделать. +- `queue_oldest_time` (`DateTime`) - если `queue_size` больше 0, показывает, когда была добавлена в очередь самая старая операция. +- `inserts_oldest_time` (`DateTime`) - см. `queue_oldest_time`. +- `merges_oldest_time` (`DateTime`) - см. `queue_oldest_time`. +- `part_mutations_oldest_time` (`DateTime`) - см. `queue_oldest_time`. Следующие 4 столбца имеют ненулевое значение только если активна сессия с ZK. -- `log_max_index` (`UInt64`) - максимальный номер записи в общем логе действий. -- `log_pointer` (`UInt64`) - максимальный номер записи из общего лога действий, которую реплика скопировала в свою очередь для выполнения, плюс единица. Если log_pointer сильно меньше log_max_index, значит что-то не так. -- `last_queue_update` (`DateTime`) - When the queue was updated last time. -- `absolute_delay` (`UInt64`) - How big lag in seconds the current replica has. -- `total_replicas` (`UInt8`) - общее число известных реплик этой таблицы. -- `active_replicas` (`UInt8`) - число реплик этой таблицы, имеющих сессию в ZK; то есть, число работающих реплик. +- `log_max_index` (`UInt64`) - максимальный номер записи в общем логе действий. +- `log_pointer` (`UInt64`) - максимальный номер записи из общего лога действий, которую реплика скопировала в свою очередь для выполнения, плюс единица. Если log\_pointer сильно меньше log\_max\_index, значит что-то не так. +- `last_queue_update` (`DateTime`) - When the queue was updated last time. +- `absolute_delay` (`UInt64`) - How big lag in seconds the current replica has. +- `total_replicas` (`UInt8`) - общее число известных реплик этой таблицы. +- `active_replicas` (`UInt8`) - число реплик этой таблицы, имеющих сессию в ZK; то есть, число работающих реплик. Если запрашивать все столбцы, то таблица может работать слегка медленно, так как на каждую строчку делается несколько чтений из ZK. -Если не запрашивать последние 4 столбца (log_max_index, log_pointer, total_replicas, active_replicas), то таблица работает быстро. +Если не запрашивать последние 4 столбца (log\_max\_index, log\_pointer, total\_replicas, active\_replicas), то таблица работает быстро. Например, так можно проверить, что всё хорошо: -```sql +``` sql SELECT database, table, @@ -846,14 +882,14 @@ WHERE Если этот запрос ничего не возвращает - значит всё хорошо. -## system.settings +## system.settings {#system-settings} Содержит информацию о настройках, используемых в данный момент. То есть, используемых для выполнения запроса, с помощью которого вы читаете из таблицы system.settings. Столбцы: -```text +``` text name String - имя настройки value String - значение настройки changed UInt8 - была ли настройка явно задана в конфиге или изменена явным образом @@ -861,13 +897,13 @@ changed UInt8 - была ли настройка явно задана в кон Пример: -```sql +``` sql SELECT * FROM system.settings WHERE changed ``` -```text +``` text ┌─name───────────────────┬─value───────┬─changed─┐ │ max_threads │ 8 │ 1 │ │ use_uncompressed_cache │ 0 │ 1 │ @@ -876,29 +912,29 @@ WHERE changed └────────────────────────┴─────────────┴─────────┘ ``` -## system.table_engines +## system.table\_engines {#system-table-engines} Содержит информацию про движки таблиц, поддерживаемые сервером, а также об их возможностях. Эта таблица содержит следующие столбцы (тип столбца показан в скобках): -- `name` (String) — имя движка. -- `supports_settings` (UInt8) — флаг, показывающий поддержку секции `SETTINGS`. -- `supports_skipping_indices` (UInt8) — флаг, показывающий поддержку [индексов пропуска данных](table_engines/mergetree/#table_engine-mergetree-data_skipping-indexes). -- `supports_ttl` (UInt8) — флаг, показывающий поддержку [TTL](table_engines/mergetree/#table_engine-mergetree-ttl). -- `supports_sort_order` (UInt8) — флаг, показывающий поддержку секций `PARTITION_BY`, `PRIMARY_KEY`, `ORDER_BY` и `SAMPLE_BY`. -- `supports_replication` (UInt8) — флаг, показвыающий поддержку [репликации](table_engines/replication/). -- `supports_duduplication` (UInt8) — флаг, показывающий наличие в движке дедупликации данных. +- `name` (String) — имя движка. +- `supports_settings` (UInt8) — флаг, показывающий поддержку секции `SETTINGS`. +- `supports_skipping_indices` (UInt8) — флаг, показывающий поддержку [индексов пропуска данных](table_engines/mergetree/#table_engine-mergetree-data_skipping-indexes). +- `supports_ttl` (UInt8) — флаг, показывающий поддержку [TTL](table_engines/mergetree/#table_engine-mergetree-ttl). +- `supports_sort_order` (UInt8) — флаг, показывающий поддержку секций `PARTITION_BY`, `PRIMARY_KEY`, `ORDER_BY` и `SAMPLE_BY`. +- `supports_replication` (UInt8) — флаг, показвыающий поддержку [репликации](table_engines/replication/). +- `supports_duduplication` (UInt8) — флаг, показывающий наличие в движке дедупликации данных. Пример: -```sql +``` sql SELECT * FROM system.table_engines WHERE name in ('Kafka', 'MergeTree', 'ReplicatedCollapsingMergeTree') ``` -```text +``` text ┌─name──────────────────────────┬─supports_settings─┬─supports_skipping_indices─┬─supports_sort_order─┬─supports_ttl─┬─supports_replication─┬─supports_deduplication─┐ │ Kafka │ 1 │ 0 │ 0 │ 0 │ 0 │ 0 │ │ MergeTree │ 1 │ 1 │ 1 │ 1 │ 0 │ 0 │ @@ -908,72 +944,70 @@ WHERE name in ('Kafka', 'MergeTree', 'ReplicatedCollapsingMergeTree') **Смотрите также** -- [Секции движка](table_engines/mergetree/#mergetree-query-clauses) семейства MergeTree -- [Настройки](table_engines/kafka.md#table_engine-kafka-creating-a-table) Kafka -- [Настройки](table_engines/join/#join-limitations-and-settings) Join +- [Секции движка](table_engines/mergetree/#mergetree-query-clauses) семейства MergeTree +- [Настройки](table_engines/kafka.md#table_engine-kafka-creating-a-table) Kafka +- [Настройки](table_engines/join/#join-limitations-and-settings) Join - -## system.tables +## system.tables {#system-tables} Содержит метаданные каждой таблицы, о которой знает сервер. Отсоединённые таблицы не отображаются в `system.tables`. Эта таблица содержит следующие столбцы (тип столбца показан в скобках): -- `database String` — имя базы данных, в которой находится таблица. -- `name` (String) — имя таблицы. -- `engine` (String) — движок таблицы (без параметров). -- `is_temporary` (UInt8) — флаг, указывающий на то, временная это таблица или нет. -- `data_path` (String) — путь к данным таблицы в файловой системе. -- `metadata_path` (String) — путь к табличным метаданным в файловой системе. -- `metadata_modification_time` (DateTime) — время последней модификации табличных метаданных. -- `dependencies_database` (Array(String)) — зависимости базы данных. -- `dependencies_table` (Array(String)) — табличные зависимости (таблицы [MaterializedView](table_engines/materializedview.md), созданные на базе текущей таблицы). -- `create_table_query` (String) — запрос, которым создавалась таблица. -- `engine_full` (String) — параметры табличного движка. -- `partition_key` (String) — ключ партиционирования таблицы. -- `sorting_key` (String) — ключ сортировки таблицы. -- `primary_key` (String) - первичный ключ таблицы. -- `sampling_key` (String) — ключ сэмплирования таблицы. +- `database String` — имя базы данных, в которой находится таблица. +- `name` (String) — имя таблицы. +- `engine` (String) — движок таблицы (без параметров). +- `is_temporary` (UInt8) — флаг, указывающий на то, временная это таблица или нет. +- `data_path` (String) — путь к данным таблицы в файловой системе. +- `metadata_path` (String) — путь к табличным метаданным в файловой системе. +- `metadata_modification_time` (DateTime) — время последней модификации табличных метаданных. +- `dependencies_database` (Array(String)) — зависимости базы данных. +- `dependencies_table` (Array(String)) — табличные зависимости (таблицы [MaterializedView](table_engines/materializedview.md), созданные на базе текущей таблицы). +- `create_table_query` (String) — запрос, которым создавалась таблица. +- `engine_full` (String) — параметры табличного движка. +- `partition_key` (String) — ключ партиционирования таблицы. +- `sorting_key` (String) — ключ сортировки таблицы. +- `primary_key` (String) - первичный ключ таблицы. +- `sampling_key` (String) — ключ сэмплирования таблицы. Таблица `system.tables` используется при выполнении запроса `SHOW TABLES`. -## system.zookeeper +## system.zookeeper {#system-zookeeper} Таблицы не существует, если ZooKeeper не сконфигурирован. Позволяет читать данные из ZooKeeper кластера, описанного в конфигурации. В запросе обязательно в секции WHERE должно присутствовать условие на равенство path - путь в ZooKeeper, для детей которого вы хотите получить данные. Запрос `SELECT * FROM system.zookeeper WHERE path = '/clickhouse'` выведет данные по всем детям узла `/clickhouse`. -Чтобы вывести данные по всем узлам в корне, напишите path = '/'. +Чтобы вывести данные по всем узлам в корне, напишите path = ‘/’. Если узла, указанного в path не существует, то будет брошено исключение. Столбцы: -- `name String` — Имя узла. -- `path String` — Путь к узлу. -- `value String` — Значение узла. -- `dataLength Int32` — Размер значения. -- `numChildren Int32` — Количество детей. -- `czxid Int64` — Идентификатор транзакции, в которой узел был создан. -- `mzxid Int64` — Идентификатор транзакции, в которой узел был последний раз изменён. -- `pzxid Int64` — Идентификатор транзакции, последний раз удаливший или добавивший детей. -- `ctime DateTime` — Время создания узла. -- `mtime DateTime` — Время последней модификации узла. -- `version Int32` — Версия узла - количество раз, когда узел был изменён. -- `cversion Int32` — Количество добавлений или удалений детей. -- `aversion Int32` — Количество изменений ACL. -- `ephemeralOwner Int64` — Для эфемерных узлов - идентификатор сессии, которая владеет этим узлом. - +- `name String` — Имя узла. +- `path String` — Путь к узлу. +- `value String` — Значение узла. +- `dataLength Int32` — Размер значения. +- `numChildren Int32` — Количество детей. +- `czxid Int64` — Идентификатор транзакции, в которой узел был создан. +- `mzxid Int64` — Идентификатор транзакции, в которой узел был последний раз изменён. +- `pzxid Int64` — Идентификатор транзакции, последний раз удаливший или добавивший детей. +- `ctime DateTime` — Время создания узла. +- `mtime DateTime` — Время последней модификации узла. +- `version Int32` — Версия узла - количество раз, когда узел был изменён. +- `cversion Int32` — Количество добавлений или удалений детей. +- `aversion Int32` — Количество изменений ACL. +- `ephemeralOwner Int64` — Для эфемерных узлов - идентификатор сессии, которая владеет этим узлом. Пример: -```sql +``` sql SELECT * FROM system.zookeeper WHERE path = '/clickhouse/tables/01-08/visits/replicas' FORMAT Vertical ``` -```text +``` text Row 1: ────── name: example01-08-1.yandex.ru @@ -1009,57 +1043,56 @@ pzxid: 987021252247 path: /clickhouse/tables/01-08/visits/replicas ``` -## system.mutations {#system_tables-mutations} +## system.mutations {#system-tables-mutations} Таблица содержит информацию о ходе выполнения [мутаций](../query_language/alter.md#alter-mutations) MergeTree-таблиц. Каждой команде мутации соответствует одна строка. В таблице есть следующие столбцы: **database**, **table** - имя БД и таблицы, к которой была применена мутация. -**mutation_id** - ID запроса. Для реплицированных таблиц эти ID соответствуют именам записей в директории `/mutations/` в ZooKeeper, для нереплицированных - именам файлов в директории с данными таблицы. +**mutation\_id** - ID запроса. Для реплицированных таблиц эти ID соответствуют именам записей в директории `/mutations/` в ZooKeeper, для нереплицированных - именам файлов в директории с данными таблицы. **command** - Команда мутации (часть запроса после `ALTER TABLE [db.]table`). -**create_time** - Время создания мутации. +**create\_time** - Время создания мутации. -**block_numbers.partition_id**, **block_numbers.number** - Nested-столбец. Для мутаций реплицированных таблиц для каждой партиции содержит номер блока, полученный этой мутацией (в каждой партиции будут изменены только куски, содержащие блоки с номерами, меньшими номера, полученного мутацией в этой партиции). Для нереплицированных таблиц нумерация блоков сквозная по партициям, поэтому столбец содержит одну запись с единственным номером блока, полученным мутацией. +**block\_numbers.partition\_id**, **block\_numbers.number** - Nested-столбец. Для мутаций реплицированных таблиц для каждой партиции содержит номер блока, полученный этой мутацией (в каждой партиции будут изменены только куски, содержащие блоки с номерами, меньшими номера, полученного мутацией в этой партиции). Для нереплицированных таблиц нумерация блоков сквозная по партициям, поэтому столбец содержит одну запись с единственным номером блока, полученным мутацией. -**parts_to_do** - Количество кусков таблицы, которые ещё предстоит изменить. +**parts\_to\_do** - Количество кусков таблицы, которые ещё предстоит изменить. -**is_done** - Завершена ли мутация. Замечание: даже если `parts_to_do = 0`, для реплицированной таблицы возможна ситуация, когда мутация ещё не завершена из-за долго выполняющейся вставки, которая добавляет данные, которые нужно будет мутировать. +**is\_done** - Завершена ли мутация. Замечание: даже если `parts_to_do = 0`, для реплицированной таблицы возможна ситуация, когда мутация ещё не завершена из-за долго выполняющейся вставки, которая добавляет данные, которые нужно будет мутировать. Если во время мутации какого-либо куска возникли проблемы, заполняются следующие столбцы: -**latest_failed_part** - Имя последнего куска, мутация которого не удалась. +**latest\_failed\_part** - Имя последнего куска, мутация которого не удалась. -**latest_fail_time** — время последней ошибки мутации. +**latest\_fail\_time** — время последней ошибки мутации. -**latest_fail_reason** — причина последней ошибки мутации. +**latest\_fail\_reason** — причина последней ошибки мутации. -## system.disks {#system_tables-disks} +## system.disks {#system-tables-disks} Cодержит информацию о дисках, заданных в [конфигурации сервера](table_engines/mergetree.md#table_engine-mergetree-multiple-volumes_configure). Столбцы: -- `name` ([String](../data_types/string.md)) — имя диска в конфигурации сервера. -- `path` ([String](../data_types/string.md)) — путь к точке монтирования в файловой системе. -- `free_space` ([UInt64](../data_types/int_uint.md)) — свободное место на диске в байтах. -- `total_space` ([UInt64](../data_types/int_uint.md)) — объём диска в байтах. -- `keep_free_space` ([UInt64](../data_types/int_uint.md)) — место, которое должно остаться свободным на диске в байтах. Задаётся значением параметра `keep_free_space_bytes` конфигурации дисков. +- `name` ([String](../data_types/string.md)) — имя диска в конфигурации сервера. +- `path` ([String](../data_types/string.md)) — путь к точке монтирования в файловой системе. +- `free_space` ([UInt64](../data_types/int_uint.md)) — свободное место на диске в байтах. +- `total_space` ([UInt64](../data_types/int_uint.md)) — объём диска в байтах. +- `keep_free_space` ([UInt64](../data_types/int_uint.md)) — место, которое должно остаться свободным на диске в байтах. Задаётся значением параметра `keep_free_space_bytes` конфигурации дисков. - -## system.storage_policies {#system_tables-storage_policies} +## system.storage\_policies {#system-tables-storage-policies} Содержит информацию о политиках хранения и томах, заданных в [конфигурации сервера](table_engines/mergetree.md#table_engine-mergetree-multiple-volumes_configure). Столбцы: -- `policy_name` ([String](../data_types/string.md)) — имя политики хранения. -- `volume_name` ([String](../data_types/string.md)) — имя тома, который содержится в политике хранения. -- `volume_priority` ([UInt64](../data_types/int_uint.md)) — порядковый номер тома согласно конфигурации. -- `disks` ([Array(String)](../data_types/array.md)) — имена дисков, содержащихся в политике хранения. -- `max_data_part_size` ([UInt64](../data_types/int_uint.md)) — максимальный размер куска данных, который может храниться на дисках тома (0 — без ограничений). -- `move_factor` ([Float64](../data_types/float.md))` — доля свободного места, при превышении которой данные начинают перемещаться на следующий том. +- `policy_name` ([String](../data_types/string.md)) — имя политики хранения. +- `volume_name` ([String](../data_types/string.md)) — имя тома, который содержится в политике хранения. +- `volume_priority` ([UInt64](../data_types/int_uint.md)) — порядковый номер тома согласно конфигурации. +- `disks` ([Array(String)](../data_types/array.md)) — имена дисков, содержащихся в политике хранения. +- `max_data_part_size` ([UInt64](../data_types/int_uint.md)) — максимальный размер куска данных, который может храниться на дисках тома (0 — без ограничений). +- `move_factor` ([Float64](../data_types/float.md))\` — доля свободного места, при превышении которой данные начинают перемещаться на следующий том. Если политика хранения содержит несколько томов, то каждому тому соответствует отдельная запись в таблице. diff --git a/docs/ru/operations/table_engines/aggregatingmergetree.md b/docs/ru/operations/table_engines/aggregatingmergetree.md index 74e1d9b9e2a..64ae3aa037c 100644 --- a/docs/ru/operations/table_engines/aggregatingmergetree.md +++ b/docs/ru/operations/table_engines/aggregatingmergetree.md @@ -1,5 +1,4 @@ - -# AggregatingMergeTree +# AggregatingMergeTree {#aggregatingmergetree} Движок наследует функциональность [MergeTree](mergetree.md#table_engines-mergetree), изменяя логику слияния кусков данных. Все строки с одинаковым первичным ключом (точнее, с одинаковым [ключом сортировки](mergetree.md)) ClickHouse заменяет на одну (в пределах одного куска данных), которая хранит объединение состояний агрегатных функций. @@ -9,9 +8,9 @@ Использование `AggregatingMergeTree` оправдано только в том случае, когда это уменьшает количество строк на порядки. -## Создание таблицы +## Создание таблицы {#sozdanie-tablitsy} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -30,13 +29,14 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] При создании таблицы `AggregatingMergeTree` используются те же [секции](mergetree.md), что и при создании таблицы `MergeTree`. +
-
Устаревший способ создания таблицы +Устаревший способ создания таблицы -!!! attention +!!! attention "Attention" Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ описанный выше. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -48,7 +48,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Все параметры имеют то же значение, что в и `MergeTree`.
-## SELECT/INSERT данных +## SELECT/INSERT данных {#selectinsert-dannykh} Для вставки данных используйте `INSERT SELECT` с агрегатными `-State`-функциями. @@ -56,11 +56,11 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] В запросах `SELECT` значения типа `AggregateFunction` выводятся во всех форматах, которые поддерживает ClickHouse, в виде implementation-specific бинарных данных. Если с помощью `SELECT` выполнить дамп данных, например, в формат `TabSeparated`, то потом этот дамп можно загрузить обратно с помощью запроса `INSERT`. -## Пример агрегирущего материализованного представления +## Пример агрегирущего материализованного представления {#primer-agregirushchego-materializovannogo-predstavleniia} Создаём материализованное представление типа `AggregatingMergeTree`, следящее за таблицей `test.visits`: -```sql +``` sql CREATE MATERIALIZED VIEW test.basic ENGINE = AggregatingMergeTree() PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate) AS SELECT @@ -74,7 +74,7 @@ GROUP BY CounterID, StartDate; Вставляем данные в таблицу `test.visits`: -```sql +``` sql INSERT INTO test.visits ... ``` @@ -82,7 +82,7 @@ INSERT INTO test.visits ... Чтобы получить агрегированные данные, выполним запрос вида `SELECT ... GROUP BY ...` из представления `test.basic`: -```sql +``` sql SELECT StartDate, sumMerge(Visits) AS Visits, diff --git a/docs/ru/operations/table_engines/buffer.md b/docs/ru/operations/table_engines/buffer.md index b162df2b8b1..5866b36164f 100644 --- a/docs/ru/operations/table_engines/buffer.md +++ b/docs/ru/operations/table_engines/buffer.md @@ -1,23 +1,23 @@ -# Buffer +# Buffer {#buffer} Буферизует записываемые данные в оперативке, периодически сбрасывая их в другую таблицу. При чтении, производится чтение данных одновременно из буфера и из другой таблицы. -```sql +``` sql Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes) ``` Параметры движка: -`database` — имя базы данных. Вместо имени базы данных может использоваться константное выражение, возвращающее строку. +`database` — имя базы данных. Вместо имени базы данных может использоваться константное выражение, возвращающее строку. `table` — таблица, в которую сбрасывать данные. `num_layers` — уровень параллелизма. Физически таблица будет представлена в виде `num_layers` независимых буферов. Рекомендуемое значение — 16. `min_time`, `max_time`, `min_rows`, `max_rows`, `min_bytes`, `max_bytes` — условия для сброса данных из буфера. Данные сбрасываются из буфера и записываются в таблицу назначения, если выполнены все `min`-условия или хотя бы одно `max`-условие. -- `min_time`, `max_time` — условие на время в секундах от момента первой записи в буфер. -- `min_rows`, `max_rows` — условие на количество строк в буфере. -- `min_bytes`, `max_bytes` — условие на количество байт в буфере. +- `min_time`, `max_time` — условие на время в секундах от момента первой записи в буфер. +- `min_rows`, `max_rows` — условие на количество строк в буфере. +- `min_bytes`, `max_bytes` — условие на количество байт в буфере. При записи, данные вставляются в случайный из `num_layers` буферов. Или, если размер куска вставляемых данных достаточно большой (больше `max_rows` или `max_bytes`), то он записывается в таблицу назначения минуя буфер. @@ -25,11 +25,11 @@ Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_ Пример: -```sql +``` sql CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000) ``` -Создаём таблицу merge.hits_buffer такой же структуры как merge.hits и движком Buffer. При записи в эту таблицу, данные буферизуются в оперативке и, в дальнейшем, записываются в таблицу merge.hits. Создаётся 16 буферов. Данные, имеющиеся в каждом из них будут сбрасываться, если прошло сто секунд, или записан миллион строк, или записано сто мегабайт данных; или если одновременно прошло десять секунд и записано десять тысяч строк и записано десять мегабайт данных. Для примера, если записана всего лишь одна строка, то через сто секунд она будет сброшена в любом случае. А если записано много строк, то они будут сброшены раньше. +Создаём таблицу merge.hits\_buffer такой же структуры как merge.hits и движком Buffer. При записи в эту таблицу, данные буферизуются в оперативке и, в дальнейшем, записываются в таблицу merge.hits. Создаётся 16 буферов. Данные, имеющиеся в каждом из них будут сбрасываться, если прошло сто секунд, или записан миллион строк, или записано сто мегабайт данных; или если одновременно прошло десять секунд и записано десять тысяч строк и записано десять мегабайт данных. Для примера, если записана всего лишь одна строка, то через сто секунд она будет сброшена в любом случае. А если записано много строк, то они будут сброшены раньше. При остановке сервера, при DROP TABLE или DETACH TABLE, данные из буфера тоже сбрасываются в таблицу назначения. @@ -51,7 +51,7 @@ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10 При добавлении данных в Buffer, один из буферов блокируется. Это приводит к задержкам, если одновременно делается чтение из таблицы. -Данные, вставляемые в таблицу Buffer, попадают в подчинённую таблицу в порядке, возможно отличающимся от порядка вставки, и блоками, возможно отличающимися от вставленных блоков. В связи с этим, трудно корректно использовать таблицу типа Buffer для записи в CollapsingMergeTree. Чтобы избежать проблемы, можно выставить num_layers в 1. +Данные, вставляемые в таблицу Buffer, попадают в подчинённую таблицу в порядке, возможно отличающимся от порядка вставки, и блоками, возможно отличающимися от вставленных блоков. В связи с этим, трудно корректно использовать таблицу типа Buffer для записи в CollapsingMergeTree. Чтобы избежать проблемы, можно выставить num\_layers в 1. Если таблица назначения является реплицируемой, то при записи в таблицу Buffer будут потеряны некоторые ожидаемые свойства реплицируемых таблиц. Из-за произвольного изменения порядка строк и размеров блоков данных, перестаёт работать дедупликация данных, в результате чего исчезает возможность надёжной exactly once записи в реплицируемые таблицы. @@ -59,6 +59,6 @@ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10 Таблицы типа Buffer используются в тех случаях, когда от большого количества серверов поступает слишком много INSERT-ов в единицу времени, и нет возможности заранее самостоятельно буферизовать данные перед вставкой, в результате чего, INSERT-ы не успевают выполняться. -Заметим, что даже для таблиц типа Buffer не имеет смысла вставлять данные по одной строке, так как таким образом будет достигнута скорость всего лишь в несколько тысяч строк в секунду, тогда как при вставке более крупными блоками, достижимо более миллиона строк в секунду (смотрите раздел "Производительность"). +Заметим, что даже для таблиц типа Buffer не имеет смысла вставлять данные по одной строке, так как таким образом будет достигнута скорость всего лишь в несколько тысяч строк в секунду, тогда как при вставке более крупными блоками, достижимо более миллиона строк в секунду (смотрите раздел «Производительность»). [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/buffer/) diff --git a/docs/ru/operations/table_engines/collapsingmergetree.md b/docs/ru/operations/table_engines/collapsingmergetree.md index d5513c80022..4275c1130a6 100644 --- a/docs/ru/operations/table_engines/collapsingmergetree.md +++ b/docs/ru/operations/table_engines/collapsingmergetree.md @@ -1,4 +1,4 @@ -# CollapsingMergeTree {#table_engine-collapsingmergetree} +# CollapsingMergeTree {#table-engine-collapsingmergetree} Движок наследует функциональность от [MergeTree](mergetree.md) и добавляет в алгоритм слияния кусков данных логику сворачивания (удаления) строк. @@ -6,9 +6,9 @@ Движок может значительно уменьшить объём хранения и, как следствие, повысить эффективность запросов `SELECT`. -## Создание таблицы +## Создание таблицы {#sozdanie-tablitsy} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -25,20 +25,22 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Параметры CollapsingMergeTree** -- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния. +- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния. - Тип данных столбца — `Int8`. + Тип данных столбца — `Int8`. **Секции запроса** При создании таблицы с движком `CollapsingMergeTree` используются те же [секции запроса](mergetree.md#table_engine-mergetree-creating-a-table) что и при создании таблицы с движком `MergeTree`. -
Устаревший способ создания таблицы +
-!!! attention +Устаревший способ создания таблицы + +!!! attention "Attention" Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ описанный выше. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -49,15 +51,15 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Все параметры, кроме `ver` имеют то же значение, что и в `MergeTree`. -- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния. +- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния. - Тип данных столбца — `Int8`. + Тип данных столбца — `Int8`.
-## Сворачивание (удаление) строк {#table_engine-collapsingmergetree-collapsing} +## Сворачивание (удаление) строк {#table-engine-collapsingmergetree-collapsing} -### Данные +### Данные {#dannye} Рассмотрим ситуацию, когда необходимо сохранять постоянно изменяющиеся данные для какого-либо объекта. Кажется логичным иметь одну строку для объекта и обновлять её при любом изменении, однако операция обновления является дорогостоящей и медленной для СУБД, поскольку требует перезаписи данных в хранилище. Если необходимо быстро записать данные, обновление не допустимо, но можно записать изменения объекта последовательно как описано ниже. @@ -65,7 +67,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Например, мы хотим рассчитать, сколько страниц проверили пользователи на каком-то сайте и как долго они там находились. В какой-то момент времени мы пишем следующую строку с состоянием действий пользователя: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ @@ -73,7 +75,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Через некоторое время мы регистрируем изменение активности пользователя и записываем его следующими двумя строками. -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -86,55 +88,55 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Поскольку нам нужно только последнее состояние активности пользователя, строки -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ 5 │ 146 │ -1 │ └─────────────────────┴───────────┴──────────┴──────┘ ``` -можно удалить, сворачивая (удаляя) устаревшее состояние объекта. `CollapsingMergeTree` выполняет это при слиянии кусков данных. +можно удалить, сворачивая (удаляя) устаревшее состояние объекта. `CollapsingMergeTree` выполняет это при слиянии кусков данных. Зачем нужны две строки для каждого изменения описано в разделе [Алгоритм](#table_engine-collapsingmergetree-collapsing-algorithm). **Особенности подхода** -1. Программа, которая записывает данные, должна помнить состояние объекта, чтобы иметь возможность отменить его. Строка отмены состояния должна содержать копию полей сортировочного ключа предыдущей строки состояния с противоположным значением `Sign`. Это увеличивает начальный размер хранилища, но позволяет быстро записывать данные. -2. Длинные растущие массивы в Столбцах снижают эффективность работы движка за счёт нагрузки на запись. Чем проще данные, тем выше эффективность. -3. Результаты запроса `SELECT` сильно зависят от согласованности истории изменений объекта. Будьте точны при подготовке данных для вставки. Можно получить непредсказуемые результаты для несогласованных данных, например отрицательные значения для неотрицательных метрик, таких как глубина сеанса. +1. Программа, которая записывает данные, должна помнить состояние объекта, чтобы иметь возможность отменить его. Строка отмены состояния должна содержать копию полей сортировочного ключа предыдущей строки состояния с противоположным значением `Sign`. Это увеличивает начальный размер хранилища, но позволяет быстро записывать данные. +2. Длинные растущие массивы в Столбцах снижают эффективность работы движка за счёт нагрузки на запись. Чем проще данные, тем выше эффективность. +3. Результаты запроса `SELECT` сильно зависят от согласованности истории изменений объекта. Будьте точны при подготовке данных для вставки. Можно получить непредсказуемые результаты для несогласованных данных, например отрицательные значения для неотрицательных метрик, таких как глубина сеанса. -### Алгоритм {#table_engine-collapsingmergetree-collapsing-algorithm} +### Алгоритм {#table-engine-collapsingmergetree-collapsing-algorithm} Во время объединения кусков данных, каждая группа последовательных строк с одинаковым сортировочным ключом (`ORDER BY`) уменьшается до не более чем двух строк, одна из которых имеет `Sign = 1` (строка состояния), а другая строка с `Sign = -1` (строка отмены состояния). Другими словами, записи сворачиваются. Для каждого результирующего куска данных ClickHouse сохраняет: - 1. Первую строку отмены состояния и последнюю строку состояния, если количество строк обоих видов совпадает. + 1. Первую строку отмены состояния и последнюю строку состояния, если количество строк обоих видов совпадает. - 2. Последнюю строку состояния, если строк состояния на одну больше, чем строк отмены состояния. + 2. Последнюю строку состояния, если строк состояния на одну больше, чем строк отмены состояния. - 3. Первую строку отмены состояния, если их на одну больше, чем строк состояния. + 3. Первую строку отмены состояния, если их на одну больше, чем строк состояния. - 4. Ни в одну из строк во всех остальных случаях. + 4. Ни в одну из строк во всех остальных случаях. - Слияние продолжается, но ClickHouse рассматривает эту ситуацию как логическую ошибку и записывает её в журнал сервера. Эта ошибка может возникать, если одни и те же данные вставлялись несколько раз. + Слияние продолжается, но ClickHouse рассматривает эту ситуацию как логическую ошибку и записывает её в журнал сервера. Эта ошибка может возникать, если одни и те же данные вставлялись несколько раз. Как видно, от сворачивания не должны меняться результаты расчётов статистик. Изменения постепенно сворачиваются так, что остаются лишь последнее состояние почти каждого объекта. Столбец `Sign` необходим, поскольку алгоритм слияния не гарантирует, что все строки с одинаковым ключом сортировки будут находиться в одном результирующем куске данных и даже на одном физическом сервере. ClickHouse выполняет запросы `SELECT` несколькими потоками, и он не может предсказать порядок строк в результате. Если необходимо получить полностью свёрнутые данные из таблицы `CollapsingMergeTree`, то необходимо агрегирование. -Для завершения свертывания добавьте в запрос секцию`GROUP BY` и агрегатные функции, которые учитывают знак. Например, для расчета количества используйте `sum(Sign)` вместо`count()`. Чтобы вычислить сумму чего-либо, используйте `sum(Sign * x)` вместо`sum(х)`, и так далее, а также добавьте `HAVING sum(Sign) > 0` . +Для завершения свертывания добавьте в запрос секцию`GROUP BY` и агрегатные функции, которые учитывают знак. Например, для расчета количества используйте `sum(Sign)` вместо`count()`. Чтобы вычислить сумму чего-либо, используйте `sum(Sign * x)` вместо`sum(х)`, и так далее, а также добавьте `HAVING sum(Sign) > 0` . Таким образом можно вычислять агрегации `count`, `sum` и `avg`. Если объект имеет хотя бы одно не свёрнутое состояние, то может быть вычислена агрегация `uniq`. Агрегации `min` и `max` невозможно вычислить, поскольку `CollapsingMergeTree` не сохраняет историю значений свернутых состояний. -Если необходимо выбирать данные без агрегации (например, проверить наличие строк, последние значения которых удовлетворяют некоторым условиям), можно использовать модификатор `FINAL` для секции `FROM`. Это вариант существенно менее эффективен. +Если необходимо выбирать данные без агрегации (например, проверить наличие строк, последние значения которых удовлетворяют некоторым условиям), можно использовать модификатор `FINAL` для секции `FROM`. Это вариант существенно менее эффективен. -## Пример использования +## Пример использования {#primer-ispolzovaniia} Исходные данные: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ 5 │ 146 │ -1 │ @@ -144,7 +146,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Создание таблицы: -```sql +``` sql CREATE TABLE UAct ( UserID UInt64, @@ -158,11 +160,11 @@ ORDER BY UserID Insertion of the data: -```sql +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1) ``` -```sql +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 6, 185, 1) ``` @@ -170,11 +172,11 @@ INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, Получение данных: -```sql +``` sql SELECT * FROM UAct ``` -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -186,11 +188,11 @@ SELECT * FROM UAct Что мы видим и где сворачивание? -Двумя запросами `INSERT`, мы создали два куска данных. Запрос `SELECT` был выполнен в 2 потока, и мы получили случайный порядок строк. Сворачивание не произошло, так как слияние кусков данных еще не произошло. ClickHouse объединяет куски данных в неизвестный момент времени, который мы не можем предсказать. +Двумя запросами `INSERT`, мы создали два куска данных. Запрос `SELECT` был выполнен в 2 потока, и мы получили случайный порядок строк. Сворачивание не произошло, так как слияние кусков данных еще не произошло. ClickHouse объединяет куски данных в неизвестный момент времени, который мы не можем предсказать. Таким образом, нам нужна агрегация: -```sql +``` sql SELECT UserID, sum(PageViews * Sign) AS PageViews, @@ -200,7 +202,7 @@ GROUP BY UserID HAVING sum(Sign) > 0 ``` -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┐ │ 4324182021466249494 │ 6 │ 185 │ └─────────────────────┴───────────┴──────────┘ @@ -208,11 +210,11 @@ HAVING sum(Sign) > 0 Если нам не нужна агрегация, но мы хотим принудительно выполнить свёртку данных, можно использовать модификатор `FINAL` для секции `FROM`. -```sql +``` sql SELECT * FROM UAct FINAL ``` -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ @@ -220,21 +222,22 @@ SELECT * FROM UAct FINAL Такой способ выбора данных очень неэффективен. Не используйте его для больших таблиц. -## Пример другого подхода +## Пример другого подхода {#primer-drugogo-podkhoda} Исходные данные: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ -5 │ -146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ ``` -Идея состоит в том, что слияния при сворачивании учитывают только ключевые поля, поэтому в отменяющей строке можно указать отрицательные значения, которые нивелируют предыдущую версию записи при суммировании без учета поля Sign. -Для этого подхода необходимо изменить тип данных `PageViews`, `Duration` для хранения отрицательных значений UInt8 -> Int16. -```sql +Идея состоит в том, что слияния при сворачивании учитывают только ключевые поля, поэтому в отменяющей строке можно указать отрицательные значения, которые нивелируют предыдущую версию записи при суммировании без учета поля Sign. +Для этого подхода необходимо изменить тип данных `PageViews`, `Duration` для хранения отрицательных значений UInt8 -\> Int16. + +``` sql CREATE TABLE UAct ( UserID UInt64, @@ -247,19 +250,22 @@ ORDER BY UserID ``` Тестируем подход: -```sql + +``` sql insert into UAct values(4324182021466249494, 5, 146, 1); insert into UAct values(4324182021466249494, -5, -146, -1); insert into UAct values(4324182021466249494, 6, 185, 1); select * from UAct final; // старайтесь не использовать final (он подходит только для тестов и маленьких таблиц) ``` -```text + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ ``` -```sql + +``` sql SELECT UserID, sum(PageViews) AS PageViews, @@ -267,29 +273,33 @@ SELECT FROM UAct GROUP BY UserID ``` -```text + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┐ │ 4324182021466249494 │ 6 │ 185 │ └─────────────────────┴───────────┴──────────┘ ``` -```sql + +``` sql select count() FROM UAct ``` -```text + +``` text ┌─count()─┐ │ 3 │ └─────────┘ ``` -```sql + +``` sql optimize table UAct final; select * FROM UAct ``` -```text + +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ ``` [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/collapsingmergetree/) - diff --git a/docs/ru/operations/table_engines/custom_partitioning_key.md b/docs/ru/operations/table_engines/custom_partitioning_key.md index 9376000a3b5..c2b846ef3c1 100644 --- a/docs/ru/operations/table_engines/custom_partitioning_key.md +++ b/docs/ru/operations/table_engines/custom_partitioning_key.md @@ -1,4 +1,4 @@ -# Произвольный ключ партиционирования +# Произвольный ключ партиционирования {#proizvolnyi-kliuch-partitsionirovaniia} Партиционирование данных доступно для таблиц семейства [MergeTree](mergetree.md) (включая [реплицированные таблицы](replication.md)). Таблицы [MaterializedView](materializedview.md), созданные на основе таблиц MergeTree, также поддерживают партиционирование. @@ -6,11 +6,11 @@ Ключ партиционирования задается при [создании таблицы](mergetree.md#table_engine-mergetree-creating-a-table), в секции `PARTITION BY expr`. Ключ может представлять собой произвольное выражение из столбцов таблицы. Например, чтобы задать партиционирования по месяцам, можно использовать выражение `toYYYYMM(date_column)`: -```sql +``` sql CREATE TABLE visits ( - VisitDate Date, - Hour UInt8, + VisitDate Date, + Hour UInt8, ClientID UUID ) ENGINE = MergeTree() @@ -20,7 +20,7 @@ ORDER BY Hour Ключом партиционирования также может быть кортеж из выражений (аналогично [первичному ключу](mergetree.md#primary-keys-and-indexes-in-queries)). Например: -```sql +``` sql ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/name', 'replica1', Sign) PARTITION BY (toMonday(StartDate), EventType) ORDER BY (CounterID, StartDate, intHash32(UserID)); @@ -28,23 +28,23 @@ ORDER BY (CounterID, StartDate, intHash32(UserID)); В этом примере задано партиционирование по типам событий, произошедших в течение текущей недели. -Каждая партиция состоит из отдельных фрагментов или так называемых _кусков данных_. Каждый кусок отсортирован по первичному ключу. При вставке данных в таблицу каждая отдельная запись сохраняется в виде отдельного куска. Через некоторое время после вставки (обычно до 10 минут), ClickHouse выполняет в фоновом режиме слияние данных — в результате куски для одной и той же партиции будут объединены в более крупный кусок. +Каждая партиция состоит из отдельных фрагментов или так называемых *кусков данных*. Каждый кусок отсортирован по первичному ключу. При вставке данных в таблицу каждая отдельная запись сохраняется в виде отдельного куска. Через некоторое время после вставки (обычно до 10 минут), ClickHouse выполняет в фоновом режиме слияние данных — в результате куски для одной и той же партиции будут объединены в более крупный кусок. -!!! info +!!! info "Info" Не рекомендуется делать слишком гранулированное партиционирование – то есть задавать партиции по столбцу, в котором будет слишком большой разброс значений (речь идет о порядке более тысячи партиций). Это приведет к скоплению большого числа файлов и файловых дескрипторов в системе, что может значительно снизить производительность запросов `SELECT`. Чтобы получить набор кусков и партиций таблицы, можно воспользоваться системной таблицей [system.parts](../system_tables.md#system_tables-parts). В качестве примера рассмотрим таблицу `visits`, в которой задано партиционирование по месяцам. Выполним `SELECT` для таблицы `system.parts`: -```sql -SELECT +``` sql +SELECT partition, - name, + name, active -FROM system.parts +FROM system.parts WHERE table = 'visits' ``` -```text +``` text ┌─partition─┬─name───────────┬─active─┐ │ 201901 │ 201901_1_3_1 │ 0 │ │ 201901 │ 201901_1_9_2 │ 1 │ @@ -56,7 +56,7 @@ WHERE table = 'visits' └───────────┴────────────────┴────────┘ ``` -Столбец `partition` содержит имена всех партиций таблицы. Таблица `visits` из нашего примера содержит две партиции: `201901` и `201902`. Используйте значения из этого столбца в запросах [ALTER ... PARTITION](#alter_manipulations-with-partitions). +Столбец `partition` содержит имена всех партиций таблицы. Таблица `visits` из нашего примера содержит две партиции: `201901` и `201902`. Используйте значения из этого столбца в запросах [ALTER … PARTITION](#alter_manipulations-with-partitions). Столбец `name` содержит названия кусков партиций. Значения из этого столбца можно использовать в запросах [ALTER ATTACH PART](#alter_attach-partition). @@ -64,23 +64,23 @@ WHERE table = 'visits' Рассмотрим детальнее имя первого куска `201901_1_3_1`: -- `201901` имя партиции; -- `1` – минимальный номер блока данных; -- `3` – максимальный номер блока данных; -- `1` – уровень куска (глубина дерева слияний, которыми этот кусок образован). +- `201901` имя партиции; +- `1` – минимальный номер блока данных; +- `3` – максимальный номер блока данных; +- `1` – уровень куска (глубина дерева слияний, которыми этот кусок образован). -!!! info - Названия кусков для таблиц старого типа образуются следующим образом: `20190117_20190123_2_2_0` (минимальная дата _ максимальная дата _ номер минимального блока _ номер максимального блока _ уровень). +!!! info "Info" + Названия кусков для таблиц старого типа образуются следующим образом: `20190117_20190123_2_2_0` (минимальная дата \_ максимальная дата \_ номер минимального блока \_ номер максимального блока \_ уровень). -Как видно из примера выше, таблица содержит несколько отдельных кусков для одной и той же партиции (например, куски `201901_1_3_1` и `201901_1_9_2` принадлежат партиции `201901`). Это означает, что эти куски еще не были объединены – в файловой системе они хранятся отдельно. После того как будет выполнено автоматическое слияние данных (выполняется примерно спустя 10 минут после вставки данных), исходные куски будут объединены в один более крупный кусок и помечены как неактивные. +Как видно из примера выше, таблица содержит несколько отдельных кусков для одной и той же партиции (например, куски `201901_1_3_1` и `201901_1_9_2` принадлежат партиции `201901`). Это означает, что эти куски еще не были объединены – в файловой системе они хранятся отдельно. После того как будет выполнено автоматическое слияние данных (выполняется примерно спустя 10 минут после вставки данных), исходные куски будут объединены в один более крупный кусок и помечены как неактивные. Вы можете запустить внеочередное слияние данных с помощью запроса [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize). Пример: -```sql +``` sql OPTIMIZE TABLE visits PARTITION 201902; ``` -```text +``` text ┌─partition─┬─name───────────┬─active─┐ │ 201901 │ 201901_1_3_1 │ 0 │ │ 201901 │ 201901_1_9_2 │ 1 │ @@ -95,9 +95,9 @@ OPTIMIZE TABLE visits PARTITION 201902; Неактивные куски будут удалены примерно через 10 минут после слияния. -Другой способ посмотреть набор кусков и партиций – зайти в директорию с данными таблицы: `/var/lib/clickhouse/data///`. Например: +Другой способ посмотреть набор кусков и партиций – зайти в директорию с данными таблицы: `/var/lib/clickhouse/data//
/`. Например: -```bash +``` bash /var/lib/clickhouse/data/default/visits$ ls -l total 40 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1 @@ -111,9 +111,9 @@ drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 12:09 201902_4_6_1 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached ``` -'201901_1_1_0', '201901_1_7_1' и т. д. – это директории кусков партиции. Каждый кусок содержит данные только для соответствующего месяца (таблица в данном примере содержит партиционирование по месяцам). +‘201901\_1\_1\_0’, ‘201901\_1\_7\_1’ и т. д. – это директории кусков партиции. Каждый кусок содержит данные только для соответствующего месяца (таблица в данном примере содержит партиционирование по месяцам). -Директория `detached` содержит куски, отсоединенные от таблицы с помощью запроса [DETACH](#alter_detach-partition). Поврежденные куски также попадают в эту директорию – они не удаляются с сервера. +Директория `detached` содержит куски, отсоединенные от таблицы с помощью запроса [DETACH](#alter_detach-partition). Поврежденные куски также попадают в эту директорию – они не удаляются с сервера. Сервер не использует куски из директории `detached`. Вы можете в любое время добавлять, удалять, модифицировать данные в директории detached - сервер не будет об этом знать, пока вы не сделаете запрос [ATTACH](../../query_language/alter.md#alter_attach-partition). @@ -123,5 +123,4 @@ drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached ClickHouse позволяет производить различные манипуляции с кусками: удалять, копировать из одной таблицы в другую или создавать их резервные копии. Подробнее см. в разделе [Манипуляции с партициями и кусками](../../query_language/alter.md#alter_manipulations-with-partitions). -[Оригинальная статья: ](https://clickhouse.tech/docs/ru/operations/table_engines/custom_partitioning_key/) - +[Оригинальная статья:](https://clickhouse.tech/docs/ru/operations/table_engines/custom_partitioning_key/) diff --git a/docs/ru/operations/table_engines/dictionary.md b/docs/ru/operations/table_engines/dictionary.md index 0bf34c16e93..9f764a65ffa 100644 --- a/docs/ru/operations/table_engines/dictionary.md +++ b/docs/ru/operations/table_engines/dictionary.md @@ -1,11 +1,10 @@ - -# Dictionary +# Dictionary {#dictionary} Движок `Dictionary` отображает данные [словаря](../../query_language/dicts/external_dicts.md) как таблицу ClickHouse. Рассмотрим для примера словарь `products` со следующей конфигурацией: -```xml +``` xml products @@ -38,7 +37,7 @@ Запрос данных словаря: -```sql +``` sql SELECT name, type, @@ -51,36 +50,36 @@ SELECT FROM system.dictionaries WHERE name = 'products' ``` -```text + +``` text ┌─name─────┬─type─┬─key────┬─attribute.names─┬─attribute.types─┬─bytes_allocated─┬─element_count─┬─source──────────┐ │ products │ Flat │ UInt64 │ ['title'] │ ['String'] │ 23065376 │ 175032 │ ODBC: .products │ └──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘ ``` -В таком виде данные из словаря можно получить при помощи функций [dictGet*](../../query_language/functions/ext_dict_functions.md#ext_dict_functions). +В таком виде данные из словаря можно получить при помощи функций [dictGet\*](../../query_language/functions/ext_dict_functions.md#ext_dict_functions). Такое представление неудобно, когда нам необходимо получить данные в чистом виде, а также при выполнении операции `JOIN`. Для этих случаев можно использовать движок `Dictionary`, который отобразит данные словаря в таблицу. Синтаксис: -```sql +``` sql CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)` ``` - Пример использования: -```sql +``` sql create table products (product_id UInt64, title String) Engine = Dictionary(products); ``` Проверим что у нас в таблице? -```sql +``` sql select * from products limit 1; ``` -```text +``` text ┌────product_id─┬─title───────────┐ │ 152689 │ Some item │ └───────────────┴─────────────────┘ diff --git a/docs/ru/operations/table_engines/distributed.md b/docs/ru/operations/table_engines/distributed.md index 259841aebf1..91d41528518 100644 --- a/docs/ru/operations/table_engines/distributed.md +++ b/docs/ru/operations/table_engines/distributed.md @@ -1,12 +1,11 @@ - -# Distributed +# Distributed {#distributed} **Движок Distributed не хранит данные самостоятельно**, а позволяет обрабатывать запросы распределённо, на нескольких серверах. Чтение автоматически распараллеливается. При чтении будут использованы индексы таблиц на удалённых серверах, если есть. Движок Distributed принимает параметры: имя кластера в конфигурационном файле сервера, имя удалённой базы данных, имя удалённой таблицы, а также (не обязательно) ключ шардирования. Пример: -```sql +``` sql Distributed(logs, default, hits[, sharding_key]) ``` @@ -20,7 +19,7 @@ logs - имя кластера в конфигурационном файле с Кластеры задаются следующим образом: -```xml +``` xml @@ -60,16 +59,14 @@ logs - имя кластера в конфигурационном файле с Имя кластера не должно содержать точки. В качестве параметров для каждого сервера указываются `host`, `port` и, не обязательно, `user`, `password`, `secure`, `compression`: +- `host` - адрес удалённого сервера. Может быть указан домен, или IPv4 или IPv6 адрес. В случае указания домена, при старте сервера делается DNS запрос, и результат запоминается на всё время работы сервера. Если DNS запрос неуспешен, то сервер не запускается. Если вы изменяете DNS-запись, перезапустите сервер. +- `port` - TCP-порт для межсерверного взаимодействия (в конфиге - tcp\_port, обычно 9000). Не перепутайте с http\_port. +- `user` - имя пользователя для соединения с удалённым сервером. по умолчанию - default. Этот пользователь должен иметь доступ для соединения с указанным сервером. Доступы настраиваются в файле users.xml, подробнее смотрите в разделе [Права доступа](../../operations/access_rights.md). +- `password` - пароль для соединения с удалённым сервером, в открытом виде. по умолчанию - пустая строка. +- `secure` - Использовать шифрованное соединение ssl, Обычно используется с портом `port` = 9440. Сервер должен слушать порт 9440 с корректными настройками сертификатов. +- `compression` - Использовать сжатие данных. По умолчанию: true. -: - `host` - адрес удалённого сервера. Может быть указан домен, или IPv4 или IPv6 адрес. В случае указания домена, при старте сервера делается DNS запрос, и результат запоминается на всё время работы сервера. Если DNS запрос неуспешен, то сервер не запускается. Если вы изменяете DNS-запись, перезапустите сервер. - - `port` - TCP-порт для межсерверного взаимодействия (в конфиге - tcp_port, обычно 9000). Не перепутайте с http_port. - - `user` - имя пользователя для соединения с удалённым сервером. по умолчанию - default. Этот пользователь должен иметь доступ для соединения с указанным сервером. Доступы настраиваются в файле users.xml, подробнее смотрите в разделе [Права доступа](../../operations/access_rights.md). - - `password` - пароль для соединения с удалённым сервером, в открытом виде. по умолчанию - пустая строка. - - `secure` - Использовать шифрованное соединение ssl, Обычно используется с портом `port` = 9440. Сервер должен слушать порт 9440 с корректными настройками сертификатов. - - `compression` - Использовать сжатие данных. По умолчанию: true. - - -При указании реплик, для каждого из шардов, при чтении, будет выбрана одна из доступных реплик. Можно настроить алгоритм балансировки нагрузки (то есть, предпочтения, на какую из реплик идти) - см. настройку [load_balancing](../settings/settings.md#settings-load_balancing). +При указании реплик, для каждого из шардов, при чтении, будет выбрана одна из доступных реплик. Можно настроить алгоритм балансировки нагрузки (то есть, предпочтения, на какую из реплик идти) - см. настройку [load\_balancing](../settings/settings.md#settings-load_balancing). Если соединение с сервером не установлено, то будет произведена попытка соединения с небольшим таймаутом. Если соединиться не удалось, то будет выбрана следующая реплика, и так для всех реплик. Если попытка соединения для всех реплик не удалась, то будут снова произведены попытки соединения по кругу, и так несколько раз. Это работает в пользу отказоустойчивости, хотя и не обеспечивает полную отказоустойчивость: удалённый сервер может принять соединение, но не работать, или плохо работать. @@ -81,26 +78,26 @@ logs - имя кластера в конфигурационном файле с Движок Distributed позволяет работать с кластером, как с локальным сервером. При этом, кластер является неэластичным: вы должны прописать его конфигурацию в конфигурационный файл сервера (лучше всех серверов кластера). -Не поддерживаются Distributed таблицы, смотрящие на другие Distributed таблицы (за исключением случаев, когда у Distributed таблицы всего один шард). Вместо этого, сделайте так, чтобы Distributed таблица смотрела на "конечные" таблицы. +Не поддерживаются Distributed таблицы, смотрящие на другие Distributed таблицы (за исключением случаев, когда у Distributed таблицы всего один шард). Вместо этого, сделайте так, чтобы Distributed таблица смотрела на «конечные» таблицы. Как видно, движок Distributed требует прописывания кластера в конфигурационный файл; кластера из конфигурационного файла обновляются налету, без перезапуска сервера. Если вам необходимо каждый раз отправлять запрос на неизвестный набор шардов и реплик, вы можете не создавать Distributed таблицу, а воспользоваться табличной функцией remote. Смотрите раздел [Табличные функции](../../query_language/table_functions/index.md). Есть два способа записывать данные на кластер: -Во-первых, вы можете самостоятельно определять, на какие серверы какие данные записывать, и выполнять запись непосредственно на каждый шард. То есть, делать INSERT в те таблицы, на которые "смотрит" распределённая таблица. Это наиболее гибкое решение поскольку вы можете использовать любую схему шардирования, которая может быть нетривиальной из-за требований предметной области. +Во-первых, вы можете самостоятельно определять, на какие серверы какие данные записывать, и выполнять запись непосредственно на каждый шард. То есть, делать INSERT в те таблицы, на которые «смотрит» распределённая таблица. Это наиболее гибкое решение поскольку вы можете использовать любую схему шардирования, которая может быть нетривиальной из-за требований предметной области. Также это является наиболее оптимальным решением, так как данные могут записываться на разные шарды полностью независимо. Во-вторых, вы можете делать INSERT в Distributed таблицу. В этом случае, таблица будет сама распределять вставляемые данные по серверам. Для того, чтобы писать в Distributed таблицу, у неё должен быть задан ключ шардирования (последний параметр). Также, если шард всего-лишь один, то запись работает и без указания ключа шардирования (так как в этом случае он не имеет смысла). -У каждого шарда в конфигурационном файле может быть задан "вес" (weight). По умолчанию, вес равен единице. Данные будут распределяться по шардам в количестве, пропорциональном весу шарда. Например, если есть два шарда, и у первого выставлен вес 9, а у второго 10, то на первый будет отправляться 9 / 19 доля строк, а на второй - 10 / 19. +У каждого шарда в конфигурационном файле может быть задан «вес» (weight). По умолчанию, вес равен единице. Данные будут распределяться по шардам в количестве, пропорциональном весу шарда. Например, если есть два шарда, и у первого выставлен вес 9, а у второго 10, то на первый будет отправляться 9 / 19 доля строк, а на второй - 10 / 19. -У каждого шарда в конфигурационном файле может быть указан параметр internal_replication. +У каждого шарда в конфигурационном файле может быть указан параметр internal\_replication. -Если он выставлен в true, то для записи будет выбираться первая живая реплика и данные будут писаться на неё. Этот вариант следует использовать, если Distributed таблица "смотрит" на реплицируемые таблицы. То есть, если таблица, в которую будут записаны данные, будет сама заниматься их репликацией. +Если он выставлен в true, то для записи будет выбираться первая живая реплика и данные будут писаться на неё. Этот вариант следует использовать, если Distributed таблица «смотрит» на реплицируемые таблицы. То есть, если таблица, в которую будут записаны данные, будет сама заниматься их репликацией. Если он выставлен в false (по умолчанию), то данные будут записываться на все реплики. По сути, это означает, что Distributed таблица занимается репликацией данных самостоятельно. Это хуже, чем использование реплицируемых таблиц, так как не контролируется консистентность реплик, и они со временем будут содержать немного разные данные. -Для выбора шарда, на который отправляется строка данных, вычисляется выражение шардирования, и берётся его остаток от деления на суммарный вес шардов. Строка отправляется на шард, соответствующий полуинтервалу остатков от prev_weights до prev_weights + weight, где prev_weights - сумма весов шардов с меньшим номером, а weight - вес этого шарда. Например, если есть два шарда, и у первого выставлен вес 9, а у второго 10, то строка будет отправляться на первый шард для остатков из диапазона \[0, 9), а на второй - для остатков из диапазона \[9, 19). +Для выбора шарда, на который отправляется строка данных, вычисляется выражение шардирования, и берётся его остаток от деления на суммарный вес шардов. Строка отправляется на шард, соответствующий полуинтервалу остатков от prev\_weights до prev\_weights + weight, где prev\_weights - сумма весов шардов с меньшим номером, а weight - вес этого шарда. Например, если есть два шарда, и у первого выставлен вес 9, а у второго 10, то строка будет отправляться на первый шард для остатков из диапазона \[0, 9), а на второй - для остатков из диапазона \[9, 19). Выражением шардирование может быть произвольное выражение от констант и столбцов таблицы, возвращающее целое число. Например, вы можете использовать выражение rand() для случайного распределения данных, или UserID - для распределения по остатку от деления идентификатора посетителя (тогда данные одного посетителя будут расположены на одном шарде, что упростит выполнение IN и JOIN по посетителям). Если распределение какого-либо столбца недостаточно равномерное, вы можете обернуть его в хэш функцию: intHash64(UserID). @@ -110,12 +107,12 @@ logs - имя кластера в конфигурационном файле с Беспокоиться о схеме шардирования имеет смысл в следующих случаях: - используются запросы, требующие соединение данных (IN, JOIN) по определённому ключу - тогда если данные шардированы по этому ключу, то можно использовать локальные IN, JOIN вместо GLOBAL IN, GLOBAL JOIN, что кардинально более эффективно. -- используется большое количество серверов (сотни и больше) и большое количество маленьких запросов (запросы отдельных клиентов - сайтов, рекламодателей, партнёров) - тогда, для того, чтобы маленькие запросы не затрагивали весь кластер, имеет смысл располагать данные одного клиента на одном шарде, или (вариант, который используется в Яндекс.Метрике) сделать двухуровневое шардирование: разбить весь кластер на "слои", где слой может состоять из нескольких шардов; данные для одного клиента располагаются на одном слое, но в один слой можно по мере необходимости добавлять шарды, в рамках которых данные распределены произвольным образом; создаются распределённые таблицы на каждый слой и одна общая распределённая таблица для глобальных запросов. +- используется большое количество серверов (сотни и больше) и большое количество маленьких запросов (запросы отдельных клиентов - сайтов, рекламодателей, партнёров) - тогда, для того, чтобы маленькие запросы не затрагивали весь кластер, имеет смысл располагать данные одного клиента на одном шарде, или (вариант, который используется в Яндекс.Метрике) сделать двухуровневое шардирование: разбить весь кластер на «слои», где слой может состоять из нескольких шардов; данные для одного клиента располагаются на одном слое, но в один слой можно по мере необходимости добавлять шарды, в рамках которых данные распределены произвольным образом; создаются распределённые таблицы на каждый слой и одна общая распределённая таблица для глобальных запросов. -Запись данных осуществляется полностью асинхронно. При вставке в таблицу, блок данных сначала записывается в файловую систему. Затем, в фоновом режиме отправляются на удалённые серверы при первой возможности. Период отправки регулируется настройками [distributed_directory_monitor_sleep_time_ms](../settings/settings.md#distributed_directory_monitor_sleep_time_ms) и [distributed_directory_monitor_max_sleep_time_ms](../settings/settings.md#distributed_directory_monitor_max_sleep_time_ms). Движок таблиц `Distributed` отправляет каждый файл со вставленными данными отдельно, но можно включить пакетную отправку данных настройкой [distributed_directory_monitor_batch_inserts](../settings/settings.md#distributed_directory_monitor_batch_inserts). Эта настройка улучшает производительность кластера за счет более оптимального использования ресурсов сервера-отправителя и сети. Необходимо проверять, что данные отправлены успешно, для этого проверьте список файлов (данных, ожидающих отправки) в каталоге таблицы `/var/lib/clickhouse/data/database/table/`. +Запись данных осуществляется полностью асинхронно. При вставке в таблицу, блок данных сначала записывается в файловую систему. Затем, в фоновом режиме отправляются на удалённые серверы при первой возможности. Период отправки регулируется настройками [distributed\_directory\_monitor\_sleep\_time\_ms](../settings/settings.md#distributed_directory_monitor_sleep_time_ms) и [distributed\_directory\_monitor\_max\_sleep\_time\_ms](../settings/settings.md#distributed_directory_monitor_max_sleep_time_ms). Движок таблиц `Distributed` отправляет каждый файл со вставленными данными отдельно, но можно включить пакетную отправку данных настройкой [distributed\_directory\_monitor\_batch\_inserts](../settings/settings.md#distributed_directory_monitor_batch_inserts). Эта настройка улучшает производительность кластера за счет более оптимального использования ресурсов сервера-отправителя и сети. Необходимо проверять, что данные отправлены успешно, для этого проверьте список файлов (данных, ожидающих отправки) в каталоге таблицы `/var/lib/clickhouse/data/database/table/`. Если после INSERT-а в Distributed таблицу, сервер перестал существовать или был грубо перезапущен (например, в следствие аппаратного сбоя), то записанные данные могут быть потеряны. Если в директории таблицы обнаружен повреждённый кусок данных, то он переносится в поддиректорию broken и больше не используется. -При выставлении опции max_parallel_replicas выполнение запроса распараллеливается по всем репликам внутри одного шарда. Подробнее смотрите раздел [max_parallel_replicas](../settings/settings.md#settings-max_parallel_replicas). +При выставлении опции max\_parallel\_replicas выполнение запроса распараллеливается по всем репликам внутри одного шарда. Подробнее смотрите раздел [max\_parallel\_replicas](../settings/settings.md#settings-max_parallel_replicas). [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/distributed/) diff --git a/docs/ru/operations/table_engines/external_data.md b/docs/ru/operations/table_engines/external_data.md index f5ce57529ca..700a718ed0c 100644 --- a/docs/ru/operations/table_engines/external_data.md +++ b/docs/ru/operations/table_engines/external_data.md @@ -1,6 +1,6 @@ -# Внешние данные для обработки запроса +# Внешние данные для обработки запроса {#vneshnie-dannye-dlia-obrabotki-zaprosa} -ClickHouse позволяет отправить на сервер данные, необходимые для обработки одного запроса, вместе с запросом SELECT. Такие данные будут положены во временную таблицу (см. раздел "Временные таблицы") и смогут использоваться в запросе (например, в операторах IN). +ClickHouse позволяет отправить на сервер данные, необходимые для обработки одного запроса, вместе с запросом SELECT. Такие данные будут положены во временную таблицу (см. раздел «Временные таблицы») и смогут использоваться в запросе (например, в операторах IN). Для примера, если у вас есть текстовый файл с важными идентификаторами посетителей, вы можете загрузить его на сервер вместе с запросом, в котором используется фильтрация по этому списку. @@ -10,29 +10,29 @@ ClickHouse позволяет отправить на сервер данные, В клиенте командной строки, может быть указана секция параметров вида -```bash +``` bash --external --file=... [--name=...] [--format=...] [--types=...|--structure=...] ``` Таких секций может быть несколько - по числу передаваемых таблиц. -**--external** - маркер начала секции. -**--file** - путь к файлу с дампом таблицы, или -, что обозначает stdin. +**–external** - маркер начала секции. +**–file** - путь к файлу с дампом таблицы, или -, что обозначает stdin. Из stdin может быть считана только одна таблица. Следующие параметры не обязательные: -**--name** - имя таблицы. Если не указано - используется _data. -**--format** - формат данных в файле. Если не указано - используется TabSeparated. +**–name** - имя таблицы. Если не указано - используется \_data. +**–format** - формат данных в файле. Если не указано - используется TabSeparated. Должен быть указан один из следующих параметров: -**--types** - список типов столбцов через запятую. Например, `UInt64,String`. Столбцы будут названы _1, _2, ... -**--structure** - структура таблицы, в форме `UserID UInt64`, `URL String`. Определяет имена и типы столбцов. +**–types** - список типов столбцов через запятую. Например, `UInt64,String`. Столбцы будут названы \_1, \_2, … +**–structure** - структура таблицы, в форме `UserID UInt64`, `URL String`. Определяет имена и типы столбцов. Файлы, указанные в file, будут разобраны форматом, указанным в format, с использованием типов данных, указанных в types или structure. Таблица будет загружена на сервер, и доступна там в качестве временной таблицы с именем name. Примеры: -```bash +``` bash $ echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 849897 $ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' @@ -43,11 +43,11 @@ $ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, co /bin/sync 1 ``` -При использовании HTTP интерфейса, внешние данные передаются в формате multipart/form-data. Каждая таблица передаётся отдельным файлом. Имя таблицы берётся из имени файла. В query_string передаются параметры name_format, name_types, name_structure, где name - имя таблицы, которой соответствуют эти параметры. Смысл параметров такой же, как при использовании клиента командной строки. +При использовании HTTP интерфейса, внешние данные передаются в формате multipart/form-data. Каждая таблица передаётся отдельным файлом. Имя таблицы берётся из имени файла. В query\_string передаются параметры name\_format, name\_types, name\_structure, где name - имя таблицы, которой соответствуют эти параметры. Смысл параметров такой же, как при использовании клиента командной строки. Пример: -```bash +``` bash $ cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv $ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' diff --git a/docs/ru/operations/table_engines/file.md b/docs/ru/operations/table_engines/file.md index 7391ff297dc..bfc2f0937ac 100644 --- a/docs/ru/operations/table_engines/file.md +++ b/docs/ru/operations/table_engines/file.md @@ -1,16 +1,16 @@ -# File(Format) {#table_engines-file} +# File(Format) {#table-engines-file} Управляет данными в одном файле на диске в указанном формате. Примеры применения: -- Выгрузка данных из ClickHouse в файл. -- Преобразование данных из одного формата в другой. -- Обновление данных в ClickHouse редактированием файла на диске. +- Выгрузка данных из ClickHouse в файл. +- Преобразование данных из одного формата в другой. +- Обновление данных в ClickHouse редактированием файла на диске. -## Использование движка в сервере ClickHouse +## Использование движка в сервере ClickHouse {#ispolzovanie-dvizhka-v-servere-clickhouse} -```sql +``` sql File(Format) ``` @@ -22,14 +22,14 @@ File(Format) Можно вручную создать в хранилище каталог таблицы, поместить туда файл, затем на сервере ClickHouse добавить ([ATTACH](../../query_language/misc.md)) информацию о таблице, соответствующей имени каталога и прочитать из файла данные. -!!! warning +!!! warning "Warning" Будьте аккуратны с этой функциональностью, поскольку сервер ClickHouse не отслеживает внешние изменения данных. Если в файл будет производиться запись одновременно со стороны сервера ClickHouse и с внешней стороны, то результат непредсказуем. **Пример:** **1.** Создадим на сервере таблицу `file_engine_table`: -```sql +``` sql CREATE TABLE file_engine_table (name String, value UInt32) ENGINE=File(TabSeparated) ``` @@ -37,43 +37,43 @@ CREATE TABLE file_engine_table (name String, value UInt32) ENGINE=File(TabSepara **2.** Вручную создадим файл `/var/lib/clickhouse/data/default/file_engine_table/data.TabSeparated` с содержимым: -```bash +``` bash $cat data.TabSeparated -one 1 -two 2 +one 1 +two 2 ``` **3.** Запросим данные: -```sql +``` sql SELECT * FROM file_engine_table ``` -```text +``` text ┌─name─┬─value─┐ │ one │ 1 │ │ two │ 2 │ └──────┴───────┘ ``` -## Использование движка в clickhouse-local +## Использование движка в clickhouse-local {#ispolzovanie-dvizhka-v-clickhouse-local} В [clickhouse-local](../utils/clickhouse-local.md) движок в качестве параметра принимает не только формат, но и путь к файлу. В том числе можно указать стандартные потоки ввода/вывода цифровым или буквенным обозначением `0` или `stdin`, `1` или `stdout`. **Пример:** -```bash +``` bash $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" ``` -## Детали реализации +## Детали реализации {#detali-realizatsii} -- Поддерживается одновременное выполнение множества запросов `SELECT`, запросы `INSERT` могут выполняться только последовательно. -- Поддерживается создание ещё не существующего файла при запросе `INSERT`. -- Для существующих файлов `INSERT` записывает в конец файла. -- Не поддерживается: - - использование операций `ALTER` и `SELECT...SAMPLE`; - - индексы; - - репликация. +- Поддерживается одновременное выполнение множества запросов `SELECT`, запросы `INSERT` могут выполняться только последовательно. +- Поддерживается создание ещё не существующего файла при запросе `INSERT`. +- Для существующих файлов `INSERT` записывает в конец файла. +- Не поддерживается: + - использование операций `ALTER` и `SELECT...SAMPLE`; + - индексы; + - репликация. [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/file/) diff --git a/docs/ru/operations/table_engines/graphitemergetree.md b/docs/ru/operations/table_engines/graphitemergetree.md index e6ad1dcde10..c128da7ac02 100644 --- a/docs/ru/operations/table_engines/graphitemergetree.md +++ b/docs/ru/operations/table_engines/graphitemergetree.md @@ -8,7 +8,7 @@ ## Создание таблицы {#creating-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( Path String, @@ -27,32 +27,34 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] В таблице должны быть столбцы для следующих данных: -- Название метрики (сенсора Graphite). Тип данных: `String`. +- Название метрики (сенсора Graphite). Тип данных: `String`. -- Время измерения метрики. Тип данных `DateTime`. +- Время измерения метрики. Тип данных `DateTime`. -- Значение метрики. Тип данных: любой числовой. +- Значение метрики. Тип данных: любой числовой. -- Версия метрики. Тип данных: любой числовой. +- Версия метрики. Тип данных: любой числовой. - ClickHouse сохраняет строки с последней версией или последнюю записанную строку, если версии совпадают. Другие строки удаляются при слиянии кусков данных. + ClickHouse сохраняет строки с последней версией или последнюю записанную строку, если версии совпадают. Другие строки удаляются при слиянии кусков данных. Имена этих столбцов должны быть заданы в конфигурации rollup. **Параметры GraphiteMergeTree** -- `config_section` — имя раздела в конфигурационном файле, в котором находятся правила rollup. +- `config_section` — имя раздела в конфигурационном файле, в котором находятся правила rollup. **Секции запроса** При создании таблицы `GraphiteMergeTree` используются те же [секции](mergetree.md#table_engine-mergetree-creating-a-table) запроса, что и при создании таблицы `MergeTree`. -
Устаревший способ создания таблицы +
-!!! attention +Устаревший способ создания таблицы + +!!! attention "Attention" Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ описанный выше. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( EventDate Date, @@ -66,33 +68,33 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Все параметры, кроме `config_section` имеют то же значение, что в `MergeTree`. -- `config_section` — имя раздела в конфигурационном файле, в котором находятся правила rollup. +- `config_section` — имя раздела в конфигурационном файле, в котором находятся правила rollup.
## Конфигурация rollup {#rollup-configuration} -Настройки прореживания данных задаются параметром [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) в конфигурации сервера . Имя параметра может быть любым. Можно создать несколько конфигураций и использовать их для разных таблиц. +Настройки прореживания данных задаются параметром [graphite\_rollup](../server_settings/settings.md#server_settings-graphite_rollup) в конфигурации сервера . Имя параметра может быть любым. Можно создать несколько конфигураций и использовать их для разных таблиц. Структура конфигурации rollup: -```text +``` text required-columns patterns ``` ### Требуемые столбцы (required-columns) {#required-columns} -- `path_column_name` — столбец, в котором хранится название метрики (сенсор Graphite). Значение по умолчанию: `Path`. -- `time_column_name` — столбец, в котором хранится время измерения метрики. Значение по умолчанию: `Time`. -- `value_column_name` — столбец со значением метрики в момент времени, установленный в `time_column_name`. Значение по умолчанию: `Value`. -- `version_column_name` — столбец, в котором хранится версия метрики. Значение по умолчанию: `Timestamp`. +- `path_column_name` — столбец, в котором хранится название метрики (сенсор Graphite). Значение по умолчанию: `Path`. +- `time_column_name` — столбец, в котором хранится время измерения метрики. Значение по умолчанию: `Time`. +- `value_column_name` — столбец со значением метрики в момент времени, установленный в `time_column_name`. Значение по умолчанию: `Value`. +- `version_column_name` — столбец, в котором хранится версия метрики. Значение по умолчанию: `Timestamp`. ### Правила (patterns) {#patterns} Структура раздела `patterns`: -```text +``` text pattern regexp function @@ -116,22 +118,22 @@ default !!! warning "Внимание" Правила должны быть строго упорядочены: - 1. Правила без `function` или `retention`. - 1. Правила одновремено содержащие `function` и `retention`. - 1. Правило `default`. + 1. Правила без `function` или `retention`. + 1. Правила одновремено содержащие `function` и `retention`. + 1. Правило `default`. При обработке строки ClickHouse проверяет правила в разделе `pattern`. Каждый `pattern` (включая `default`) может содержать параметр агрегации `function`, параметр `retention`, или оба параметра одновременно. Если имя метрики соответствует шаблону `regexp`, то применяются правила `pattern`, в противном случае правило `default`. Поля для разделов `pattern` и `default`: -- `regexp` – шаблон имени метрики. -- `age` – минимальный возраст данных в секундах. -- `precision` – точность определения возраста данных в секундах. Должен быть делителем для 86400 (количество секунд в сутках). -- `function` – имя агрегирующей функции, которую следует применить к данным, чей возраст оказался в интервале `[age, age + precision]`. +- `regexp` – шаблон имени метрики. +- `age` – минимальный возраст данных в секундах. +- `precision` – точность определения возраста данных в секундах. Должен быть делителем для 86400 (количество секунд в сутках). +- `function` – имя агрегирующей функции, которую следует применить к данным, чей возраст оказался в интервале `[age, age + precision]`. ### Пример конфигурации {#configuration-example} -```xml +``` xml Version diff --git a/docs/ru/operations/table_engines/hdfs.md b/docs/ru/operations/table_engines/hdfs.md index 2aad0902eb7..6d40e416a81 100644 --- a/docs/ru/operations/table_engines/hdfs.md +++ b/docs/ru/operations/table_engines/hdfs.md @@ -1,10 +1,10 @@ -# HDFS {#table_engines-hdfs} +# HDFS {#table-engines-hdfs} Управляет данными в HDFS. Данный движок похож на движки [File](file.md) и [URL](url.md). -## Использование движка +## Использование движка {#ispolzovanie-dvizhka} -```sql +``` sql ENGINE = HDFS(URI, format) ``` @@ -16,94 +16,97 @@ ENGINE = HDFS(URI, format) **1.** Создадим на сервере таблицу `hdfs_engine_table`: -```sql +``` sql CREATE TABLE hdfs_engine_table (name String, value UInt32) ENGINE=HDFS('hdfs://hdfs1:9000/other_storage', 'TSV') ``` **2.** Заполним файл: -```sql + +``` sql INSERT INTO hdfs_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) ``` **3.** Запросим данные: -```sql +``` sql SELECT * FROM hdfs_engine_table LIMIT 2 ``` -```text +``` text ┌─name─┬─value─┐ │ one │ 1 │ │ two │ 2 │ └──────┴───────┘ ``` -## Детали реализации +## Детали реализации {#detali-realizatsii} -- Поддерживается многопоточное чтение и запись. -- Не поддерживается: - - использование операций `ALTER` и `SELECT...SAMPLE`; - - индексы; - - репликация. +- Поддерживается многопоточное чтение и запись. +- Не поддерживается: + - использование операций `ALTER` и `SELECT...SAMPLE`; + - индексы; + - репликация. **Шаблоны в пути** Шаблоны могут содержаться в нескольких компонентах пути. Обрабатываются только существующие файлы, название которых целиком удовлетворяет шаблону (не только суффиксом или префиксом). -- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. -- `?` — Заменяет ровно один любой символ. -- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. +- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. +- `?` — Заменяет ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. Конструкция с `{}` аналогична табличной функции [remote](../../query_language/table_functions/remote.md). **Пример** -1. Предположим, у нас есть несколько файлов со следующими URI в HDFS: +1. Предположим, у нас есть несколько файлов со следующими URI в HDFS: -- 'hdfs://hdfs1:9000/some_dir/some_file_1' -- 'hdfs://hdfs1:9000/some_dir/some_file_2' -- 'hdfs://hdfs1:9000/some_dir/some_file_3' -- 'hdfs://hdfs1:9000/another_dir/some_file_1' -- 'hdfs://hdfs1:9000/another_dir/some_file_2' -- 'hdfs://hdfs1:9000/another_dir/some_file_3' +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/some\_dir/some\_file\_3’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_1’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_2’ +- ‘hdfs://hdfs1:9000/another\_dir/some\_file\_3’ -2. Есть несколько возможностей создать таблицу, состояющую из этих шести файлов: +1. Есть несколько возможностей создать таблицу, состояющую из этих шести файлов: -```sql + + +``` sql CREATE TABLE table_with_range (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV') ``` Другой способ: -```sql +``` sql CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_?', 'TSV') ``` Таблица, состоящая из всех файлов в обеих директориях (все файлы должны удовлетворять формату и схеме, указанной в запросе): -```sql +``` sql CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV') ``` -!!! warning +!!! warning "Warning" Если список файлов содержит числовые интервалы с ведущими нулями, используйте конструкцию с фигурными скобочками для каждой цифры или используйте `?`. **Example** -Создадим таблицу с именами `file000`, `file001`, ... , `file999`: +Создадим таблицу с именами `file000`, `file001`, … , `file999`: -```sql +``` sql CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') ``` -## Виртуальные столбцы +## Виртуальные столбцы {#virtualnye-stolbtsy} -- `_path` — Путь к файлу. -- `_file` — Имя файла. +- `_path` — Путь к файлу. +- `_file` — Имя файла. **Смотрите также** -- [Виртуальные столбцы](index.md#table_engines-virtual_columns) +- [Виртуальные столбцы](index.md#table_engines-virtual_columns) [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/hdfs/) diff --git a/docs/ru/operations/table_engines/index.md b/docs/ru/operations/table_engines/index.md index d22c88a1b04..1d5799700b9 100644 --- a/docs/ru/operations/table_engines/index.md +++ b/docs/ru/operations/table_engines/index.md @@ -1,69 +1,69 @@ -# Движки таблиц {#table_engines} +# Движки таблиц {#table-engines} Движок таблицы (тип таблицы) определяет: -- Как и где хранятся данные, куда их писать и откуда читать. -- Какие запросы поддерживаются и каким образом. -- Конкурентный доступ к данным. -- Использование индексов, если есть. -- Возможно ли многопоточное выполнение запроса. -- Параметры репликации данных. +- Как и где хранятся данные, куда их писать и откуда читать. +- Какие запросы поддерживаются и каким образом. +- Конкурентный доступ к данным. +- Использование индексов, если есть. +- Возможно ли многопоточное выполнение запроса. +- Параметры репликации данных. -## Семейства движков +## Семейства движков {#semeistva-dvizhkov} -### MergeTree +### MergeTree {#mergetree} -Наиболее универсальные и функциональные движки таблиц для задач с высокой загрузкой. Общим свойством этих движков является быстрая вставка данных с последующей фоновой обработкой данных. Движки `*MergeTree` поддерживают репликацию данных (в [Replicated*](replication.md) версиях движков), партиционирование, и другие возможности не поддержанные для других движков. +Наиболее универсальные и функциональные движки таблиц для задач с высокой загрузкой. Общим свойством этих движков является быстрая вставка данных с последующей фоновой обработкой данных. Движки `*MergeTree` поддерживают репликацию данных (в [Replicated\*](replication.md) версиях движков), партиционирование, и другие возможности не поддержанные для других движков. Движки семейства: -- [MergeTree](mergetree.md) -- [ReplacingMergeTree](replacingmergetree.md) -- [SummingMergeTree](summingmergetree.md) -- [AggregatingMergeTree](aggregatingmergetree.md) -- [CollapsingMergeTree](collapsingmergetree.md) -- [VersionedCollapsingMergeTree](versionedcollapsingmergetree.md) -- [GraphiteMergeTree](graphitemergetree.md) +- [MergeTree](mergetree.md) +- [ReplacingMergeTree](replacingmergetree.md) +- [SummingMergeTree](summingmergetree.md) +- [AggregatingMergeTree](aggregatingmergetree.md) +- [CollapsingMergeTree](collapsingmergetree.md) +- [VersionedCollapsingMergeTree](versionedcollapsingmergetree.md) +- [GraphiteMergeTree](graphitemergetree.md) -### Log +### Log {#log} Простые [движки](log_family.md) с минимальной функциональностью. Они наиболее эффективны, когда вам нужно быстро записать много небольших таблиц (до примерно 1 миллиона строк) и прочитать их позже целиком. Движки семейства: -- [TinyLog](tinylog.md) -- [StripeLog](stripelog.md) -- [Log](log.md) +- [TinyLog](tinylog.md) +- [StripeLog](stripelog.md) +- [Log](log.md) -### Движки для интергации +### Движки для интергации {#dvizhki-dlia-intergatsii} Движки для связи с другими системами хранения и обработки данных. Движки семейства: -- [Kafka](kafka.md) -- [MySQL](mysql.md) -- [ODBC](odbc.md) -- [JDBC](jdbc.md) +- [Kafka](kafka.md) +- [MySQL](mysql.md) +- [ODBC](odbc.md) +- [JDBC](jdbc.md) -### Специальные движки +### Специальные движки {#spetsialnye-dvizhki} Движки семейства: -- [Distributed](distributed.md) -- [MaterializedView](materializedview.md) -- [Dictionary](dictionary.md) -- [Merge](merge.md) -- [File](file.md) -- [Null](null.md) -- [Set](set.md) -- [Join](join.md) -- [URL](url.md) -- [View](view.md) -- [Memory](memory.md) -- [Buffer](buffer.md) +- [Distributed](distributed.md) +- [MaterializedView](materializedview.md) +- [Dictionary](dictionary.md) +- [Merge](merge.md) +- [File](file.md) +- [Null](null.md) +- [Set](set.md) +- [Join](join.md) +- [URL](url.md) +- [View](view.md) +- [Memory](memory.md) +- [Buffer](buffer.md) -## Виртуальные столбцы {#table_engines-virtual_columns} +## Виртуальные столбцы {#table-engines-virtual-columns} Виртуальный столбец — это неотъемлемый атрибут движка таблиц, определенный в исходном коде движка. @@ -74,4 +74,3 @@ При создании таблицы со столбцом, имя которого совпадает с именем одного из виртуальных столбцов таблицы, виртуальный столбец становится недоступным. Не делайте так. Чтобы помочь избежать конфликтов, имена виртуальных столбцов обычно предваряются подчеркиванием. [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/) - diff --git a/docs/ru/operations/table_engines/jdbc.md b/docs/ru/operations/table_engines/jdbc.md index 0bdf7c765ed..d9a66244849 100644 --- a/docs/ru/operations/table_engines/jdbc.md +++ b/docs/ru/operations/table_engines/jdbc.md @@ -1,4 +1,4 @@ -# JDBC +# JDBC {#jdbc} Позволяет ClickHouse подключаться к внешним базам данных с помощью [JDBC](https://en.wikipedia.org/wiki/Java_Database_Connectivity). @@ -6,28 +6,30 @@ Движок поддерживает тип данных [Nullable](../../data_types/nullable.md). -## Создание таблицы +## Создание таблицы {#sozdanie-tablitsy} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = JDBC(dbms_uri, external_database, external_table) ``` **Параметры движка** -- `dbms_uri` — URI внешней СУБД. +- `dbms_uri` — URI внешней СУБД. - Формат: `jdbc:://:/?user=&password=`. -Пример для MySQL: `jdbc:mysql://localhost:3306/?user=root&password=root`. + Формат: `jdbc:://:/?user=&password=`. -- `external_database` — база данных во внешней СУБД. -- `external_table` — таблица в `external_database`. + Пример для MySQL: `jdbc:mysql://localhost:3306/?user=root&password=root`. -## Пример использования +- `external_database` — база данных во внешней СУБД. + +- `external_table` — таблица в `external_database`. + +## Пример использования {#primer-ispolzovaniia} Создадим таблицу в на сервере MySQL с помощью консольного клиента MySQL: -```text +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -50,13 +52,15 @@ mysql> select * from test; Создадим таблицу на сервере ClickHouse и получим из неё данные: -```sql +``` sql CREATE TABLE jdbc_table ENGINE JDBC('jdbc:mysql://localhost:3306/?user=root&password=root', 'test', 'test') ``` -```sql + +``` sql DESCRIBE TABLE jdbc_table ``` -```text + +``` text ┌─name───────────────┬─type───────────────┬─default_type─┬─default_expression─┐ │ int_id │ Int32 │ │ │ │ int_nullable │ Nullable(Int32) │ │ │ @@ -64,18 +68,20 @@ DESCRIBE TABLE jdbc_table │ float_nullable │ Nullable(Float32) │ │ │ └────────────────────┴────────────────────┴──────────────┴────────────────────┘ ``` -```sql + +``` sql SELECT * FROM jdbc_table ``` -```text + +``` text ┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ │ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ └────────┴──────────────┴───────┴────────────────┘ ``` -## Смотрите также +## Смотрите также {#smotrite-takzhe} -- [Табличная функция JDBC](../../query_language/table_functions/jdbc.md). +- [Табличная функция JDBC](../../query_language/table_functions/jdbc.md). [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/jdbc/) diff --git a/docs/ru/operations/table_engines/join.md b/docs/ru/operations/table_engines/join.md index d5d81d89511..a9c06d05ebf 100644 --- a/docs/ru/operations/table_engines/join.md +++ b/docs/ru/operations/table_engines/join.md @@ -1,10 +1,10 @@ -# Join +# Join {#join} Подготовленная структура данных для использования в операциях [JOIN](../../query_language/select.md#select-join). ## Создание таблицы {#creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], @@ -16,41 +16,43 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Параметры движка** -- `join_strictness` – [строгость JOIN](../../query_language/select.md#select-join-strictness). -- `join_type` – [тип JOIN](../../query_language/select.md#select-join-types). -- `k1[, k2, ...]` – ключевые столбцы секции `USING` с которыми выполняется операция `JOIN`. +- `join_strictness` – [строгость JOIN](../../query_language/select.md#select-join-strictness). +- `join_type` – [тип JOIN](../../query_language/select.md#select-join-types). +- `k1[, k2, ...]` – ключевые столбцы секции `USING` с которыми выполняется операция `JOIN`. Вводите параметры `join_strictness` и `join_type` без кавычек, например, `Join(ANY, LEFT, col1)`. Они должны быть такими же как и в той операции `JOIN`, в которой таблица будет использоваться. Если параметры не совпадают, ClickHouse не генерирует исключение и может возвращать неверные данные. -## Использование таблицы +## Использование таблицы {#ispolzovanie-tablitsy} -### Пример +### Пример {#primer} Создание левой таблицы: -```sql +``` sql CREATE TABLE id_val(`id` UInt32, `val` UInt32) ENGINE = TinyLog ``` -```sql + +``` sql INSERT INTO id_val VALUES (1,11)(2,12)(3,13) ``` Создание правой таблицы с движком `Join`: -```sql +``` sql CREATE TABLE id_val_join(`id` UInt32, `val` UInt8) ENGINE = Join(ANY, LEFT, id) ``` -```sql + +``` sql INSERT INTO id_val_join VALUES (1,21)(1,22)(3,23) ``` Объединение таблиц: -```sql +``` sql SELECT * FROM id_val ANY LEFT JOIN id_val_join USING (id) SETTINGS join_use_nulls = 1 ``` -```text +``` text ┌─id─┬─val─┬─id_val_join.val─┐ │ 1 │ 11 │ 21 │ │ 2 │ 12 │ ᴺᵁᴸᴸ │ @@ -60,40 +62,40 @@ SELECT * FROM id_val ANY LEFT JOIN id_val_join USING (id) SETTINGS join_use_null В качестве альтернативы, можно извлечь данные из таблицы `Join`, указав значение ключа объединения: -```sql +``` sql SELECT joinGet('id_val_join', 'val', toUInt32(1)) ``` -```text +``` text ┌─joinGet('id_val_join', 'val', toUInt32(1))─┐ │ 21 │ └────────────────────────────────────────────┘ ``` -### Выборка и вставка данных +### Выборка и вставка данных {#vyborka-i-vstavka-dannykh} Для добавления данных в таблицы с движком `Join` используйте запрос `INSERT`. Если таблица создавалась со строгостью `ANY`, то данные с повторяющимися ключами игнорируются. Если задавалась строгость `ALL`, то добавляются все строки. Из таблиц нельзя выбрать данные с помощью запроса `SELECT`. Вместо этого, используйте один из следующих методов: -- Используйте таблицу как правую в секции `JOIN`. -- Используйте функцию [joinGet](../../query_language/functions/other_functions.md#joinget), которая позволяет извлекать данные из таблицы таким же образом как из словаря. +- Используйте таблицу как правую в секции `JOIN`. +- Используйте функцию [joinGet](../../query_language/functions/other_functions.md#joinget), которая позволяет извлекать данные из таблицы таким же образом как из словаря. ### Ограничения и настройки {#join-limitations-and-settings} При создании таблицы, применяются следующие параметры : -- [join_use_nulls](../settings/settings.md#join_use_nulls) -- [max_rows_in_join](../settings/query_complexity.md#settings-max_rows_in_join) -- [max_bytes_in_join](../settings/query_complexity.md#settings-max_bytes_in_join) -- [join_overflow_mode](../settings/query_complexity.md#settings-join_overflow_mode) -- [join_any_take_last_row](../settings/settings.md#settings-join_any_take_last_row) +- [join\_use\_nulls](../settings/settings.md#join_use_nulls) +- [max\_rows\_in\_join](../settings/query_complexity.md#settings-max_rows_in_join) +- [max\_bytes\_in\_join](../settings/query_complexity.md#settings-max_bytes_in_join) +- [join\_overflow\_mode](../settings/query_complexity.md#settings-join_overflow_mode) +- [join\_any\_take\_last\_row](../settings/settings.md#settings-join_any_take_last_row) Таблицы с движком `Join` нельзя использовать в операциях `GLOBAL JOIN`. -Движок `Join` позволяет использовать параметр [join_use_nulls](../settings/settings.md#join_use_nulls) в запросе `CREATE TABLE`, который также можно использовать в запросе [SELECT](../../query_language/select.md). Если у вас разные настройки `join_use_nulls`, вы можете получить сообщение об ошибке при объединении таблиц. Это зависит от типа соединения. Когда вы используете функцию [joinGet](../../query_language/functions/other_functions.md#joinget), вам необходимо использовать один и тот же параметр `join_use_nulls` в запросах `CRATE TABLE` и `SELECT`. +Движок `Join` позволяет использовать параметр [join\_use\_nulls](../settings/settings.md#join_use_nulls) в запросе `CREATE TABLE`, который также можно использовать в запросе [SELECT](../../query_language/select.md). Если у вас разные настройки `join_use_nulls`, вы можете получить сообщение об ошибке при объединении таблиц. Это зависит от типа соединения. Когда вы используете функцию [joinGet](../../query_language/functions/other_functions.md#joinget), вам необходимо использовать один и тот же параметр `join_use_nulls` в запросах `CRATE TABLE` и `SELECT`. -## Хранение данных +## Хранение данных {#khranenie-dannykh} Данные таблиц `Join` всегда находятся в RAM. При вставке строк в таблицу ClickHouse записывает блоки данных в каталог на диске, чтобы их можно было восстановить при перезапуске сервера. diff --git a/docs/ru/operations/table_engines/kafka.md b/docs/ru/operations/table_engines/kafka.md index 90549f3afa1..6137b96dc43 100644 --- a/docs/ru/operations/table_engines/kafka.md +++ b/docs/ru/operations/table_engines/kafka.md @@ -1,16 +1,16 @@ -# Kafka +# Kafka {#kafka} Движок работает с [Apache Kafka](http://kafka.apache.org/). Kafka позволяет: -- Публиковать/подписываться на потоки данных. -- Организовать отказоустойчивое хранилище. -- Обрабатывать потоки по мере их появления. +- Публиковать/подписываться на потоки данных. +- Организовать отказоустойчивое хранилище. +- Обрабатывать потоки по мере их появления. -## Создание таблицы {#table_engine-kafka-creating-a-table} +## Создание таблицы {#table-engine-kafka-creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -30,21 +30,21 @@ SETTINGS Обязательные параметры: -- `kafka_broker_list` – перечень брокеров, разделенный запятыми (`localhost:9092`). -- `kafka_topic_list` – перечень необходимых топиков Kafka. -- `kafka_group_name` – группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы. -- `kafka_format` – формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../interfaces/formats.md). +- `kafka_broker_list` – перечень брокеров, разделенный запятыми (`localhost:9092`). +- `kafka_topic_list` – перечень необходимых топиков Kafka. +- `kafka_group_name` – группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы. +- `kafka_format` – формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../interfaces/formats.md). Опциональные параметры: -- `kafka_row_delimiter` – символ-разделитель записей (строк), которым завершается сообщение. -- `kafka_schema` – опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap'n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`. -- `kafka_num_consumers` – количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя. -- `kafka_skip_broken_messages` – максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0. +- `kafka_row_delimiter` – символ-разделитель записей (строк), которым завершается сообщение. +- `kafka_schema` – опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`. +- `kafka_num_consumers` – количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя. +- `kafka_skip_broken_messages` – максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0. Примеры -```sql +``` sql CREATE TABLE queue ( timestamp UInt64, level String, @@ -72,36 +72,38 @@ SETTINGS kafka_num_consumers = 4; ``` -
Устаревший способ создания таблицы +
-!!! attention +Устаревший способ создания таблицы + +!!! attention "Attention" Не используйте этот метод в новых проектах. По возможности переключите старые проекты на метод, описанный выше. -```sql +``` sql Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format [, kafka_row_delimiter, kafka_schema, kafka_num_consumers, kafka_skip_broken_messages]) ```
-## Описание +## Описание {#opisanie} Полученные сообщения отслеживаются автоматически, поэтому из одной группы каждое сообщение считывается только один раз. Если необходимо получить данные дважды, то создайте копию таблицы с другим именем группы. -Группы пластичны и синхронизированы на кластере. Например, если есть 10 топиков и 5 копий таблицы в кластере, то в каждую копию попадет по 2 топика. Если количество копий изменится, то распределение топиков по копиям изменится автоматически. Подробно читайте об этом на [http://kafka.apache.org/intro](http://kafka.apache.org/intro). +Группы пластичны и синхронизированы на кластере. Например, если есть 10 топиков и 5 копий таблицы в кластере, то в каждую копию попадет по 2 топика. Если количество копий изменится, то распределение топиков по копиям изменится автоматически. Подробно читайте об этом на http://kafka.apache.org/intro. Чтение сообщения с помощью `SELECT` не слишком полезно (разве что для отладки), поскольку каждое сообщения может быть прочитано только один раз. Практичнее создавать потоки реального времени с помощью материализованных преставлений. Для этого: -1. Создайте потребителя Kafka с помощью движка и рассматривайте его как поток данных. -2. Создайте таблицу с необходимой структурой. -3. Создайте материализованное представление, которое преобразует данные от движка и помещает их в ранее созданную таблицу. +1. Создайте потребителя Kafka с помощью движка и рассматривайте его как поток данных. +2. Создайте таблицу с необходимой структурой. +3. Создайте материализованное представление, которое преобразует данные от движка и помещает их в ранее созданную таблицу. Когда к движку присоединяется материализованное представление (`MATERIALIZED VIEW`), оно начинает в фоновом режиме собирать данные. Это позволяет непрерывно получать сообщения от Kafka и преобразовывать их в необходимый формат с помощью `SELECT`. Материализованных представлений у одной kafka таблицы может быть сколько угодно, они не считывают данные из таблицы kafka непосредственно, а получают новые записи (блоками), таким образом можно писать в несколько таблиц с разным уровнем детализации (с группировкой - агрегацией и без). Пример: -```sql +``` sql CREATE TABLE queue ( timestamp UInt64, level String, @@ -121,22 +123,22 @@ Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format SELECT level, sum(total) FROM daily GROUP BY level; ``` -Для улучшения производительности полученные сообщения группируются в блоки размера [max_insert_block_size](../settings/settings.md#settings-max_insert_block_size). Если блок не удалось сформировать за [stream_flush_interval_ms](../settings/settings.md) миллисекунд, то данные будут сброшены в таблицу независимо от полноты блока. +Для улучшения производительности полученные сообщения группируются в блоки размера [max\_insert\_block\_size](../settings/settings.md#settings-max_insert_block_size). Если блок не удалось сформировать за [stream\_flush\_interval\_ms](../settings/settings.md) миллисекунд, то данные будут сброшены в таблицу независимо от полноты блока. Чтобы остановить получение данных топика или изменить логику преобразования, отсоедините материализованное представление: -```sql +``` sql DETACH TABLE consumer; ATTACH MATERIALIZED VIEW consumer; ``` Если необходимо изменить целевую таблицу с помощью `ALTER`, то материализованное представление рекомендуется отключить, чтобы избежать несостыковки между целевой таблицей и данными от представления. -## Конфигурация +## Конфигурация {#konfiguratsiia} Аналогично GraphiteMergeTree, движок Kafka поддерживает расширенную конфигурацию с помощью конфигурационного файла ClickHouse. Существует два конфигурационных ключа, которые можно использовать: глобальный (`kafka`) и по топикам (`kafka_topic_*`). Сначала применяется глобальная конфигурация, затем конфигурация по топикам (если она существует). -```xml +``` xml cgrp @@ -152,17 +154,16 @@ Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format В документе [librdkafka configuration reference](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md) можно увидеть список возможных опций конфигурации. Используйте подчеркивание (`_`) вместо точки в конфигурации ClickHouse. Например, `check.crcs=true` будет соответствовать `true`. -## Виртуальные столбцы +## Виртуальные столбцы {#virtualnye-stolbtsy} -- `_topic` — топик Kafka. -- `_key` — ключ сообщения. -- `_offset` — оффсет сообщения. -- `_timestamp` — временная метка сообщения. -- `_partition` — секция топика Kafka. +- `_topic` — топик Kafka. +- `_key` — ключ сообщения. +- `_offset` — оффсет сообщения. +- `_timestamp` — временная метка сообщения. +- `_partition` — секция топика Kafka. **Смотрите также** -- [Виртуальные столбцы](index.md#table_engines-virtual_columns) +- [Виртуальные столбцы](index.md#table_engines-virtual_columns) [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/kafka/) - diff --git a/docs/ru/operations/table_engines/log.md b/docs/ru/operations/table_engines/log.md index c0e40f3ba38..9f112f3f6b9 100644 --- a/docs/ru/operations/table_engines/log.md +++ b/docs/ru/operations/table_engines/log.md @@ -1,8 +1,8 @@ -# Log +# Log {#log} Движок относится к семейству движков Log. Смотрите общие свойства и различия движков в статье [Семейство Log](log_family.md). -Отличается от [TinyLog](tinylog.md) тем, что вместе с файлами столбцов лежит небольшой файл "засечек". Засечки пишутся на каждый блок данных и содержат смещение - с какого места нужно читать файл, чтобы пропустить заданное количество строк. Это позволяет читать данные из таблицы в несколько потоков. +Отличается от [TinyLog](tinylog.md) тем, что вместе с файлами столбцов лежит небольшой файл «засечек». Засечки пишутся на каждый блок данных и содержат смещение - с какого места нужно читать файл, чтобы пропустить заданное количество строк. Это позволяет читать данные из таблицы в несколько потоков. При конкурентном доступе к данным, чтения могут выполняться одновременно, а записи блокируют чтения и друг друга. Движок Log не поддерживает индексы. Также, если при записи в таблицу произошёл сбой, то таблица станет битой, и чтения из неё будут возвращать ошибку. Движок Log подходит для временных данных, write-once таблиц, а также для тестовых и демонстрационных целей. diff --git a/docs/ru/operations/table_engines/log_family.md b/docs/ru/operations/table_engines/log_family.md index 01c8e6b8579..597d331981c 100644 --- a/docs/ru/operations/table_engines/log_family.md +++ b/docs/ru/operations/table_engines/log_family.md @@ -1,33 +1,36 @@ -#Семейство Log +# Семейство Log {#semeistvo-log} Движки разработаны для сценариев, когда необходимо быстро записывать много таблиц с небольшим объёмом данных (менее 1 миллиона строк), а затем читать их целиком. Движки семейства: -- [StripeLog](stripelog.md) -- [Log](log.md) -- [TinyLog](tinylog.md) +- [StripeLog](stripelog.md) +- [Log](log.md) +- [TinyLog](tinylog.md) -## Общие свойства +## Общие свойства {#obshchie-svoistva} Движки: -- Хранят данные на диске. -- Добавляют данные в конец файла при записи. -- Поддерживают блокировки для конкурентного доступа к данным. +- Хранят данные на диске. - Во время запросов `INSERT` таблица блокируется, а другие запросы на чтение и запись ожидают разблокировки таблицы. Если запросов на запись данных нет, то можно выполнять любое количество конкуретных запросов на чтение. +- Добавляют данные в конец файла при записи. -- Не поддерживают операции [мутации](../../query_language/alter.md#alter-mutations). -- Не поддерживают индексы. +- Поддерживают блокировки для конкурентного доступа к данным. - Это означает, что запросы `SELECT` не эффективны для выборки диапазонов данных. + Во время запросов `INSERT` таблица блокируется, а другие запросы на чтение и запись ожидают разблокировки таблицы. Если запросов на запись данных нет, то можно выполнять любое количество конкуретных запросов на чтение. -- Записывают данные не атомарно. +- Не поддерживают операции [мутации](../../query_language/alter.md#alter-mutations). - Вы можете получить таблицу с повреждёнными данными, если что-то прервёт операцию записи (например, аварийное завершение работы сервера). +- Не поддерживают индексы. -## Отличия + Это означает, что запросы `SELECT` не эффективны для выборки диапазонов данных. + +- Записывают данные не атомарно. + + Вы можете получить таблицу с повреждёнными данными, если что-то прервёт операцию записи (например, аварийное завершение работы сервера). + +## Отличия {#otlichiia} Движок `TinyLog` самый простой в семье и обеспечивает самые низкие функциональность и эффективность. Движок `TinyLog` не поддерживает параллельного чтения данных в несколько потоков. Движок читает данные медленнее, чем оба других движка с параллельным чтением, и использует почти столько же дескрипторов, сколько и движок `Log`, поскольку хранит каждый столбец в отдельном файле. Его можно использовать в простых сценариях с низкой нагрузкой. diff --git a/docs/ru/operations/table_engines/materializedview.md b/docs/ru/operations/table_engines/materializedview.md index beee3adf48f..6bcc528782a 100644 --- a/docs/ru/operations/table_engines/materializedview.md +++ b/docs/ru/operations/table_engines/materializedview.md @@ -1,4 +1,4 @@ -# MaterializedView +# MaterializedView {#materializedview} Используется для реализации материализованных представлений (подробнее см. запрос [CREATE TABLE](../../query_language/create.md)). Для хранения данных, использует другой движок, который был указан при создании представления. При чтении из таблицы, просто использует этот движок. diff --git a/docs/ru/operations/table_engines/memory.md b/docs/ru/operations/table_engines/memory.md index 1fa92e636ab..6de7978d134 100644 --- a/docs/ru/operations/table_engines/memory.md +++ b/docs/ru/operations/table_engines/memory.md @@ -1,4 +1,4 @@ -# Memory +# Memory {#memory} Хранит данные в оперативке, в несжатом виде. Данные хранятся именно в таком виде, в каком они получаются при чтении. То есть, само чтение из этой таблицы полностью бесплатно. Конкурентный доступ к данным синхронизируется. Блокировки короткие: чтения и записи не блокируют друг друга. @@ -7,6 +7,6 @@ При перезапуске сервера данные из таблицы исчезают и таблица становится пустой. Обычно, использование этого движка таблиц является неоправданным. Тем не менее, он может использоваться для тестов, а также в задачах, где важно достичь максимальной скорости на не очень большом количестве строк (примерно до 100 000 000). -Движок Memory используется системой для временных таблиц - внешних данных запроса (смотрите раздел "Внешние данные для обработки запроса"), для реализации `GLOBAL IN` (смотрите раздел "Операторы IN"). +Движок Memory используется системой для временных таблиц - внешних данных запроса (смотрите раздел «Внешние данные для обработки запроса»), для реализации `GLOBAL IN` (смотрите раздел «Операторы IN»). [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/memory/) diff --git a/docs/ru/operations/table_engines/merge.md b/docs/ru/operations/table_engines/merge.md index 712a6379a74..4f11cf77352 100644 --- a/docs/ru/operations/table_engines/merge.md +++ b/docs/ru/operations/table_engines/merge.md @@ -1,5 +1,4 @@ - -# Merge +# Merge {#merge} Движок `Merge` (не путайте с движком `MergeTree`) не хранит данные самостоятельно, а позволяет читать одновременно из произвольного количества других таблиц. Чтение автоматически распараллеливается. Запись в таблицу не поддерживается. При чтении будут использованы индексы тех таблиц, из которых реально идёт чтение, если они существуют. @@ -7,16 +6,16 @@ Пример: -```sql +``` sql Merge(hits, '^WatchLog') ``` -Данные будут читаться из таблиц в базе `hits`, имена которых соответствуют регулярному выражению '`^WatchLog`'. +Данные будут читаться из таблиц в базе `hits`, имена которых соответствуют регулярному выражению ‘`^WatchLog`’. Вместо имени базы данных может использоваться константное выражение, возвращающее строку. Например, `currentDatabase()`. Регулярные выражения — [re2](https://github.com/google/re2) (поддерживает подмножество PCRE), регистрозависимые. -Смотрите замечание об экранировании в регулярных выражениях в разделе "match". +Смотрите замечание об экранировании в регулярных выражениях в разделе «match». При выборе таблиц для чтения, сама `Merge`-таблица не будет выбрана, даже если попадает под регулярное выражение, чтобы не возникло циклов. Впрочем, вы можете создать две `Merge`-таблицы, которые будут пытаться бесконечно читать данные друг друга, но делать этого не нужно. @@ -27,8 +26,8 @@ Merge(hits, '^WatchLog') Пусть есть старая таблица `WatchLog_old`. Необходимо изменить партиционирование без перемещения данных в новую таблицу `WatchLog_new`. При этом в выборке должны участвовать данные обеих таблиц. -```sql -CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64) +``` sql +CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64) ENGINE=MergeTree(date, (UserId, EventType), 8192); INSERT INTO WatchLog_old VALUES ('2018-01-01', 1, 'hit', 3); @@ -41,7 +40,8 @@ CREATE TABLE WatchLog as WatchLog_old ENGINE=Merge(currentDatabase(), '^WatchLog SELECT * FROM WatchLog ``` -```text + +``` text ┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ │ 2018-01-01 │ 1 │ hit │ 3 │ └────────────┴────────┴───────────┴─────┘ @@ -50,14 +50,14 @@ FROM WatchLog └────────────┴────────┴───────────┴─────┘ ``` -## Виртуальные столбцы +## Виртуальные столбцы {#virtualnye-stolbtsy} -- `_table` — содержит имя таблицы, из которой данные были прочитаны. Тип — [String](../../data_types/string.md). +- `_table` — содержит имя таблицы, из которой данные были прочитаны. Тип — [String](../../data_types/string.md). - В секции `WHERE/PREWHERE` можно установить константное условие на столбец `_table` (например, `WHERE _table='xyz'`). В этом случае операции чтения выполняются только для тех таблиц, для которых выполняется условие на значение `_table`, таким образом, столбец `_table` работает как индекс. + В секции `WHERE/PREWHERE` можно установить константное условие на столбец `_table` (например, `WHERE _table='xyz'`). В этом случае операции чтения выполняются только для тех таблиц, для которых выполняется условие на значение `_table`, таким образом, столбец `_table` работает как индекс. **Смотрите также** -- [Виртуальные столбцы](index.md#table_engines-virtual_columns) +- [Виртуальные столбцы](index.md#table_engines-virtual_columns) [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/merge/) diff --git a/docs/ru/operations/table_engines/mergetree.md b/docs/ru/operations/table_engines/mergetree.md index 2753156f8a8..65e0500b391 100644 --- a/docs/ru/operations/table_engines/mergetree.md +++ b/docs/ru/operations/table_engines/mergetree.md @@ -1,4 +1,4 @@ -# MergeTree {#table_engines-mergetree} +# MergeTree {#table-engines-mergetree} Движок `MergeTree`, а также другие движки этого семейства (`*MergeTree`) — это наиболее функциональные движки таблиц ClickHouse. @@ -6,28 +6,28 @@ Основные возможности: -- Хранит данные, отсортированные по первичному ключу. +- Хранит данные, отсортированные по первичному ключу. - Это позволяет создавать разреженный индекс небольшого объёма, который позволяет быстрее находить данные. + Это позволяет создавать разреженный индекс небольшого объёма, который позволяет быстрее находить данные. -- Позволяет оперировать партициями, если задан [ключ партиционирования](custom_partitioning_key.md). +- Позволяет оперировать партициями, если задан [ключ партиционирования](custom_partitioning_key.md). - ClickHouse поддерживает отдельные операции с партициями, которые работают эффективнее, чем общие операции с этим же результатом над этими же данными. Также, ClickHouse автоматически отсекает данные по партициям там, где ключ партиционирования указан в запросе. Это также увеличивает эффективность выполнения запросов. + ClickHouse поддерживает отдельные операции с партициями, которые работают эффективнее, чем общие операции с этим же результатом над этими же данными. Также, ClickHouse автоматически отсекает данные по партициям там, где ключ партиционирования указан в запросе. Это также увеличивает эффективность выполнения запросов. -- Поддерживает репликацию данных. +- Поддерживает репликацию данных. - Для этого используется семейство таблиц `ReplicatedMergeTree`. Подробнее читайте в разделе [Репликация данных](replication.md). + Для этого используется семейство таблиц `ReplicatedMergeTree`. Подробнее читайте в разделе [Репликация данных](replication.md). -- Поддерживает сэмплирование данных. +- Поддерживает сэмплирование данных. - При необходимости можно задать способ сэмплирования данных в таблице. + При необходимости можно задать способ сэмплирования данных в таблице. -!!! info +!!! info "Info" Движок [Merge](merge.md) не относится к семейству `*MergeTree`. -## Создание таблицы {#table_engine-mergetree-creating-a-table} +## Создание таблицы {#table-engine-mergetree-creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], @@ -46,54 +46,55 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Описание параметров смотрите в [описании запроса CREATE](../../query_language/create.md). -!!!note "Note" +!!! note "Note" `INDEX` — экспериментальная возможность, смотрите [Индексы пропуска данных](#table_engine-mergetree-data_skipping-indexes). ### Секции запроса {#mergetree-query-clauses} -- `ENGINE` — имя и параметры движка. `ENGINE = MergeTree()`. `MergeTree` не имеет параметров. +- `ENGINE` — имя и параметры движка. `ENGINE = MergeTree()`. `MergeTree` не имеет параметров. -- `PARTITION BY` — [ключ партиционирования](custom_partitioning_key.md). +- `PARTITION BY` — [ключ партиционирования](custom_partitioning_key.md). - Для партиционирования по месяцам используйте выражение `toYYYYMM(date_column)`, где `date_column` — столбец с датой типа [Date](../../data_types/date.md). В этом случае имена партиций имеют формат `"YYYYMM"`. + Для партиционирования по месяцам используйте выражение `toYYYYMM(date_column)`, где `date_column` — столбец с датой типа [Date](../../data_types/date.md). В этом случае имена партиций имеют формат `"YYYYMM"`. -- `ORDER BY` — ключ сортировки. +- `ORDER BY` — ключ сортировки. - Кортеж столбцов или произвольных выражений. Пример: `ORDER BY (CounterID, EventDate)`. + Кортеж столбцов или произвольных выражений. Пример: `ORDER BY (CounterID, EventDate)`. -- `PRIMARY KEY` — первичный ключ, если он [отличается от ключа сортировки](mergetree.md). +- `PRIMARY KEY` — первичный ключ, если он [отличается от ключа сортировки](mergetree.md). - По умолчанию первичный ключ совпадает с ключом сортировки (который задаётся секцией `ORDER BY`.) Поэтому в большинстве случаев секцию `PRIMARY KEY` отдельно указывать не нужно. + По умолчанию первичный ключ совпадает с ключом сортировки (который задаётся секцией `ORDER BY`.) Поэтому в большинстве случаев секцию `PRIMARY KEY` отдельно указывать не нужно. -- `SAMPLE BY` — выражение для сэмплирования. +- `SAMPLE BY` — выражение для сэмплирования. - Если используется выражение для сэмплирования, то первичный ключ должен содержать его. Пример: -`SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))`. + Если используется выражение для сэмплирования, то первичный ключ должен содержать его. Пример: -- `TTL` — список правил, определяющих длительности хранения строк, а также задающих правила перемещения частей на определённые тома или диски. + `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))`. - Выражение должно возвращать столбец `Date` или `DateTime`. Пример: `TTL date + INTERVAL 1 DAY`. +- `TTL` — список правил, определяющих длительности хранения строк, а также задающих правила перемещения частей на определённые тома или диски. - Тип правила `DELETE|TO DISK 'xxx'|TO VOLUME 'xxx'` указывает действие, которое будет выполнено с частью, удаление строк (прореживание), перемещение (при выполнении условия для всех строк части) на определённый диск (`TO DISK 'xxx'`) или том (`TO VOLUME 'xxx'`). Поведение по умолчанию соответствует удалению строк (`DELETE`). В списке правил может быть указано только одно выражение с поведением `DELETE`. + Выражение должно возвращать столбец `Date` или `DateTime`. Пример: `TTL date + INTERVAL 1 DAY`. - Дополнительные сведения смотрите в разделе [TTL для столбцов и таблиц](#table_engine-mergetree-ttl) + Тип правила `DELETE|TO DISK 'xxx'|TO VOLUME 'xxx'` указывает действие, которое будет выполнено с частью, удаление строк (прореживание), перемещение (при выполнении условия для всех строк части) на определённый диск (`TO DISK 'xxx'`) или том (`TO VOLUME 'xxx'`). Поведение по умолчанию соответствует удалению строк (`DELETE`). В списке правил может быть указано только одно выражение с поведением `DELETE`. -- `SETTINGS` — дополнительные параметры, регулирующие поведение `MergeTree`: + Дополнительные сведения смотрите в разделе [TTL для столбцов и таблиц](#table_engine-mergetree-ttl) + +- `SETTINGS` — дополнительные параметры, регулирующие поведение `MergeTree`: + + - `index_granularity` — максимальное количество строк данных между засечками индекса. По умолчанию — 8192. Смотрите [Хранение данных](#mergetree-data-storage). + - `index_granularity_bytes` — максимальный размер гранул данных в байтах. По умолчанию — 10Mb. Чтобы ограничить размер гранул только количеством строк, установите значение 0 (не рекомендовано). Смотрите [Хранение данных](#mergetree-data-storage). + - `enable_mixed_granularity_parts` — включает или выключает переход к ограничению размера гранул с помощью настройки `index_granularity_bytes`. До версии 19.11, размер гранул ограничивался только настройкой `index_granularity`. Настройка `index_granularity_bytes` улучшает производительность ClickHouse при выборке данных из таблиц с большими (десятки и сотни мегабайтов) строками. Если у вас есть таблицы с большими строками, можно включить эту настройку, чтобы повысить эффективность запросов `SELECT`. + - `use_minimalistic_part_header_in_zookeeper` — Способ хранения заголовков кусков данных в ZooKeeper. Если `use_minimalistic_part_header_in_zookeeper = 1`, то ZooKeeper хранит меньше данных. Подробнее читайте в [описании настройки](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) в разделе "Конфигурационные параметры сервера". + - `min_merge_bytes_to_use_direct_io` — минимальный объём данных при слиянии, необходимый для прямого (небуферизованного) чтения/записи (direct I/O) на диск. При слиянии частей данных ClickHouse вычисляет общий объём хранения всех данных, подлежащих слиянию. Если общий объём хранения всех данных для чтения превышает `min_bytes_to_use_direct_io` байт, тогда ClickHouse использует флаг `O_DIRECT` при чтении данных с диска. Если `min_merge_bytes_to_use_direct_io = 0`, тогда прямой ввод-вывод отключен. Значение по умолчанию: `10 * 1024 * 1024 * 1024` байтов. + + - `merge_with_ttl_timeout` — минимальное время в секундах перед повторным слиянием с TTL. По умолчанию — 86400 (1 день). + - `write_final_mark` — включает или отключает запись последней засечки индекса в конце куска данных, указывающей за последний байт. По умолчанию — 1. Не отключайте её. + - `merge_max_block_size` — Максимальное количество строк в блоке для операций слияния. Значение по умолчанию: 8192. + - `storage_policy` — политика хранения данных. Смотрите [Хранение данных таблицы на нескольких блочных устройствах](#table_engine-mergetree-multiple-volumes). - - `index_granularity` — максимальное количество строк данных между засечками индекса. По умолчанию — 8192. Смотрите [Хранение данных](#mergetree-data-storage). - - `index_granularity_bytes` — максимальный размер гранул данных в байтах. По умолчанию — 10Mb. Чтобы ограничить размер гранул только количеством строк, установите значение 0 (не рекомендовано). Смотрите [Хранение данных](#mergetree-data-storage). - - `enable_mixed_granularity_parts` — включает или выключает переход к ограничению размера гранул с помощью настройки `index_granularity_bytes`. До версии 19.11, размер гранул ограничивался только настройкой `index_granularity`. Настройка `index_granularity_bytes` улучшает производительность ClickHouse при выборке данных из таблиц с большими (десятки и сотни мегабайтов) строками. Если у вас есть таблицы с большими строками, можно включить эту настройку, чтобы повысить эффективность запросов `SELECT`. - - `use_minimalistic_part_header_in_zookeeper` — Способ хранения заголовков кусков данных в ZooKeeper. Если `use_minimalistic_part_header_in_zookeeper = 1`, то ZooKeeper хранит меньше данных. Подробнее читайте в [описании настройки](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) в разделе "Конфигурационные параметры сервера". - - `min_merge_bytes_to_use_direct_io` — минимальный объём данных при слиянии, необходимый для прямого (небуферизованного) чтения/записи (direct I/O) на диск. При слиянии частей данных ClickHouse вычисляет общий объём хранения всех данных, подлежащих слиянию. Если общий объём хранения всех данных для чтения превышает `min_bytes_to_use_direct_io` байт, тогда ClickHouse использует флаг `O_DIRECT` при чтении данных с диска. Если `min_merge_bytes_to_use_direct_io = 0`, тогда прямой ввод-вывод отключен. Значение по умолчанию: `10 * 1024 * 1024 * 1024` байтов. - - - `merge_with_ttl_timeout` — минимальное время в секундах перед повторным слиянием с TTL. По умолчанию — 86400 (1 день). - - `write_final_mark` — включает или отключает запись последней засечки индекса в конце куска данных, указывающей за последний байт. По умолчанию — 1. Не отключайте её. - - `merge_max_block_size` — Максимальное количество строк в блоке для операций слияния. Значение по умолчанию: 8192. - - `storage_policy` — политика хранения данных. Смотрите [Хранение данных таблицы на нескольких блочных устройствах](#table_engine-mergetree-multiple-volumes). - **Пример задания секций** -```sql +``` sql ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity=8192 ``` @@ -103,12 +104,14 @@ ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDa `index_granularity` можно было не указывать, поскольку 8192 — это значение по умолчанию. -
Устаревший способ создания таблицы +
-!!! attention +Устаревший способ создания таблицы + +!!! attention "Attention" Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ, описанный выше. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -119,14 +122,14 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Параметры MergeTree()** -- `date-column` — имя столбца с типом [Date](../../data_types/date.md). На основе этого столбца ClickHouse автоматически создаёт партиции по месяцам. Имена партиций имеют формат `"YYYYMM"`. -- `sampling_expression` — выражение для сэмплирования. -- `(primary, key)` — первичный ключ. Тип — [Tuple()](../../data_types/tuple.md) -- `index_granularity` — гранулярность индекса. Число строк данных между «засечками» индекса. Для большинства задач подходит значение 8192. +- `date-column` — имя столбца с типом [Date](../../data_types/date.md). На основе этого столбца ClickHouse автоматически создаёт партиции по месяцам. Имена партиций имеют формат `"YYYYMM"`. +- `sampling_expression` — выражение для сэмплирования. +- `(primary, key)` — первичный ключ. Тип — [Tuple()](../../data_types/tuple.md) +- `index_granularity` — гранулярность индекса. Число строк данных между «засечками» индекса. Для большинства задач подходит значение 8192. **Пример** -```sql +``` sql MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) ``` @@ -146,12 +149,11 @@ MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID) Размер гранул оганичен настройками движка `index_granularity` и `index_granularity_bytes`. Количество строк в грануле лежит в диапазоне `[1, index_granularity]`, в зависимости от размера строк. Размер гранулы может превышать `index_granularity_bytes` в том случае, когда размер единственной строки в грануле превышает значение настройки. В этом случае, размер гранулы равен размеру строки. - ## Первичные ключи и индексы в запросах {#primary-keys-and-indexes-in-queries} Рассмотрим первичный ключ — `(CounterID, Date)`. В этом случае сортировку и индекс можно проиллюстрировать следующим образом: -```text +``` text Whole data: [-------------------------------------------------------------------------] CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll] Date: [1111111222222233331233211111222222333211111112122222223111112223311122333] @@ -162,9 +164,9 @@ Marks numbers: 0 1 2 3 4 5 6 7 8 Если в запросе к данным указать: -- `CounterID IN ('a', 'h')`, то сервер читает данные в диапазонах засечек `[0, 3)` и `[6, 8)`. -- `CounterID IN ('a', 'h') AND Date = 3`, то сервер читает данные в диапазонах засечек `[1, 3)` и `[7, 8)`. -- `Date = 3`, то сервер читает данные в диапазоне засечек `[1, 10]`. +- `CounterID IN ('a', 'h')`, то сервер читает данные в диапазонах засечек `[0, 3)` и `[6, 8)`. +- `CounterID IN ('a', 'h') AND Date = 3`, то сервер читает данные в диапазонах засечек `[1, 3)` и `[7, 8)`. +- `Date = 3`, то сервер читает данные в диапазоне засечек `[1, 10]`. Примеры выше показывают, что использование индекса всегда эффективнее, чем full scan. @@ -174,28 +176,28 @@ Marks numbers: 0 1 2 3 4 5 6 7 8 ClickHouse не требует уникального первичного ключа. Можно вставить много строк с одинаковым первичным ключом. -### Выбор первичного ключа +### Выбор первичного ключа {#vybor-pervichnogo-kliucha} Количество столбцов в первичном ключе не ограничено явным образом. В зависимости от структуры данных в первичный ключ можно включать больше или меньше столбцов. Это может: -- Увеличить эффективность индекса. +- Увеличить эффективность индекса. - Пусть первичный ключ — `(a, b)`, тогда добавление ещё одного столбца `c` повысит эффективность, если выполнены условия: + Пусть первичный ключ — `(a, b)`, тогда добавление ещё одного столбца `c` повысит эффективность, если выполнены условия: - - Есть запросы с условием на столбец `c`. - - Часто встречаются достаточно длинные (в несколько раз больше `index_granularity`) диапазоны данных с одинаковыми значениями `(a, b)`. Иначе говоря, когда добавление ещё одного столбца позволит пропускать достаточно длинные диапазоны данных. + - Есть запросы с условием на столбец `c`. + - Часто встречаются достаточно длинные (в несколько раз больше `index_granularity`) диапазоны данных с одинаковыми значениями `(a, b)`. Иначе говоря, когда добавление ещё одного столбца позволит пропускать достаточно длинные диапазоны данных. -- Улучшить сжатие данных. +- Улучшить сжатие данных. - ClickHouse сортирует данные по первичному ключу, поэтому чем выше однородность, тем лучше сжатие. + ClickHouse сортирует данные по первичному ключу, поэтому чем выше однородность, тем лучше сжатие. -- Обеспечить дополнительную логику при слиянии кусков данных в движках [CollapsingMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) и [SummingMergeTree](summingmergetree.md). +- Обеспечить дополнительную логику при слиянии кусков данных в движках [CollapsingMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) и [SummingMergeTree](summingmergetree.md). - В этом случае имеет смысл указать отдельный *ключ сортировки*, отличающийся от первичного ключа. + В этом случае имеет смысл указать отдельный *ключ сортировки*, отличающийся от первичного ключа. Длинный первичный ключ будет негативно влиять на производительность вставки и потребление памяти, однако на производительность ClickHouse при запросах `SELECT` лишние столбцы в первичном ключе не влияют. -### Первичный ключ, отличный от ключа сортировки +### Первичный ключ, отличный от ключа сортировки {#pervichnyi-kliuch-otlichnyi-ot-kliucha-sortirovki} Существует возможность задать первичный ключ (выражение, значения которого будут записаны в индексный файл для каждой засечки), отличный от ключа сортировки (выражение, по которому будут упорядочены строки в кусках @@ -214,7 +216,7 @@ ClickHouse не требует уникального первичного кл [ALTER ключа сортировки](../../query_language/alter.md) — лёгкая операция, так как при одновременном добавлении нового столбца в таблицу и ключ сортировки не нужно изменять данные кусков (они остаются упорядоченными и по новому выражению ключа). -### Использование индексов и партиций в запросах +### Использование индексов и партиций в запросах {#ispolzovanie-indeksov-i-partitsii-v-zaprosakh} Для запросов `SELECT` ClickHouse анализирует возможность использования индекса. Индекс может использоваться, если в секции `WHERE/PREWHERE`, в качестве одного из элементов конъюнкции, или целиком, есть выражение, представляющее операции сравнения на равенства, неравенства, а также `IN` или `LIKE` с фиксированным префиксом, над столбцами или выражениями, входящими в первичный ключ или ключ партиционирования, либо над некоторыми частично монотонными функциями от этих столбцов, а также логические связки над такими выражениями. @@ -222,13 +224,13 @@ ClickHouse не требует уникального первичного кл Рассмотрим движок сконфигурированный следующим образом: -```sql +``` sql ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192 ``` В этом случае в запросах: -```sql +``` sql SELECT count() FROM table WHERE EventDate = toDate(now()) AND CounterID = 34 SELECT count() FROM table WHERE EventDate = toDate(now()) AND (CounterID = 34 OR CounterID = 42) SELECT count() FROM table WHERE ((EventDate >= toDate('2014-01-01') AND EventDate <= toDate('2014-01-31')) OR EventDate = toDate('2014-05-01')) AND CounterID IN (101500, 731962, 160656) AND (CounterID = 101500 OR EventDate != toDate('2014-05-01')) @@ -240,15 +242,15 @@ ClickHouse будет использовать индекс по первичн В примере ниже индекс не может использоваться. -```sql +``` sql SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' ``` -Чтобы проверить, сможет ли ClickHouse использовать индекс при выполнении запроса, используйте настройки [force_index_by_date](../settings/settings.md#settings-force_index_by_date) и [force_primary_key](../settings/settings.md#settings-force_primary_key). +Чтобы проверить, сможет ли ClickHouse использовать индекс при выполнении запроса, используйте настройки [force\_index\_by\_date](../settings/settings.md#settings-force_index_by_date) и [force\_primary\_key](../settings/settings.md#settings-force_primary_key). Ключ партиционирования по месяцам обеспечивает чтение только тех блоков данных, которые содержат даты из нужного диапазона. При этом блок данных может содержать данные за многие даты (до целого месяца). В пределах одного блока данные упорядочены по первичному ключу, который может не содержать дату в качестве первого столбца. В связи с этим, при использовании запроса с указанием условия только на дату, но не на префикс первичного ключа, будет читаться данных больше, чем за одну дату. -### Использование индекса для частично-монотонных первичных ключей +### Использование индекса для частично-монотонных первичных ключей {#ispolzovanie-indeksa-dlia-chastichno-monotonnykh-pervichnykh-kliuchei} Рассмотрим, например, дни месяца. Они образуют последовательность [монотонную](https://ru.wikipedia.org/wiki/Монотонная_последовательность) в течение одного месяца, но не монотонную на более длительных периодах. Это частично-монотонная последовательность. Если пользователь создаёт таблицу с частично-монотонным первичным ключом, ClickHouse как обычно создаёт разреженный индекс. Когда пользователь выбирает данные из такого рода таблиц, ClickHouse анализирует условия запроса. Если пользователь хочет получить данные между двумя метками индекса, и обе эти метки находятся внутри одного месяца, ClickHouse может использовать индекс в данном конкретном случае, поскольку он может рассчитать расстояние между параметрами запроса и индексными метками. @@ -256,11 +258,11 @@ ClickHouse не может использовать индекс, если зн ClickHouse использует эту логику не только для последовательностей дней месяца, но и для любого частично-монотонного первичного ключа. -### Индексы пропуска данных (экспериментальная функциональность) {#table_engine-mergetree-data_skipping-indexes} +### Индексы пропуска данных (экспериментальная функциональность) {#table-engine-mergetree-data-skipping-indexes} Объявление индексов при определении столбцов в запросе `CREATE`. -```sql +``` sql INDEX index_name expr TYPE type(...) GRANULARITY granularity_value ``` @@ -270,7 +272,7 @@ INDEX index_name expr TYPE type(...) GRANULARITY granularity_value **Пример** -```sql +``` sql CREATE TABLE table_name ( u64 UInt64, @@ -285,82 +287,81 @@ CREATE TABLE table_name Эти индексы смогут использоваться для оптимизации следующих запросов -```sql +``` sql SELECT count() FROM table WHERE s < 'z' SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 ``` -#### Доступные индексы +#### Доступные индексы {#dostupnye-indeksy} -- `minmax` — Хранит минимум и максимум выражения (если выражение - `tuple`, то для каждого элемента `tuple`), используя их для пропуска блоков аналогично первичному ключу. +- `minmax` — Хранит минимум и максимум выражения (если выражение - `tuple`, то для каждого элемента `tuple`), используя их для пропуска блоков аналогично первичному ключу. -- `set(max_rows)` — Хранит уникальные значения выражения на блоке в количестве не более `max_rows` (если `max_rows = 0`, то ограничений нет), используя их для пропуска блоков, оценивая выполнимость `WHERE` выражения на хранимых данных. +- `set(max_rows)` — Хранит уникальные значения выражения на блоке в количестве не более `max_rows` (если `max_rows = 0`, то ограничений нет), используя их для пропуска блоков, оценивая выполнимость `WHERE` выражения на хранимых данных. -- `bloom_filter([false_positive])` — [фильтр Блума](https://en.wikipedia.org/wiki/Bloom_filter) для указанных стоблцов. +- `bloom_filter([false_positive])` — [фильтр Блума](https://en.wikipedia.org/wiki/Bloom_filter) для указанных стоблцов. - Необязательный параметр `false_positive` — это вероятность получения ложноположительного срабатывания. Возможные значения: (0, 1). Значение по умолчанию: 0.025. + Необязательный параметр `false_positive` — это вероятность получения ложноположительного срабатывания. Возможные значения: (0, 1). Значение по умолчанию: 0.025. - Поддержанные типы данных: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`. - - Фильтром могут пользоваться функции: [equals](../../query_language/functions/comparison_functions.md), [notEquals](../../query_language/functions/comparison_functions.md), [in](../../query_language/functions/in_functions.md), [notIn](../../query_language/functions/in_functions.md). + Поддержанные типы данных: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`. + Фильтром могут пользоваться функции: [equals](../../query_language/functions/comparison_functions.md), [notEquals](../../query_language/functions/comparison_functions.md), [in](../../query_language/functions/in_functions.md), [notIn](../../query_language/functions/in_functions.md). **Примеры** -```sql +``` sql INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE minmax GRANULARITY 4 INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4 ``` -#### Поддержка для функций +#### Поддержка для функций {#podderzhka-dlia-funktsii} Условия в секции `WHERE` содержат вызовы функций, оперирующих со столбцами. Если столбец - часть индекса, ClickHouse пытается использовать индекс при выполнении функции. Для разных видов индексов, ClickHouse поддерживает различные наборы функций, которые могут использоваться индексами. Индекс `set` используется со всеми функциями. Наборы функций для остальных индексов представлены в таблице ниже. -Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter -----------------------------|-------------|--------|------------|------------|--------------- -[equals (=, ==)](../../query_language/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ -[notEquals(!=, <>)](../../query_language/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ -[like](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ -[notLike](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ -[startsWith](../../query_language/functions/string_functions.md#function-startswith) | ✔ | ✔ | ✔ | ✔ | ✗ -[endsWith](../../query_language/functions/string_functions.md#function-endswith) | ✗ | ✗ | ✔ | ✔ | ✗ -[multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✔ | ✗ -[in](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ -[notIn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ -[less (<)](../../query_language/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ -[greater (>)](../../query_language/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ -[lessOrEquals (<=)](../../query_language/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ -[greaterOrEquals (>=)](../../query_language/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ -[empty](../../query_language/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ -[notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ -hasToken | ✗ | ✗ | ✗ | ✔ | ✗ +| Function (operator) / Index | primary key | minmax | ngrambf\_v1 | tokenbf\_v1 | bloom\_filter | +|----------------------------------------------------------------------------------------------------------|-------------|--------|-------------|-------------|---------------| +| [equals (=, ==)](../../query_language/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notEquals(!=, \<\>)](../../query_language/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [like](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ | +| [notLike](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [startsWith](../../query_language/functions/string_functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | +| [endsWith](../../query_language/functions/string_functions.md#endswith) | ✗ | ✗ | ✔ | ✔ | ✗ | +| [multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✔ | ✗ | +| [in](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notIn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [less (\<)](../../query_language/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greater (\>)](../../query_language/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [lessOrEquals (\<=)](../../query_language/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [greaterOrEquals (\>=)](../../query_language/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [empty](../../query_language/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| [notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ | +| hasToken | ✗ | ✗ | ✗ | ✔ | ✗ | Функции с постоянным агрументом, который меньше, чем размер ngram не могут использовать индекс `ngrambf_v1` для оптимизации запроса. Фильтры Блума могут иметь ложнопозитивные срабатывания, следовательно индексы `ngrambf_v1`, `tokenbf_v1` и `bloom_filter` невозможно использовать для оптимизации запросов, в которых результат функции предполается false, например: -- Можно оптимизировать: - - `s LIKE '%test%'` - - `NOT s NOT LIKE '%test%'` - - `s = 1` - - `NOT s != 1` - - `startsWith(s, 'test')` -- Нельзя оптимизировать: - - `NOT s LIKE '%test%'` - - `s NOT LIKE '%test%'` - - `NOT s = 1` - - `s != 1` - - `NOT startsWith(s, 'test')` +- Можно оптимизировать: + - `s LIKE '%test%'` + - `NOT s NOT LIKE '%test%'` + - `s = 1` + - `NOT s != 1` + - `startsWith(s, 'test')` +- Нельзя оптимизировать: + - `NOT s LIKE '%test%'` + - `s NOT LIKE '%test%'` + - `NOT s = 1` + - `s != 1` + - `NOT startsWith(s, 'test')` -## Конкурентный доступ к данным +## Конкурентный доступ к данным {#konkurentnyi-dostup-k-dannym} Для конкурентного доступа к таблице используется мультиверсионность. То есть, при одновременном чтении и обновлении таблицы, данные будут читаться из набора кусочков, актуального на момент запроса. Длинных блокировок нет. Вставки никак не мешают чтениям. Чтения из таблицы автоматически распараллеливаются. -## TTL для столбцов и таблиц {#table_engine-mergetree-ttl} +## TTL для столбцов и таблиц {#table-engine-mergetree-ttl} Определяет время жизни значений, а также правила перемещения данных на другой диск или том. @@ -370,14 +371,14 @@ hasToken | ✗ | ✗ | ✗ | ✔ | ✗ Для задания времени жизни столбца, например: -```sql +``` sql TTL time_column TTL time_column + interval ``` Чтобы задать `interval`, используйте операторы [интервала времени](../../query_language/operators.md#operators-datetime). -```sql +``` sql TTL date_time + INTERVAL 1 MONTH TTL date_time + INTERVAL 15 HOUR ``` @@ -392,7 +393,7 @@ TTL date_time + INTERVAL 15 HOUR Создание таблицы с TTL -```sql +``` sql CREATE TABLE example_table ( d DateTime, @@ -407,7 +408,7 @@ ORDER BY d; Добавление TTL на колонку существующей таблицы -```sql +``` sql ALTER TABLE example_table MODIFY COLUMN c String TTL d + INTERVAL 1 DAY; @@ -415,7 +416,7 @@ ALTER TABLE example_table Изменение TTL у колонки -```sql +``` sql ALTER TABLE example_table MODIFY COLUMN c String TTL d + INTERVAL 1 MONTH; @@ -425,19 +426,19 @@ ALTER TABLE example_table Для таблицы можно задать одно выражение для устаревания данных, а также несколько выражений, по срабатывании которых данные переместятся на [некоторый диск или том](#table_engine-mergetree-multiple-volumes). Когда некоторые данные в таблице устаревают, ClickHouse удаляет все соответствующие строки. -```sql +``` sql TTL expr [DELETE|TO DISK 'aaa'|TO VOLUME 'bbb'], ... ``` За каждым TTL выражением может следовать тип действия, которое выполняется после достижения времени, соответствующего результату TTL выражения: -- `DELETE` - удалить данные (действие по умолчанию); -- `TO DISK 'aaa'` - переместить данные на диск `aaa`; -- `TO VOLUME 'bbb'` - переместить данные на том `bbb`. +- `DELETE` - удалить данные (действие по умолчанию); +- `TO DISK 'aaa'` - переместить данные на диск `aaa`; +- `TO VOLUME 'bbb'` - переместить данные на том `bbb`. Примеры: -```sql +``` sql CREATE TABLE example_table ( d DateTime, @@ -453,7 +454,7 @@ TTL d + INTERVAL 1 MONTH [DELETE], Изменение TTL -```sql +``` sql ALTER TABLE example_table MODIFY TTL d + INTERVAL 1 DAY; ``` @@ -462,34 +463,34 @@ ALTER TABLE example_table Данные с истекшим TTL удаляются, когда ClickHouse мёржит куски данных. -Когда ClickHouse видит, что некоторые данные устарели, он выполняет внеплановые мёржи. Для управление частотой подобных мёржей, можно задать настройку [merge_with_ttl_timeout](#mergetree_setting-merge_with_ttl_timeout). Если её значение слишком низкое, придется выполнять много внеплановых мёржей, которые могут начать потреблять значительную долю ресурсов сервера. +Когда ClickHouse видит, что некоторые данные устарели, он выполняет внеплановые мёржи. Для управление частотой подобных мёржей, можно задать настройку [merge\_with\_ttl\_timeout](#mergetree_setting-merge_with_ttl_timeout). Если её значение слишком низкое, придется выполнять много внеплановых мёржей, которые могут начать потреблять значительную долю ресурсов сервера. Если вы выполните запрос `SELECT` между слияниями вы можете получить устаревшие данные. Чтобы избежать этого используйте запрос [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) перед `SELECT`. -## Хранение данных таблицы на нескольких блочных устройствах {#table_engine-mergetree-multiple-volumes} +## Хранение данных таблицы на нескольких блочных устройствах {#table-engine-mergetree-multiple-volumes} -### Введение +### Введение {#vvedenie} -Движки таблиц семейства `MergeTree` могут хранить данные на нескольких блочных устройствах. Это может оказаться полезным, например, при неявном разделении данных одной таблицы на "горячие" и "холодные". Наиболее свежая часть занимает малый объём и запрашивается регулярно, а большой хвост исторических данных запрашивается редко. При наличии в системе нескольких дисков, "горячая" часть данных может быть размещена на быстрых дисках (например, на NVMe SSD или в памяти), а холодная на более медленных (например, HDD). +Движки таблиц семейства `MergeTree` могут хранить данные на нескольких блочных устройствах. Это может оказаться полезным, например, при неявном разделении данных одной таблицы на «горячие» и «холодные». Наиболее свежая часть занимает малый объём и запрашивается регулярно, а большой хвост исторических данных запрашивается редко. При наличии в системе нескольких дисков, «горячая» часть данных может быть размещена на быстрых дисках (например, на NVMe SSD или в памяти), а холодная на более медленных (например, HDD). Минимальной перемещаемой единицей для `MergeTree` является кусок данных (data part). Данные одного куска могут находится только на одном диске. Куски могут перемещаться между дисками в фоне, согласно пользовательским настройкам, а также с помощью запросов [ALTER](../../query_language/alter.md#alter_move-partition). -### Термины +### Термины {#terminy} -- Диск — примонтированное в файловой системе блочное устройство. -- Диск по умолчанию — диск, на котором находится путь, указанный в конфигурационной настройке сервера [path](../server_settings/settings.md#server_settings-path). -- Том (Volume) — упорядоченный набор равноценных дисков (схоже с [JBOD](https://ru.wikipedia.org/wiki/JBOD)) -- Политика хранения (StoragePolicy) — множество томов с правилами перемещения данных между ними. +- Диск — примонтированное в файловой системе блочное устройство. +- Диск по умолчанию — диск, на котором находится путь, указанный в конфигурационной настройке сервера [path](../server_settings/settings.md#server_settings-path). +- Том (Volume) — упорядоченный набор равноценных дисков (схоже с [JBOD](https://ru.wikipedia.org/wiki/JBOD)) +- Политика хранения (StoragePolicy) — множество томов с правилами перемещения данных между ними. -У всех описанных сущностей при создании указываются имена, можно найти в системных таблицах [system.storage_policies](../system_tables.md#system_tables-storage_policies) и [system.disks](../system_tables.md#system_tables-disks). Имя политики хранения можно указать в настройке `storage_policy` движков таблиц семейства `MergeTree`. +У всех описанных сущностей при создании указываются имена, можно найти в системных таблицах [system.storage\_policies](../system_tables.md#system_tables-storage_policies) и [system.disks](../system_tables.md#system_tables-disks). Имя политики хранения можно указать в настройке `storage_policy` движков таблиц семейства `MergeTree`. -### Конфигурация {#table_engine-mergetree-multiple-volumes_configure} +### Конфигурация {#table-engine-mergetree-multiple-volumes-configure} -Диски, тома и политики хранения задаются внутри тега `` в основном файле `config.xml` или в отдельном файле в директории `config.d`. +Диски, тома и политики хранения задаются внутри тега `` в основном файле `config.xml` или в отдельном файле в директории `config.d`. Структура конфигурации: -```xml +``` xml @@ -506,22 +507,22 @@ ALTER TABLE example_table ... - + ... ``` Теги: -- `` — имя диска. Имена должны быть разными для всех дисков. -- `path` — путь по которому будут храниться данные сервера (каталоги `data` и `shadow`), должен быть терминирован `/`. -- `keep_free_space_bytes` — размер зарезервированного свободного места на диске. +- `` — имя диска. Имена должны быть разными для всех дисков. +- `path` — путь по которому будут храниться данные сервера (каталоги `data` и `shadow`), должен быть терминирован `/`. +- `keep_free_space_bytes` — размер зарезервированного свободного места на диске. Порядок задания дисков не имеет значения. Общий вид конфигурации политик хранения: -```xml +``` xml ... @@ -550,15 +551,15 @@ ALTER TABLE example_table Тэги: -- `policy_name_N` — название политики. Названия политик должны быть уникальны. -- `volume_name_N` — название тома. Названия томов должны быть уникальны. -- `disk` — диск, находящийся внутри тома. -- `max_data_part_size_bytes` — максимальный размер куска данных, который может находится на любом из дисков этого тома. -- `move_factor` — доля свободного места, при превышении которого данные начинают перемещаться на следующий том, если он есть (по умолчанию 0.1). +- `policy_name_N` — название политики. Названия политик должны быть уникальны. +- `volume_name_N` — название тома. Названия томов должны быть уникальны. +- `disk` — диск, находящийся внутри тома. +- `max_data_part_size_bytes` — максимальный размер куска данных, который может находится на любом из дисков этого тома. +- `move_factor` — доля свободного места, при превышении которого данные начинают перемещаться на следующий том, если он есть (по умолчанию 0.1). Примеры конфигураций: -```xml +``` xml ... @@ -579,7 +580,7 @@ ALTER TABLE example_table disk1 - + 0.2 @@ -590,13 +591,13 @@ ALTER TABLE example_table В приведенном примере, политика `hdd_in_order` реализует прицип [round-robin](https://ru.wikipedia.org/wiki/Round-robin_(%D0%B0%D0%BB%D0%B3%D0%BE%D1%80%D0%B8%D1%82%D0%BC)). Так как в политике есть всего один том (`single`), то все записи производятся на его диски по круговому циклу. Такая политика может быть полезна при наличии в системе нескольких похожих дисков, но при этом не сконфигурирован RAID. Учтите, что каждый отдельный диск ненадёжен и чтобы не потерять важные данные это необходимо скомпенсировать за счет хранения данных в трёх копиях. -Если система содержит диски различных типов, то может пригодиться политика `moving_from_ssd_to_hdd`. В томе `hot` находится один SSD-диск (`fast_ssd`), а также задается ограничение на максимальный размер куска, который может храниться на этом томе (1GB). Все куски такой таблицы больше 1GB будут записываться сразу на том `cold`, в котором содержится один HDD-диск `disk1`. Также, при заполнении диска `fast_ssd` более чем на 80% данные будут переносится на диск `disk1` фоновым процессом. +Если система содержит диски различных типов, то может пригодиться политика `moving_from_ssd_to_hdd`. В томе `hot` находится один SSD-диск (`fast_ssd`), а также задается ограничение на максимальный размер куска, который может храниться на этом томе (1GB). Все куски такой таблицы больше 1GB будут записываться сразу на том `cold`, в котором содержится один HDD-диск `disk1`. Также, при заполнении диска `fast_ssd` более чем на 80% данные будут переносится на диск `disk1` фоновым процессом. Порядок томов в политиках хранения важен, при достижении условий на переполнение тома данные переносятся на следующий. Порядок дисков в томах так же важен, данные пишутся по очереди на каждый из них. После задания конфигурации политик хранения их можно использовать, как настройку при создании таблиц: -```sql +``` sql CREATE TABLE table_with_non_default_policy ( EventDate Date, OrderID UInt64, @@ -610,25 +611,25 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' По умолчанию используется политика хранения `default` в которой есть один том и один диск, указанный в ``. В данный момент менять политику хранения после создания таблицы нельзя. -### Особенности работы +### Особенности работы {#osobennosti-raboty} В таблицах `MergeTree` данные попадают на диск несколькими способами: -- В результате вставки (запрос `INSERT`). -- В фоновых операциях слияний и [мутаций](../../query_language/alter.md#alter-mutations). -- При скачивании данных с другой реплики. -- В результате заморозки партиций [ALTER TABLE ... FREEZE PARTITION](../../query_language/alter.md#alter_freeze-partition). +- В результате вставки (запрос `INSERT`). +- В фоновых операциях слияний и [мутаций](../../query_language/alter.md#alter-mutations). +- При скачивании данных с другой реплики. +- В результате заморозки партиций [ALTER TABLE … FREEZE PARTITION](../../query_language/alter.md#alter_freeze-partition). Во всех случаях, кроме мутаций и заморозки партиций, при записи куска выбирается том и диск в соответствии с указанной конфигурацией хранилища: -1. Выбирается первый по порядку том, на котором есть свободное место для записи куска (`unreserved_space > current_part_size`) и который позволяет записывать куски требуемого размера `max_data_part_size_bytes > current_part_size`. -2. Внутри тома выбирается следующий диск после того, на который была предыдущая запись и на котором свободного места больше чем размер куска (`unreserved_space - keep_free_space_bytes > current_part_size`) +1. Выбирается первый по порядку том, на котором есть свободное место для записи куска (`unreserved_space > current_part_size`) и который позволяет записывать куски требуемого размера `max_data_part_size_bytes > current_part_size`. +2. Внутри тома выбирается следующий диск после того, на который была предыдущая запись и на котором свободного места больше чем размер куска (`unreserved_space - keep_free_space_bytes > current_part_size`) Мутации и запросы заморозки партиций в реализации используют [жесткие ссылки](https://ru.wikipedia.org/wiki/%D0%96%D1%91%D1%81%D1%82%D0%BA%D0%B0%D1%8F_%D1%81%D1%81%D1%8B%D0%BB%D0%BA%D0%B0). Жесткие ссылки между различными дисками не поддерживаются, поэтому в случае таких операций куски размещаются на тех же дисках, что и исходные. -В фоне куски перемещаются между томами на основе информации о занятом месте (настройка `move_factor`) по порядку, в котором указаны тома в конфигурации. Данные никогда не перемещаются с последнего тома и на первый том. Следить за фоновыми перемещениями можно с помощью системных таблиц [system.part_log](../system_tables.md#system_tables-part-log) (поле `type = MOVE_PART`) и [system.parts](../system_tables.md#system_tables-parts) (поля `path` и `disk`). Также подробная информация о перемещениях доступна в логах сервера. +В фоне куски перемещаются между томами на основе информации о занятом месте (настройка `move_factor`) по порядку, в котором указаны тома в конфигурации. Данные никогда не перемещаются с последнего тома и на первый том. Следить за фоновыми перемещениями можно с помощью системных таблиц [system.part\_log](../system_tables.md#system_tables-part-log) (поле `type = MOVE_PART`) и [system.parts](../system_tables.md#system_tables-parts) (поля `path` и `disk`). Также подробная информация о перемещениях доступна в логах сервера. -С помощью запроса [ALTER TABLE ... MOVE PART|PARTITION ... TO VOLUME|DISK ...](../../query_language/alter.md#alter_move-partition) пользователь может принудительно перенести кусок или партицию с одного раздела на другой. При этом учитываются все ограничения, указанные для фоновых операций. Запрос самостоятельно инициирует процесс перемещения не дожидаясь фоновых операций. В случае недостатка места или неудовлетворения ограничениям пользователь получит сообщение об ошибке. +С помощью запроса [ALTER TABLE … MOVE PART\|PARTITION … TO VOLUME\|DISK …](../../query_language/alter.md#alter_move-partition) пользователь может принудительно перенести кусок или партицию с одного раздела на другой. При этом учитываются все ограничения, указанные для фоновых операций. Запрос самостоятельно инициирует процесс перемещения не дожидаясь фоновых операций. В случае недостатка места или неудовлетворения ограничениям пользователь получит сообщение об ошибке. Перемещения данных не взаимодействуют с репликацией данных, поэтому на разных репликах одной и той же таблицы могут быть указаны разные политики хранения. diff --git a/docs/ru/operations/table_engines/mysql.md b/docs/ru/operations/table_engines/mysql.md index 79b000c0e12..09ca9077c2c 100644 --- a/docs/ru/operations/table_engines/mysql.md +++ b/docs/ru/operations/table_engines/mysql.md @@ -1,10 +1,10 @@ -# MySQL +# MySQL {#mysql} Движок MySQL позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом MySQL сервере. -## Создание таблицы +## Создание таблицы {#sozdanie-tablitsy} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], @@ -17,32 +17,38 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Структура таблицы может отличаться от исходной структуры таблицы MySQL: -- Имена столбцов должны быть такими же, как в исходной таблице MySQL, но вы можете использовать только некоторые из этих столбцов и в любом порядке. -- Типы столбцов могут отличаться от типов в исходной таблице MySQL. ClickHouse пытается [приводить](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) значения к типам данных ClickHouse. +- Имена столбцов должны быть такими же, как в исходной таблице MySQL, но вы можете использовать только некоторые из этих столбцов и в любом порядке. +- Типы столбцов могут отличаться от типов в исходной таблице MySQL. ClickHouse пытается [приводить](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) значения к типам данных ClickHouse. **Параметры движка** -- `host:port` — адрес сервера MySQL. -- `database` — имя базы данных на удалённом сервере. -- `table` — имя таблицы на удалённом сервере. -- `user` — пользователь MySQL. -- `password` — пароль пользователя. -- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Если `replace_query=1`, то запрос заменяется. -- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое к запросу `INSERT`. +- `host:port` — адрес сервера MySQL. - Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1`. Чтобы узнать какие `on_duplicate_clause` можно использовать с секцией `ON DUPLICATE KEY` обратитесь к [документации MySQL](https://dev.mysql.com/doc/refman/8.0/en/insert-on-duplicate.html). +- `database` — имя базы данных на удалённом сервере. - Чтобы указать `on_duplicate_clause` необходимо передать `0` в параметр `replace_query`. Если одновременно передать `replace_query = 1` и `on_duplicate_clause`, то ClickHouse сгенерирует исключение. +- `table` — имя таблицы на удалённом сервере. -Простые условия `WHERE` такие как ` =, !=, >, >=, <, =` выполняются на стороне сервера MySQL. +- `user` — пользователь MySQL. + +- `password` — пароль пользователя. + +- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Если `replace_query=1`, то запрос заменяется. + +- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое к запросу `INSERT`. + + Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1`. Чтобы узнать какие `on_duplicate_clause` можно использовать с секцией `ON DUPLICATE KEY` обратитесь к [документации MySQL](https://dev.mysql.com/doc/refman/8.0/en/insert-on-duplicate.html). + + Чтобы указать `on_duplicate_clause` необходимо передать `0` в параметр `replace_query`. Если одновременно передать `replace_query = 1` и `on_duplicate_clause`, то ClickHouse сгенерирует исключение. + +Простые условия `WHERE` такие как `=, !=, >, >=, <, =` выполняются на стороне сервера MySQL. Остальные условия и ограничение выборки `LIMIT` будут выполнены в ClickHouse только после выполнения запроса к MySQL. -## Пример использования +## Пример использования {#primer-ispolzovaniia} Таблица в MySQL: -```text +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -65,7 +71,7 @@ mysql> select * from test; Таблица в ClickHouse, которая получает данные из созданной ранее таблицы MySQL: -```sql +``` sql CREATE TABLE mysql_table ( `float_nullable` Nullable(Float32), @@ -74,19 +80,19 @@ CREATE TABLE mysql_table ENGINE = MySQL('localhost:3306', 'test', 'test', 'bayonet', '123') ``` -```sql +``` sql SELECT * FROM mysql_table ``` -```text +``` text ┌─float_nullable─┬─int_id─┐ │ ᴺᵁᴸᴸ │ 1 │ └────────────────┴────────┘ ``` -## Смотрите также +## Смотрите также {#smotrite-takzhe} -- [Табличная функция 'mysql'](../../query_language/table_functions/mysql.md) -- [Использование MySQL в качестве источника для внешнего словаря](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) +- [Табличная функция ‘mysql’](../../query_language/table_functions/mysql.md) +- [Использование MySQL в качестве источника для внешнего словаря](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/mysql/) diff --git a/docs/ru/operations/table_engines/null.md b/docs/ru/operations/table_engines/null.md index 949f101e68c..ac8c7942f79 100644 --- a/docs/ru/operations/table_engines/null.md +++ b/docs/ru/operations/table_engines/null.md @@ -1,4 +1,4 @@ -# Null +# Null {#null} При записи в таблицу типа Null, данные игнорируются. При чтении из таблицы типа Null, возвращается пустота. diff --git a/docs/ru/operations/table_engines/odbc.md b/docs/ru/operations/table_engines/odbc.md index 91ac7e72876..b3dde77890c 100644 --- a/docs/ru/operations/table_engines/odbc.md +++ b/docs/ru/operations/table_engines/odbc.md @@ -1,4 +1,4 @@ -# ODBC {#table_engine-odbc} +# ODBC {#table-engine-odbc} Позволяет ClickHouse подключаться к внешним базам данных с помощью [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). @@ -6,9 +6,9 @@ Движок поддерживает тип данных [Nullable](../../data_types/nullable.md). -## Создание таблицы +## Создание таблицы {#sozdanie-tablitsy} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1], @@ -22,16 +22,16 @@ ENGINE = ODBC(connection_settings, external_database, external_table) Структура таблицы может отличаться от структуры исходной таблицы в удалённой СУБД: -- Имена столбцов должны быть такими же, как в исходной таблице, но вы можете использовать только некоторые из этих столбцов и в любом порядке. -- Типы столбцов могут отличаться от типов аналогичных столбцов в исходной таблице. ClickHouse пытается [приводить](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) значения к типам данных ClickHouse. +- Имена столбцов должны быть такими же, как в исходной таблице, но вы можете использовать только некоторые из этих столбцов и в любом порядке. +- Типы столбцов могут отличаться от типов аналогичных столбцов в исходной таблице. ClickHouse пытается [приводить](../../query_language/functions/type_conversion_functions.md#type_conversion_function-cast) значения к типам данных ClickHouse. **Параметры движка** -- `connection_settings` — название секции с настройками соединения в файле `odbc.ini`. -- `external_database` — имя базы данных во внешней СУБД. -- `external_table` — имя таблицы в `external_database`. +- `connection_settings` — название секции с настройками соединения в файле `odbc.ini`. +- `external_database` — имя базы данных во внешней СУБД. +- `external_table` — имя таблицы в `external_database`. -## Пример использования +## Пример использования {#primer-ispolzovaniia} **Извлечение данных из локальной установки MySQL через ODBC** @@ -41,17 +41,18 @@ ENGINE = ODBC(connection_settings, external_database, external_table) По умолчанию (если установлен из пакетов) ClickHouse запускается от имени пользователя `clickhouse`. Таким образом, вам нужно создать и настроить этого пользователя на сервере MySQL. -```bash +``` bash $ sudo mysql ``` -```sql + +``` sql mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; ``` Теперь настроим соединение в `/etc/odbc.ini`. -```bash +``` bash $ cat /etc/odbc.ini [mysqlconn] DRIVER = /usr/local/lib/libmyodbc5w.so @@ -64,7 +65,7 @@ PASSWORD = clickhouse Вы можете проверить соединение с помощью утилиты `isql` из установки unixODBC. -```bash +``` bash $ isql -v mysqlconn +---------------------------------------+ | Connected! | @@ -74,7 +75,7 @@ $ isql -v mysqlconn Таблица в MySQL: -```text +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -97,7 +98,7 @@ mysql> select * from test; Таблица в ClickHouse, которая получает данные из таблицы MySQL: -```sql +``` sql CREATE TABLE odbc_t ( `int_id` Int32, @@ -106,19 +107,19 @@ CREATE TABLE odbc_t ENGINE = ODBC('DSN=mysqlconn', 'test', 'test') ``` -```sql +``` sql SELECT * FROM odbc_t ``` -```text +``` text ┌─int_id─┬─float_nullable─┐ │ 1 │ ᴺᵁᴸᴸ │ └────────┴────────────────┘ ``` -## Смотрите также +## Смотрите также {#smotrite-takzhe} -- [Внешние словари ODBC](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) -- [Табличная функция odbc](../../query_language/table_functions/odbc.md) +- [Внешние словари ODBC](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) +- [Табличная функция odbc](../../query_language/table_functions/odbc.md) [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/odbc/) diff --git a/docs/ru/operations/table_engines/replacingmergetree.md b/docs/ru/operations/table_engines/replacingmergetree.md index b63d6582017..b403e485741 100644 --- a/docs/ru/operations/table_engines/replacingmergetree.md +++ b/docs/ru/operations/table_engines/replacingmergetree.md @@ -1,4 +1,4 @@ -# ReplacingMergeTree +# ReplacingMergeTree {#replacingmergetree} Движок отличается от [MergeTree](mergetree.md#table_engines-mergetree) тем, что выполняет удаление дублирующихся записей с одинаковым значением первичного ключа (точнее, с одинаковым значением [ключа сортировки](mergetree.md)). @@ -6,9 +6,9 @@ Таким образом, `ReplacingMergeTree` подходит для фоновой чистки дублирующихся данных в целях экономии места, но не даёт гарантии отсутствия дубликатов. -## Создание таблицы +## Создание таблицы {#sozdanie-tablitsy} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -25,23 +25,25 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Параметры ReplacingMergeTree** -- `ver` — столбец с версией, тип `UInt*`, `Date` или `DateTime`. Необязательный параметр. +- `ver` — столбец с версией, тип `UInt*`, `Date` или `DateTime`. Необязательный параметр. - При слиянии, из всех строк с одинаковым значением первичного ключа `ReplacingMergeTree` оставляет только одну: + При слиянии, из всех строк с одинаковым значением первичного ключа `ReplacingMergeTree` оставляет только одну: - - Последнюю в выборке, если `ver` не задан. - - С максимальной версией, если `ver` задан. + - Последнюю в выборке, если `ver` не задан. + - С максимальной версией, если `ver` задан. **Секции запроса** При создании таблицы `ReplacingMergeTree` используются те же [секции](mergetree.md), что и при создании таблицы `MergeTree`. -
Устаревший способ создания таблицы +
-!!! attention +Устаревший способ создания таблицы + +!!! attention "Attention" Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ описанный выше. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -52,7 +54,8 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Все параметры, кроме `ver` имеют то же значение, что в и `MergeTree`. -- `ver` — столбец с версией. Необязательный параметр. Описание смотрите выше по тексту. +- `ver` — столбец с версией. Необязательный параметр. Описание смотрите выше по тексту. +
[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/replacingmergetree/) diff --git a/docs/ru/operations/table_engines/replication.md b/docs/ru/operations/table_engines/replication.md index 496050cceb6..01767425afa 100644 --- a/docs/ru/operations/table_engines/replication.md +++ b/docs/ru/operations/table_engines/replication.md @@ -1,14 +1,14 @@ -# Репликация данных {#table_engines-replication} +# Репликация данных {#table-engines-replication} Репликация поддерживается только для таблиц семейства MergeTree: -- ReplicatedMergeTree -- ReplicatedSummingMergeTree -- ReplicatedReplacingMergeTree -- ReplicatedAggregatingMergeTree -- ReplicatedCollapsingMergeTree -- ReplicatedVersionedCollapsingMergeTree -- ReplicatedGraphiteMergeTree +- ReplicatedMergeTree +- ReplicatedSummingMergeTree +- ReplicatedReplacingMergeTree +- ReplicatedAggregatingMergeTree +- ReplicatedCollapsingMergeTree +- ReplicatedVersionedCollapsingMergeTree +- ReplicatedGraphiteMergeTree Репликация работает на уровне отдельных таблиц, а не всего сервера. То есть, на сервере могут быть расположены одновременно реплицируемые и не реплицируемые таблицы. @@ -18,20 +18,20 @@ Запросы `CREATE`, `DROP`, `ATTACH`, `DETACH` и `RENAME` выполняются на одном сервере и не реплицируются: -- Запрос `CREATE TABLE` создаёт новую реплицируемую таблицу на том сервере, где его выполнили. Если таблица уже существует на других серверах, запрос добавляет новую реплику. -- `DROP TABLE` удаляет реплику, расположенную на том сервере, где выполняется запрос. -- Запрос `RENAME` переименовывает таблицу на одной реплик. Другими словами, реплицируемые таблицы на разных репликах могут называться по-разному. +- Запрос `CREATE TABLE` создаёт новую реплицируемую таблицу на том сервере, где его выполнили. Если таблица уже существует на других серверах, запрос добавляет новую реплику. +- `DROP TABLE` удаляет реплику, расположенную на том сервере, где выполняется запрос. +- Запрос `RENAME` переименовывает таблицу на одной реплик. Другими словами, реплицируемые таблицы на разных репликах могут называться по-разному. ClickHouse хранит метаинформацию о репликах в [Apache ZooKeeper](https://zookeeper.apache.org). Используйте ZooKeeper 3.4.5 или новее. Для использовании репликации, установите параметры в секции [zookeeper](../server_settings/settings.md#server-settings_zookeeper) конфигурации сервера. - !!! attention "Внимание" +!!! attention "Внимание" Не пренебрегайте настройками безопасности. ClickHouse поддерживает [ACL схему](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) `digest` подсистемы безопасности ZooKeeper. Пример указания адресов кластера ZooKeeper: -```xml +``` xml example1 @@ -52,7 +52,7 @@ ClickHouse хранит метаинформацию о репликах в [Apa Если в конфигурационном файле не настроен ZooKeeper, то вы не сможете создать реплицируемые таблицы, а уже имеющиеся реплицируемые таблицы будут доступны в режиме только на чтение. -При запросах `SELECT`, ZooKeeper не используется, т.е. репликация не влияет на производительность `SELECT` и запросы работают так же быстро, как и для нереплицируемых таблиц. При запросах к распределенным реплицированным таблицам поведение ClickHouse регулируется настройками [max_replica_delay_for_distributed_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback_to_stale_replicas_for_distributed_queries](../settings/settings.md). +При запросах `SELECT`, ZooKeeper не используется, т.е. репликация не влияет на производительность `SELECT` и запросы работают так же быстро, как и для нереплицируемых таблиц. При запросах к распределенным реплицированным таблицам поведение ClickHouse регулируется настройками [max\_replica\_delay\_for\_distributed\_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback\_to\_stale\_replicas\_for\_distributed\_queries](../settings/settings.md). При каждом запросе `INSERT`, делается около десятка записей в ZooKeeper в рамках нескольких транзакций. (Чтобы быть более точным, это для каждого вставленного блока данных; запрос INSERT содержит один блок или один блок на `max_insert_block_size = 1048576` строк.) Это приводит к некоторому увеличению задержек при `INSERT`, по сравнению с нереплицируемыми таблицами. Но если придерживаться обычных рекомендаций - вставлять данные пачками не более одного `INSERT` в секунду, то это не составляет проблем. На всём кластере ClickHouse, использующим для координации один кластер ZooKeeper, может быть в совокупности несколько сотен `INSERT` в секунду. Пропускная способность при вставке данных (количество строчек в секунду) такая же высокая, как для нереплицируемых таблиц. @@ -64,7 +64,7 @@ ClickHouse хранит метаинформацию о репликах в [Apa Каждый блок данных записывается атомарно. Запрос INSERT разбивается на блоки данных размером до `max_insert_block_size = 1048576` строк. То есть, если в запросе `INSERT` менее 1048576 строк, то он делается атомарно. -Блоки данных дедуплицируются. При многократной записи одного и того же блока данных (блоков данных одинакового размера, содержащих одни и те же строчки в одном и том же порядке), блок будет записан только один раз. Это сделано для того, чтобы в случае сбоя в сети, когда клиентское приложение не может понять, были ли данные записаны в БД, можно было просто повторить запрос `INSERT`. При этом не имеет значения, на какую реплику будут отправлены INSERT-ы с одинаковыми данными. Запрос `INSERT` идемпотентный. Параметры дедуплицирования регулируются настройками сервера [merge_tree](../server_settings/settings.md#server_settings-merge_tree) +Блоки данных дедуплицируются. При многократной записи одного и того же блока данных (блоков данных одинакового размера, содержащих одни и те же строчки в одном и том же порядке), блок будет записан только один раз. Это сделано для того, чтобы в случае сбоя в сети, когда клиентское приложение не может понять, были ли данные записаны в БД, можно было просто повторить запрос `INSERT`. При этом не имеет значения, на какую реплику будут отправлены INSERT-ы с одинаковыми данными. Запрос `INSERT` идемпотентный. Параметры дедуплицирования регулируются настройками сервера [merge\_tree](../server_settings/settings.md#server_settings-merge_tree) При репликации, по сети передаются только исходные вставляемые данные. Дальнейшие преобразования данных (слияния) координируются и делаются на всех репликах одинаковым образом. За счёт этого минимизируется использование сети, и благодаря этому, репликация хорошо работает при расположении реплик в разных дата-центрах. (Стоит заметить, что дублирование данных в разных дата-центрах, по сути, является основной задачей репликации). @@ -78,12 +78,12 @@ ClickHouse хранит метаинформацию о репликах в [Apa **Параметры Replicated\*MergeTree** -- `zoo_path` — путь к таблице в ZooKeeper. -- `replica_name` — имя реплики в ZooKeeper. +- `zoo_path` — путь к таблице в ZooKeeper. +- `replica_name` — имя реплики в ZooKeeper. Пример: -```sql +``` sql CREATE TABLE table_name ( EventDate DateTime, @@ -95,9 +95,11 @@ ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) ``` -
Пример в устаревшем синтаксисе +
-```sql +Пример в устаревшем синтаксисе + +``` sql CREATE TABLE table_name ( EventDate DateTime, @@ -110,7 +112,7 @@ CREATE TABLE table_name Как видно в примере, эти параметры могут содержать подстановки в фигурных скобках. Подставляемые значения достаются из конфигурационного файла, из секции `macros`. Пример: -```xml +``` xml 05 02 @@ -121,14 +123,14 @@ CREATE TABLE table_name Путь к таблице в ZooKeeper должен быть разным для каждой реплицируемой таблицы. В том числе, для таблиц на разных шардах, должны быть разные пути. В данном случае, путь состоит из следующих частей: -`/clickhouse/tables/` — общий префикс. Рекомендуется использовать именно его. +`/clickhouse/tables/` — общий префикс. Рекомендуется использовать именно его. -`{layer}-{shard}` — идентификатор шарда. В данном примере он состоит из двух частей, так как на кластере Яндекс.Метрики используется двухуровневое шардирование. Для большинства задач, оставьте только подстановку {shard}, которая будет раскрываться в идентификатор шарда. +`{layer}-{shard}` — идентификатор шарда. В данном примере он состоит из двух частей, так как на кластере Яндекс.Метрики используется двухуровневое шардирование. Для большинства задач, оставьте только подстановку {shard}, которая будет раскрываться в идентификатор шарда. -`table_name` - имя узла для таблицы в ZooKeeper. Разумно делать его таким же, как имя таблицы. Оно указывается явно, так как, в отличие от имени таблицы, оно не меняется после запроса RENAME. +`table_name` - имя узла для таблицы в ZooKeeper. Разумно делать его таким же, как имя таблицы. Оно указывается явно, так как, в отличие от имени таблицы, оно не меняется после запроса RENAME. *Подсказка*: можно также указать имя базы данных перед `table_name`, например `db_name.table_name` -Имя реплики — то, что идентифицирует разные реплики одной и той же таблицы. Можно использовать для него имя сервера, как показано в примере. Впрочем, достаточно, чтобы имя было уникально лишь в пределах каждого шарда. +Имя реплики — то, что идентифицирует разные реплики одной и той же таблицы. Можно использовать для него имя сервера, как показано в примере. Впрочем, достаточно, чтобы имя было уникально лишь в пределах каждого шарда. Можно не использовать подстановки, а указать соответствующие параметры явно. Это может быть удобным для тестирования и при настройке маленьких кластеров. Однако в этом случае нельзя пользоваться распределенными DDL-запросами (`ON CLUSTER`). @@ -138,9 +140,9 @@ CREATE TABLE table_name Если вы добавляете новую реплику после того, как таблица на других репликах уже содержит некоторые данные, то после выполнения запроса, данные на новую реплику будут скачаны с других реплик. То есть, новая реплика синхронизирует себя с остальными. -Для удаления реплики, выполните запрос `DROP TABLE`. При этом, удаляется только одна реплика — расположенная на том сервере, где вы выполняете запрос. +Для удаления реплики, выполните запрос `DROP TABLE`. При этом, удаляется только одна реплика — расположенная на том сервере, где вы выполняете запрос. -## Восстановление после сбоя +## Восстановление после сбоя {#vosstanovlenie-posle-sboia} Если при старте сервера, недоступен ZooKeeper, реплицируемые таблицы переходят в режим только для чтения. Система будет пытаться периодически установить соединение с ZooKeeper. @@ -154,24 +156,24 @@ CREATE TABLE table_name При старте сервера (или создании новой сессии с ZooKeeper), проверяется только количество и размеры всех файлов. Если у файлов совпадают размеры, но изменены байты где-то посередине, то это обнаруживается не сразу, а только при попытке их прочитать при каком-либо запросе `SELECT`. Запрос кинет исключение о несоответствующей чексумме или размере сжатого блока. В этом случае, куски данных добавляются в очередь на проверку, и при необходимости, скачиваются с реплик. -Если обнаруживается, что локальный набор данных слишком сильно отличается от ожидаемого, то срабатывает защитный механизм. Сервер сообщает об этом в лог и отказывается запускаться. Это сделано, так как такой случай может свидетельствовать об ошибке конфигурации - например, если реплика одного шарда была случайно сконфигурирована, как реплика другого шарда. Тем не менее, пороги защитного механизма поставлены довольно низкими, и такая ситуация может возникнуть и при обычном восстановлении после сбоя. В этом случае, восстановление делается полуавтоматически - "по кнопке". +Если обнаруживается, что локальный набор данных слишком сильно отличается от ожидаемого, то срабатывает защитный механизм. Сервер сообщает об этом в лог и отказывается запускаться. Это сделано, так как такой случай может свидетельствовать об ошибке конфигурации - например, если реплика одного шарда была случайно сконфигурирована, как реплика другого шарда. Тем не менее, пороги защитного механизма поставлены довольно низкими, и такая ситуация может возникнуть и при обычном восстановлении после сбоя. В этом случае, восстановление делается полуавтоматически - «по кнопке». Для запуска восстановления, создайте в ZooKeeper узел `/path_to_table/replica_name/flags/force_restore_data` с любым содержимым или выполните команду для восстановления всех реплицируемых таблиц: -```bash +``` bash $ sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data ``` Затем запустите сервер. При старте, сервер удалит эти флаги и запустит восстановление. -## Восстановление в случае потери всех данных +## Восстановление в случае потери всех данных {#vosstanovlenie-v-sluchae-poteri-vsekh-dannykh} Если на одном из серверов исчезли все данные и метаданные, восстановление делается следующим образом: -1. Установите на сервер ClickHouse. Корректно пропишите подстановки в конфигурационном файле, отвечающие за идентификатор шарда и реплики, если вы их используете. -2. Если у вас были нереплицируемые таблицы, которые должны быть вручную продублированы на серверах, скопируйте их данные (в директории `/var/lib/clickhouse/data/db_name/table_name/`) с реплики. -3. Скопируйте с реплики определения таблиц, находящиеся в `/var/lib/clickhouse/metadata/`. Если в определениях таблиц, идентификатор шарда или реплики, прописаны в явном виде - исправьте их, чтобы они соответствовали данной реплике. (Альтернативный вариант - запустить сервер и сделать самостоятельно все запросы `ATTACH TABLE`, которые должны были бы быть в соответствующих .sql файлах в `/var/lib/clickhouse/metadata/`.) -4. Создайте в ZooKeeper узел `/path_to_table/replica_name/flags/force_restore_data` с любым содержимым или выполните команду для восстановления всех реплицируемых таблиц: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` +1. Установите на сервер ClickHouse. Корректно пропишите подстановки в конфигурационном файле, отвечающие за идентификатор шарда и реплики, если вы их используете. +2. Если у вас были нереплицируемые таблицы, которые должны быть вручную продублированы на серверах, скопируйте их данные (в директории `/var/lib/clickhouse/data/db_name/table_name/`) с реплики. +3. Скопируйте с реплики определения таблиц, находящиеся в `/var/lib/clickhouse/metadata/`. Если в определениях таблиц, идентификатор шарда или реплики, прописаны в явном виде - исправьте их, чтобы они соответствовали данной реплике. (Альтернативный вариант - запустить сервер и сделать самостоятельно все запросы `ATTACH TABLE`, которые должны были бы быть в соответствующих .sql файлах в `/var/lib/clickhouse/metadata/`.) +4. Создайте в ZooKeeper узел `/path_to_table/replica_name/flags/force_restore_data` с любым содержимым или выполните команду для восстановления всех реплицируемых таблиц: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` Затем запустите сервер (перезапустите, если уже запущен). Данные будут скачаны с реплик. @@ -179,7 +181,7 @@ $ sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data Отсутствует ограничение на использование сетевой полосы при восстановлении. Имейте это ввиду, если восстанавливаете сразу много реплик. -## Преобразование из MergeTree в ReplicatedMergeTree +## Преобразование из MergeTree в ReplicatedMergeTree {#preobrazovanie-iz-mergetree-v-replicatedmergetree} Здесь и далее, под `MergeTree` подразумеваются все движки таблиц семейства `MergeTree`, так же для `ReplicatedMergeTree`. @@ -191,18 +193,18 @@ $ sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data Перенесите данные из старой таблицы в поддиректорию detached в директории с данными новой таблицы (`/var/lib/clickhouse/data/db_name/table_name/`). Затем добавьте эти куски данных в рабочий набор с помощью выполнения запросов `ALTER TABLE ATTACH PARTITION` на одной из реплик. -## Преобразование из ReplicatedMergeTree в MergeTree +## Преобразование из ReplicatedMergeTree в MergeTree {#preobrazovanie-iz-replicatedmergetree-v-mergetree} Создайте таблицу типа MergeTree с другим именем. Перенесите в её директорию с данными все данные из директории с данными таблицы типа `ReplicatedMergeTree`. Затем удалите таблицу типа `ReplicatedMergeTree` и перезапустите сервер. Если вы хотите избавиться от таблицы `ReplicatedMergeTree`, не запуская сервер, то -- удалите соответствующий файл `.sql` в директории с метаданными (`/var/lib/clickhouse/metadata/`); -- удалите соответствующий путь в ZooKeeper (`/path_to_table/replica_name`); +- удалите соответствующий файл `.sql` в директории с метаданными (`/var/lib/clickhouse/metadata/`); +- удалите соответствующий путь в ZooKeeper (`/path_to_table/replica_name`); После этого, вы можете запустить сервер, создать таблицу типа `MergeTree`, перенести данные в её директорию, и перезапустить сервер. -## Восстановление в случае потери или повреждения метаданных на ZooKeeper кластере +## Восстановление в случае потери или повреждения метаданных на ZooKeeper кластере {#vosstanovlenie-v-sluchae-poteri-ili-povrezhdeniia-metadannykh-na-zookeeper-klastere} Если данные в ZooKeeper оказались утеряны или повреждены, то вы можете сохранить данные, переместив их в нереплицируемую таблицу, как описано в пункте выше. diff --git a/docs/ru/operations/table_engines/set.md b/docs/ru/operations/table_engines/set.md index b7e7046efc3..497d52821d1 100644 --- a/docs/ru/operations/table_engines/set.md +++ b/docs/ru/operations/table_engines/set.md @@ -1,6 +1,6 @@ -# Set +# Set {#set} -Представляет собой множество, постоянно находящееся в оперативке. Предназначено для использования в правой части оператора IN (смотрите раздел "Операторы IN"). +Представляет собой множество, постоянно находящееся в оперативке. Предназначено для использования в правой части оператора IN (смотрите раздел «Операторы IN»). В таблицу можно вставлять данные INSERT-ом - будут добавлены новые элементы в множество, с игнорированием дубликатов. Но из таблицы нельзя, непосредственно, делать SELECT. Единственная возможность чтения - использование в правой части оператора IN. diff --git a/docs/ru/operations/table_engines/stripelog.md b/docs/ru/operations/table_engines/stripelog.md index 20e8f176a6a..a1bdfbdaf64 100644 --- a/docs/ru/operations/table_engines/stripelog.md +++ b/docs/ru/operations/table_engines/stripelog.md @@ -1,12 +1,12 @@ -# StripeLog +# StripeLog {#stripelog} Движок относится к семейству движков Log. Смотрите общие свойства и различия движков в статье [Семейство Log](log_family.md). Движок разработан для сценариев, когда необходимо записывать много таблиц с небольшим объёмом данных (менее 1 миллиона строк). -## Создание таблицы {#table_engines-stripelog-creating-a-table} +## Создание таблицы {#table-engines-stripelog-creating-a-table} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( column1_name [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -17,26 +17,26 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Смотрите подробное описание запроса [CREATE TABLE](../../query_language/create.md#create-table-query). -## Запись данных {#table_engines-stripelog-writing-the-data} +## Запись данных {#table-engines-stripelog-writing-the-data} Движок `StripeLog` хранит все столбцы в одном файле. При каждом запросе `INSERT`, ClickHouse добавляет блок данных в конец файла таблицы, записывая столбцы один за другим. Для каждой таблицы ClickHouse записывает файлы: -- `data.bin` — файл с данными. -- `index.mrk` — файл с метками. Метки содержат смещения для каждого столбца каждого вставленного блока данных. +- `data.bin` — файл с данными. +- `index.mrk` — файл с метками. Метки содержат смещения для каждого столбца каждого вставленного блока данных. Движок `StripeLog` не поддерживает запросы `ALTER UPDATE` и `ALTER DELETE`. -## Чтение данных {#table_engines-stripelog-reading-the-data} +## Чтение данных {#table-engines-stripelog-reading-the-data} Файл с метками позволяет ClickHouse распараллеливать чтение данных. Это означает, что запрос `SELECT` возвращает строки в непредсказуемом порядке. Используйте секцию `ORDER BY` для сортировки строк. -## Пример использования {#table_engines-stripelog-example-of-use} +## Пример использования {#table-engines-stripelog-example-of-use} Создание таблицы: -```sql +``` sql CREATE TABLE stripe_log_table ( timestamp DateTime, @@ -48,7 +48,7 @@ ENGINE = StripeLog Вставка данных: -```sql +``` sql INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The first regular message') INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The second regular message'),(now(),'WARNING','The first warning message') ``` @@ -57,11 +57,11 @@ INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The second regular message ClickHouse использует несколько потоков при выборе данных. Каждый поток считывает отдельный блок данных и возвращает результирующие строки независимо по мере завершения. В результате порядок блоков строк в выходных данных в большинстве случаев не совпадает с порядком тех же блоков во входных данных. Например: -```sql +``` sql SELECT * FROM stripe_log_table ``` -```text +``` text ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ │ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ @@ -73,11 +73,11 @@ SELECT * FROM stripe_log_table Сортировка результатов (по умолчанию по возрастанию): -```sql +``` sql SELECT * FROM stripe_log_table ORDER BY timestamp ``` -```text +``` text ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ │ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ diff --git a/docs/ru/operations/table_engines/summingmergetree.md b/docs/ru/operations/table_engines/summingmergetree.md index 74c2c793eb5..6b1a41384c0 100644 --- a/docs/ru/operations/table_engines/summingmergetree.md +++ b/docs/ru/operations/table_engines/summingmergetree.md @@ -1,13 +1,12 @@ - -# SummingMergeTree +# SummingMergeTree {#summingmergetree} Движок наследует функциональность [MergeTree](mergetree.md#table_engines-mergetree). Отличие заключается в том, что для таблиц `SummingMergeTree` при слиянии кусков данных ClickHouse все строки с одинаковым первичным ключом (точнее, с одинаковым [ключом сортировки](mergetree.md)) заменяет на одну, которая хранит только суммы значений из столбцов с цифровым типом данных. Если ключ сортировки подобран таким образом, что одному значению ключа соответствует много строк, это значительно уменьшает объём хранения и ускоряет последующую выборку данных. Мы рекомендуем использовать движок в паре с `MergeTree`. В `MergeTree` храните полные данные, а `SummingMergeTree` используйте для хранения агрегированных данных, например, при подготовке отчетов. Такой подход позволит не утратить ценные данные из-за неправильно выбранного первичного ключа. -## Создание таблицы +## Создание таблицы {#sozdanie-tablitsy} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -24,21 +23,23 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Параметры SummingMergeTree** -- `columns` — кортеж с именами столбцов, в которых будут суммироваться данные. Необязательный параметр. +- `columns` — кортеж с именами столбцов, в которых будут суммироваться данные. Необязательный параметр. Столбцы должны иметь числовой тип и не должны входить в первичный ключ. - Если `columns` не задан, то ClickHouse суммирует значения во всех столбцах с числовым типом данных, не входящих в первичный ключ. + Если `columns` не задан, то ClickHouse суммирует значения во всех столбцах с числовым типом данных, не входящих в первичный ключ. **Секции запроса** При создании таблицы `SummingMergeTree` используются те же [секции](mergetree.md) запроса, что и при создании таблицы `MergeTree`. -
Устаревший способ создания таблицы +
-!!! attention +Устаревший способ создания таблицы + +!!! attention "Attention" Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ описанный выше. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -49,14 +50,15 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Все параметры, кроме `columns` имеют то же значение, что в и `MergeTree`. -- `columns` — кортеж с именами столбцов для суммирования данных. Необязательный параметр. Описание смотрите выше по тексту. +- `columns` — кортеж с именами столбцов для суммирования данных. Необязательный параметр. Описание смотрите выше по тексту. +
-## Пример использования +## Пример использования {#primer-ispolzovaniia} Рассмотрим следующую таблицу: -```sql +``` sql CREATE TABLE summtt ( key UInt32, @@ -68,30 +70,30 @@ ORDER BY key Добавим в неё данные: -```sql +``` sql INSERT INTO summtt Values(1,1),(1,2),(2,1) ``` ClickHouse может не полностью просуммировать все строки ([смотрите ниже по тексту](#obrabotka-dannykh)), поэтому при запросе мы используем агрегатную функцию `sum` и секцию `GROUP BY`. -```sql +``` sql SELECT key, sum(value) FROM summtt GROUP BY key ``` -```text + +``` text ┌─key─┬─sum(value)─┐ │ 2 │ 1 │ │ 1 │ 3 │ └─────┴────────────┘ ``` - -## Обработка данных +## Обработка данных {#obrabotka-dannykh} При вставке данных в таблицу они сохраняются как есть. Периодически ClickHouse выполняет слияние вставленных кусков данных и именно в этот момент производится суммирование и замена многих строк с одинаковым первичным ключом на одну для каждого результирующего куска данных. ClickHouse может слить куски данных таким образом, что не все строки с одинаковым первичным ключом окажутся в одном финальном куске, т.е. суммирование будет не полным. Поэтому, при выборке данных (`SELECT`) необходимо использовать агрегатную функцию [sum()](../../query_language/agg_functions/reference.md#agg_function-sum) и секцию `GROUP BY` как описано в примере выше. -### Общие правила суммирования +### Общие правила суммирования {#obshchie-pravila-summirovaniia} Суммируются значения в столбцах с числовым типом данных. Набор столбцов определяется параметром `columns`. @@ -101,24 +103,24 @@ ClickHouse может слить куски данных таким образо Значения для столбцов, входящих в первичный ключ, не суммируются. -### Суммирование в столбцах AggregateFunction +### Суммирование в столбцах AggregateFunction {#summirovanie-v-stolbtsakh-aggregatefunction} Для столбцов типа [AggregateFunction](../../data_types/nested_data_structures/aggregatefunction.md#data_type-aggregatefunction) ClickHouse выполняет агрегацию согласно заданной функции, повторяя поведение движка [AggregatingMergeTree](aggregatingmergetree.md). -### Вложенные структуры +### Вложенные структуры {#vlozhennye-struktury} Таблица может иметь вложенные структуры данных, которые обрабатываются особым образом. Если название вложенной таблицы заканчивается на `Map` и она содержит не менее двух столбцов, удовлетворяющих критериям: -- первый столбец - числовой `(*Int*, Date, DateTime)`, назовем его условно `key`, -- остальные столбцы - арифметические `(*Int*, Float32/64)`, условно `(values...)`, +- первый столбец - числовой `(*Int*, Date, DateTime)`, назовем его условно `key`, +- остальные столбцы - арифметические `(*Int*, Float32/64)`, условно `(values...)`, то вложенная таблица воспринимается как отображение `key => (values...)` и при слиянии её строк выполняется слияние элементов двух множеств по `key` со сложением соответствующих `(values...)`. Примеры: -```text +``` text [(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] [(1, 100)] + [(1, 150)] -> [(1, 250)] [(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] diff --git a/docs/ru/operations/table_engines/tinylog.md b/docs/ru/operations/table_engines/tinylog.md index 6d3d2441051..d6dafb63504 100644 --- a/docs/ru/operations/table_engines/tinylog.md +++ b/docs/ru/operations/table_engines/tinylog.md @@ -1,4 +1,4 @@ -# TinyLog +# TinyLog {#tinylog} Движок относится к семейству движков Log. Смотрите общие свойства и различия движков в статье [Семейство Log](log_family.md). diff --git a/docs/ru/operations/table_engines/url.md b/docs/ru/operations/table_engines/url.md index 0a4142f2f95..86a903f538a 100644 --- a/docs/ru/operations/table_engines/url.md +++ b/docs/ru/operations/table_engines/url.md @@ -1,9 +1,9 @@ -# URL(URL, Format) {#table_engines-url} +# URL(URL, Format) {#table-engines-url} Управляет данными на удаленном HTTP/HTTPS сервере. Данный движок похож на движок [File](file.md). -## Использование движка в сервере ClickHouse +## Использование движка в сервере ClickHouse {#ispolzovanie-dvizhka-v-servere-clickhouse} `Format` должен быть таким, который ClickHouse может использовать в запросах `SELECT` и, если есть необходимость, `INSERT`. Полный список поддерживаемых форматов смотрите в @@ -17,13 +17,13 @@ соответственно. Для обработки `POST`-запросов удаленный сервер должен поддерживать [Chunked transfer encoding](https://ru.wikipedia.org/wiki/Chunked_transfer_encoding). -Максимальное количество переходов по редиректам при выполнении HTTP-запроса методом GET можно ограничить с помощью настройки [max_http_get_redirects](../settings/settings.md#setting-max_http_get_redirects). +Максимальное количество переходов по редиректам при выполнении HTTP-запроса методом GET можно ограничить с помощью настройки [max\_http\_get\_redirects](../settings/settings.md#setting-max_http_get_redirects). **Пример:** **1.** Создадим на сервере таблицу `url_engine_table`: -```sql +``` sql CREATE TABLE url_engine_table (word String, value UInt64) ENGINE=URL('http://127.0.0.1:12345/', CSV) ``` @@ -31,7 +31,7 @@ ENGINE=URL('http://127.0.0.1:12345/', CSV) **2.** Создадим простейший http-сервер стандартными средствами языка python3 и запустим его: -```python3 +``` python3 from http.server import BaseHTTPRequestHandler, HTTPServer class CSVHTTPServer(BaseHTTPRequestHandler): @@ -47,30 +47,29 @@ if __name__ == "__main__": HTTPServer(server_address, CSVHTTPServer).serve_forever() ``` -```bash +``` bash $ python3 server.py ``` **3.** Запросим данные: -```sql +``` sql SELECT * FROM url_engine_table ``` -```text +``` text ┌─word──┬─value─┐ │ Hello │ 1 │ │ World │ 2 │ └───────┴───────┘ ``` -## Особенности использования - -- Поддерживается многопоточное чтение и запись. -- Не поддерживается: - - использование операций `ALTER` и `SELECT...SAMPLE`; - - индексы; - - репликация. +## Особенности использования {#osobennosti-ispolzovaniia} +- Поддерживается многопоточное чтение и запись. +- Не поддерживается: + - использование операций `ALTER` и `SELECT...SAMPLE`; + - индексы; + - репликация. [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/url/) diff --git a/docs/ru/operations/table_engines/versionedcollapsingmergetree.md b/docs/ru/operations/table_engines/versionedcollapsingmergetree.md index 51de2d5e7b6..2809ac74af8 100644 --- a/docs/ru/operations/table_engines/versionedcollapsingmergetree.md +++ b/docs/ru/operations/table_engines/versionedcollapsingmergetree.md @@ -1,17 +1,17 @@ -# VersionedCollapsingMergeTree +# VersionedCollapsingMergeTree {#versionedcollapsingmergetree} Движок: -- Позволяет быстро записывать постоянно изменяющиеся состояния объектов. -- Удаляет старые состояния объектов в фоновом режиме. Это значительно сокращает объём хранения. +- Позволяет быстро записывать постоянно изменяющиеся состояния объектов. +- Удаляет старые состояния объектов в фоновом режиме. Это значительно сокращает объём хранения. Подробнее читайте в разделе [Collapsing](#table_engines_versionedcollapsingmergetree). Движок наследует функциональность от [MergeTree](mergetree.md#table_engines-mergetree) и добавляет в алгоритм слияния кусков данных логику сворачивания (удаления) строк. `VersionedCollapsingMergeTree` предназначен для тех же задач, что и [CollapsingMergeTree](collapsingmergetree.md), но использует другой алгоритм свёртывания, который позволяет вставлять данные в любом порядке в несколько потоков. В частности, столбец `Version` помогает свернуть строки правильно, даже если они вставлены в неправильном порядке. `CollapsingMergeTree` требует строго последовательную вставку данных. -## Создание таблицы +## Создание таблицы {#sozdanie-tablitsy} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -28,28 +28,30 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Параметры движка** -```sql +``` sql VersionedCollapsingMergeTree(sign, version) ``` -- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния. +- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния. - Тип данных столбца должен быть `Int8`. + Тип данных столбца должен быть `Int8`. -- `version` — имя столбца с версией состояния объекта. +- `version` — имя столбца с версией состояния объекта. - Тип данных столбца должен быть `UInt*`. + Тип данных столбца должен быть `UInt*`. **Секции запроса** При создании таблицы `VersionedСollapsingMergeTree` используются те же [секции](mergetree.md) запроса, что и при создании таблицы `MergeTree` . -
Устаревший способ создания таблицы +
+ +Устаревший способ создания таблицы !!! warning "Внимание" Не используйте этот метод в новых проектах. По возможности переключите старые проекты на метод, описанный выше. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -60,19 +62,19 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Все параметры, за исключением `sign` и `version` имеют то же значение, что и в `MergeTree`. -- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния. +- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния. - Тип данных столбца — `Int8`. + Тип данных столбца — `Int8`. -- `version` — имя столбца с версией состояния объекта. +- `version` — имя столбца с версией состояния объекта. - Тип данных столбца должен быть `UInt*`. + Тип данных столбца должен быть `UInt*`.
-## Сворачивание (удаление) строк {#table_engines_versionedcollapsingmergetree} +## Сворачивание (удаление) строк {#table-engines-versionedcollapsingmergetree} -### Данные +### Данные {#dannye} Рассмотрим ситуацию, когда необходимо сохранять постоянно изменяющиеся данные для какого-либо объекта. Разумно иметь одну строку для объекта и обновлять эту строку при каждом изменении. Однако операция обновления является дорогостоящей и медленной для СУБД, поскольку требует перезаписи данных в хранилище. Обновление неприемлемо, если требуется быстро записывать данные, но можно записывать изменения в объект последовательно следующим образом. @@ -80,7 +82,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Например, мы хотим рассчитать, сколько страниц пользователи посетили на каком-либо сайте и как долго они там находились. В какой-то момент времени мы записываем следующую строку состояния пользовательской активности: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | └─────────────────────┴───────────┴──────────┴──────┴─────────┘ @@ -88,7 +90,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Через некоторое время мы регистрируем изменение активности пользователя и записываем его следующими двумя строками. -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | │ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | @@ -101,7 +103,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Поскольку нам нужно только последнее состояние активности пользователя, строки -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | @@ -114,19 +116,19 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Примечания по использованию** -1. Программа, которая записывает данные, должна помнить состояние объекта, чтобы иметь возможность отменить его. Строка отмены состояния должна быть копией предыдущей строки состояния с противоположным значением `Sign`. Это увеличивает начальный размер хранилища, но позволяет быстро записывать данные. -2. Длинные растущие массивы в столбцах снижают эффективность работы движка за счёт нагрузки на запись. Чем проще данные, тем выше эффективность. -3. `SELECT` результаты сильно зависят от согласованности истории изменений объекта. Будьте точны при подготовке данных для вставки. Вы можете получить непредсказуемые результаты с несогласованными данными, такими как отрицательные значения для неотрицательных метрик, таких как глубина сеанса. +1. Программа, которая записывает данные, должна помнить состояние объекта, чтобы иметь возможность отменить его. Строка отмены состояния должна быть копией предыдущей строки состояния с противоположным значением `Sign`. Это увеличивает начальный размер хранилища, но позволяет быстро записывать данные. +2. Длинные растущие массивы в столбцах снижают эффективность работы движка за счёт нагрузки на запись. Чем проще данные, тем выше эффективность. +3. `SELECT` результаты сильно зависят от согласованности истории изменений объекта. Будьте точны при подготовке данных для вставки. Вы можете получить непредсказуемые результаты с несогласованными данными, такими как отрицательные значения для неотрицательных метрик, таких как глубина сеанса. -### Алгоритм {#table_engines-versionedcollapsingmergetree-algorithm} +### Алгоритм {#table-engines-versionedcollapsingmergetree-algorithm} Когда ClickHouse объединяет куски данных, он удаляет каждую пару строк, которые имеют один и тот же первичный ключ и версию и разный `Sign`. Порядок строк не имеет значения. Когда ClickHouse вставляет данные, он упорядочивает строки по первичному ключу. Если столбец `Version` не находится в первичном ключе, ClickHouse добавляет его к первичному ключу неявно как последнее поле и использует для сортировки. -## Выборка данных +## Выборка данных {#vyborka-dannykh} -ClickHouse не гарантирует, что все строки с одинаковым первичным ключом будут находиться в одном результирующем куске данных или даже на одном физическом сервере. Это справедливо как для записи данных, так и для последующего слияния кусков данных. Кроме того, ClickHouse обрабатывает запросы `SELECT` несколькими потоками, и не может предсказать порядок строк в конечной выборке. Это означает, что если необходимо получить полностью "свернутые" данные из таблицы `VersionedCollapsingMergeTree`, то требуется агрегирование. +ClickHouse не гарантирует, что все строки с одинаковым первичным ключом будут находиться в одном результирующем куске данных или даже на одном физическом сервере. Это справедливо как для записи данных, так и для последующего слияния кусков данных. Кроме того, ClickHouse обрабатывает запросы `SELECT` несколькими потоками, и не может предсказать порядок строк в конечной выборке. Это означает, что если необходимо получить полностью «свернутые» данные из таблицы `VersionedCollapsingMergeTree`, то требуется агрегирование. Для завершения свертывания добавьте в запрос секцию `GROUP BY` и агрегатные функции, которые учитывают знак. Например, для расчета количества используйте `sum(Sign)` вместо`count()`. Чтобы вычислить сумму чего-либо, используйте `sum(Sign * x)` вместо`sum(х)`, а также добавьте `HAVING sum(Sign) > 0` . @@ -134,11 +136,11 @@ ClickHouse не гарантирует, что все строки с одина Если необходимо выбирать данные без агрегации (например, проверить наличие строк, последние значения которых удовлетворяют некоторым условиям), можно использовать модификатор `FINAL` для секции `FROM`. Такой подход неэффективен и не должен использоваться с большими таблицами. -## Пример использования +## Пример использования {#primer-ispolzovaniia} Данные для примера: -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | @@ -148,7 +150,7 @@ ClickHouse не гарантирует, что все строки с одина Создание таблицы: -```sql +``` sql CREATE TABLE UAct ( UserID UInt64, @@ -163,11 +165,11 @@ ORDER BY UserID Вставка данных: -```sql +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1, 1) ``` -```sql +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1, 1),(4324182021466249494, 6, 185, 1, 2) ``` @@ -175,11 +177,11 @@ INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1, 1),(432418202146624949 Получение данных: -```sql +``` sql SELECT * FROM UAct ``` -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┴─────────┘ @@ -195,7 +197,7 @@ SELECT * FROM UAct Поэтому нам нужна агрегация: -```sql +``` sql SELECT UserID, sum(PageViews * Sign) AS PageViews, @@ -206,7 +208,7 @@ GROUP BY UserID, Version HAVING sum(Sign) > 0 ``` -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Version─┐ │ 4324182021466249494 │ 6 │ 185 │ 2 │ └─────────────────────┴───────────┴──────────┴─────────┘ @@ -214,11 +216,11 @@ HAVING sum(Sign) > 0 Если нам не нужна агрегация, но мы хотим принудительно выполнить свёртку данных, то можно использовать модификатор `FINAL` для секции `FROM`. -```sql +``` sql SELECT * FROM UAct FINAL ``` -```text +``` text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ └─────────────────────┴───────────┴──────────┴──────┴─────────┘ diff --git a/docs/ru/operations/table_engines/view.md b/docs/ru/operations/table_engines/view.md index 865b666553d..dc8957326fd 100644 --- a/docs/ru/operations/table_engines/view.md +++ b/docs/ru/operations/table_engines/view.md @@ -1,4 +1,4 @@ -# View +# View {#view} Используется для реализации представлений (подробнее см. запрос `CREATE VIEW`). Не хранит данные, а хранит только указанный запрос `SELECT`. При чтении из таблицы, выполняет его (с удалением из запроса всех ненужных столбцов). diff --git a/docs/ru/operations/tips.md b/docs/ru/operations/tips.md index e984e8bb0b7..4aa4605defb 100644 --- a/docs/ru/operations/tips.md +++ b/docs/ru/operations/tips.md @@ -1,19 +1,19 @@ -# Советы по эксплуатации +# Советы по эксплуатации {#sovety-po-ekspluatatsii} -## CPU scaling governor +## CPU scaling governor {#cpu-scaling-governor} Всегда используйте `performance` scaling governor. `ondemand` scaling governor работает намного хуже при постоянно высоком спросе. -```bash +``` bash $ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ``` -## Ограничение CPU +## Ограничение CPU {#ogranichenie-cpu} Процессоры могут перегреваться. С помощью `dmesg` можно увидеть, если тактовая частота процессора была ограничена из-за перегрева. Также ограничение может устанавливаться снаружи на уровне дата-центра. С помощью `turbostat` можно за этим наблюдать под нагрузкой. -## Оперативная память +## Оперативная память {#operativnaia-pamiat} Для небольших объёмов данных (до ~200 Гб в сжатом виде) лучше всего использовать столько памяти не меньше, чем объём данных. Для больших объёмов данных, при выполнении интерактивных (онлайн) запросов, стоит использовать разумный объём оперативной памяти (128 Гб или более) для того, чтобы горячее подмножество данных поместилось в кеше страниц. @@ -21,22 +21,22 @@ $ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_gov Не выключайте overcommit. Значение `cat /proc/sys/vm/overcommit_memory` должно быть 0 or 1. Выполните: -```bash +``` bash $ echo 0 | sudo tee /proc/sys/vm/overcommit_memory ``` -## Huge pages +## Huge pages {#huge-pages} Механизм прозрачных huge pages нужно отключить. Он мешает работе аллокаторов памяти, что приводит к значительной деградации производительности. -```bash +``` bash $ echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled ``` С помощью `perf top` можно наблюдать за временем, проведенном в ядре операционной системы для управления памятью. Постоянные huge pages так же не нужно аллоцировать. -## Подсистема хранения +## Подсистема хранения {#podsistema-khraneniia} Если ваш бюджет позволяет использовать SSD, используйте SSD. В противном случае используйте HDD. SATA HDDs 7200 RPM подойдут. @@ -44,7 +44,7 @@ $ echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled Предпочитайте много серверов с локальными жесткими дисками вместо меньшего числа серверов с подключенными дисковыми полками. Но для хранения архивов с редкими запросами полки всё же подходят. -## RAID +## RAID {#raid} При использовании HDD можно объединить их RAID-10, RAID-5, RAID-6 или RAID-50. Лучше использовать программный RAID в Linux (`mdadm`). Лучше не использовать LVM. @@ -52,9 +52,9 @@ $ echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled Если бюджет позволяет, лучше выбрать RAID-10. На более чем 4 дисках вместо RAID-5 нужно использовать RAID-6 (предпочтительнее) или RAID-50. -При использовании RAID-5, RAID-6 или RAID-50, нужно всегда увеличивать stripe_cache_size, так как значение по умолчанию выбрано не самым удачным образом. +При использовании RAID-5, RAID-6 или RAID-50, нужно всегда увеличивать stripe\_cache\_size, так как значение по умолчанию выбрано не самым удачным образом. -```bash +``` bash $ echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size ``` @@ -69,30 +69,30 @@ $ echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size Включите NCQ с длинной очередью. Для HDD стоит выбрать планировщик CFQ, а для SSD — noop. Не стоит уменьшать настройку readahead. На HDD стоит включать кеш записи. -## Файловая система +## Файловая система {#failovaia-sistema} Ext4 самый проверенный вариант. Укажите опции монтирования `noatime,nobarrier`. XFS также подходит, но не так тщательно протестирована в сочетании с ClickHouse. Большинство других файловых систем также должны нормально работать. Файловые системы с отложенной аллокацией работают лучше. -## Ядро Linux +## Ядро Linux {#iadro-linux} Не используйте слишком старое ядро Linux. -## Сеть +## Сеть {#set} При использовании IPv6, стоит увеличить размер кеша маршрутов. Ядра Linux до 3.2 имели массу проблем в реализации IPv6. Предпочитайте как минимум 10 Гбит сеть. 1 Гбит также будет работать, но намного хуже для починки реплик с десятками терабайт данных или для обработки распределенных запросов с большим объёмом промежуточных данных. -## ZooKeeper +## ZooKeeper {#zookeeper} Вероятно вы уже используете ZooKeeper для других целей. Можно использовать ту же инсталляцию ZooKeeper, если она не сильно перегружена. Лучше использовать свежую версию ZooKeeper, как минимум 3.4.9. Версия в стабильных дистрибутивах Linux может быть устаревшей. -Никогда не используете написанные вручную скрипты для переноса данных между разными ZooKeeper кластерами, потому что результат будет некорректный для sequential нод. Никогда не используйте утилиту "zkcopy", по той же причине: https://github.com/ksprojects/zkcopy/issues/15 +Никогда не используете написанные вручную скрипты для переноса данных между разными ZooKeeper кластерами, потому что результат будет некорректный для sequential нод. Никогда не используйте утилиту «zkcopy», по той же причине: https://github.com/ksprojects/zkcopy/issues/15 Если вы хотите разделить существующий ZooKeeper кластер на два, правильный способ - увеличить количество его реплик, а затем переконфигурировать его как два независимых кластера. @@ -108,7 +108,7 @@ XFS также подходит, но не так тщательно проте zoo.cfg: -```bash +``` bash # http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html # The number of milliseconds of each tick @@ -124,9 +124,9 @@ maxClientCnxns=2000 maxSessionTimeout=60000000 # the directory where the snapshot is stored. -dataDir=/opt/zookeeper/{{ cluster['name'] }}/data +dataDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '{{' }} '}}' }}/data # Place the dataLogDir to a separate physical disc for better performance -dataLogDir=/opt/zookeeper/{{ cluster['name'] }}/logs +dataLogDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '{{' }} '}}' }}/logs autopurge.snapRetainCount=10 autopurge.purgeInterval=1 @@ -159,20 +159,20 @@ snapCount=3000000 leaderServes=yes standaloneEnabled=false -dynamicConfigFile=/etc/zookeeper-{{ cluster['name'] }}/conf/zoo.cfg.dynamic +dynamicConfigFile=/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '{{' }} '}}' }}/conf/zoo.cfg.dynamic ``` Версия Java: -```text +``` text Java(TM) SE Runtime Environment (build 1.8.0_25-b17) Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) ``` Параметры JVM: -```bash -NAME=zookeeper-{{ cluster['name'] }} +``` bash +NAME=zookeeper-{{ '{{' }} cluster['name'] {{ '{{' }} '}}' }} ZOOCFGDIR=/etc/$NAME/conf # TODO this is really ugly @@ -191,8 +191,8 @@ JAVA=/usr/bin/java ZOOMAIN="org.apache.zookeeper.server.quorum.QuorumPeerMain" ZOO_LOG4J_PROP="INFO,ROLLINGFILE" JMXLOCALONLY=false -JAVA_OPTS="-Xms{{ cluster.get('xms','128M') }} \ - -Xmx{{ cluster.get('xmx','1G') }} \ +JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '{{' }} '}}' }} \ + -Xmx{{ '{{' }} cluster.get('xmx','1G') {{ '{{' }} '}}' }} \ -Xloggc:/var/log/$NAME/zookeeper-gc.log \ -XX:+UseGCLogFileRotation \ -XX:NumberOfGCLogFiles=16 \ @@ -212,8 +212,8 @@ JAVA_OPTS="-Xms{{ cluster.get('xms','128M') }} \ Salt init: -```text -description "zookeeper-{{ cluster['name'] }} centralized coordination service" +``` text +description "zookeeper-{{ '{{' }} cluster['name'] {{ '{{' }} '}}' }} centralized coordination service" start on runlevel [2345] stop on runlevel [!2345] @@ -223,19 +223,19 @@ respawn limit nofile 8192 8192 pre-start script - [ -r "/etc/zookeeper-{{ cluster['name'] }}/conf/environment" ] || exit 0 - . /etc/zookeeper-{{ cluster['name'] }}/conf/environment + [ -r "/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '{{' }} '}}' }}/conf/environment" ] || exit 0 + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '{{' }} '}}' }}/conf/environment [ -d $ZOO_LOG_DIR ] || mkdir -p $ZOO_LOG_DIR chown $USER:$GROUP $ZOO_LOG_DIR end script script - . /etc/zookeeper-{{ cluster['name'] }}/conf/environment + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '{{' }} '}}' }}/conf/environment [ -r /etc/default/zookeeper ] && . /etc/default/zookeeper if [ -z "$JMXDISABLE" ]; then JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.local.only=$JMXLOCALONLY" fi - exec start-stop-daemon --start -c $USER --exec $JAVA --name zookeeper-{{ cluster['name'] }} \ + exec start-stop-daemon --start -c $USER --exec $JAVA --name zookeeper-{{ '{{' }} cluster['name'] {{ '{{' }} '}}' }} \ -- -cp $CLASSPATH $JAVA_OPTS -Dzookeeper.log.dir=${ZOO_LOG_DIR} \ -Dzookeeper.root.logger=${ZOO_LOG4J_PROP} $ZOOMAIN $ZOOCFG end script diff --git a/docs/ru/operations/troubleshooting.md b/docs/ru/operations/troubleshooting.md index bdde1b21d8f..4c9fbf49bd6 100644 --- a/docs/ru/operations/troubleshooting.md +++ b/docs/ru/operations/troubleshooting.md @@ -1,37 +1,37 @@ -# Устранение неисправностей +# Устранение неисправностей {#ustranenie-neispravnostei} -- [Установка дистрибутива](#troubleshooting-installation-errors) -- [Соединение с сервером](#troubleshooting-accepts-no-connections) -- [Обработка запросов](#troubleshooting-does-not-process-queries) -- [Скорость обработки запросов](#troubleshooting-too-slow) +- [Установка дистрибутива](#troubleshooting-installation-errors) +- [Соединение с сервером](#troubleshooting-accepts-no-connections) +- [Обработка запросов](#troubleshooting-does-not-process-queries) +- [Скорость обработки запросов](#troubleshooting-too-slow) ## Установка дистрибутива {#troubleshooting-installation-errors} -### Не получается скачать deb-пакеты из репозитория ClickHouse с помощью apt-get +### Не получается скачать deb-пакеты из репозитория ClickHouse с помощью apt-get {#ne-poluchaetsia-skachat-deb-pakety-iz-repozitoriia-clickhouse-s-pomoshchiu-apt-get} -- Проверьте настройки брандмауэра. -- Если по какой-либо причине вы не можете получить доступ к репозиторию, скачайте пакеты как описано в разделе [Начало работы](../getting_started/index.md) и установите их вручную командой `sudo dpkg -i `. Также, необходим пакет `tzdata`. +- Проверьте настройки брандмауэра. +- Если по какой-либо причине вы не можете получить доступ к репозиторию, скачайте пакеты как описано в разделе [Начало работы](../getting_started/index.md) и установите их вручную командой `sudo dpkg -i `. Также, необходим пакет `tzdata`. ## Соединение с сервером {#troubleshooting-accepts-no-connections} Возможные проблемы: -- Сервер не запущен. -- Неожиданные или неправильные параметры конфигурации. +- Сервер не запущен. +- Неожиданные или неправильные параметры конфигурации. -### Сервер не запущен +### Сервер не запущен {#server-ne-zapushchen} **Проверьте, запущен ли сервер** Команда: -```bash +``` bash $ sudo service clickhouse-server status ``` Если сервер не запущен, запустите его с помощью команды: -```bash +``` bash $ sudo service clickhouse-server start ``` @@ -41,24 +41,24 @@ $ sudo service clickhouse-server start В случае успешного запуска вы должны увидеть строки, содержащие: -- ` Application: starting up.` — сервер запускается. -- ` Application: Ready for connections.` — сервер запущен и готов принимать соединения. +- ` Application: starting up.` — сервер запускается. +- ` Application: Ready for connections.` — сервер запущен и готов принимать соединения. -Если `clickhouse-server` не запустился из-за ошибки конфигурации вы увидите `` строку с описанием ошибки. Например: +Если `clickhouse-server` не запустился из-за ошибки конфигурации вы увидите `` строку с описанием ошибки. Например: -```text +``` text 2019.01.11 15:23:25.549505 [ 45 ] {} ExternalDictionaries: Failed reloading 'event2id' external dictionary: Poco::Exception. Code: 1000, e.code() = 111, e.displayText() = Connection refused, e.what() = Connection refused ``` Если вы не видите ошибки в конце файла, просмотрите весь файл начиная со строки: -```text +``` text Application: starting up. ``` При попытке запустить второй экземпляр `clickhouse-server` журнал выглядит следующим образом: -```text +``` text 2019.01.11 15:25:11.151730 [ 1 ] {} : Starting ClickHouse 19.1.0 with revision 54413 2019.01.11 15:25:11.154578 [ 1 ] {} Application: starting up 2019.01.11 15:25:11.156361 [ 1 ] {} StatusFile: Status file ./status already exists - unclean restart. Contents: @@ -76,54 +76,54 @@ Revision: 54413 Если из логов `clickhouse-server` вы не получили необходимой информации или логов нет, то вы можете посмотреть логи `system.d` командой: -```bash +``` bash $ sudo journalctl -u clickhouse-server ``` **Запустите clickhouse-server в интерактивном режиме** -```bash +``` bash $ sudo -u clickhouse /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml ``` Эта команда запускает сервер как интерактивное приложение со стандартными параметрами скрипта автозапуска. В этом режиме `clickhouse-server` выводит сообщения в консоль. -### Параметры конфигурации +### Параметры конфигурации {#parametry-konfiguratsii} Проверьте: -- Настройки Docker. +- Настройки Docker. - При запуске ClickHouse в Docker в сети IPv6 убедитесь, что установлено `network=host`. + При запуске ClickHouse в Docker в сети IPv6 убедитесь, что установлено `network=host`. -- Параметры endpoint. +- Параметры endpoint. - Проверьте настройки [listen_host](server_settings/settings.md#server_settings-listen_host) и [tcp_port](server_settings/settings.md#server_settings-tcp_port). + Проверьте настройки [listen_host](server_settings/settings.md#server_settings-listen_host) и [tcp_port](server_settings/settings.md#server_settings-tcp_port). - По умолчанию, сервер ClickHouse принимает только локальные подключения. + По умолчанию, сервер ClickHouse принимает только локальные подключения. -- Настройки протокола HTTP. +- Настройки протокола HTTP. - Проверьте настройки протокола для HTTP API. + Проверьте настройки протокола для HTTP API. -- Параметры безопасного подключения. +- Параметры безопасного подключения. - Проверьте: + Проверьте: - - Настройку `tcp_port_secure`. - - Параметры для SSL-сертификатов. + - Настройку `tcp_port_secure`. + - Параметры для SSL-сертификатов. - Используйте правильные параметры при подключении. Например, используйте параметр `port_secure` при использовании `clickhouse_client`. + Используйте правильные параметры при подключении. Например, используйте параметр `port_secure` при использовании `clickhouse_client`. -- Настройки пользователей. +- Настройки пользователей. - Возможно, вы используете неверное имя пользователя или пароль. + Возможно, вы используете неверное имя пользователя или пароль. ## Обработка запросов {#troubleshooting-does-not-process-queries} Если ClickHouse не может обработать запрос, он отправляет клиенту описание ошибки. В `clickhouse-client` вы получаете описание ошибки в консоли. При использовании интерфейса HTTP, ClickHouse отправляет описание ошибки в теле ответа. Например: -```bash +``` bash $ curl 'http://localhost:8123/' --data-binary "SELECT a" Code: 47, e.displayText() = DB::Exception: Unknown identifier: a. Note that there are no tables (FROM clause) in your query, context: required_names: 'a' source_tables: table_aliases: private_aliases: column_aliases: public_columns: 'a' masked_columns: array_join_columns: source_columns: , e.what() = DB::Exception ``` diff --git a/docs/ru/operations/update.md b/docs/ru/operations/update.md index eb870c2de9c..f1998864f40 100644 --- a/docs/ru/operations/update.md +++ b/docs/ru/operations/update.md @@ -1,8 +1,8 @@ -# Обновление ClickHouse +# Обновление ClickHouse {#obnovlenie-clickhouse} Если ClickHouse установлен с помощью deb-пакетов, выполните следующие команды на сервере: -```bash +``` bash $ sudo apt-get update $ sudo apt-get install clickhouse-client clickhouse-server $ sudo service clickhouse-server restart diff --git a/docs/ru/operations/utils/clickhouse-copier.md b/docs/ru/operations/utils/clickhouse-copier.md index 1f8ba4bc5e5..291c36993c6 100644 --- a/docs/ru/operations/utils/clickhouse-copier.md +++ b/docs/ru/operations/utils/clickhouse-copier.md @@ -1,5 +1,4 @@ - -# clickhouse-copier +# clickhouse-copier {#clickhouse-copier} Копирует данные из таблиц одного кластера в таблицы другого (или этого же) кластера. @@ -7,37 +6,39 @@ После запуска, `clickhouse-copier`: -- Соединяется с ZooKeeper и получает: - - Задания на копирование. - - Состояние заданий на копирование. -- Выполняет задания. +- Соединяется с ZooKeeper и получает: - Каждый запущенный процесс выбирает "ближайший" шард исходного кластера и копирует данные в кластер назначения, при необходимости перешардируя их. + - Задания на копирование. + - Состояние заданий на копирование. -`clickhouse-copier` отслеживает изменения в ZooKeeper и применяет их "на лету". +- Выполняет задания. + + Каждый запущенный процесс выбирает "ближайший" шард исходного кластера и копирует данные в кластер назначения, при необходимости перешардируя их. + +`clickhouse-copier` отслеживает изменения в ZooKeeper и применяет их «на лету». Для снижения сетевого трафика рекомендуем запускать `clickhouse-copier` на том же сервере, где находятся исходные данные. -## Запуск clickhouse-copier +## Запуск clickhouse-copier {#zapusk-clickhouse-copier} Утилиту следует запускать вручную следующим образом: -```bash +``` bash $ clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir ``` Параметры запуска: -- `daemon` - запускает `clickhouse-copier` в режиме демона. -- `config` - путь к файлу `zookeeper.xml` с параметрами соединения с ZooKeeper. -- `task-path` - путь к ноде ZooKeeper. Нода используется для синхронизации между процессами `clickhouse-copier` и для хранения заданий. Задания хранятся в `$task-path/description`. -- `task-file` - необязательный путь к файлу с описанием конфигурация заданий для загрузки в ZooKeeper. -- `task-upload-force` - Загрузить `task-file` в ZooKeeper даже если уже было загружено. -- `base-dir` - путь к логам и вспомогательным файлам. При запуске `clickhouse-copier` создает в `$base-dir` подкаталоги `clickhouse-copier_YYYYMMHHSS_`. Если параметр не указан, то каталоги будут создаваться в каталоге, где `clickhouse-copier` был запущен. +- `daemon` - запускает `clickhouse-copier` в режиме демона. +- `config` - путь к файлу `zookeeper.xml` с параметрами соединения с ZooKeeper. +- `task-path` - путь к ноде ZooKeeper. Нода используется для синхронизации между процессами `clickhouse-copier` и для хранения заданий. Задания хранятся в `$task-path/description`. +- `task-file` - необязательный путь к файлу с описанием конфигурация заданий для загрузки в ZooKeeper. +- `task-upload-force` - Загрузить `task-file` в ZooKeeper даже если уже было загружено. +- `base-dir` - путь к логам и вспомогательным файлам. При запуске `clickhouse-copier` создает в `$base-dir` подкаталоги `clickhouse-copier_YYYYMMHHSS_`. Если параметр не указан, то каталоги будут создаваться в каталоге, где `clickhouse-copier` был запущен. -## Формат zookeeper.xml +## Формат zookeeper.xml {#format-zookeeper-xml} -```xml +``` xml trace @@ -54,9 +55,9 @@ $ clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/pat ``` -## Конфигурация заданий на копирование +## Конфигурация заданий на копирование {#konfiguratsiia-zadanii-na-kopirovanie} -```xml +``` xml @@ -163,6 +164,6 @@ $ clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/pat ``` -`clickhouse-copier` отслеживает изменения `/task/path/description` и применяет их "на лету". Если вы поменяете, например, значение `max_workers`, то количество процессов, выполняющих задания, также изменится. +`clickhouse-copier` отслеживает изменения `/task/path/description` и применяет их «на лету». Если вы поменяете, например, значение `max_workers`, то количество процессов, выполняющих задания, также изменится. [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/utils/clickhouse-copier/) diff --git a/docs/ru/operations/utils/clickhouse-local.md b/docs/ru/operations/utils/clickhouse-local.md index 02752dfe8bc..bcc34deea4e 100644 --- a/docs/ru/operations/utils/clickhouse-local.md +++ b/docs/ru/operations/utils/clickhouse-local.md @@ -1,5 +1,4 @@ - -# clickhouse-local +# clickhouse-local {#clickhouse-local} Принимает на вход данные, которые можно представить в табличном виде и выполняет над ними операции, заданные на [языке запросов](../../query_language/index.md) ClickHouse. @@ -7,53 +6,52 @@ `clickhouse-local` при настройке по умолчанию не имеет доступа к данным, которыми управляет сервер ClickHouse, установленный на этом же хосте, однако можно подключить конфигурацию сервера с помощью ключа `--config-file`. -!!! warning +!!! warning "Warning" Мы не рекомендуем подключать серверную конфигурацию к `clickhouse-local`, поскольку данные можно легко повредить неосторожными действиями. -## Вызов программы +## Вызов программы {#vyzov-programmy} Основной формат вызова: -```bash +``` bash $ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" ``` Ключи команды: -- `-S`, `--structure` — структура таблицы, в которую будут помещены входящие данные. -- `-if`, `--input-format` — формат входящих данных. По умолчанию — `TSV`. -- `-f`, `--file` — путь к файлу с данными. По умолчанию — `stdin`. -- `-q`, `--query` — запросы на выполнение. Разделитель запросов — `;`. -- `-N`, `--table` — имя таблицы, в которую будут помещены входящие данные. По умолчанию - `table`. -- `-of`, `--format`, `--output-format` — формат выходных данных. По умолчанию — `TSV`. -- `--stacktrace` — вывод отладочной информации при исключениях. -- `--verbose` — подробный вывод при выполнении запроса. -- `-s` — отключает вывод системных логов в `stderr`. -- `--config-file` — путь к файлу конфигурации. По умолчанию `clickhouse-local` запускается с пустой конфигурацией. Конфигурационный файл имеет тот же формат, что и для сервера ClickHouse и в нём можно использовать все конфигурационные параметры сервера. Обычно подключение конфигурации не требуется, если требуется установить отдельный параметр, то это можно сделать ключом с именем параметра. -- `--help` — вывод справочной информации о `clickhouse-local`. +- `-S`, `--structure` — структура таблицы, в которую будут помещены входящие данные. +- `-if`, `--input-format` — формат входящих данных. По умолчанию — `TSV`. +- `-f`, `--file` — путь к файлу с данными. По умолчанию — `stdin`. +- `-q`, `--query` — запросы на выполнение. Разделитель запросов — `;`. +- `-N`, `--table` — имя таблицы, в которую будут помещены входящие данные. По умолчанию - `table`. +- `-of`, `--format`, `--output-format` — формат выходных данных. По умолчанию — `TSV`. +- `--stacktrace` — вывод отладочной информации при исключениях. +- `--verbose` — подробный вывод при выполнении запроса. +- `-s` — отключает вывод системных логов в `stderr`. +- `--config-file` — путь к файлу конфигурации. По умолчанию `clickhouse-local` запускается с пустой конфигурацией. Конфигурационный файл имеет тот же формат, что и для сервера ClickHouse и в нём можно использовать все конфигурационные параметры сервера. Обычно подключение конфигурации не требуется, если требуется установить отдельный параметр, то это можно сделать ключом с именем параметра. +- `--help` — вывод справочной информации о `clickhouse-local`. +## Примеры вызова {#primery-vyzova} -## Примеры вызова - -```bash +``` bash $ echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. -1 2 -3 4 +1 2 +3 4 ``` Вызов выше эквивалентен следующему: -```bash +``` bash $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. -1 2 -3 4 +1 2 +3 4 ``` А теперь давайте выведем на экран объём оперативной памяти, занимаемой пользователями (Unix): -```bash +``` bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ┏━━━━━━━━━━┳━━━━━━━━━━┓ diff --git a/docs/ru/operations/utils/index.md b/docs/ru/operations/utils/index.md index ef17f5d99c2..91ee649ee3a 100644 --- a/docs/ru/operations/utils/index.md +++ b/docs/ru/operations/utils/index.md @@ -1,6 +1,6 @@ -# Утилиты ClickHouse +# Утилиты ClickHouse {#utility-clickhouse} -* [clickhouse-local](clickhouse-local.md) -* [clickhouse-copier](clickhouse-copier.md) - копирует (и перешардирует) данные с одного кластера на другой. +- [clickhouse-local](clickhouse-local.md) +- [clickhouse-copier](clickhouse-copier.md) - копирует (и перешардирует) данные с одного кластера на другой. [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/utils/) diff --git a/docs/ru/query_language/agg_functions/combinators.md b/docs/ru/query_language/agg_functions/combinators.md index 18fd5fabe55..2cc94a18521 100644 --- a/docs/ru/query_language/agg_functions/combinators.md +++ b/docs/ru/query_language/agg_functions/combinators.md @@ -1,4 +1,4 @@ -# Комбинаторы агрегатных функций {#aggregate_functions_combinators} +# Комбинаторы агрегатных функций {#aggregate-functions-combinators} К имени агрегатной функции может быть приписан некоторый суффикс. При этом, работа агрегатной функции некоторым образом модифицируется. @@ -11,7 +11,7 @@ С помощью условных агрегатных функций, вы можете вычислить агрегаты сразу для нескольких условий, не используя подзапросы и `JOIN`-ы. Например, в Яндекс.Метрике, условные агрегатные функции используются для реализации функциональности сравнения сегментов. -## -Array +## -Array {#array} К имени любой агрегатной функции может быть приписан суффикс -Array. В этом случае, агрегатная функция вместо аргументов типов T принимает аргументы типов Array(T) (массивы). Если агрегатная функция принимает несколько аргументов, то это должны быть массивы одинаковых длин. При обработке массивов, агрегатная функция работает, как исходная агрегатная функция по всем элементам массивов. @@ -21,60 +21,55 @@ Комбинаторы -If и -Array можно сочетать. При этом, должен сначала идти Array, а потом If. Примеры: `uniqArrayIf(arr, cond)`, `quantilesTimingArrayIf(level1, level2)(arr, cond)`. Из-за такого порядка получается, что аргумент cond не должен быть массивом. -## -State +## -State {#state} В случае применения этого комбинатора, агрегатная функция возвращает не готовое значение (например, в случае функции [uniq](reference.md#agg_function-uniq) — количество уникальных значений), а промежуточное состояние агрегации (например, в случае функции `uniq` — хэш-таблицу для расчёта количества уникальных значений), которое имеет тип `AggregateFunction(...)` и может использоваться для дальнейшей обработки или может быть сохранено в таблицу для последующей доагрегации. Для работы с промежуточными состояниями предназначены: -- Движок таблиц [AggregatingMergeTree](../../operations/table_engines/aggregatingmergetree.md). -- Функция [finalizeAggregation](../functions/other_functions.md#function-finalizeaggregation). -- Функция [runningAccumulate](../functions/other_functions.md#function-runningaccumulate). -- Комбинатор [-Merge](#aggregate_functions_combinators_merge). -- Комбинатор [-MergeState](#aggregate_functions_combinators_mergestate). +- Движок таблиц [AggregatingMergeTree](../../operations/table_engines/aggregatingmergetree.md). +- Функция [finalizeAggregation](../functions/other_functions.md#function-finalizeaggregation). +- Функция [runningAccumulate](../functions/other_functions.md#function-runningaccumulate). +- Комбинатор [-Merge](#aggregate_functions_combinators_merge). +- Комбинатор [-MergeState](#aggregate_functions_combinators_mergestate). - -## -Merge {#aggregate_functions_combinators_merge} +## -Merge {#aggregate-functions-combinators-merge} В случае применения этого комбинатора, агрегатная функция будет принимать в качестве аргумента промежуточное состояние агрегации, доагрегировать (объединять вместе) эти состояния, и возвращать готовое значение. -## -MergeState {#aggregate_functions_combinators_mergestate} +## -MergeState {#aggregate-functions-combinators-mergestate} Выполняет слияние промежуточных состояний агрегации, аналогично комбинатору -Merge, но возвращает не готовое значение, а промежуточное состояние агрегации, аналогично комбинатору -State. -## -ForEach +## -ForEach {#foreach} Преобразует агрегатную функцию для таблиц в агрегатную функцию для массивов, которая применяет агрегирование для соответствующих элементов массивов и возвращает массив результатов. Например, `sumForEach` для массивов `[1, 2]`, `[3, 4, 5]` и `[6, 7]` даст результат `[10, 13, 5]`, сложив соответственные элементы массивов. - -## -Resample {#agg_functions-combinator-resample} - +## -Resample {#agg-functions-combinator-resample} Позволяет поделить данные на группы, а затем по-отдельности агрегирует данные для этих групп. Группы образуются разбиением значений одного из столбцов на интервалы. -```sql +``` sql Resample(start, end, step)(, resampling_key) ``` **Параметры** -- `start` — начальное значение для интервала значений `resampling_key`. -- `stop` — конечное значение для интервала значений `resampling_key`. Интервал не включает значение `stop` (`[start, stop)`). -- `step` — шаг деления полного интервала на подинтервалы. Функция `aggFunction` выполняется для каждого из подинтервалов независимо. -- `resampling_key` — столбец, значения которого используются для разделения данных на интервалы. -- `aggFunction_params` — параметры `aggFunction`. - +- `start` — начальное значение для интервала значений `resampling_key`. +- `stop` — конечное значение для интервала значений `resampling_key`. Интервал не включает значение `stop` (`[start, stop)`). +- `step` — шаг деления полного интервала на подинтервалы. Функция `aggFunction` выполняется для каждого из подинтервалов независимо. +- `resampling_key` — столбец, значения которого используются для разделения данных на интервалы. +- `aggFunction_params` — параметры `aggFunction`. **Возвращаемые значения** -- Массив результатов `aggFunction` для каждого подинтервала. +- Массив результатов `aggFunction` для каждого подинтервала. **Пример** - Рассмотрим таблицу `people` со следующими данными: -```text +``` text ┌─name───┬─age─┬─wage─┐ │ John │ 16 │ 10 │ │ Alice │ 30 │ 15 │ @@ -89,10 +84,11 @@ Чтобы собрать имена в массив, возьмём агрегатную функцию [groupArray](reference.md#agg_function-grouparray). Она принимает один аргумент. В нашем случае, это столбец `name`. Функция `groupArrayResample` должна использовать столбец `age` для агрегирования имён по возрасту. Чтобы определить необходимые интервалы, передадим в функцию `groupArrayResample` аргументы `30, 75, 30`. -```sql +``` sql SELECT groupArrayResample(30, 75, 30)(name, age) from people ``` -```text + +``` text ┌─groupArrayResample(30, 75, 30)(name, age)─────┐ │ [['Alice','Mary','Evelyn'],['David','Brian']] │ └───────────────────────────────────────────────┘ @@ -104,14 +100,14 @@ SELECT groupArrayResample(30, 75, 30)(name, age) from people Теперь посчитаем общее количество людей и их среднюю заработную плату в заданных возрастных интервалах. - -```sql +``` sql SELECT countResample(30, 75, 30)(name, age) AS amount, avgResample(30, 75, 30)(wage, age) AS avg_wage FROM people ``` -```text + +``` text ┌─amount─┬─avg_wage──────────────────┐ │ [3,2] │ [11.5,12.949999809265137] │ └────────┴───────────────────────────┘ diff --git a/docs/ru/query_language/agg_functions/index.md b/docs/ru/query_language/agg_functions/index.md index f1adc7f1c78..e0c07a1c95e 100644 --- a/docs/ru/query_language/agg_functions/index.md +++ b/docs/ru/query_language/agg_functions/index.md @@ -1,13 +1,13 @@ -# Агрегатные функции {#aggregate_functions} +# Агрегатные функции {#aggregate-functions} Агрегатные функции работают в [привычном](http://www.sql-tutorial.com/sql-aggregate-functions-sql-tutorial) для специалистов по базам данных смысле. ClickHouse поддерживает также: -- [Параметрические агрегатные функции](parametric_functions.md#aggregate_functions_parametric), которые помимо столбцов принимаю и другие параметры. -- [Комбинаторы](combinators.md#aggregate_functions_combinators), которые изменяют поведение агрегатных функций. +- [Параметрические агрегатные функции](parametric_functions.md#aggregate_functions_parametric), которые помимо столбцов принимаю и другие параметры. +- [Комбинаторы](combinators.md#aggregate_functions_combinators), которые изменяют поведение агрегатных функций. -## Обработка NULL +## Обработка NULL {#obrabotka-null} При агрегации все `NULL` пропускаются. @@ -15,7 +15,7 @@ ClickHouse поддерживает также: Рассмотрим таблицу: -```text +``` text ┌─x─┬────y─┐ │ 1 │ 2 │ │ 2 │ ᴺᵁᴸᴸ │ @@ -27,25 +27,25 @@ ClickHouse поддерживает также: Выполним суммирование значений в столбце `y`: -```sql +``` sql SELECT sum(y) FROM t_null_big ``` -```text + +``` text ┌─sum(y)─┐ │ 7 │ └────────┘ - ``` Функция `sum` работает с `NULL` как с `0`. В частности, это означает, что если на вход в функцию подать выборку, где все значения `NULL`, то результат будет `0`, а не `NULL`. - Теперь с помощью функции `groupArray` сформируем массив из столбца `y`: -```sql +``` sql SELECT groupArray(y) FROM t_null_big ``` -```text + +``` text ┌─groupArray(y)─┐ │ [2,2,3] │ └───────────────┘ diff --git a/docs/ru/query_language/agg_functions/parametric_functions.md b/docs/ru/query_language/agg_functions/parametric_functions.md index 54cd282c733..fe817e6b5a9 100644 --- a/docs/ru/query_language/agg_functions/parametric_functions.md +++ b/docs/ru/query_language/agg_functions/parametric_functions.md @@ -1,15 +1,13 @@ -# Параметрические агрегатные функции {#aggregate_functions_parametric} +# Параметрические агрегатные функции {#aggregate-functions-parametric} Некоторые агрегатные функции могут принимать не только столбцы-аргументы (по которым производится свёртка), но и набор параметров - констант для инициализации. Синтаксис - две пары круглых скобок вместо одной. Первая - для параметров, вторая - для аргументов. -## histogram +## histogram {#histogram} Рассчитывает адаптивную гистограмму. Не гарантирует точного результата. -``` -histogram(number_of_bins)(values) -``` - + histogram(number_of_bins)(values) + Функция использует [A Streaming Parallel Decision Tree Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). Границы столбцов устанавливаются по мере поступления новых данных в функцию. В общем случае столбцы имею разную ширину. **Параметры** @@ -19,27 +17,28 @@ histogram(number_of_bins)(values) **Возвращаемые значения** -- [Массив](../../data_types/array.md) [кортежей](../../data_types/tuple.md) следующего вида: +- [Массив](../../data_types/array.md) [кортежей](../../data_types/tuple.md) следующего вида: - ``` - [(lower_1, upper_1, height_1), ... (lower_N, upper_N, height_N)] - ``` + ``` + [(lower_1, upper_1, height_1), ... (lower_N, upper_N, height_N)] + ``` - - `lower` — нижняя граница корзины. - - `upper` — верхняя граница корзины. - - `height` — количество значений в корзине. + - `lower` — нижняя граница корзины. + - `upper` — верхняя граница корзины. + - `height` — количество значений в корзине. **Пример** -```sql -SELECT histogram(5)(number + 1) +``` sql +SELECT histogram(5)(number + 1) FROM ( - SELECT * - FROM system.numbers + SELECT * + FROM system.numbers LIMIT 20 ) ``` -```text + +``` text ┌─histogram(5)(plus(number, 1))───────────────────────────────────────────┐ │ [(1,4.5,4),(4.5,8.5,4),(8.5,12.75,4.125),(12.75,17,4.625),(17,20,3.25)] │ └─────────────────────────────────────────────────────────────────────────┘ @@ -47,19 +46,20 @@ FROM ( С помощью функции [bar](../functions/other_functions.md#function-bar) можно визуализировать гистограмму, например: -```sql +``` sql WITH histogram(5)(rand() % 100) AS hist -SELECT - arrayJoin(hist).3 AS height, +SELECT + arrayJoin(hist).3 AS height, bar(height, 0, 6, 5) AS bar -FROM +FROM ( SELECT * FROM system.numbers LIMIT 20 ) ``` -```text + +``` text ┌─height─┬─bar───┐ │ 2.125 │ █▋ │ │ 3.25 │ ██▌ │ @@ -71,48 +71,46 @@ FROM В этом случае необходимо помнить, что границы корзин гистограммы не известны. -## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) {#function-sequencematch} +## sequenceMatch(pattern)(timestamp, cond1, cond2, …) {#function-sequencematch} Проверяет, содержит ли последовательность событий цепочку, которая соответствует указанному шаблону. -```sql +``` sql sequenceMatch(pattern)(timestamp, cond1, cond2, ...) ``` !!! warning "Предупреждение" События, произошедшие в одну и ту же секунду, располагаются в последовательности в неопределенном порядке, что может повлиять на результат работы функции. - **Параметры** -- `pattern` — строка с шаблоном. Смотрите [Синтаксис шаблонов](#sequence-function-pattern-syntax). +- `pattern` — строка с шаблоном. Смотрите [Синтаксис шаблонов](#sequence-function-pattern-syntax). -- `timestamp` — столбец, содержащий метки времени. Типичный тип данных столбца — `Date` или `DateTime`. Также можно использовать любой из поддержанных типов данных [UInt](../../data_types/int_uint.md). - -- `cond1`, `cond2` — условия, описывающие цепочку событий. Тип данных — `UInt8`. Можно использовать до 32 условий. Функция учитывает только те события, которые указаны в условиях. Функция пропускает данные из последовательности, если они не описаны ни в одном из условий. +- `timestamp` — столбец, содержащий метки времени. Типичный тип данных столбца — `Date` или `DateTime`. Также можно использовать любой из поддержанных типов данных [UInt](../../data_types/int_uint.md). +- `cond1`, `cond2` — условия, описывающие цепочку событий. Тип данных — `UInt8`. Можно использовать до 32 условий. Функция учитывает только те события, которые указаны в условиях. Функция пропускает данные из последовательности, если они не описаны ни в одном из условий. **Возвращаемые значения** -- 1, если цепочка событий, соответствующая шаблону найдена. -- 0, если цепочка событий, соответствующая шаблону не найдена. +- 1, если цепочка событий, соответствующая шаблону найдена. +- 0, если цепочка событий, соответствующая шаблону не найдена. Тип: `UInt8`. **Синтаксис шаблонов** -- `(?N)` — соответствует условию на позиции `N`. Условия пронумерованы по порядку в диапазоне `[1, 32]`. Например, `(?1)` соответствует условию, заданному параметром `cond1`. +- `(?N)` — соответствует условию на позиции `N`. Условия пронумерованы по порядку в диапазоне `[1, 32]`. Например, `(?1)` соответствует условию, заданному параметром `cond1`. -- `.*` — соответствует любому количеству событий. Для этого элемента шаблона не надо задавать условия. +- `.*` — соответствует любому количеству событий. Для этого элемента шаблона не надо задавать условия. -- `(?t operator value)` — устанавливает время в секундах, которое должно разделять два события. Например, шаблон `(?1)(?t>1800)(?2)` соответствует событиям, которые произошли более чем через 1800 секунд друг от друга. Между этими событиями может находиться произвольное количество любых событий. Операторы могут быть `>=`, `>`, `<`, `<=`. +- `(?t operator value)` — устанавливает время в секундах, которое должно разделять два события. Например, шаблон `(?1)(?t>1800)(?2)` соответствует событиям, которые произошли более чем через 1800 секунд друг от друга. Между этими событиями может находиться произвольное количество любых событий. Операторы могут быть `>=`, `>`, `<`, `<=`. **Примеры** Пусть таблица `t` содержит следующие данные: -```text +``` text ┌─time─┬─number─┐ │ 1 │ 1 │ │ 2 │ 3 │ @@ -122,10 +120,11 @@ sequenceMatch(pattern)(timestamp, cond1, cond2, ...) Выполним запрос: -```sql +``` sql SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2) FROM t ``` -```text + +``` text ┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2))─┐ │ 1 │ └───────────────────────────────────────────────────────────────────────┘ @@ -133,10 +132,11 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2) FROM t Функция нашла цепочку событий, в которой число 2 следует за числом 1. Число 3 между ними было пропущено, поскольку оно не было использовано ни в одном из условий. -```sql +``` sql SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 3) FROM t ``` -```text + +``` text ┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 3))─┐ │ 0 │ └──────────────────────────────────────────────────────────────────────────────────────────┘ @@ -144,43 +144,42 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 3) FROM В этом случае функция не может найти цепочку событий, соответствующую шаблону, поскольку событие для числа 3 произошло между 1 и 2. Если бы в этом же случае мы бы проверяли условие на событие для числа 4, то цепочка бы соответствовала шаблону. -```sql +``` sql SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM t ``` -```text + +``` text ┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 4))─┐ │ 1 │ └──────────────────────────────────────────────────────────────────────────────────────────┘ ``` - **Смотрите также** -- [sequenceCount](#function-sequencecount) +- [sequenceCount](#function-sequencecount) - -## sequenceCount(pattern)(time, cond1, cond2, ...) {#function-sequencecount} +## sequenceCount(pattern)(time, cond1, cond2, …) {#function-sequencecount} Вычисляет количество цепочек событий, соответствующих шаблону. Функция обнаруживает только непересекающиеся цепочки событий. Она начитает искать следующую цепочку только после того, как полностью совпала текущая цепочка событий. !!! warning "Предупреждение" События, произошедшие в одну и ту же секунду, располагаются в последовательности в неопределенном порядке, что может повлиять на результат работы функции. -```sql +``` sql sequenceCount(pattern)(timestamp, cond1, cond2, ...) ``` **Параметры** -- `pattern` — строка с шаблоном. Смотрите [Синтаксис шаблонов](#sequence-function-pattern-syntax). +- `pattern` — строка с шаблоном. Смотрите [Синтаксис шаблонов](#sequence-function-pattern-syntax). -- `timestamp` — столбец, содержащий метки времени. Типичный тип данных столбца — `Date` или `DateTime`. Также можно использовать любой из поддержанных типов данных [UInt](../../data_types/int_uint.md). +- `timestamp` — столбец, содержащий метки времени. Типичный тип данных столбца — `Date` или `DateTime`. Также можно использовать любой из поддержанных типов данных [UInt](../../data_types/int_uint.md). -- `cond1`, `cond2` — условия, описывающие цепочку событий. Тип данных — `UInt8`. Можно использовать до 32 условий. Функция учитывает только те события, которые указаны в условиях. Функция пропускает данные из последовательности, если они не описаны ни в одном из условий. +- `cond1`, `cond2` — условия, описывающие цепочку событий. Тип данных — `UInt8`. Можно использовать до 32 условий. Функция учитывает только те события, которые указаны в условиях. Функция пропускает данные из последовательности, если они не описаны ни в одном из условий. **Возвращаемое значение** -- Число непересекающихся цепочек событий, соответствущих шаблону. +- Число непересекающихся цепочек событий, соответствущих шаблону. Тип: `UInt64`. @@ -188,7 +187,7 @@ sequenceCount(pattern)(timestamp, cond1, cond2, ...) Пусть таблица `t` содержит следующие данные: -```text +``` text ┌─time─┬─number─┐ │ 1 │ 1 │ │ 2 │ 3 │ @@ -201,10 +200,11 @@ sequenceCount(pattern)(timestamp, cond1, cond2, ...) Вычислим сколько раз число 2 стоит после числа 1, причем между 1 и 2 могут быть любые числа: -```sql +``` sql SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t ``` -```text + +``` text ┌─sequenceCount('(?1).*(?2)')(time, equals(number, 1), equals(number, 2))─┐ │ 2 │ └─────────────────────────────────────────────────────────────────────────┘ @@ -212,7 +212,7 @@ SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t **Смотрите также** -- [sequenceMatch](#function-sequencematch) +- [sequenceMatch](#function-sequencematch) ## windowFunnel {#windowfunnel} @@ -220,24 +220,24 @@ SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t Функция работает по алгоритму: -- Функция отыскивает данные, на которых срабатывает первое условие из цепочки, и присваивает счетчику событий значение 1. С этого же момента начинается отсчет времени скользящего окна. +- Функция отыскивает данные, на которых срабатывает первое условие из цепочки, и присваивает счетчику событий значение 1. С этого же момента начинается отсчет времени скользящего окна. -- Если в пределах окна последовательно попадаются события из цепочки, то счетчик увеличивается. Если последовательность событий нарушается, то счетчик не растет. +- Если в пределах окна последовательно попадаются события из цепочки, то счетчик увеличивается. Если последовательность событий нарушается, то счетчик не растет. -- Если в данных оказалось несколько цепочек разной степени завершенности, то функция выдаст только размер самой длинной цепочки. +- Если в данных оказалось несколько цепочек разной степени завершенности, то функция выдаст только размер самой длинной цепочки. **Синтаксис** -```sql +``` sql windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) ``` **Параметры** -- `window` — ширина скользящего окна по времени в секундах. [UInt](../../data_types/int_uint.md). -- `mode` - необязательный параметр. Если установлено значение `'strict'`, то функция `windowFunnel()` применяет условия только для уникальных значений. -- `timestamp` — имя столбца, содержащего временные отметки. [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md#data_type-datetime) и другие параметры с типом `Integer`. В случае хранения меток времени в столбцах с типом `UInt64`, максимально допустимое значение соответствует ограничению для типа `Int64`, т.е. равно `2^63-1`. -- `cond` — условия или данные, описывающие цепочку событий. [UInt8](../../data_types/int_uint.md). +- `window` — ширина скользящего окна по времени в секундах. [UInt](../../data_types/int_uint.md). +- `mode` - необязательный параметр. Если установлено значение `'strict'`, то функция `windowFunnel()` применяет условия только для уникальных значений. +- `timestamp` — имя столбца, содержащего временные отметки. [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md#data_type-datetime) и другие параметры с типом `Integer`. В случае хранения меток времени в столбцах с типом `UInt64`, максимально допустимое значение соответствует ограничению для типа `Int64`, т.е. равно `2^63-1`. +- `cond` — условия или данные, описывающие цепочку событий. [UInt8](../../data_types/int_uint.md). **Возвращаемое значение** @@ -251,14 +251,14 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) Зададим следующую цепочку событий: -1. Пользователь вошел в личный кабинет (`eventID = 1001`). -2. Пользователь ищет телефон (`eventID = 1003, product = 'phone'`). -3. Пользователь сделал заказ (`eventID = 1009`) -4. Пользователь сделал повторный заказ (`eventID = 1010`). +1. Пользователь вошел в личный кабинет (`eventID = 1001`). +2. Пользователь ищет телефон (`eventID = 1003, product = 'phone'`). +3. Пользователь сделал заказ (`eventID = 1009`) +4. Пользователь сделал повторный заказ (`eventID = 1010`). Входная таблица: -```text +``` text ┌─event_date─┬─user_id─┬───────────timestamp─┬─eventID─┬─product─┐ │ 2019-01-28 │ 1 │ 2019-01-29 10:00:00 │ 1003 │ phone │ └────────────┴─────────┴─────────────────────┴─────────┴─────────┘ @@ -277,7 +277,7 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) Запрос: -```sql +``` sql SELECT level, count() AS c @@ -296,29 +296,29 @@ ORDER BY level ASC ## retention {#retention} -Аналитическая функция, которая показывает, насколько +Аналитическая функция, которая показывает, насколько выдерживаются те или иные условия, например, удержание динамики/уровня [посещаемости сайта](https://yandex.ru/support/partner2/statistics/metrika-visitors-statistics.html?lang=ru). Функция принимает набор (от 1 до 32) логических условий, как в [WHERE](../select.md#select-where), и применяет их к заданному набору данных. Условия, кроме первого, применяются попарно: результат второго будет истинным, если истинно первое и второе, третьего - если истинно первое и третье и т. д. -**Синтаксис** +**Синтаксис** -```sql +``` sql retention(cond1, cond2, ..., cond32) ``` -**Параметры** +**Параметры** -- `cond` — вычисляемое условие или выражение, которое возвращает `UInt8` результат (1/0). +- `cond` — вычисляемое условие или выражение, которое возвращает `UInt8` результат (1/0). **Возвращаемое значение** Массив из 1 или 0. -- 1 — условие выполнено. -- 0 — условие не выполнено. +- 1 — условие выполнено. +- 0 — условие не выполнено. Тип: `UInt8`. @@ -326,9 +326,9 @@ retention(cond1, cond2, ..., cond32) Рассмотрим пример расчета функции `retention` для определения посещаемости сайта. -**1.** Создадим таблицу для илюстрации примера. +**1.** Создадим таблицу для илюстрации примера. -```sql +``` sql CREATE TABLE retention_test(date Date, uid Int32)ENGINE = Memory; INSERT INTO retention_test SELECT '2020-01-01', number FROM numbers(5); @@ -340,13 +340,13 @@ INSERT INTO retention_test SELECT '2020-01-03', number FROM numbers(15); Запрос: -```sql +``` sql SELECT * FROM retention_test ``` Ответ: -```text +``` text ┌───────date─┬─uid─┐ │ 2020-01-01 │ 0 │ │ 2020-01-01 │ 1 │ @@ -389,7 +389,7 @@ SELECT * FROM retention_test Запрос: -```sql +``` sql SELECT uid, retention(date = '2020-01-01', date = '2020-01-02', date = '2020-01-03') AS r @@ -401,7 +401,7 @@ ORDER BY uid ASC Результат: -```text +``` text ┌─uid─┬─r───────┐ │ 0 │ [1,1,1] │ │ 1 │ [1,1,1] │ @@ -425,7 +425,7 @@ ORDER BY uid ASC Запрос: -```sql +``` sql SELECT sum(r[1]) AS r1, sum(r[2]) AS r2, @@ -443,7 +443,7 @@ FROM Результат: -```text +``` text ┌─r1─┬─r2─┬─r3─┐ │ 5 │ 5 │ 5 │ └────┴────┴────┘ @@ -451,11 +451,11 @@ FROM Где: -- `r1` - количество уникальных посетителей за 2020-01-01 (`cond1`). -- `r2` - количество уникальных посетителей в период между 2020-01-01 и 2020-01-02 (`cond1` и `cond2`). -- `r3` - количество уникальных посетителей в период между 2020-01-01 и 2020-01-03 (`cond1` и `cond3`). +- `r1` - количество уникальных посетителей за 2020-01-01 (`cond1`). +- `r2` - количество уникальных посетителей в период между 2020-01-01 и 2020-01-02 (`cond1` и `cond2`). +- `r3` - количество уникальных посетителей в период между 2020-01-01 и 2020-01-03 (`cond1` и `cond3`). -## uniqUpTo(N)(x) +## uniqUpTo(N)(x) {#uniquptonx} Вычисляет количество различных значений аргумента, если оно меньше или равно N. В случае, если количество различных значений аргумента больше N, возвращает N + 1. @@ -471,7 +471,7 @@ FROM Пример применения: -```text +``` text Задача: показывать в отчёте только поисковые фразы, по которым было хотя бы 5 уникальных посетителей. Решение: пишем в запросе GROUP BY SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 ``` diff --git a/docs/ru/query_language/agg_functions/reference.md b/docs/ru/query_language/agg_functions/reference.md index f4ee9cffad2..63a8b064e05 100644 --- a/docs/ru/query_language/agg_functions/reference.md +++ b/docs/ru/query_language/agg_functions/reference.md @@ -1,31 +1,31 @@ -# Справочник функций +# Справочник функций {#spravochnik-funktsii} -## count {#agg_function-count} +## count {#agg-function-count} Вычисляет количество строк или не NULL значений . ClickHouse поддерживает следующие виды синтаксиса для `count`: -- `count(expr)` или `COUNT(DISTINCT expr)`. -- `count()` или `COUNT(*)`. Синтаксис `count()` специфичен для ClickHouse. +- `count(expr)` или `COUNT(DISTINCT expr)`. +- `count()` или `COUNT(*)`. Синтаксис `count()` специфичен для ClickHouse. **Параметры** Функция может принимать: -- Ноль параметров. -- Одно [выражение](../syntax.md#syntax-expressions). +- Ноль параметров. +- Одно [выражение](../syntax.md#syntax-expressions). **Возвращаемое значение** -- Если функция вызывается без параметров, она вычисляет количество строк. -- Если передаётся [выражение](../syntax.md#syntax-expressions) , то функция вычисляет количество раз, когда выражение возвращает не NULL. Если выражение возвращает значение типа [Nullable](../../data_types/nullable.md), то результат `count` не становится `Nullable`. Функция возвращает 0, если выражение возвращает `NULL` для всех строк. +- Если функция вызывается без параметров, она вычисляет количество строк. +- Если передаётся [выражение](../syntax.md#syntax-expressions) , то функция вычисляет количество раз, когда выражение возвращает не NULL. Если выражение возвращает значение типа [Nullable](../../data_types/nullable.md), то результат `count` не становится `Nullable`. Функция возвращает 0, если выражение возвращает `NULL` для всех строк. В обоих случаях тип возвращаемого значения [UInt64](../../data_types/int_uint.md). **Подробности** -ClickHouse поддерживает синтаксис `COUNT(DISTINCT ...)`. Поведение этой конструкции зависит от настройки [count_distinct_implementation](../../operations/settings/settings.md#settings-count_distinct_implementation). Она определяет, какая из функций [uniq*](#agg_function-uniq) используется для выполнения операции. По умолчанию — функция [uniqExact](#agg_function-uniqexact). +ClickHouse поддерживает синтаксис `COUNT(DISTINCT ...)`. Поведение этой конструкции зависит от настройки [count\_distinct\_implementation](../../operations/settings/settings.md#settings-count_distinct_implementation). Она определяет, какая из функций [uniq\*](#agg_function-uniq) используется для выполнения операции. По умолчанию — функция [uniqExact](#agg_function-uniqexact). Запрос `SELECT count() FROM table` не оптимизирован, поскольку количество записей в таблице не хранится отдельно. Он выбирает небольшой столбец из таблицы и подсчитывает количество значений в нём. @@ -33,11 +33,11 @@ ClickHouse поддерживает синтаксис `COUNT(DISTINCT ...)`. П Пример 1: -```sql +``` sql SELECT count() FROM t ``` -```text +``` text ┌─count()─┐ │ 5 │ └─────────┘ @@ -45,21 +45,21 @@ SELECT count() FROM t Пример 2: -```sql +``` sql SELECT name, value FROM system.settings WHERE name = 'count_distinct_implementation' ``` -```text +``` text ┌─name──────────────────────────┬─value─────┐ │ count_distinct_implementation │ uniqExact │ └───────────────────────────────┴───────────┘ ``` -```sql +``` sql SELECT count(DISTINCT num) FROM t ``` -```text +``` text ┌─uniqExact(num)─┐ │ 3 │ └────────────────┘ @@ -67,7 +67,7 @@ SELECT count(DISTINCT num) FROM t Этот пример показывает, что `count(DISTINCT num)` выполняется с помощью функции `uniqExact` в соответствии со значением настройки `count_distinct_implementation`. -## any(x) {#agg_function-any} +## any(x) {#agg-function-any} Выбирает первое попавшееся значение. Порядок выполнения запроса может быть произвольным и даже каждый раз разным, поэтому результат данной функции недетерминирован. @@ -77,43 +77,43 @@ SELECT count(DISTINCT num) FROM t При наличии в запросе `SELECT` секции `GROUP BY` или хотя бы одной агрегатной функции, ClickHouse (в отличие от, например, MySQL) требует, чтобы все выражения в секциях `SELECT`, `HAVING`, `ORDER BY` вычислялись из ключей или из агрегатных функций. То есть, каждый выбираемый из таблицы столбец, должен использоваться либо в ключах, либо внутри агрегатных функций. Чтобы получить поведение, как в MySQL, вы можете поместить остальные столбцы в агрегатную функцию `any`. -## anyHeavy(x) {#agg_function-anyHeavy} +## anyHeavy(x) {#agg-function-anyheavy} -Выбирает часто встречающееся значение с помощью алгоритма "[heavy hitters](http://www.cs.umd.edu/~samir/498/karp.pdf)". Если существует значение, которое встречается чаще, чем в половине случаев, в каждом потоке выполнения запроса, то возвращается данное значение. В общем случае, результат недетерминирован. +Выбирает часто встречающееся значение с помощью алгоритма «[heavy hitters](http://www.cs.umd.edu/~samir/498/karp.pdf)». Если существует значение, которое встречается чаще, чем в половине случаев, в каждом потоке выполнения запроса, то возвращается данное значение. В общем случае, результат недетерминирован. -```sql +``` sql anyHeavy(column) ``` **Аргументы** -- `column` — имя столбца. +- `column` — имя столбца. **Пример** Возьмём набор данных [OnTime](../../getting_started/example_datasets/ontime.md) и выберем произвольное часто встречающееся значение в столбце `AirlineID`. -```sql +``` sql SELECT anyHeavy(AirlineID) AS res FROM ontime ``` -```text +``` text ┌───res─┐ │ 19690 │ └───────┘ ``` -## anyLast(x) {#agg_function-anyLast} +## anyLast(x) {#agg-function-anylast} Выбирает последнее попавшееся значение. Результат так же недетерминирован, как и для функции `any`. -##groupBitAnd +## groupBitAnd {#groupbitand} Применяет побитовое `И` для последовательности чисел. -```sql +``` sql groupBitAnd(expr) ``` @@ -129,7 +129,7 @@ groupBitAnd(expr) Тестовые данные: -```text +``` text binary decimal 00101100 = 44 00011100 = 28 @@ -139,7 +139,7 @@ binary decimal Запрос: -```sql +``` sql SELECT groupBitAnd(num) FROM t ``` @@ -147,16 +147,16 @@ SELECT groupBitAnd(num) FROM t Результат: -```text +``` text binary decimal 00000100 = 4 ``` -##groupBitOr +## groupBitOr {#groupbitor} Применяет побитовое `ИЛИ` для последовательности чисел. -```sql +``` sql groupBitOr(expr) ``` @@ -172,7 +172,7 @@ groupBitOr(expr) Тестовые данные: -```text +``` text binary decimal 00101100 = 44 00011100 = 28 @@ -182,7 +182,7 @@ binary decimal Запрос: -```sql +``` sql SELECT groupBitOr(num) FROM t ``` @@ -190,16 +190,16 @@ SELECT groupBitOr(num) FROM t Результат: -```text +``` text binary decimal 01111101 = 125 ``` -##groupBitXor +## groupBitXor {#groupbitxor} Применяет побитовое `ИСКЛЮЧАЮЩЕЕ ИЛИ` для последовательности чисел. -```sql +``` sql groupBitXor(expr) ``` @@ -215,7 +215,7 @@ groupBitXor(expr) Тестовые данные: -```text +``` text binary decimal 00101100 = 44 00011100 = 28 @@ -225,7 +225,7 @@ binary decimal Запрос: -```sql +``` sql SELECT groupBitXor(num) FROM t ``` @@ -233,16 +233,16 @@ SELECT groupBitXor(num) FROM t Результат: -```text +``` text binary decimal 01101000 = 104 ``` -## groupBitmap +## groupBitmap {#groupbitmap} Bitmap или агрегатные вычисления для столбца с типом данных `UInt*`, возвращают кардинальность в виде значения типа UInt64, если добавить суффикс -State, то возвращают [объект bitmap](../functions/bitmap_functions.md). -```sql +``` sql groupBitmap(expr) ``` @@ -258,7 +258,7 @@ groupBitmap(expr) Тестовые данные: -```text +``` text UserID 1 1 @@ -268,73 +268,73 @@ UserID Запрос: -```sql +``` sql SELECT groupBitmap(UserID) as num FROM t ``` Результат: -```text +``` text num 3 ``` -## min(x) {#agg_function-min} +## min(x) {#agg-function-min} Вычисляет минимум. -## max(x) {#agg_function-max} +## max(x) {#agg-function-max} Вычисляет максимум. -## argMin(arg, val) {#agg_function-argMin} +## argMin(arg, val) {#agg-function-argmin} Вычисляет значение arg при минимальном значении val. Если есть несколько разных значений arg для минимальных значений val, то выдаётся первое попавшееся из таких значений. **Пример:** -```text +``` text ┌─user─────┬─salary─┐ │ director │ 5000 │ │ manager │ 3000 │ │ worker │ 1000 │ └──────────┴────────┘ ``` -```sql + +``` sql SELECT argMin(user, salary) FROM salary ``` -```text + +``` text ┌─argMin(user, salary)─┐ │ worker │ └──────────────────────┘ ``` -## argMax(arg, val) {#agg_function-argMax} +## argMax(arg, val) {#agg-function-argmax} Вычисляет значение arg при максимальном значении val. Если есть несколько разных значений arg для максимальных значений val, то выдаётся первое попавшееся из таких значений. - -## sum(x) {#agg_function-sum} +## sum(x) {#agg-function-sum} Вычисляет сумму. Работает только для чисел. -## sumWithOverflow(x) {#agg_function-sumWithOverflow} +## sumWithOverflow(x) {#agg-function-sumwithoverflow} Вычисляет сумму чисел, используя для результата тот же тип данных, что и для входных параметров. Если сумма выйдет за максимальное значение для заданного типа данных, то функция вернёт ошибку. Работает только для чисел. +## sumMap(key, value) {#summapkey-value} -## sumMap(key, value) - -Производит суммирование массива 'value' по соответствующим ключам заданным в массиве 'key'. -Количество элементов в 'key' и 'value' должно быть одинаковым для каждой строки, для которой происходит суммирование. +Производит суммирование массива ‘value’ по соответствующим ключам заданным в массиве ‘key’. +Количество элементов в ‘key’ и ‘value’ должно быть одинаковым для каждой строки, для которой происходит суммирование. Возвращает кортеж из двух массивов - ключи в отсортированном порядке и значения, просуммированные по соответствующим ключам. Пример: -```sql +``` sql CREATE TABLE sum_map( date Date, timeslot DateTime, @@ -355,18 +355,18 @@ FROM sum_map GROUP BY timeslot ``` -```text +``` text ┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┐ │ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10]) │ │ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10]) │ └─────────────────────┴──────────────────────────────────────────────┘ ``` -## skewPop +## skewPop {#skewpop} Вычисляет [коэффициент асимметрии](https://ru.wikipedia.org/wiki/Коэффициент_асимметрии) для последовательности. -```sql +``` sql skewPop(expr) ``` @@ -380,17 +380,17 @@ skewPop(expr) **Пример** -```sql +``` sql SELECT skewPop(value) FROM series_with_value_column ``` -## skewSamp +## skewSamp {#skewsamp} -Вычисляет [выборочный коэффициент асимметрии](https://ru.wikipedia.org/wiki/Статистика_(функция_выборки)) для последовательности. +Вычисляет [выборочный коэффициент асимметрии](https://ru.wikipedia.org/wiki/Статистика_(функция_выборки)) для последовательности. Он представляет собой несмещенную оценку асимметрии случайной величины, если переданные значения образуют ее выборку. -```sql +``` sql skewSamp(expr) ``` @@ -404,15 +404,15 @@ skewSamp(expr) **Пример** -```sql +``` sql SELECT skewSamp(value) FROM series_with_value_column ``` -## kurtPop +## kurtPop {#kurtpop} Вычисляет [коэффициент эксцесса](https://ru.wikipedia.org/wiki/Коэффициент_эксцесса) последовательности. -```sql +``` sql kurtPop(expr) ``` @@ -426,17 +426,17 @@ kurtPop(expr) **Пример** -```sql +``` sql SELECT kurtPop(value) FROM series_with_value_column ``` -## kurtSamp +## kurtSamp {#kurtsamp} Вычисляет [выборочный коэффициент эксцесса](https://ru.wikipedia.org/wiki/Статистика_(функция_выборки)) для последовательности. Он представляет собой несмещенную оценку эксцесса случайной величины, если переданные значения образуют ее выборку. -```sql +``` sql kurtSamp(expr) ``` @@ -450,18 +450,18 @@ kurtSamp(expr) **Пример** -```sql +``` sql SELECT kurtSamp(value) FROM series_with_value_column ``` -## timeSeriesGroupSum(uid, timestamp, value) {#agg_function-timeseriesgroupsum} +## timeSeriesGroupSum(uid, timestamp, value) {#agg-function-timeseriesgroupsum} `timeSeriesGroupSum` агрегирует временные ряды в которых не совпадают моменты. Функция использует линейную интерполяцию между двумя значениями времени, а затем суммирует значения для одного и того же момента (как измеренные так и интерполированные) по всем рядам. -- `uid` уникальный идентификатор временного ряда, `UInt64`. -- `timestamp` имеет тип `Int64` чтобы можно было учитывать милли и микросекунды. -- `value` представляет собой значение метрики. +- `uid` уникальный идентификатор временного ряда, `UInt64`. +- `timestamp` имеет тип `Int64` чтобы можно было учитывать милли и микросекунды. +- `value` представляет собой значение метрики. Функция возвращает массив кортежей с парами `(timestamp, aggregated_value)`. @@ -469,7 +469,7 @@ SELECT kurtSamp(value) FROM series_with_value_column Пример: -```text +``` text ┌─uid─┬─timestamp─┬─value─┐ │ 1 │ 2 │ 0.2 │ │ 1 │ 7 │ 0.7 │ @@ -484,7 +484,7 @@ SELECT kurtSamp(value) FROM series_with_value_column └─────┴───────────┴───────┘ ``` -```sql +``` sql CREATE TABLE time_series( uid UInt64, timestamp Int64, @@ -502,32 +502,32 @@ FROM ( И результат будет: -```text +``` text [(2,0.2),(3,0.9),(7,2.1),(8,2.4),(12,3.6),(17,5.1),(18,5.4),(24,7.2),(25,2.5)] ``` -## timeSeriesGroupRateSum(uid, ts, val) {#agg_function-timeseriesgroupratesum} +## timeSeriesGroupRateSum(uid, ts, val) {#agg-function-timeseriesgroupratesum} Аналогично timeSeriesGroupRateSum, timeSeriesGroupRateSum будет вычислять производные по timestamp для рядов, а затем суммировать полученные производные для всех рядов для одного значения timestamp. Также ряды должны быть отсортированы по возрастанию timestamp. Для пример из описания timeSeriesGroupRateSum результат будет следующим: -```text +``` text [(2,0),(3,0.1),(7,0.3),(8,0.3),(12,0.3),(17,0.3),(18,0.3),(24,0.3),(25,0.1)] ``` -## avg(x) {#agg_function-avg} +## avg(x) {#agg-function-avg} Вычисляет среднее. Работает только для чисел. Результат всегда Float64. -## uniq {#agg_function-uniq} +## uniq {#agg-function-uniq} Приближённо вычисляет количество различных значений аргумента. -```sql +``` sql uniq(x[, ...]) ``` @@ -537,34 +537,34 @@ uniq(x[, ...]) **Возвращаемое значение** -- Значение с типом данных [UInt64](../../data_types/int_uint.md). +- Значение с типом данных [UInt64](../../data_types/int_uint.md). **Детали реализации** Функция: -- Вычисляет хэш для всех параметров агрегации, а затем использует его в вычислениях. +- Вычисляет хэш для всех параметров агрегации, а затем использует его в вычислениях. -- Использует адаптивный алгоритм выборки. В качестве состояния вычисления функция использует выборку хэш-значений элементов размером до 65536. +- Использует адаптивный алгоритм выборки. В качестве состояния вычисления функция использует выборку хэш-значений элементов размером до 65536. - Этот алгоритм очень точен и очень эффективен по использованию CPU. Если запрос содержит небольшое количество этих функций, использование `uniq` почти так же эффективно, как и использование других агрегатных функций. + Этот алгоритм очень точен и очень эффективен по использованию CPU. Если запрос содержит небольшое количество этих функций, использование `uniq` почти так же эффективно, как и использование других агрегатных функций. -- Результат детерминирован (не зависит от порядка выполнения запроса). +- Результат детерминирован (не зависит от порядка выполнения запроса). Эту функцию рекомендуется использовать практически во всех сценариях. **Смотрите также** -- [uniqCombined](#agg_function-uniqcombined) -- [uniqCombined64](#agg_function-uniqcombined64) -- [uniqHLL12](#agg_function-uniqhll12) -- [uniqExact](#agg_function-uniqexact) +- [uniqCombined](#agg_function-uniqcombined) +- [uniqCombined64](#agg_function-uniqcombined64) +- [uniqHLL12](#agg_function-uniqhll12) +- [uniqExact](#agg_function-uniqexact) -## uniqCombined {#agg_function-uniqcombined} +## uniqCombined {#agg-function-uniqcombined} Приближённо вычисляет количество различных значений аргумента. -```sql +``` sql uniqCombined(HLL_precision)(x[, ...]) ``` @@ -574,49 +574,49 @@ uniqCombined(HLL_precision)(x[, ...]) Функция принимает переменное число входных параметров. Параметры могут быть числовых типов, а также `Tuple`, `Array`, `Date`, `DateTime`, `String`. -`HLL_precision` — это логарифм по основанию 2 от числа ячеек в [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). Необязательный, можно использовать функцию как `uniqCombined (x [,...])`. Для `HLL_precision` значение по умолчанию — 17, что фактически составляет 96 КБ пространства (2^17 ячеек, 6 бит каждая). +`HLL_precision` — это логарифм по основанию 2 от числа ячеек в [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). Необязательный, можно использовать функцию как `uniqCombined (x [,...])`. Для `HLL_precision` значение по умолчанию — 17, что фактически составляет 96 КБ пространства (2^17 ячеек, 6 бит каждая). **Возвращаемое значение** -- Число типа [UInt64](../../data_types/int_uint.md). +- Число типа [UInt64](../../data_types/int_uint.md). **Детали реализации** Функция: -- Вычисляет хэш (64-битный для `String` и 32-битный для всех остальных типов) для всех параметров агрегации, а затем использует его в вычислениях. +- Вычисляет хэш (64-битный для `String` и 32-битный для всех остальных типов) для всех параметров агрегации, а затем использует его в вычислениях. -- Используется комбинация трёх алгоритмов: массив, хэш-таблица и HyperLogLog с таблицей коррекции погрешности. +- Используется комбинация трёх алгоритмов: массив, хэш-таблица и HyperLogLog с таблицей коррекции погрешности. - Для небольшого количества различных значений используется массив. Если размер набора больше, используется хэш-таблица. При дальнейшем увеличении количества значений, используется структура HyperLogLog, имеющая фиксированный размер в памяти. + Для небольшого количества различных значений используется массив. Если размер набора больше, используется хэш-таблица. При дальнейшем увеличении количества значений, используется структура HyperLogLog, имеющая фиксированный размер в памяти. -- Результат детерминирован (не зависит от порядка выполнения запроса). +- Результат детерминирован (не зависит от порядка выполнения запроса). !!! note "Note" Так как используется 32-битный хэш для не-`String` типов, результат будет иметь очень очень большую ошибку для количества разичных элементов существенно больше `UINT_MAX` (ошибка быстро растёт начиная с нескольких десятков миллиардов различных значений), таким образом в этом случае нужно использовать [uniqCombined64](#agg_function-uniqcombined64) По сравнению с функцией [uniq](#agg_function-uniq), `uniqCombined`: -- Потребляет в несколько раз меньше памяти. -- Вычисляет с в несколько раз более высокой точностью. -- Обычно имеет немного более низкую производительность. В некоторых сценариях `uniqCombined` может показывать более высокую производительность, чем `uniq`, например, в случае распределенных запросов, при которых по сети передаётся большое количество состояний агрегации. +- Потребляет в несколько раз меньше памяти. +- Вычисляет с в несколько раз более высокой точностью. +- Обычно имеет немного более низкую производительность. В некоторых сценариях `uniqCombined` может показывать более высокую производительность, чем `uniq`, например, в случае распределенных запросов, при которых по сети передаётся большое количество состояний агрегации. **Смотрите также** -- [uniq](#agg_function-uniq) -- [uniqCombined64](#agg_function-uniqcombined64) -- [uniqHLL12](#agg_function-uniqhll12) -- [uniqExact](#agg_function-uniqexact) +- [uniq](#agg_function-uniq) +- [uniqCombined64](#agg_function-uniqcombined64) +- [uniqHLL12](#agg_function-uniqhll12) +- [uniqExact](#agg_function-uniqexact) -## uniqCombined64 {#agg_function-uniqcombined64} +## uniqCombined64 {#agg-function-uniqcombined64} Использует 64-битный хэш для всех типов, в отличие от [uniqCombined](#agg_function-uniqcombined). -## uniqHLL12 {#agg_function-uniqhll12} +## uniqHLL12 {#agg-function-uniqhll12} Вычисляет приблизительное число различных значений аргументов, используя алгоритм [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). -```sql +``` sql uniqHLL12(x[, ...]) ``` @@ -626,33 +626,33 @@ uniqHLL12(x[, ...]) **Возвращаемое значение** -- Значение хэша с типом данных [UInt64](../../data_types/int_uint.md). +- Значение хэша с типом данных [UInt64](../../data_types/int_uint.md). **Детали реализации** Функция: -- Вычисляет хэш для всех параметров агрегации, а затем использует его в вычислениях. +- Вычисляет хэш для всех параметров агрегации, а затем использует его в вычислениях. -- Использует алгоритм HyperLogLog для аппроксимации числа различных значений аргументов. +- Использует алгоритм HyperLogLog для аппроксимации числа различных значений аргументов. - Используется 212 5-битовых ячеек. Размер состояния чуть больше 2.5 КБ. Результат не точный (ошибка до ~10%) для небольших множеств (<10K элементов). Однако для множеств большой кардинальности (10K - 100M) результат довольно точен (ошибка до ~1.6%). Начиная с 100M ошибка оценки будет только расти и для множеств огромной кардинальности (1B+ элементов) функция возвращает результат с очень большой неточностью. + Используется 212 5-битовых ячеек. Размер состояния чуть больше 2.5 КБ. Результат не точный (ошибка до ~10%) для небольших множеств (<10K элементов). Однако для множеств большой кардинальности (10K - 100M) результат довольно точен (ошибка до ~1.6%). Начиная с 100M ошибка оценки будет только расти и для множеств огромной кардинальности (1B+ элементов) функция возвращает результат с очень большой неточностью. -- Результат детерминирован (не зависит от порядка выполнения запроса). +- Результат детерминирован (не зависит от порядка выполнения запроса). Мы не рекомендуем использовать эту функцию. В большинстве случаев используйте функцию [uniq](#agg_function-uniq) или [uniqCombined](#agg_function-uniqcombined). **Смотрите также** -- [uniq](#agg_function-uniq) -- [uniqCombined](#agg_function-uniqcombined) -- [uniqExact](#agg_function-uniqexact) +- [uniq](#agg_function-uniq) +- [uniqCombined](#agg_function-uniqcombined) +- [uniqExact](#agg_function-uniqexact) -## uniqExact {#agg_function-uniqexact} +## uniqExact {#agg-function-uniqexact} Вычисляет точное количество различных значений аргументов. -```sql +``` sql uniqExact(x[, ...]) ``` @@ -666,11 +666,11 @@ uniqExact(x[, ...]) **Смотрите также** -- [uniq](#agg_function-uniq) -- [uniqCombined](#agg_function-uniqcombined) -- [uniqHLL12](#agg_function-uniqhll12) +- [uniq](#agg_function-uniq) +- [uniqCombined](#agg_function-uniqcombined) +- [uniqHLL12](#agg_function-uniqhll12) -## groupArray(x), groupArray(max_size)(x) {#agg_function-grouparray} +## groupArray(x), groupArray(max\_size)(x) {#agg-function-grouparray} Составляет массив из значений аргумента. Значения в массив могут быть добавлены в любом (недетерминированном) порядке. @@ -680,8 +680,7 @@ uniqExact(x[, ...]) В некоторых случаях, вы всё же можете рассчитывать на порядок выполнения запроса. Это — случаи, когда `SELECT` идёт из подзапроса, в котором используется `ORDER BY`. - -## groupArrayInsertAt(x) +## groupArrayInsertAt(x) {#grouparrayinsertatx} Вставляет в массив значение в заданную позицию. @@ -689,14 +688,14 @@ uniqExact(x[, ...]) Опциональные параметры: -- Значение по умолчанию для подстановки на пустые позиции. -- Длина результирующего массива. Например, если вы хотите получать массивы одинакового размера для всех агрегатных ключей. При использовании этого параметра значение по умолчанию задавать обязательно. +- Значение по умолчанию для подстановки на пустые позиции. +- Длина результирующего массива. Например, если вы хотите получать массивы одинакового размера для всех агрегатных ключей. При использовании этого параметра значение по умолчанию задавать обязательно. -## groupArrayMovingSum {#agg_function-grouparraymovingsum} +## groupArrayMovingSum {#agg-function-grouparraymovingsum} Вычисляет скользящую сумму входных значений. -```sql +``` sql groupArrayMovingSum(numbers_for_summing) groupArrayMovingSum(window_size)(numbers_for_summing) ``` @@ -705,18 +704,18 @@ groupArrayMovingSum(window_size)(numbers_for_summing) **Параметры** -- `numbers_for_summing` — [выражение](../syntax.md#syntax-expressions), возвращающее значение числового типа. -- `window_size` — размер окна. +- `numbers_for_summing` — [выражение](../syntax.md#syntax-expressions), возвращающее значение числового типа. +- `window_size` — размер окна. **Возвращаемые значения** -- Массив того же размера и типа, что и входные данные. +- Массив того же размера и типа, что и входные данные. **Пример** Таблица с исходными данными: -```sql +``` sql CREATE TABLE t ( `int` UInt8, @@ -726,7 +725,7 @@ CREATE TABLE t ENGINE = TinyLog ``` -```text +``` text ┌─int─┬─float─┬──dec─┐ │ 1 │ 1.1 │ 1.10 │ │ 2 │ 2.2 │ 2.20 │ @@ -737,7 +736,7 @@ ENGINE = TinyLog Запросы: -```sql +``` sql SELECT groupArrayMovingSum(int) AS I, groupArrayMovingSum(float) AS F, @@ -745,13 +744,13 @@ SELECT FROM t ``` -```text +``` text ┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ │ [1,3,7,14] │ [1.1,3.3000002,7.7000003,15.47] │ [1.10,3.30,7.70,15.47] │ └────────────┴─────────────────────────────────┴────────────────────────┘ ``` -```sql +``` sql SELECT groupArrayMovingSum(2)(int) AS I, groupArrayMovingSum(2)(float) AS F, @@ -759,31 +758,29 @@ SELECT FROM t ``` -```text +``` text ┌─I──────────┬─F───────────────────────────────┬─D──────────────────────┐ │ [1,3,6,11] │ [1.1,3.3000002,6.6000004,12.17] │ [1.10,3.30,6.60,12.17] │ └────────────┴─────────────────────────────────┴────────────────────────┘ ``` -## groupArrayMovingAvg {#agg_function-grouparraymovingavg} +## groupArrayMovingAvg {#agg-function-grouparraymovingavg} Вычисляет скользящее среднее для входных значений. -``` -groupArrayMovingAvg(numbers_for_summing) -groupArrayMovingAvg(window_size)(numbers_for_summing) -``` + groupArrayMovingAvg(numbers_for_summing) + groupArrayMovingAvg(window_size)(numbers_for_summing) Функция может принимать размер окна в качестве параметра. Если окно не указано, то функция использует размер окна, равный количеству строк в столбце. **Параметры** -- `numbers_for_summing` — [выражение](../syntax.md#syntax-expressions), возвращающее значение числового типа. -- `window_size` — размер окна. +- `numbers_for_summing` — [выражение](../syntax.md#syntax-expressions), возвращающее значение числового типа. +- `window_size` — размер окна. **Возвращаемые значения** -- Массив того же размера и типа, что и входные данные. +- Массив того же размера и типа, что и входные данные. Функция использует [округление к меньшему по модулю](https://ru.wikipedia.org/wiki/Округление#Методы). Оно усекает десятичные разряды, незначимые для результирующего типа данных. @@ -791,7 +788,7 @@ groupArrayMovingAvg(window_size)(numbers_for_summing) Таблица с исходными данными: -```sql +``` sql CREATE TABLE t ( `int` UInt8, @@ -801,7 +798,7 @@ CREATE TABLE t ENGINE = TinyLog ``` -```text +``` text ┌─int─┬─float─┬──dec─┐ │ 1 │ 1.1 │ 1.10 │ │ 2 │ 2.2 │ 2.20 │ @@ -812,7 +809,7 @@ ENGINE = TinyLog Запросы: -```sql +``` sql SELECT groupArrayMovingAvg(int) AS I, groupArrayMovingAvg(float) AS F, @@ -820,13 +817,13 @@ SELECT FROM t ``` -```text +``` text ┌─I─────────┬─F───────────────────────────────────┬─D─────────────────────┐ │ [0,0,1,3] │ [0.275,0.82500005,1.9250001,3.8675] │ [0.27,0.82,1.92,3.86] │ └───────────┴─────────────────────────────────────┴───────────────────────┘ ``` -```sql +``` sql SELECT groupArrayMovingAvg(2)(int) AS I, groupArrayMovingAvg(2)(float) AS F, @@ -834,13 +831,13 @@ SELECT FROM t ``` -```text +``` text ┌─I─────────┬─F────────────────────────────────┬─D─────────────────────┐ │ [0,1,3,5] │ [0.55,1.6500001,3.3000002,6.085] │ [0.55,1.65,3.30,6.08] │ └───────────┴──────────────────────────────────┴───────────────────────┘ ``` -## groupUniqArray(x), groupUniqArray(max_size)(x) +## groupUniqArray(x), groupUniqArray(max\_size)(x) {#groupuniqarrayx-groupuniqarraymax-sizex} Составляет массив из различных значений аргумента. Расход оперативной памяти такой же, как у функции `uniqExact`. @@ -854,36 +851,34 @@ FROM t Внутренние состояния функций `quantile*` не объединяются, если они используются в одном запросе. Если вам необходимо вычислить квантили нескольких уровней, используйте функцию [quantiles](#quantiles), это повысит эффективность запроса. -**Синтаксис** +**Синтаксис** -```sql +``` sql quantile(level)(expr) ``` Алиас: `median`. -**Параметры** +**Параметры** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). +- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). +**Возвращаемое значение** -**Возвращаемое значение** - -- Приблизительный квантиль заданного уровня. +- Приблизительный квантиль заданного уровня. Тип: -- [Float64](../../data_types/float.md) для входных данных числового типа. -- [Date](../../data_types/date.md), если входные значения имеют тип `Date`. -- [DateTime](../../data_types/datetime.md), если входные значения имеют тип `DateTime`. - +- [Float64](../../data_types/float.md) для входных данных числового типа. +- [Date](../../data_types/date.md), если входные значения имеют тип `Date`. +- [DateTime](../../data_types/datetime.md), если входные значения имеют тип `DateTime`. **Пример** Входная таблица: -```text +``` text ┌─val─┐ │ 1 │ │ 1 │ @@ -894,13 +889,13 @@ quantile(level)(expr) Запрос: -```sql +``` sql SELECT quantile(val) FROM t ``` Результат: -```text +``` text ┌─quantile(val)─┐ │ 1.5 │ └───────────────┘ @@ -908,9 +903,8 @@ SELECT quantile(val) FROM t **Смотрите также** -- [median](#median) -- [quantiles](#quantiles) - +- [median](#median) +- [quantiles](#quantiles) ## quantileDeterministic {#quantiledeterministic} @@ -920,37 +914,35 @@ SELECT quantile(val) FROM t Внутренние состояния функций `quantile*` не объединяются, если они используются в одном запросе. Если вам необходимо вычислить квантили нескольких уровней, используйте функцию [quantiles](#quantiles), это повысит эффективность запроса. -**Синтаксис** +**Синтаксис** -```sql +``` sql quantileDeterministic(level)(expr, determinator) ``` Алиас: `medianDeterministic`. -**Параметры** +**Параметры** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). -- `determinator` — Число, хэш которого используется при сэмплировании в алгоритме reservoir sampling, чтобы сделать результат детерминированным. В качестве детерминатора можно использовать любое определённое положительное число, например, идентификатор пользователя или события. Если одно и то же значение детерминатора попадается в выборке слишком часто, то функция выдаёт некорректный результат. +- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). +- `determinator` — Число, хэш которого используется при сэмплировании в алгоритме reservoir sampling, чтобы сделать результат детерминированным. В качестве детерминатора можно использовать любое определённое положительное число, например, идентификатор пользователя или события. Если одно и то же значение детерминатора попадается в выборке слишком часто, то функция выдаёт некорректный результат. **Возвращаемое значение** -- Приблизительный квантиль заданного уровня. - +- Приблизительный квантиль заданного уровня. Тип: -- [Float64](../../data_types/float.md) для входных данных числового типа. -- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. -- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. - +- [Float64](../../data_types/float.md) для входных данных числового типа. +- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. +- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. **Пример** Входная таблица: -```text +``` text ┌─val─┐ │ 1 │ │ 1 │ @@ -961,13 +953,13 @@ quantileDeterministic(level)(expr, determinator) Запрос: -```sql +``` sql SELECT quantileDeterministic(val, 1) FROM t ``` Результат: -```text +``` text ┌─quantileDeterministic(val, 1)─┐ │ 1.5 │ └───────────────────────────────┘ @@ -975,9 +967,8 @@ SELECT quantileDeterministic(val, 1) FROM t **Смотрите также** -- [median](#median) -- [quantiles](#quantiles) - +- [median](#median) +- [quantiles](#quantiles) ## quantileExact {#quantileexact} @@ -987,41 +978,40 @@ SELECT quantileDeterministic(val, 1) FROM t Внутренние состояния функций `quantile*` не объединяются, если они используются в одном запросе. Если вам необходимо вычислить квантили нескольких уровней, используйте функцию [quantiles](#quantiles), это повысит эффективность запроса. -**Синтаксис** +**Синтаксис** -```sql +``` sql quantileExact(level)(expr) ``` Алиас: `medianExact`. -**Параметры** - -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). +**Параметры** +- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). **Возвращаемое значение** -- Квантиль заданного уровня. +- Квантиль заданного уровня. Тип: -- [Float64](../../data_types/float.md) для входных данных числового типа. -- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. -- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. +- [Float64](../../data_types/float.md) для входных данных числового типа. +- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. +- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. **Пример** Запрос: -```sql +``` sql SELECT quantileExact(number) FROM numbers(10) ``` Результат: -```text +``` text ┌─quantileExact(number)─┐ │ 5 │ └───────────────────────┘ @@ -1029,8 +1019,8 @@ SELECT quantileExact(number) FROM numbers(10) **Смотрите также** -- [median](#median) -- [quantiles](#quantiles) +- [median](#median) +- [quantiles](#quantiles) ## quantileExactWeighted {#quantileexactweighted} @@ -1040,35 +1030,35 @@ SELECT quantileExact(number) FROM numbers(10) Внутренние состояния функций `quantile*` не объединяются, если они используются в одном запросе. Если вам необходимо вычислить квантили нескольких уровней, используйте функцию [quantiles](#quantiles), это повысит эффективность запроса. -**Синтаксис** +**Синтаксис** -```sql +``` sql quantileExactWeighted(level)(expr, weight) ``` Алиас: `medianExactWeighted`. -**Параметры** +**Параметры** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). -- `weight` — Столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. +- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). +- `weight` — Столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. **Возвращаемое значение** -- Quantile of the specified level. +- Quantile of the specified level. Тип: -- [Float64](../../data_types/float.md) для входных данных числового типа. -- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. -- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. +- [Float64](../../data_types/float.md) для входных данных числового типа. +- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. +- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. **Пример** Входная таблица: -```text +``` text ┌─n─┬─val─┐ │ 0 │ 3 │ │ 1 │ 2 │ @@ -1079,22 +1069,22 @@ quantileExactWeighted(level)(expr, weight) Запрос: -```sql +``` sql SELECT quantileExactWeighted(n, val) FROM t ``` Результат: -```text +``` text ┌─quantileExactWeighted(n, val)─┐ │ 1 │ └───────────────────────────────┘ ``` -**Смотрите также** +**Смотрите также** -- [median](#median) -- [quantiles](#quantiles) +- [median](#median) +- [quantiles](#quantiles) ## quantileTiming {#quantiletiming} @@ -1106,7 +1096,7 @@ SELECT quantileExactWeighted(n, val) FROM t **Синтаксис** -```sql +``` sql quantileTiming(level)(expr) ``` @@ -1114,18 +1104,19 @@ quantileTiming(level)(expr) **Параметры** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — [Выражение](../syntax.md#syntax-expressions) над значения столбца, которые возвращают данные типа [Float*](../../data_types/float.md). +- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). - - Если в функцию передать отрицательные значения, то её поведение не определено. - - Если значение больше, чем 30 000 (например, время загрузки страницы превышает 30 секунд), то оно приравнивается к 30 000. +- `expr` — [Выражение](../syntax.md#syntax-expressions) над значения столбца, которые возвращают данные типа [Float\*](../../data_types/float.md). + + - Если в функцию передать отрицательные значения, то её поведение не определено. + - Если значение больше, чем 30 000 (например, время загрузки страницы превышает 30 секунд), то оно приравнивается к 30 000. **Точность** Вычисления точны при соблюдении следующих условий: -- Размер выборки не превышает 5670 элементов. -- Размер выборки превышает 5670 элементов, но значение каждого элемента не больше 1024. +- Размер выборки не превышает 5670 элементов. +- Размер выборки превышает 5670 элементов, но значение каждого элемента не больше 1024. В противном случае, результат вычисления округляется до ближайшего множителя числа 16. @@ -1134,7 +1125,7 @@ quantileTiming(level)(expr) **Возвращаемое значение** -- Квантиль заданного уровня. +- Квантиль заданного уровня. Тип: `Float32`. @@ -1145,7 +1136,7 @@ quantileTiming(level)(expr) Входная таблица: -```text +``` text ┌─response_time─┐ │ 72 │ │ 112 │ @@ -1161,13 +1152,13 @@ quantileTiming(level)(expr) Запрос: -```sql +``` sql SELECT quantileTiming(response_time) FROM t ``` Результат: -```text +``` text ┌─quantileTiming(response_time)─┐ │ 126 │ └───────────────────────────────┘ @@ -1175,8 +1166,8 @@ SELECT quantileTiming(response_time) FROM t **Смотрите также** -- [median](#median) -- [quantiles](#quantiles) +- [median](#median) +- [quantiles](#quantiles) ## quantileTimingWeighted {#quantiletimingweighted} @@ -1188,7 +1179,7 @@ SELECT quantileTiming(response_time) FROM t **Синтаксис** -```sql +``` sql quantileTimingWeighted(level)(expr, weight) ``` @@ -1196,20 +1187,21 @@ quantileTimingWeighted(level)(expr, weight) **Параметры** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — [Выражение](../syntax.md#syntax-expressions) над значения столбца, которые возвращают данные типа [Float*](../../data_types/float.md). +- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). - - Если в функцию передать отрицательные значения, то её поведение не определено. - - Если значение больше, чем 30 000 (например, время загрузки страницы превышает 30 секунд), то оно приравнивается к 30 000. +- `expr` — [Выражение](../syntax.md#syntax-expressions) над значения столбца, которые возвращают данные типа [Float\*](../../data_types/float.md). -- `weight` — Столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. + - Если в функцию передать отрицательные значения, то её поведение не определено. + - Если значение больше, чем 30 000 (например, время загрузки страницы превышает 30 секунд), то оно приравнивается к 30 000. + +- `weight` — Столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. **Точность** Вычисления точны при соблюдении следующих условий: -- Размер выборки не превышает 5670 элементов. -- Размер выборки превышает 5670 элементов, но значение каждого элемента не больше 1024. +- Размер выборки не превышает 5670 элементов. +- Размер выборки превышает 5670 элементов, но значение каждого элемента не больше 1024. В противном случае, результат вычисления округляется до ближайшего множителя числа 16. @@ -1218,7 +1210,7 @@ quantileTimingWeighted(level)(expr, weight) **Возвращаемое значение** -- Квантиль заданного уровня. +- Квантиль заданного уровня. Тип: `Float32`. @@ -1229,7 +1221,7 @@ quantileTimingWeighted(level)(expr, weight) Входная таблица: -```text +``` text ┌─response_time─┬─weight─┐ │ 68 │ 1 │ │ 104 │ 2 │ @@ -1242,13 +1234,13 @@ quantileTimingWeighted(level)(expr, weight) Запрос: -```sql +``` sql SELECT quantileTimingWeighted(response_time, weight) FROM t ``` Результат: -```text +``` text ┌─quantileTimingWeighted(response_time, weight)─┐ │ 112 │ └───────────────────────────────────────────────┘ @@ -1256,57 +1248,53 @@ SELECT quantileTimingWeighted(response_time, weight) FROM t **Смотрите также** -- [median](#median) -- [quantiles](#quantiles) - +- [median](#median) +- [quantiles](#quantiles) ## quantileTDigest {#quantiletdigest} Приблизительно вычисляет [квантиль](https://ru.wikipedia.org/wiki/Квантиль) числовой последовательности, используя алгоритм [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf). -Максимальная ошибка 1%. Потребление памяти — `log(n)`, где `n` — число значений. Результат не детерминирован и зависит от порядка выполнения запроса. +Максимальная ошибка 1%. Потребление памяти — `log(n)`, где `n` — число значений. Результат не детерминирован и зависит от порядка выполнения запроса. Производительность функции ниже, чем производительность функции [quantile](#quantile) или [quantileTiming](#quantiletiming). По соотношению размера состояния к точности вычисления, эта функция значительно превосходит `quantile`. Внутренние состояния функций `quantile*` не объединяются, если они используются в одном запросе. Если вам необходимо вычислить квантили нескольких уровней, используйте функцию [quantiles](#quantiles), это повысит эффективность запроса. -**Синтаксис** +**Синтаксис** -```sql +``` sql quantileTDigest(level)(expr) ``` Алиас: `medianTDigest`. -**Параметры** - -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). +**Параметры** +- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). **Возвращаемое значение** -- Приблизительную квантиль заданного уровня. - +- Приблизительную квантиль заданного уровня. Тип: -- [Float64](../../data_types/float.md) для входных данных числового типа. -- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. -- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. - +- [Float64](../../data_types/float.md) для входных данных числового типа. +- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. +- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. **Пример** Запрос: -```sql +``` sql SELECT quantileTDigest(number) FROM numbers(10) ``` Результат: -```text +``` text ┌─quantileTDigest(number)─┐ │ 4.5 │ └─────────────────────────┘ @@ -1314,55 +1302,54 @@ SELECT quantileTDigest(number) FROM numbers(10) **Смотрите также** -- [median](#median) -- [quantiles](#quantiles) +- [median](#median) +- [quantiles](#quantiles) ## quantileTDigestWeighted {#quantiletdigestweighted} Приблизительно вычисляет [квантиль](https://ru.wikipedia.org/wiki/Квантиль) числовой последовательности, используя алгоритм [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf). Функция учитывает вес каждого элемента последовательности. - -Максимальная ошибка 1%. Потребление памяти — `log(n)`, где `n` — число значений. Результат не детерминирован и зависит от порядка выполнения запроса. + +Максимальная ошибка 1%. Потребление памяти — `log(n)`, где `n` — число значений. Результат не детерминирован и зависит от порядка выполнения запроса. Производительность функции ниже, чем производительность функции [quantile](#quantile) или [quantileTiming](#quantiletiming). По соотношению размера состояния к точности вычисления, эта функция значительно превосходит `quantile`. Внутренние состояния функций `quantile*` не объединяются, если они используются в одном запросе. Если вам необходимо вычислить квантили нескольких уровней, используйте функцию [quantiles](#quantiles), это повысит эффективность запроса. -**Синтаксис** +**Синтаксис** -```sql +``` sql quantileTDigestWeighted(level)(expr, weight) ``` Алиас: `medianTDigest`. -**Параметры** +**Параметры** -- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). -- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). -- `weight` — Столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. +- `level` — Уровень квантили. Опционально. Константное значение с плавающей запятой от 0 до 1. Мы рекомендуем использовать значение `level` из диапазона `[0.01, 0.99]`. Значение по умолчанию: 0.5. При `level=0.5` функция вычисляет [медиану](https://ru.wikipedia.org/wiki/Медиана_(статистика)). +- `expr` — Выражение над значениями столбца, которое возвращает данные [числовых типов](../../data_types/index.md#data_types) или типов [Date](../../data_types/date.md), [DateTime](../../data_types/datetime.md). +- `weight` — Столбец с весам элементов последовательности. Вес — это количество повторений элемента в последовательности. **Возвращаемое значение** -- Приблизительный квантиль заданного уровня. - +- Приблизительный квантиль заданного уровня. Тип: -- [Float64](../../data_types/float.md) для входных данных числового типа. -- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. -- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. +- [Float64](../../data_types/float.md) для входных данных числового типа. +- [Date](../../data_types/date.md) если входные значения имеют тип `Date`. +- [DateTime](../../data_types/datetime.md) если входные значения имеют тип `DateTime`. **Пример** Запрос: -```sql +``` sql SELECT quantileTDigestWeighted(number, 1) FROM numbers(10) ``` Результат: -```text +``` text ┌─quantileTDigestWeighted(number, 1)─┐ │ 4.5 │ └────────────────────────────────────┘ @@ -1370,9 +1357,8 @@ SELECT quantileTDigestWeighted(number, 1) FROM numbers(10) **Смотрите также** -- [median](#median) -- [quantiles](#quantiles) - +- [median](#median) +- [quantiles](#quantiles) ## median {#median} @@ -1380,20 +1366,20 @@ SELECT quantileTDigestWeighted(number, 1) FROM numbers(10) Functions: -- `median` — алиас [quantile](#quantile). -- `medianDeterministic` — алиас [quantileDeterministic](#quantiledeterministic). -- `medianExact` — алиас [quantileExact](#quantileexact). -- `medianExactWeighted` — алиас [quantileExactWeighted](#quantileexactweighted). -- `medianTiming` — алиас [quantileTiming](#quantiletiming). -- `medianTimingWeighted` — алиас [quantileTimingWeighted](#quantiletimingweighted). -- `medianTDigest` — алиас [quantileTDigest](#quantiletdigest). -- `medianTDigestWeighted` — алиас [quantileTDigestWeighted](#quantiletdigestweighted). +- `median` — алиас [quantile](#quantile). +- `medianDeterministic` — алиас [quantileDeterministic](#quantiledeterministic). +- `medianExact` — алиас [quantileExact](#quantileexact). +- `medianExactWeighted` — алиас [quantileExactWeighted](#quantileexactweighted). +- `medianTiming` — алиас [quantileTiming](#quantiletiming). +- `medianTimingWeighted` — алиас [quantileTimingWeighted](#quantiletimingweighted). +- `medianTDigest` — алиас [quantileTDigest](#quantiletdigest). +- `medianTDigestWeighted` — алиас [quantileTDigestWeighted](#quantiletdigestweighted). **Пример** Входная таблица: -```text +``` text ┌─val─┐ │ 1 │ │ 1 │ @@ -1404,23 +1390,23 @@ Functions: Запрос: -```sql +``` sql SELECT medianDeterministic(val, 1) FROM t ``` Результат: -```text +``` text ┌─medianDeterministic(val, 1)─┐ │ 1.5 │ └─────────────────────────────┘ ``` -## quantiles(level1, level2, ...)(x) {#quantiles} +## quantiles(level1, level2, …)(x) {#quantiles} Для всех quantile-функций, также присутствуют соответствующие quantiles-функции: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`. Эти функции за один проход вычисляют все квантили перечисленных уровней и возвращают массив вычисленных значений. -## varSamp(x) +## varSamp(x) {#varsampx} Вычисляет величину `Σ((x - x̅)^2) / (n - 1)`, где `n` - размер выборки, `x̅`- среднее значение `x`. @@ -1428,27 +1414,27 @@ SELECT medianDeterministic(val, 1) FROM t Возвращает `Float64`. В случае, когда `n <= 1`, возвращается `+∞`. -## varPop(x) +## varPop(x) {#varpopx} Вычисляет величину `Σ((x - x̅)^2) / n`, где `n` - размер выборки, `x̅`- среднее значение `x`. То есть, дисперсию для множества значений. Возвращает `Float64`. -## stddevSamp(x) +## stddevSamp(x) {#stddevsampx} Результат равен квадратному корню от `varSamp(x)`. -## stddevPop(x) +## stddevPop(x) {#stddevpopx} Результат равен квадратному корню от `varPop(x)`. -## topK(N)(column) +## topK(N)(column) {#topkncolumn} Возвращает массив наиболее часто встречающихся значений в указанном столбце. Результирующий массив упорядочен по убыванию частоты значения (не по самим значениям). Реализует [Filtered Space-Saving](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf) алгоритм для анализа TopK, на основе reduce-and-combine алгоритма из методики [Parallel Space Saving](https://arxiv.org/pdf/1401.0702.pdf). -```sql +``` sql topK(N)(column) ``` @@ -1458,19 +1444,19 @@ topK(N)(column) **Аргументы** -- 'N' - Количество значений. -- 'x' – Столбец. +- ‘N’ - Количество значений. +- ‘x’ – Столбец. **Пример** Возьмём набор данных [OnTime](../../getting_started/example_datasets/ontime.md) и выберем 3 наиболее часто встречающихся значения в столбце `AirlineID`. -```sql +``` sql SELECT topK(3)(AirlineID) AS res FROM ontime ``` -```text +``` text ┌─res─────────────────┐ │ [19393,19790,19805] │ └─────────────────────┘ @@ -1482,66 +1468,65 @@ FROM ontime **Синтаксис** -```sql +``` sql topKWeighted(N)(x, weight) ``` **Параметры** -- `N` — Количество элементов для выдачи. +- `N` — Количество элементов для выдачи. **Аргументы** -- `x` – значение. -- `weight` — вес. [UInt8](../../data_types/int_uint.md). +- `x` – значение. +- `weight` — вес. [UInt8](../../data_types/int_uint.md). **Возвращаемое значение** Возвращает массив значений с максимально приближенной суммой весов. - **Пример** Запрос: -```sql +``` sql SELECT topKWeighted(10)(number, number) FROM numbers(1000) ``` Результат: -```text +``` text ┌─topKWeighted(10)(number, number)──────────┐ │ [999,998,997,996,995,994,993,992,991,990] │ └───────────────────────────────────────────┘ ``` -## covarSamp(x, y) +## covarSamp(x, y) {#covarsampx-y} Вычисляет величину `Σ((x - x̅)(y - y̅)) / (n - 1)`. Возвращает Float64. В случае, когда `n <= 1`, возвращается +∞. -## covarPop(x, y) +## covarPop(x, y) {#covarpopx-y} Вычисляет величину `Σ((x - x̅)(y - y̅)) / n`. -## corr(x, y) +## corr(x, y) {#corrx-y} Вычисляет коэффициент корреляции Пирсона: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`. -## simpleLinearRegression +## simpleLinearRegression {#simplelinearregression} Выполняет простую (одномерную) линейную регрессию. -```sql +``` sql simpleLinearRegression(x, y) ``` Параметры: -- `x` — столбец со значениями зависимой переменной. -- `y` — столбец со значениями наблюдаемой переменной. +- `x` — столбец со значениями зависимой переменной. +- `y` — столбец со значениями наблюдаемой переменной. Возвращаемые значения: @@ -1549,45 +1534,44 @@ simpleLinearRegression(x, y) **Примеры** -```sql +``` sql SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3]) ``` -```text +``` text ┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [0, 1, 2, 3])─┐ │ (1,0) │ └───────────────────────────────────────────────────────────────────┘ ``` -```sql +``` sql SELECT arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6]) ``` -```text +``` text ┌─arrayReduce('simpleLinearRegression', [0, 1, 2, 3], [3, 4, 5, 6])─┐ │ (1,3) │ └───────────────────────────────────────────────────────────────────┘ ``` -## stochasticLinearRegression {#agg_functions-stochasticlinearregression} +## stochasticLinearRegression {#agg-functions-stochasticlinearregression} Функция реализует стохастическую линейную регрессию. Поддерживает пользовательские параметры для скорости обучения, коэффициента регуляризации L2, размера mini-batch и имеет несколько методов обновления весов ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (по умолчанию), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)). -### Параметры {#agg_functions-stochasticlinearregression-parameters} +### Параметры {#agg-functions-stochasticlinearregression-parameters} Есть 4 настраиваемых параметра. Они передаются в функцию последовательно, однако не обязательно указывать все, используются значения по умолчанию, однако хорошая модель требует некоторой настройки параметров. -```text +``` text stochasticLinearRegression(1.0, 1.0, 10, 'SGD') ``` -1. Скорость обучения — коэффициент длины шага, при выполнении градиентного спуска. Слишком большая скорость обучения может привести к бесконечным весам модели. По умолчанию `0.00001`. -2. Коэффициент регуляризации l2. Помогает предотвратить подгонку. По умолчанию `0.1`. -3. Размер mini-batch задаёт количество элементов, чьи градиенты будут вычислены и просуммированы при выполнении одного шага градиентного спуска. Чистый стохастический спуск использует один элемент, однако использование mini-batch (около 10 элементов) делает градиентные шаги более стабильными. По умолчанию `15`. -4. Метод обновления весов, можно выбрать один из следующих: `Adam` (по умолчанию), `SGD`, `Momentum`, `Nesterov`. `Momentum` и `Nesterov` более требовательные к вычислительным ресурсам и памяти, однако они имеют высокую скорость схождения и устойчивости методов стохастического градиента. +1. Скорость обучения — коэффициент длины шага, при выполнении градиентного спуска. Слишком большая скорость обучения может привести к бесконечным весам модели. По умолчанию `0.00001`. +2. Коэффициент регуляризации l2. Помогает предотвратить подгонку. По умолчанию `0.1`. +3. Размер mini-batch задаёт количество элементов, чьи градиенты будут вычислены и просуммированы при выполнении одного шага градиентного спуска. Чистый стохастический спуск использует один элемент, однако использование mini-batch (около 10 элементов) делает градиентные шаги более стабильными. По умолчанию `15`. +4. Метод обновления весов, можно выбрать один из следующих: `Adam` (по умолчанию), `SGD`, `Momentum`, `Nesterov`. `Momentum` и `Nesterov` более требовательные к вычислительным ресурсам и памяти, однако они имеют высокую скорость схождения и устойчивости методов стохастического градиента. - -### Использование {#agg_functions-stochasticlinearregression-usage} +### Использование {#agg-functions-stochasticlinearregression-usage} `stochasticLinearRegression` используется на двух этапах: построение модели и предсказание новых данных. Чтобы построить модель и сохранить её состояние для дальнейшего использования, мы используем комбинатор `-State`. Для прогнозирования мы используем функцию [evalMLMethod](../functions/machine_learning_functions.md#machine_learning_methods-evalmlmethod), которая принимает в качестве аргументов состояние и свойства для прогнозирования. @@ -1598,7 +1582,7 @@ stochasticLinearRegression(1.0, 1.0, 10, 'SGD') Пример запроса: -```sql +``` sql CREATE TABLE IF NOT EXISTS train_data ( param1 Float64, @@ -1618,7 +1602,7 @@ AS state FROM train_data; После сохранения состояния в таблице мы можем использовать его несколько раз для прогнозирования или смёржить с другими состояниями и создать новые, улучшенные модели. -```sql +``` sql WITH (SELECT state FROM your_model) AS model SELECT evalMLMethod(model, param1, param2) FROM test_data ``` @@ -1627,76 +1611,83 @@ evalMLMethod(model, param1, param2) FROM test_data `test_data` — это таблица, подобная `train_data`, но при этом может не содержать целевое значение. -### Примечания {#agg_functions-stochasticlinearregression-notes} +### Примечания {#agg-functions-stochasticlinearregression-notes} -1. Объединить две модели можно следующим запросом: +1. Объединить две модели можно следующим запросом: - ```sql + + + ``` sql SELECT state1 + state2 FROM your_models ``` - где таблица `your_models` содержит обе модели. Запрос вернёт новый объект `AggregateFunctionState`. + где таблица `your_models` содержит обе модели. Запрос вернёт новый объект `AggregateFunctionState`. -2. Пользователь может получать веса созданной модели для своих целей без сохранения модели, если не использовать комбинатор `-State`. +1. Пользователь может получать веса созданной модели для своих целей без сохранения модели, если не использовать комбинатор `-State`. - ```sql + + + ``` sql SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data ``` - Подобный запрос строит модель и возвращает её веса, отвечающие параметрам моделей и смещение. Таким образом, в приведенном выше примере запрос вернет столбец с тремя значениями. + Подобный запрос строит модель и возвращает её веса, отвечающие параметрам моделей и смещение. Таким образом, в приведенном выше примере запрос вернет столбец с тремя значениями. **Смотрите также** -- [stochasticLogisticRegression](#agg_functions-stochasticlogisticregression) -- [Отличие линейной от логистической регрессии.](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) +- [stochasticLogisticRegression](#agg_functions-stochasticlogisticregression) +- [Отличие линейной от логистической регрессии.](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) - -## stochasticLogisticRegression {#agg_functions-stochasticlogisticregression} +## stochasticLogisticRegression {#agg-functions-stochasticlogisticregression} Функция реализует стохастическую логистическую регрессию. Её можно использовать для задачи бинарной классификации, функция поддерживает те же пользовательские параметры, что и stochasticLinearRegression и работает таким же образом. -### Параметры {#agg_functions-stochasticlogisticregression-parameters} +### Параметры {#agg-functions-stochasticlogisticregression-parameters} Параметры те же, что и в stochasticLinearRegression: `learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`. Смотрите раздел [parameters](#agg_functions-stochasticlinearregression-parameters). -```text +``` text stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') ``` -1. Построение модели +1. Построение модели - Смотрите раздел `Построение модели` в описании [stochasticLinearRegression](#stochasticlinearregression-usage-fitting) . + - Прогнозируемые метки должны быть в диапазоне [-1, 1]. + Смотрите раздел `Построение модели` в описании [stochasticLinearRegression](#stochasticlinearregression-usage-fitting) . -2. Прогнозирование + Прогнозируемые метки должны быть в диапазоне \[-1, 1\]. - Используя сохраненное состояние, можно предсказать вероятность наличия у объекта метки `1`. +1. Прогнозирование - ```sql + + + Используя сохраненное состояние, можно предсказать вероятность наличия у объекта метки `1`. + + ``` sql WITH (SELECT state FROM your_model) AS model SELECT evalMLMethod(model, param1, param2) FROM test_data ``` - Запрос возвращает столбец вероятностей. Обратите внимание, что первый аргумент `evalMLMethod` это объект `AggregateFunctionState`, далее идут столбцы свойств. + Запрос возвращает столбец вероятностей. Обратите внимание, что первый аргумент `evalMLMethod` это объект `AggregateFunctionState`, далее идут столбцы свойств. - Мы также можем установить границу вероятности, которая присваивает элементам различные метки. + Мы также можем установить границу вероятности, которая присваивает элементам различные метки. - ```sql + ``` sql SELECT ans < 1.1 AND ans > 0.5 FROM (WITH (SELECT state FROM your_model) AS model SELECT evalMLMethod(model, param1, param2) AS ans FROM test_data) ``` - Тогда результатом будут метки. + Тогда результатом будут метки. - `test_data` — это таблица, подобная `train_data`, но при этом может не содержать целевое значение. + `test_data` — это таблица, подобная `train_data`, но при этом может не содержать целевое значение. **Смотрите также** -- [stochasticLinearRegression](#agg_functions-stochasticlinearregression) -- [Отличие линейной от логистической регрессии](https://moredez.ru/q/51225972/) +- [stochasticLinearRegression](#agg_functions-stochasticlinearregression) +- [Отличие линейной от логистической регрессии](https://moredez.ru/q/51225972/) [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/agg_functions/reference/) diff --git a/docs/ru/query_language/alter.md b/docs/ru/query_language/alter.md index 7736456d960..3034b11f270 100644 --- a/docs/ru/query_language/alter.md +++ b/docs/ru/query_language/alter.md @@ -1,12 +1,12 @@ -## ALTER {#query_language_queries_alter} +## ALTER {#query-language-queries-alter} Запрос `ALTER` поддерживается только для таблиц типа `*MergeTree`, а также `Merge` и `Distributed`. Запрос имеет несколько вариантов. -### Манипуляции со столбцами +### Манипуляции со столбцами {#manipuliatsii-so-stolbtsami} Изменение структуры таблицы. -```sql +``` sql ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN ... ``` @@ -15,17 +15,17 @@ ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN Существуют следующие действия: -- [ADD COLUMN](#alter_add-column) — добавляет столбец в таблицу; -- [DROP COLUMN](#alter_drop-column) — удаляет столбец; -- [CLEAR COLUMN](#alter_clear-column) — сбрасывает все значения в столбце для заданной партиции; -- [COMMENT COLUMN](#alter_comment-column) — добавляет комментарий к столбцу; -- [MODIFY COLUMN](#alter_modify-column) — изменяет тип столбца, выражение для значения по умолчанию и TTL. +- [ADD COLUMN](#alter_add-column) — добавляет столбец в таблицу; +- [DROP COLUMN](#alter_drop-column) — удаляет столбец; +- [CLEAR COLUMN](#alter_clear-column) — сбрасывает все значения в столбце для заданной партиции; +- [COMMENT COLUMN](#alter_comment-column) — добавляет комментарий к столбцу; +- [MODIFY COLUMN](#alter_modify-column) — изменяет тип столбца, выражение для значения по умолчанию и TTL. Подробное описание для каждого действия приведено ниже. -#### ADD COLUMN {#alter_add-column} +#### ADD COLUMN {#alter-add-column} -```sql +``` sql ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after] ``` @@ -39,13 +39,13 @@ ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [codec] [AFTER name_after] Пример: -```sql +``` sql ALTER TABLE visits ADD COLUMN browser String AFTER user_id ``` -#### DROP COLUMN {#alter_drop-column} +#### DROP COLUMN {#alter-drop-column} -```sql +``` sql DROP COLUMN [IF EXISTS] name ``` @@ -55,28 +55,29 @@ DROP COLUMN [IF EXISTS] name Пример: -```sql +``` sql ALTER TABLE visits DROP COLUMN browser ``` -#### CLEAR COLUMN {#alter_clear-column} +#### CLEAR COLUMN {#alter-clear-column} -```sql +``` sql CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name ``` + Сбрасывает все значения в столбце для заданной партиции. Если указано `IF EXISTS`, запрос не будет возвращать ошибку, если столбца не существует. Как корректно задать имя партиции, см. в разделе [Как задавать имя партиции в запросах ALTER](#alter-how-to-specify-part-expr). Пример: -```sql +``` sql ALTER TABLE visits CLEAR COLUMN browser IN PARTITION tuple() ``` -#### COMMENT COLUMN {#alter_comment-column} +#### COMMENT COLUMN {#alter-comment-column} -```sql +``` sql COMMENT COLUMN [IF EXISTS] name 'Text comment' ``` @@ -88,23 +89,25 @@ COMMENT COLUMN [IF EXISTS] name 'Text comment' Пример: -```sql +``` sql ALTER TABLE visits COMMENT COLUMN browser 'Столбец показывает, из каких браузеров пользователи заходили на сайт.' ``` -#### MODIFY COLUMN {#alter_modify-column} +#### MODIFY COLUMN {#alter-modify-column} -```sql +``` sql MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] ``` Запрос изменяет следующие свойства столбца `name`: -- Тип -- Значение по умолчанию -- TTL +- Тип - Примеры изменения TTL столбца смотрите в разделе [TTL столбца](../operations/table_engines/mergetree.md#mergetree-column-ttl). +- Значение по умолчанию + +- TTL + + Примеры изменения TTL столбца смотрите в разделе [TTL столбца](../operations/table_engines/mergetree.md#mergetree-column-ttl). Если указано `IF EXISTS`, запрос не возвращает ошибку, если столбца не существует. @@ -112,7 +115,7 @@ MODIFY COLUMN [IF EXISTS] name [type] [default_expr] [TTL] Пример запроса: -```sql +``` sql ALTER TABLE visits MODIFY COLUMN browser Array(String) ``` @@ -120,17 +123,17 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String) Выполнение производится в несколько стадий: -- подготовка временных (новых) файлов с изменёнными данными; -- переименование старых файлов; -- переименование временных (новых) файлов в старые; -- удаление старых файлов. +- подготовка временных (новых) файлов с изменёнными данными; +- переименование старых файлов; +- переименование временных (новых) файлов в старые; +- удаление старых файлов. Из них, длительной является только первая стадия. Если на этой стадии возникнет сбой, то данные не поменяются. Если на одной из следующих стадий возникнет сбой, то данные будет можно восстановить вручную. За исключением случаев, когда старые файлы удалены из файловой системы, а данные для новых файлов не доехали на диск и потеряны. Запрос `ALTER` на изменение столбцов реплицируется. Соответствующие инструкции сохраняются в ZooKeeper, и затем каждая реплика их применяет. Все запросы `ALTER` выполняются в одном и том же порядке. Запрос ждёт выполнения соответствующих действий на всех репликах. Но при этом, запрос на изменение столбцов в реплицируемой таблице можно прервать, и все действия будут осуществлены асинхронно. -#### Ограничения запроса ALTER +#### Ограничения запроса ALTER {#ogranicheniia-zaprosa-alter} Запрос `ALTER` позволяет создавать и удалять отдельные элементы (столбцы) вложенных структур данных, но не вложенные структуры данных целиком. Для добавления вложенной структуры данных, вы можете добавить столбцы с именем вида `name.nested_name` и типом `Array(T)` - вложенная структура данных полностью эквивалентна нескольким столбцам-массивам с именем, имеющим одинаковый префикс до точки. @@ -142,11 +145,11 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String) Для таблиц, которые не хранят данные самостоятельно (типа [Merge](../operations/table_engines/merge.md) и [Distributed](../operations/table_engines/distributed.md)), `ALTER` всего лишь меняет структуру таблицы, но не меняет структуру подчинённых таблиц. Для примера, при ALTER-е таблицы типа `Distributed`, вам также потребуется выполнить запрос `ALTER` для таблиц на всех удалённых серверах. -### Манипуляции с ключевыми выражениями таблиц +### Манипуляции с ключевыми выражениями таблиц {#manipuliatsii-s-kliuchevymi-vyrazheniiami-tablits} Поддерживается операция: -```sql +``` sql MODIFY ORDER BY new_expression ``` @@ -158,13 +161,15 @@ MODIFY ORDER BY new_expression сортировки, разрешено добавлять в ключ только новые столбцы (т.е. столбцы, добавляемые командой `ADD COLUMN` в том же запросе `ALTER`), у которых нет выражения по умолчанию. -### Манипуляции с индексами +### Манипуляции с индексами {#manipuliatsii-s-indeksami} Добавить или удалить индекс можно с помощью операций -```sql + +``` sql ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value [AFTER name] ALTER TABLE [db].name DROP INDEX name ``` + Поддерживается только таблицами семейства `*MergeTree`. Команда `ADD INDEX` добавляет описание индексов в метаданные, а `DROP INDEX` удаляет индекс из метаданных и стирает файлы индекса с диска, поэтому они легковесные и работают мгновенно. @@ -173,12 +178,13 @@ ALTER TABLE [db].name DROP INDEX name Запрос на изменение индексов реплицируется, сохраняя новые метаданные в ZooKeeper и применяя изменения на всех репликах. -### Манипуляции с ограничениями (constraints) +### Манипуляции с ограничениями (constraints) {#manipuliatsii-s-ogranicheniiami-constraints} Про ограничения подробнее написано [тут](create.md#constraints). Добавить или удалить ограничение можно с помощью запросов -```sql + +``` sql ALTER TABLE [db].name ADD CONSTRAINT constraint_name CHECK expression; ALTER TABLE [db].name DROP CONSTRAINT constraint_name; ``` @@ -189,25 +195,25 @@ ALTER TABLE [db].name DROP CONSTRAINT constraint_name; Запрос на изменение ограничений для Replicated таблиц реплицируется, сохраняя новые метаданные в ZooKeeper и применяя изменения на всех репликах. -### Манипуляции с партициями и кусками {#alter_manipulations-with-partitions} +### Манипуляции с партициями и кусками {#alter-manipulations-with-partitions} Для работы с [партициями](../operations/table_engines/custom_partitioning_key.md) доступны следующие операции: -- [DETACH PARTITION](#alter_detach-partition) – перенести партицию в директорию `detached`; -- [DROP PARTITION](#alter_drop-partition) – удалить партицию; -- [ATTACH PARTITION|PART](#alter_attach-partition) – добавить партицию/кусок в таблицу из директории `detached`; -- [ATTACH PARTITION FROM](#alter_attach-partition-from) – скопировать партицию из другой таблицы; -- [REPLACE PARTITION](#alter_replace-partition) – скопировать партицию из другой таблицы с заменой; -- [MOVE PARTITION TO TABLE] (#alter_move_to_table-partition) - переместить партицию в другую таблицу; -- [CLEAR COLUMN IN PARTITION](#alter_clear-column-partition) – удалить все значения в столбце для заданной партиции; -- [CLEAR INDEX IN PARTITION](#alter_clear-index-partition) - очистить построенные вторичные индексы для заданной партиции; -- [FREEZE PARTITION](#alter_freeze-partition) – создать резервную копию партиции; -- [FETCH PARTITION](#alter_fetch-partition) – скачать партицию с другого сервера; -- [MOVE PARTITION|PART](#alter_move-partition) – переместить партицию/кускок на другой диск или том. +- [DETACH PARTITION](#alter_detach-partition) – перенести партицию в директорию `detached`; +- [DROP PARTITION](#alter_drop-partition) – удалить партицию; +- [ATTACH PARTITION\|PART](#alter_attach-partition) – добавить партицию/кусок в таблицу из директории `detached`; +- [ATTACH PARTITION FROM](#alter_attach-partition-from) – скопировать партицию из другой таблицы; +- [REPLACE PARTITION](#alter_replace-partition) – скопировать партицию из другой таблицы с заменой; +- [MOVE PARTITION TO TABLE](#alter_move_to_table-partition) (\#alter\_move\_to\_table-partition) - переместить партицию в другую таблицу; +- [CLEAR COLUMN IN PARTITION](#alter_clear-column-partition) – удалить все значения в столбце для заданной партиции; +- [CLEAR INDEX IN PARTITION](#alter_clear-index-partition) - очистить построенные вторичные индексы для заданной партиции; +- [FREEZE PARTITION](#alter_freeze-partition) – создать резервную копию партиции; +- [FETCH PARTITION](#alter_fetch-partition) – скачать партицию с другого сервера; +- [MOVE PARTITION\|PART](#alter_move-partition) – переместить партицию/кускок на другой диск или том. -#### DETACH PARTITION {#alter_detach-partition} +#### DETACH PARTITION {#alter-detach-partition} -```sql +``` sql ALTER TABLE table_name DETACH PARTITION partition_expr ``` @@ -215,7 +221,7 @@ ALTER TABLE table_name DETACH PARTITION partition_expr Пример: -```sql +``` sql ALTER TABLE visits DETACH PARTITION 201901 ``` @@ -225,9 +231,9 @@ ALTER TABLE visits DETACH PARTITION 201901 Запрос реплицируется — данные будут перенесены в директорию `detached` и забыты на всех репликах. Обратите внимание, запрос может быть отправлен только на реплику-лидер. Чтобы узнать, является ли реплика лидером, выполните запрос `SELECT` к системной таблице [system.replicas](../operations/system_tables.md#system_tables-replicas). Либо можно выполнить запрос `DETACH` на всех репликах — тогда на всех репликах, кроме реплики-лидера, запрос вернет ошибку. -#### DROP PARTITION {#alter_drop-partition} +#### DROP PARTITION {#alter-drop-partition} -```sql +``` sql ALTER TABLE table_name DROP PARTITION partition_expr ``` @@ -237,24 +243,24 @@ ALTER TABLE table_name DROP PARTITION partition_expr Запрос реплицируется — данные будут удалены на всех репликах. -#### DROP DETACHED PARTITION|PART {#alter_drop-detached} +#### DROP DETACHED PARTITION\|PART {#alter-drop-detached} -```sql +``` sql ALTER TABLE table_name DROP DETACHED PARTITION|PART partition_expr ``` Удаляет из `detached` кусок или все куски, принадлежащие партиции. Подробнее о том, как корректно задать имя партиции, см. в разделе [Как задавать имя партиции в запросах ALTER](#alter-how-to-specify-part-expr). -#### ATTACH PARTITION|PART {#alter_attach-partition} +#### ATTACH PARTITION\|PART {#alter-attach-partition} -```sql +``` sql ALTER TABLE table_name ATTACH PARTITION|PART partition_expr ``` Добавляет данные в таблицу из директории `detached`. Можно добавить данные как для целой партиции, так и для отдельного куска. Примеры: -```sql +``` sql ALTER TABLE visits ATTACH PARTITION 201901; ALTER TABLE visits ATTACH PART 201901_2_2_0; ``` @@ -265,9 +271,9 @@ ALTER TABLE visits ATTACH PART 201901_2_2_0; Это означает, что вы можете разместить данные в директории `detached` на одной реплике и с помощью запроса `ALTER ... ATTACH` добавить их в таблицу на всех репликах. -#### ATTACH PARTITION FROM {#alter_attach-partition-from} +#### ATTACH PARTITION FROM {#alter-attach-partition-from} -```sql +``` sql ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1 ``` @@ -275,15 +281,14 @@ ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1 Следует иметь в виду: -- Таблицы должны иметь одинаковую структуру. -- Для таблиц должен быть задан одинаковый ключ партиционирования. +- Таблицы должны иметь одинаковую структуру. +- Для таблиц должен быть задан одинаковый ключ партиционирования. Подробнее о том, как корректно задать имя партиции, см. в разделе [Как задавать имя партиции в запросах ALTER](#alter-how-to-specify-part-expr). +#### REPLACE PARTITION {#alter-replace-partition} -#### REPLACE PARTITION {#alter_replace-partition} - -```sql +``` sql ALTER TABLE table2 REPLACE PARTITION partition_expr FROM table1 ``` @@ -291,12 +296,12 @@ ALTER TABLE table2 REPLACE PARTITION partition_expr FROM table1 Следует иметь в виду: -- Таблицы должны иметь одинаковую структуру. -- Для таблиц должен быть задан одинаковый ключ партиционирования. +- Таблицы должны иметь одинаковую структуру. +- Для таблиц должен быть задан одинаковый ключ партиционирования. Подробнее о том, как корректно задать имя партиции, см. в разделе [Как задавать имя партиции в запросах ALTER](#alter-how-to-specify-part-expr). -#### MOVE PARTITION TO TABLE {#alter_move_to_table-partition} +#### MOVE PARTITION TO TABLE {#alter-move-to-table-partition} ``` sql ALTER TABLE table_source MOVE PARTITION partition_expr TO TABLE table_dest @@ -306,13 +311,12 @@ ALTER TABLE table_source MOVE PARTITION partition_expr TO TABLE table_dest Следует иметь в виду: -- Таблицы должны иметь одинаковую структуру. -- Для таблиц должен быть задан одинаковый ключ партиционирования. +- Таблицы должны иметь одинаковую структуру. +- Для таблиц должен быть задан одинаковый ключ партиционирования. +#### CLEAR COLUMN IN PARTITION {#alter-clear-column-partition} -#### CLEAR COLUMN IN PARTITION {#alter_clear-column-partition} - -```sql +``` sql ALTER TABLE table_name CLEAR COLUMN column_name IN PARTITION partition_expr ``` @@ -320,21 +324,21 @@ ALTER TABLE table_name CLEAR COLUMN column_name IN PARTITION partition_expr Пример: -```sql +``` sql ALTER TABLE visits CLEAR COLUMN hour in PARTITION 201902 ``` -#### CLEAR INDEX IN PARTITION {#alter_clear-index-partition} +#### CLEAR INDEX IN PARTITION {#alter-clear-index-partition} -```sql +``` sql ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr ``` Работает как `CLEAR COLUMN`, но сбрасывает индексы вместо данных в столбцах. -#### FREEZE PARTITION {#alter_freeze-partition} +#### FREEZE PARTITION {#alter-freeze-partition} -```sql +``` sql ALTER TABLE table_name FREEZE [PARTITION partition_expr] ``` @@ -343,17 +347,17 @@ ALTER TABLE table_name FREEZE [PARTITION partition_expr] !!! note "Примечание" Создание резервной копии не требует остановки сервера. -Для таблиц старого стиля имя партиций можно задавать в виде префикса (например, '2019'). В этом случае резервные копии будут созданы для всех соответствующих партиций. Подробнее о том, как корректно задать имя партиции, см. в разделе [Как задавать имя партиции в запросах ALTER](#alter-how-to-specify-part-expr). +Для таблиц старого стиля имя партиций можно задавать в виде префикса (например, ‘2019’). В этом случае резервные копии будут созданы для всех соответствующих партиций. Подробнее о том, как корректно задать имя партиции, см. в разделе [Как задавать имя партиции в запросах ALTER](#alter-how-to-specify-part-expr). Запрос делает следующее — для текущего состояния таблицы он формирует жесткие ссылки на данные в этой таблице. Ссылки размещаются в директории `/var/lib/clickhouse/shadow/N/...`, где: -- `/var/lib/clickhouse/` — рабочая директория ClickHouse, заданная в конфигурационном файле; -- `N` — инкрементальный номер резервной копии. +- `/var/lib/clickhouse/` — рабочая директория ClickHouse, заданная в конфигурационном файле; +- `N` — инкрементальный номер резервной копии. !!! note "Примечание" При использовании [нескольких дисков для хранения данных таблицы](../operations/table_engines/mergetree.md#table_engine-mergetree-multiple-volumes) директория `shadow/N` появляется на каждом из дисков, на которых были куски, попавшие под выражение `PARTITION`. -Структура директорий внутри резервной копии такая же, как внутри `/var/lib/clickhouse/`. Запрос выполнит 'chmod' для всех файлов, запрещая запись в них. +Структура директорий внутри резервной копии такая же, как внутри `/var/lib/clickhouse/`. Запрос выполнит ‘chmod’ для всех файлов, запрещая запись в них. Обратите внимание, запрос `ALTER TABLE t FREEZE PARTITION` не реплицируется. Он создает резервную копию только на локальном сервере. После создания резервной копии данные из `/var/lib/clickhouse/shadow/` можно скопировать на удалённый сервер, а локальную копию удалить. @@ -363,17 +367,17 @@ ALTER TABLE table_name FREEZE [PARTITION partition_expr] Чтобы восстановить данные из резервной копии, выполните следующее: -1. Создайте таблицу, если она ещё не существует. Запрос на создание можно взять из .sql файла (замените в нём `ATTACH` на `CREATE`). -2. Скопируйте данные из директории `data/database/table/` внутри резервной копии в директорию `/var/lib/clickhouse/data/database/table/detached/`. -3. С помощью запросов `ALTER TABLE t ATTACH PARTITION` добавьте данные в таблицу. +1. Создайте таблицу, если она ещё не существует. Запрос на создание можно взять из .sql файла (замените в нём `ATTACH` на `CREATE`). +2. Скопируйте данные из директории `data/database/table/` внутри резервной копии в директорию `/var/lib/clickhouse/data/database/table/detached/`. +3. С помощью запросов `ALTER TABLE t ATTACH PARTITION` добавьте данные в таблицу. Восстановление данных из резервной копии не требует остановки сервера. Подробнее о резервном копировании и восстановлении данных читайте в разделе [Резервное копирование данных](../operations/backup.md). -#### FETCH PARTITION {#alter_fetch-partition} +#### FETCH PARTITION {#alter-fetch-partition} -```sql +``` sql ALTER TABLE table_name FETCH PARTITION partition_expr FROM 'path-in-zookeeper' ``` @@ -381,40 +385,42 @@ ALTER TABLE table_name FETCH PARTITION partition_expr FROM 'path-in-zookeeper' Запрос выполняет следующее: -1. Загружает партицию с указанного шарда. Путь к шарду задается в секции `FROM` ('path-in-zookeeper'). Обратите внимание, нужно задавать путь к шарду в ZooKeeper. -2. Помещает загруженные данные в директорию `detached` таблицы `table_name`. Чтобы прикрепить эти данные к таблице, используйте запрос [ATTACH PARTITION|PART](#alter_attach-partition). +1. Загружает партицию с указанного шарда. Путь к шарду задается в секции `FROM` (‘path-in-zookeeper’). Обратите внимание, нужно задавать путь к шарду в ZooKeeper. +2. Помещает загруженные данные в директорию `detached` таблицы `table_name`. Чтобы прикрепить эти данные к таблице, используйте запрос [ATTACH PARTITION\|PART](#alter_attach-partition). Например: -```sql +``` sql ALTER TABLE users FETCH PARTITION 201902 FROM '/clickhouse/tables/01-01/visits'; ALTER TABLE users ATTACH PARTITION 201902; ``` + Следует иметь в виду: -- Запрос `ALTER TABLE t FETCH PARTITION` не реплицируется. Он загружает партицию в директорию `detached` только на локальном сервере. -- Запрос `ALTER TABLE t ATTACH` реплицируется — он добавляет данные в таблицу сразу на всех репликах. На одной из реплик данные будут добавлены из директории `detached`, а на других — из соседних реплик. +- Запрос `ALTER TABLE t FETCH PARTITION` не реплицируется. Он загружает партицию в директорию `detached` только на локальном сервере. +- Запрос `ALTER TABLE t ATTACH` реплицируется — он добавляет данные в таблицу сразу на всех репликах. На одной из реплик данные будут добавлены из директории `detached`, а на других — из соседних реплик. Перед загрузкой данных система проверяет, существует ли партиция и совпадает ли её структура со структурой таблицы. При этом автоматически выбирается наиболее актуальная реплика среди всех живых реплик. Несмотря на то что запрос называется `ALTER TABLE`, он не изменяет структуру таблицы и не изменяет сразу доступные данные в таблице. -#### MOVE PARTITION|PART {#alter_move-partition} +#### MOVE PARTITION\|PART {#alter-move-partition} Перемещает партицию или кусок данных на другой том или диск для таблиц с движком `MergeTree`. Смотрите [Хранение данных таблицы на нескольких блочных устройствах](../operations/table_engines/mergetree.md#table_engine-mergetree-multiple-volumes). -```sql +``` sql ALTER TABLE table_name MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name' ``` + Запрос `ALTER TABLE t MOVE`: -- Не реплицируется, т.к. на разных репликах могут быть различные конфигурации политик хранения. -- Возвращает ошибку, если указан несконфигурированный том или диск. Ошибка также возвращается в случае невыполнения условий перемещения данных, которые указаны в конфигурации политики хранения. -- Может возвращать ошибку в случае, когда перемещаемые данные уже оказались перемещены в результате фонового процесса, конкурентного запроса `ALTER TABLE t MOVE` или как часть результата фоновой операции слияния. В данном случае никаких дополнительных действий от пользователя не требуется. +- Не реплицируется, т.к. на разных репликах могут быть различные конфигурации политик хранения. +- Возвращает ошибку, если указан несконфигурированный том или диск. Ошибка также возвращается в случае невыполнения условий перемещения данных, которые указаны в конфигурации политики хранения. +- Может возвращать ошибку в случае, когда перемещаемые данные уже оказались перемещены в результате фонового процесса, конкурентного запроса `ALTER TABLE t MOVE` или как часть результата фоновой операции слияния. В данном случае никаких дополнительных действий от пользователя не требуется. Примеры: -```sql +``` sql ALTER TABLE hits MOVE PART '20190301_14343_16206_438' TO VOLUME 'slow' ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' ``` @@ -423,10 +429,10 @@ ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' Чтобы задать нужную партицию в запросах `ALTER ... PARTITION`, можно использовать: -- Имя партиции. Посмотреть имя партиции можно в столбце `partition` системной таблицы [system.parts](../operations/system_tables.md#system_tables-parts). Например, `ALTER TABLE visits DETACH PARTITION 201901`. -- Произвольное выражение из столбцов исходной таблицы. Также поддерживаются константы и константные выражения. Например, `ALTER TABLE visits DETACH PARTITION toYYYYMM(toDate('2019-01-25'))`. -- Строковый идентификатор партиции. Идентификатор партиции используется для именования кусков партиции на файловой системе и в ZooKeeper. В запросах `ALTER` идентификатор партиции нужно указывать в секции `PARTITION ID`, в одинарных кавычках. Например, `ALTER TABLE visits DETACH PARTITION ID '201901'`. -- Для запросов [ATTACH PART](#alter_attach-partition) и [DROP DETACHED PART](#alter_drop-detached): чтобы задать имя куска партиции, используйте строковой литерал со значением из столбца `name` системной таблицы [system.detached_parts](../operations/system_tables.md#system_tables-detached_parts). Например, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. +- Имя партиции. Посмотреть имя партиции можно в столбце `partition` системной таблицы [system.parts](../operations/system_tables.md#system_tables-parts). Например, `ALTER TABLE visits DETACH PARTITION 201901`. +- Произвольное выражение из столбцов исходной таблицы. Также поддерживаются константы и константные выражения. Например, `ALTER TABLE visits DETACH PARTITION toYYYYMM(toDate('2019-01-25'))`. +- Строковый идентификатор партиции. Идентификатор партиции используется для именования кусков партиции на файловой системе и в ZooKeeper. В запросах `ALTER` идентификатор партиции нужно указывать в секции `PARTITION ID`, в одинарных кавычках. Например, `ALTER TABLE visits DETACH PARTITION ID '201901'`. +- Для запросов [ATTACH PART](#alter_attach-partition) и [DROP DETACHED PART](#alter_drop-detached): чтобы задать имя куска партиции, используйте строковой литерал со значением из столбца `name` системной таблицы [system.detached\_parts](../operations/system_tables.md#system_tables-detached_parts). Например, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. Использование кавычек в имени партиций зависит от типа данных столбца, по которому задано партиционирование. Например, для столбца с типом `String` имя партиции необходимо указывать в кавычках (одинарных). Для типов `Date` и `Int*` кавычки указывать не нужно. @@ -434,21 +440,21 @@ ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' Правила, сформулированные выше, актуальны также для запросов [OPTIMIZE](misc.md#misc_operations-optimize). Чтобы указать единственную партицию непартиционированной таблицы, укажите `PARTITION tuple()`. Например: -```sql +``` sql OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; ``` Примеры запросов `ALTER ... PARTITION` можно посмотреть в тестах: [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_local.sql) и [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). -### Манипуляции с TTL таблицы +### Манипуляции с TTL таблицы {#manipuliatsii-s-ttl-tablitsy} Вы можете изменить [TTL для таблицы](../operations/table_engines/mergetree.md#mergetree-table-ttl) запросом следующего вида: -```sql +``` sql ALTER TABLE table-name MODIFY TTL ttl-expression ``` -### Синхронность запросов ALTER +### Синхронность запросов ALTER {#sinkhronnost-zaprosov-alter} Для нереплицируемых таблиц, все запросы `ALTER` выполняются синхронно. Для реплицируемых таблиц, запрос всего лишь добавляет инструкцию по соответствующим действиям в `ZooKeeper`, а сами действия осуществляются при первой возможности. Но при этом, запрос может ждать завершения выполнения этих действий на всех репликах. @@ -463,19 +469,19 @@ ALTER TABLE table-name MODIFY TTL ttl-expression На данный момент доступны команды: -```sql +``` sql ALTER TABLE [db.]table DELETE WHERE filter_expr ``` Выражение `filter_expr` должно иметь тип `UInt8`. Запрос удаляет строки таблицы, для которых это выражение принимает ненулевое значение. -```sql +``` sql ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr ``` Выражение `filter_expr` должно иметь тип `UInt8`. Запрос изменяет значение указанных столбцов на вычисленное значение соответствующих выражений в каждой строке, для которой `filter_expr` принимает ненулевое значение. Вычисленные значения преобразуются к типу столбца с помощью оператора `CAST`. Изменение столбцов, которые используются при вычислении первичного ключа или ключа партиционирования, не поддерживается. -```sql +``` sql ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name ``` diff --git a/docs/ru/query_language/create.md b/docs/ru/query_language/create.md index ec334533aef..a121a504c8d 100644 --- a/docs/ru/query_language/create.md +++ b/docs/ru/query_language/create.md @@ -1,36 +1,36 @@ -## CREATE DATABASE {#query_language-create-database} +## CREATE DATABASE {#query-language-create-database} Создает базу данных. -```sql +``` sql CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] ``` -### Секции +### Секции {#sektsii} -- `IF NOT EXISTS` +- `IF NOT EXISTS` - Если база данных с именем `db_name` уже существует, то ClickHouse не создаёт базу данных и: - - Не генерирует исключение, если секция указана. - - Генерирует исключение, если секция не указана. + Если база данных с именем `db_name` уже существует, то ClickHouse не создаёт базу данных и: + - Не генерирует исключение, если секция указана. + - Генерирует исключение, если секция не указана. -- `ON CLUSTER` +- `ON CLUSTER` - ClickHouse создаёт базу данных `db_name` на всех серверах указанного кластера. + ClickHouse создаёт базу данных `db_name` на всех серверах указанного кластера. -- `ENGINE` +- `ENGINE` - - [MySQL](../database_engines/mysql.md) + - [MySQL](../database_engines/mysql.md) - Позволяет получать данные с удаленного сервера MySQL. + Позволяет получать данные с удаленного сервера MySQL. - По умолчанию ClickHouse использует собственный [движок баз данных](../database_engines/index.md). + По умолчанию ClickHouse использует собственный [движок баз данных](../database_engines/index.md). ## CREATE TABLE {#create-table-query} Запрос `CREATE TABLE` может иметь несколько форм. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1], @@ -45,19 +45,19 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Описание столбца, это `name type`, в простейшем случае. Пример: `RegionID UInt32`. Также могут быть указаны выражения для значений по умолчанию - смотрите ниже. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name AS [db2.]name2 [ENGINE = engine] ``` Создаёт таблицу с такой же структурой, как другая таблица. Можно указать другой движок для таблицы. Если движок не указан, то будет выбран такой же движок, как у таблицы `db2.name2`. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name AS table_function() ``` Создаёт таблицу с такой же структурой и данными, как результат соответствующей табличной функцией. -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... ``` @@ -107,7 +107,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... Наряду с объявлением столбцов можно объявить ограничения на значения в столбцах таблицы: -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1], @@ -121,7 +121,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Добавление большого числа ограничений может негативно повлиять на производительность `INSERT` запросов. -### Выражение для TTL +### Выражение для TTL {#vyrazhenie-dlia-ttl} Определяет время хранения значений. Может быть указано только для таблиц семейства MergeTree. Подробнее смотрите в [TTL для столбцов и таблиц](../operations/table_engines/mergetree.md#table_engine-mergetree-ttl). @@ -129,7 +129,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] По умолчанию, ClickHouse применяет к столбцу метод сжатия, определённый в [конфигурации сервера](../operations/server_settings/settings.md#compression). Кроме этого, можно задать метод сжатия для каждого отдельного столбца в запросе `CREATE TABLE`. -```sql +``` sql CREATE TABLE codec_example ( dt Date CODEC(ZSTD), @@ -144,15 +144,15 @@ ENGINE = Если задать кодек для столбца, то кодек по умолчанию не применяется. Кодеки можно последовательно комбинировать, например, `CODEC(Delta, ZSTD)`. Чтобы выбрать наиболее подходящую для вашего проекта комбинацию кодеков, необходимо провести сравнительные тесты, подобные тем, что описаны в статье Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse). -!!!warning "Предупреждение" +!!! warning "Предупреждение" Нельзя распаковать базу данных ClickHouse с помощью сторонних утилит наподобие `lz4`. Необходимо использовать специальную утилиту [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor). Сжатие поддерживается для следующих движков таблиц: -- [MergeTree family](../operations/table_engines/mergetree.md) -- [Log family](../operations/table_engines/log_family.md) -- [Set](../operations/table_engines/set.md) -- [Join](../operations/table_engines/join.md) +- [MergeTree family](../operations/table_engines/mergetree.md) +- [Log family](../operations/table_engines/log_family.md) +- [Set](../operations/table_engines/set.md) +- [Join](../operations/table_engines/join.md) ClickHouse поддерживает кодеки общего назначения и специализированные кодеки. @@ -162,14 +162,14 @@ ClickHouse поддерживает кодеки общего назначени Специализированные кодеки: -- `Delta(delta_bytes)` — Метод, в котором исходные значения заменяются разностью двух соседних значений, за исключением первого значения, которое остаётся неизменным. Для хранения разниц используется до `delta_bytes`, т.е. `delta_bytes` — это максимальный размер исходных данных. Возможные значения `delta_bytes`: 1, 2, 4, 8. Значение по умолчанию для `delta_bytes` равно `sizeof(type)`, если результат 1, 2, 4, or 8. Во всех других случаях — 1. -- `DoubleDelta` — Вычисляется разницу от разниц и сохраняет её в компакном бинарном виде. Оптимальная степень сжатия достигается для монотонных последовательностей с постоянным шагом, наподобие временных рядов. Можно использовать с любым типом данных фиксированного размера. Реализует алгоритм, используемый в TSDB Gorilla, поддерживает 64-битные типы данных. Использует 1 дополнительный бит для 32-байтовых значений: 5-битные префиксы вместо 4-битных префиксов. Подробнее читайте в разделе "Compressing Time Stamps" документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `Gorilla` — Вычисляет XOR между текущим и предыдущим значением и записывает результат в компактной бинарной форме. Еффективно сохраняет ряды медленно изменяющихся чисел с плавающей запятой, поскольку наилучший коэффициент сжатия достигается, если соседние значения одинаковые. Реализует алгоритм, используемый в TSDB Gorilla, адаптируя его для работы с 64-битными значениями. Подробнее читайте в разделе "Compressing Values" документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `T64` — Метод сжатия который обрезает неиспользуемые старшие биты целочисленных значений (включая `Enum`, `Date` и `DateTime`). На каждом шаге алгоритма, кодек помещает блок из 64 значений в матрицу 64✕64, транспонирует её, обрезает неиспользуемые биты, а то, что осталось возвращает в виде последовательности. Неиспользуемые биты, это биты, которые не изменяются от минимального к максимальному на всём диапазоне значений куска данных. +- `Delta(delta_bytes)` — Метод, в котором исходные значения заменяются разностью двух соседних значений, за исключением первого значения, которое остаётся неизменным. Для хранения разниц используется до `delta_bytes`, т.е. `delta_bytes` — это максимальный размер исходных данных. Возможные значения `delta_bytes`: 1, 2, 4, 8. Значение по умолчанию для `delta_bytes` равно `sizeof(type)`, если результат 1, 2, 4, or 8. Во всех других случаях — 1. +- `DoubleDelta` — Вычисляется разницу от разниц и сохраняет её в компакном бинарном виде. Оптимальная степень сжатия достигается для монотонных последовательностей с постоянным шагом, наподобие временных рядов. Можно использовать с любым типом данных фиксированного размера. Реализует алгоритм, используемый в TSDB Gorilla, поддерживает 64-битные типы данных. Использует 1 дополнительный бит для 32-байтовых значений: 5-битные префиксы вместо 4-битных префиксов. Подробнее читайте в разделе «Compressing Time Stamps» документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `Gorilla` — Вычисляет XOR между текущим и предыдущим значением и записывает результат в компактной бинарной форме. Еффективно сохраняет ряды медленно изменяющихся чисел с плавающей запятой, поскольку наилучший коэффициент сжатия достигается, если соседние значения одинаковые. Реализует алгоритм, используемый в TSDB Gorilla, адаптируя его для работы с 64-битными значениями. Подробнее читайте в разделе «Compressing Values» документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `T64` — Метод сжатия который обрезает неиспользуемые старшие биты целочисленных значений (включая `Enum`, `Date` и `DateTime`). На каждом шаге алгоритма, кодек помещает блок из 64 значений в матрицу 64✕64, транспонирует её, обрезает неиспользуемые биты, а то, что осталось возвращает в виде последовательности. Неиспользуемые биты, это биты, которые не изменяются от минимального к максимальному на всём диапазоне значений куска данных. Кодеки `DoubleDelta` и `Gorilla` используются в TSDB Gorilla как компоненты алгоритма сжатия. Подход Gorilla эффективен в сценариях, когда данные представляют собой медленно изменяющиеся во времени величины. Метки времени эффективно сжимаются кодеком `DoubleDelta`, а значения кодеком `Gorilla`. Например, чтобы создать эффективно хранящуюся таблицу, используйте следующую конфигурацию: -```sql +``` sql CREATE TABLE codec_example ( timestamp DateTime CODEC(DoubleDelta), @@ -182,28 +182,27 @@ ENGINE = MergeTree() Кодеки: -- `NONE` — без сжатия. -- `LZ4` — [алгоритм сжатия без потерь](https://github.com/lz4/lz4) используемый по умолчанию. Применяет быстрое сжатие LZ4. -- `LZ4HC[(level)]` — алгоритм LZ4 HC (high compression) с настраиваемым уровнем сжатия. Уровень по умолчанию — 9. Настройка `level <= 0` устанавливает уровень сжания по умолчанию. Возможные уровни сжатия: [1, 12]. Рекомендуемый диапазон уровней: [4, 9]. -- `ZSTD[(level)]` — [алгоритм сжатия ZSTD](https://en.wikipedia.org/wiki/Zstandard) с настраиваемым уровнем сжатия `level`. Возможные уровни сжатия: [1, 22]. Уровень сжатия по умолчанию: 1. +- `NONE` — без сжатия. +- `LZ4` — [алгоритм сжатия без потерь](https://github.com/lz4/lz4) используемый по умолчанию. Применяет быстрое сжатие LZ4. +- `LZ4HC[(level)]` — алгоритм LZ4 HC (high compression) с настраиваемым уровнем сжатия. Уровень по умолчанию — 9. Настройка `level <= 0` устанавливает уровень сжания по умолчанию. Возможные уровни сжатия: \[1, 12\]. Рекомендуемый диапазон уровней: \[4, 9\]. +- `ZSTD[(level)]` — [алгоритм сжатия ZSTD](https://en.wikipedia.org/wiki/Zstandard) с настраиваемым уровнем сжатия `level`. Возможные уровни сжатия: \[1, 22\]. Уровень сжатия по умолчанию: 1. -Высокие уровни сжатия полезны для ассимметричных сценариев, подобных "один раз сжал, много раз распаковал". Высокие уровни сжатия подразумеваю лучшее сжатие, но большее использование CPU. +Высокие уровни сжатия полезны для ассимметричных сценариев, подобных «один раз сжал, много раз распаковал». Высокие уровни сжатия подразумеваю лучшее сжатие, но большее использование CPU. - -## Временные таблицы +## Временные таблицы {#vremennye-tablitsy} ClickHouse поддерживает временные таблицы со следующими характеристиками: -- Временные таблицы исчезают после завершения сессии, в том числе при обрыве соединения. -- Временная таблица использует только модуль памяти. -- Невозможно указать базу данных для временной таблицы. Она создается вне баз данных. -- Невозможно создать временную таблицу распределнным DDL запросом на всех серверах кластера (с опцией `ON CLUSTER`): такая таблица существует только в рамках существующей сессии. -- Если временная таблица имеет то же имя, что и некоторая другая, то, при упоминании в запросе без указания БД, будет использована временная таблица. -- При распределённой обработке запроса, используемые в запросе временные таблицы, передаются на удалённые серверы. +- Временные таблицы исчезают после завершения сессии, в том числе при обрыве соединения. +- Временная таблица использует только модуль памяти. +- Невозможно указать базу данных для временной таблицы. Она создается вне баз данных. +- Невозможно создать временную таблицу распределнным DDL запросом на всех серверах кластера (с опцией `ON CLUSTER`): такая таблица существует только в рамках существующей сессии. +- Если временная таблица имеет то же имя, что и некоторая другая, то, при упоминании в запросе без указания БД, будет использована временная таблица. +- При распределённой обработке запроса, используемые в запросе временные таблицы, передаются на удалённые серверы. Чтобы создать временную таблицу, используйте следующий синтаксис: -```sql +``` sql CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -216,21 +215,21 @@ CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name Вместо временных можно использовать обычные таблицы с [ENGINE = Memory](../operations/table_engines/memory.md). -## Распределенные DDL запросы (секция ON CLUSTER) +## Распределенные DDL запросы (секция ON CLUSTER) {#raspredelennye-ddl-zaprosy-sektsiia-on-cluster} Запросы `CREATE`, `DROP`, `ALTER`, `RENAME` поддерживают возможность распределенного выполнения на кластере. Например, следующий запрос создает распределенную (Distributed) таблицу `all_hits` на каждом хосте в `cluster`: -```sql +``` sql CREATE TABLE IF NOT EXISTS all_hits ON CLUSTER cluster (p Date, i Int32) ENGINE = Distributed(cluster, default, hits) ``` Для корректного выполнения таких запросов необходимо на каждом хосте иметь одинаковое определение кластера (для упрощения синхронизации конфигов можете использовать подстановки из ZooKeeper). Также необходимо подключение к ZooKeeper серверам. Локальная версия запроса в конечном итоге будет выполнена на каждом хосте кластера, даже если некоторые хосты в данный момент не доступны. Гарантируется упорядоченность выполнения запросов в рамках одного хоста. -## CREATE VIEW +## CREATE VIEW {#create-view} -```sql +``` sql CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]table_name [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ... ``` @@ -240,19 +239,19 @@ CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]table_name [TO[db.]name] [ENGINE Для примера, пусть вы создали представление: -```sql +``` sql CREATE VIEW view AS SELECT ... ``` и написали запрос: -```sql +``` sql SELECT a, b, c FROM view ``` Этот запрос полностью эквивалентен использованию подзапроса: -```sql +``` sql SELECT a, b, c FROM (SELECT ...) ``` @@ -266,9 +265,9 @@ SELECT a, b, c FROM (SELECT ...) Если указано POPULATE, то при создании представления, в него будут вставлены имеющиеся данные таблицы, как если бы был сделан запрос `CREATE TABLE ... AS SELECT ...` . Иначе, представление будет содержать только данные, вставляемые в таблицу после создания представления. Не рекомендуется использовать POPULATE, так как вставляемые в таблицу данные во время создания представления, не попадут в него. -Запрос `SELECT` может содержать `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`... Следует иметь ввиду, что соответствующие преобразования будут выполняться независимо, на каждый блок вставляемых данных. Например, при наличии `GROUP BY`, данные будут агрегироваться при вставке, но только в рамках одной пачки вставляемых данных. Далее, данные не будут доагрегированы. Исключение - использование ENGINE, производящего агрегацию данных самостоятельно, например, `SummingMergeTree`. +Запрос `SELECT` может содержать `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`… Следует иметь ввиду, что соответствующие преобразования будут выполняться независимо, на каждый блок вставляемых данных. Например, при наличии `GROUP BY`, данные будут агрегироваться при вставке, но только в рамках одной пачки вставляемых данных. Далее, данные не будут доагрегированы. Исключение - использование ENGINE, производящего агрегацию данных самостоятельно, например, `SummingMergeTree`. -Недоработано выполнение запросов `ALTER` над материализованными представлениями, поэтому они могут быть неудобными для использования. Если материализованное представление использует конструкцию ``TO [db.]name``, то можно выполнить ``DETACH`` представления, ``ALTER`` для целевой таблицы и последующий ``ATTACH`` ранее отсоединенного (``DETACH``) представления. +Недоработано выполнение запросов `ALTER` над материализованными представлениями, поэтому они могут быть неудобными для использования. Если материализованное представление использует конструкцию `TO [db.]name`, то можно выполнить `DETACH` представления, `ALTER` для целевой таблицы и последующий `ATTACH` ранее отсоединенного (`DETACH`) представления. Представления выглядят так же, как обычные таблицы. Например, они перечисляются в результате запроса `SHOW TABLES`. @@ -276,7 +275,7 @@ SELECT a, b, c FROM (SELECT ...) ## CREATE DICTIONARY {#create-dictionary-query} -```sql +``` sql CREATE DICTIONARY [IF NOT EXISTS] [db.]dictionary_name ( key1 type1 [DEFAULT|EXPRESSION expr1] [HIERARCHICAL|INJECTIVE|IS_OBJECT_ID], diff --git a/docs/ru/query_language/dicts/external_dicts.md b/docs/ru/query_language/dicts/external_dicts.md index 7e38cd9221d..586e443e3a9 100644 --- a/docs/ru/query_language/dicts/external_dicts.md +++ b/docs/ru/query_language/dicts/external_dicts.md @@ -1,19 +1,19 @@ -# Внешние словари {#dicts-external_dicts} +# Внешние словари {#dicts-external-dicts} -Существует возможность подключать собственные словари из различных источников данных. Источником данных для словаря может быть локальный текстовый/исполняемый файл, HTTP(s) ресурс или другая СУБД. Подробнее смотрите в разделе "[Источники внешних словарей](external_dicts_dict_sources.md)". +Существует возможность подключать собственные словари из различных источников данных. Источником данных для словаря может быть локальный текстовый/исполняемый файл, HTTP(s) ресурс или другая СУБД. Подробнее смотрите в разделе «[Источники внешних словарей](external_dicts_dict_sources.md)». ClickHouse: - Полностью или частично хранит словари в оперативной памяти. - Периодически обновляет их и динамически подгружает отсутствующие значения. - Позволяет создавать внешние словари с помощью xml-файлов или [DDL-запросов](../create.md#create-dictionary-query). -Конфигурация внешних словарей может находится в одном или нескольких xml-файлах. Путь к конфигурации указывается в параметре [dictionaries_config](../../operations/server_settings/settings.md). +Конфигурация внешних словарей может находится в одном или нескольких xml-файлах. Путь к конфигурации указывается в параметре [dictionaries\_config](../../operations/server_settings/settings.md). -Словари могут загружаться при старте сервера или при первом использовании, в зависимости от настройки [dictionaries_lazy_load](../../operations/server_settings/settings.md). +Словари могут загружаться при старте сервера или при первом использовании, в зависимости от настройки [dictionaries\_lazy\_load](../../operations/server_settings/settings.md). Конфигурационный файл словарей имеет вид: -```xml +``` xml Необязательный элемент с любым содержимым. Игнорируется сервером ClickHouse. @@ -40,14 +40,13 @@ ClickHouse: !!! attention "Внимание" Можно преобразовывать значения по небольшому словарю, описав его в запросе `SELECT` (см. функцию [transform](../functions/other_functions.md)). Эта функциональность не связана с внешними словарями. - ## Смотрите также {#ext-dicts-see-also} -- [Настройка внешнего словаря](external_dicts_dict.md) -- [Хранение словарей в памяти](external_dicts_dict_layout.md) -- [Обновление словарей](external_dicts_dict_lifetime.md) -- [Источники внешних словарей](external_dicts_dict_sources.md) -- [Ключ и поля словаря](external_dicts_dict_structure.md) -- [Функции для работы с внешними словарями](../functions/ext_dict_functions.md#ext_dict_functions) +- [Настройка внешнего словаря](external_dicts_dict.md) +- [Хранение словарей в памяти](external_dicts_dict_layout.md) +- [Обновление словарей](external_dicts_dict_lifetime.md) +- [Источники внешних словарей](external_dicts_dict_sources.md) +- [Ключ и поля словаря](external_dicts_dict_structure.md) +- [Функции для работы с внешними словарями](../functions/ext_dict_functions.md#ext_dict_functions) [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts/) diff --git a/docs/ru/query_language/dicts/external_dicts_dict.md b/docs/ru/query_language/dicts/external_dicts_dict.md index 61946c10ee8..dcb8c6652b0 100644 --- a/docs/ru/query_language/dicts/external_dicts_dict.md +++ b/docs/ru/query_language/dicts/external_dicts_dict.md @@ -1,8 +1,8 @@ -# Настройка внешнего словаря {#dicts-external_dicts_dict} +# Настройка внешнего словаря {#dicts-external-dicts-dict} XML-конфигурация словаря имеет следующую структуру: -```xml +``` xml dict_name @@ -26,7 +26,7 @@ XML-конфигурация словаря имеет следующую стр Соответствующий [DDL-запрос](../create.md#create-dictionary-query) имеет следующий вид: -```sql +``` sql CREATE DICTIONARY dict_name ( ... -- attributes @@ -37,10 +37,10 @@ LAYOUT(...) -- Memory layout configuration LIFETIME(...) -- Lifetime of dictionary in memory ``` -- `name` — Идентификатор, под которым словарь будет доступен для использования. Используйте символы `[a-zA-Z0-9_\-]`. -- [source](external_dicts_dict_sources.md) — Источник словаря. -- [layout](external_dicts_dict_layout.md) — Размещение словаря в памяти. -- [structure](external_dicts_dict_structure.md) — Структура словаря. Ключ и атрибуты, которые можно получить по ключу. -- [lifetime](external_dicts_dict_lifetime.md) — Периодичность обновления словарей. +- `name` — Идентификатор, под которым словарь будет доступен для использования. Используйте символы `[a-zA-Z0-9_\-]`. +- [source](external_dicts_dict_sources.md) — Источник словаря. +- [layout](external_dicts_dict_layout.md) — Размещение словаря в памяти. +- [structure](external_dicts_dict_structure.md) — Структура словаря. Ключ и атрибуты, которые можно получить по ключу. +- [lifetime](external_dicts_dict_lifetime.md) — Периодичность обновления словарей. [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict/) diff --git a/docs/ru/query_language/dicts/external_dicts_dict_hierarchical.md b/docs/ru/query_language/dicts/external_dicts_dict_hierarchical.md index b0d8125177b..335b882a97c 100644 --- a/docs/ru/query_language/dicts/external_dicts_dict_hierarchical.md +++ b/docs/ru/query_language/dicts/external_dicts_dict_hierarchical.md @@ -1,17 +1,17 @@ -# Иерархические словари +# Иерархические словари {#ierarkhicheskie-slovari} ClickHouse поддерживает иерархические словари с [числовыми ключом](external_dicts_dict_structure.md#ext_dict-numeric-key). Рассмотрим следующую структуру: -```text +``` text 0 (Common parent) │ ├── 1 (Russia) │ │ -│ └── 2 (Moscow) +│ └── 2 (Moscow) │ │ -│ └── 3 (Center) +│ └── 3 (Center) │ └── 4 (Great Britain) │ @@ -20,13 +20,13 @@ ClickHouse поддерживает иерархические словари с Эту иерархию можно выразить в виде следующей таблицы-словаря. -region_id | parent_region | region_name -----|--------------|------ -1 | 0 | Russia -2 | 1 | Moscow -3 | 2 | Center -4 | 0 | Great Britain -5 | 4 | London +| region\_id | parent\_region | region\_name | +|------------|----------------|---------------| +| 1 | 0 | Russia | +| 2 | 1 | Moscow | +| 3 | 2 | Center | +| 4 | 0 | Great Britain | +| 5 | 4 | London | Таблица содержит столбец `parent_region`, содержащий ключ ближайшего предка для текущего элемента. @@ -36,7 +36,7 @@ ClickHouse поддерживает свойство [hierarchical](external_dic Структура словаря для нашего примера может выглядеть следующим образом: -```xml +``` xml diff --git a/docs/ru/query_language/dicts/external_dicts_dict_layout.md b/docs/ru/query_language/dicts/external_dicts_dict_layout.md index 0b0f8ab4b82..2344577ccb9 100644 --- a/docs/ru/query_language/dicts/external_dicts_dict_layout.md +++ b/docs/ru/query_language/dicts/external_dicts_dict_layout.md @@ -1,26 +1,26 @@ -# Хранение словарей в памяти {#dicts-external_dicts_dict_layout} +# Хранение словарей в памяти {#dicts-external-dicts-dict-layout} Словари можно размещать в памяти множеством способов. -Рекомендуем [flat](#flat), [hashed](#hashed) и [complex_key_hashed](#complex-key-hashed). Скорость обработки словарей при этом максимальна. +Рекомендуем [flat](#flat), [hashed](#hashed) и [complex\_key\_hashed](#complex-key-hashed). Скорость обработки словарей при этом максимальна. -Размещение с кэшированием не рекомендуется использовать из-за потенциально низкой производительности и сложностей в подборе оптимальных параметров. Читайте об этом подробнее в разделе "[cache](#cache)". +Размещение с кэшированием не рекомендуется использовать из-за потенциально низкой производительности и сложностей в подборе оптимальных параметров. Читайте об этом подробнее в разделе «[cache](#cache)». Повысить производительность словарей можно следующими способами: -- Вызывать функцию для работы со словарём после `GROUP BY`. -- Помечать извлекаемые атрибуты как инъективные. Атрибут называется инъективным, если разным ключам соответствуют разные значения атрибута. Тогда при использовании в `GROUP BY` функции, достающей значение атрибута по ключу, эта функция автоматически выносится из `GROUP BY`. +- Вызывать функцию для работы со словарём после `GROUP BY`. +- Помечать извлекаемые атрибуты как инъективные. Атрибут называется инъективным, если разным ключам соответствуют разные значения атрибута. Тогда при использовании в `GROUP BY` функции, достающей значение атрибута по ключу, эта функция автоматически выносится из `GROUP BY`. При ошибках работы со словарями ClickHouse генерирует исключения. Например, в следующих ситуациях: -- При обращении к словарю, который не удалось загрузить. -- При ошибке запроса к `cached`-словарю. +- При обращении к словарю, который не удалось загрузить. +- При ошибке запроса к `cached`-словарю. Список внешних словарей и их статус можно посмотреть в таблице `system.dictionaries`. Общий вид конфигурации: -```xml +``` xml ... @@ -34,28 +34,27 @@ ``` - Соответствущий [DDL-запрос](../create.md#create-dictionary-query): -```sql +``` sql CREATE DICTIONARY (...) ... LAYOUT(LAYOUT_TYPE(param value)) -- layout settings ... ``` -## Способы размещения словарей в памяти +## Способы размещения словарей в памяти {#sposoby-razmeshcheniia-slovarei-v-pamiati} -- [flat](#flat) -- [hashed](#hashed) -- [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed) -- [cache](#cache) -- [range_hashed](#range-hashed) -- [complex_key_hashed](#complex-key-hashed) -- [complex_key_cache](#complex-key-cache) -- [ip_trie](#ip-trie) +- [flat](#flat) +- [hashed](#hashed) +- [sparse\_hashed](#dicts-external_dicts_dict_layout-sparse_hashed) +- [cache](#cache) +- [range\_hashed](#range-hashed) +- [complex\_key\_hashed](#complex-key-hashed) +- [complex\_key\_cache](#complex-key-cache) +- [ip\_trie](#ip-trie) -### flat +### flat {#flat} Словарь полностью хранится в оперативной памяти в виде плоских массивов. Объём памяти, занимаемой словарём пропорционален размеру самого большого по размеру ключа. @@ -67,7 +66,7 @@ LAYOUT(LAYOUT_TYPE(param value)) -- layout settings Пример конфигурации: -```xml +``` xml @@ -75,11 +74,11 @@ LAYOUT(LAYOUT_TYPE(param value)) -- layout settings или -```sql +``` sql LAYOUT(FLAT()) ``` -### hashed +### hashed {#hashed} Словарь полностью хранится в оперативной памяти в виде хэш-таблиц. Словарь может содержать произвольное количество элементов с произвольными идентификаторами. На практике, количество ключей может достигать десятков миллионов элементов. @@ -87,7 +86,7 @@ LAYOUT(FLAT()) Пример конфигурации: -```xml +``` xml @@ -95,17 +94,17 @@ LAYOUT(FLAT()) или -```sql +``` sql LAYOUT(HASHED()) ``` -### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed} +### sparse\_hashed {#dicts-external-dicts-dict-layout-sparse-hashed} Аналогичен `hashed`, но при этом занимает меньше места в памяти и генерирует более высокую загрузку CPU. Пример конфигурации: -```xml +``` xml @@ -113,17 +112,17 @@ LAYOUT(HASHED()) или -```sql +``` sql LAYOUT(SPARSE_HASHED()) ``` -### complex_key_hashed +### complex\_key\_hashed {#complex-key-hashed} Тип размещения предназначен для использования с составными [ключами](external_dicts_dict_structure.md). Аналогичен `hashed`. Пример конфигурации: -```xml +``` xml @@ -131,12 +130,11 @@ LAYOUT(SPARSE_HASHED()) или -```sql +``` sql LAYOUT(COMPLEX_KEY_HASHED()) ``` - -### range_hashed +### range\_hashed {#range-hashed} Словарь хранится в оперативной памяти в виде хэш-таблицы с упорядоченным массивом диапазонов и соответствующих им значений. @@ -144,7 +142,7 @@ LAYOUT(COMPLEX_KEY_HASHED()) Пример: таблица содержит скидки для каждого рекламодателя в виде: -```text +``` text +---------------+---------------------+-------------------+--------+ | advertiser id | discount start date | discount end date | amount | +===============+=====================+===================+========+ @@ -156,11 +154,11 @@ LAYOUT(COMPLEX_KEY_HASHED()) +---------------+---------------------+-------------------+--------+ ``` -Чтобы использовать выборку по диапазонам дат, необходимо в [structure](external_dicts_dict_structure.md) определить элементы `range_min`, `range_max`. В этих элементах должны присутствовать элементы `name` и `type` (если `type` не указан, будет использован тип по умолчанию -- Date). `type` может быть любым численным типом (Date/DateTime/UInt64/Int32/др.). +Чтобы использовать выборку по диапазонам дат, необходимо в [structure](external_dicts_dict_structure.md) определить элементы `range_min`, `range_max`. В этих элементах должны присутствовать элементы `name` и `type` (если `type` не указан, будет использован тип по умолчанию – Date). `type` может быть любым численным типом (Date/DateTime/UInt64/Int32/др.). Пример: -```xml +``` xml Id @@ -171,14 +169,14 @@ LAYOUT(COMPLEX_KEY_HASHED()) last - Date + Date ... ``` или -```sql +``` sql CREATE DICTIONARY somedict ( id UInt64, first Date, @@ -191,19 +189,19 @@ RANGE(MIN first MAX last) Для работы с такими словарями в функцию `dictGetT` необходимо передавать дополнительный аргумент, для которого подбирается диапазон: - dictGetT('dict_name', 'attr_name', id, date) + dictGetT('dict_name', 'attr_name', id, date) Функция возвращает значение для заданных `id` и диапазона дат, в который входит переданная дата. Особенности алгоритма: -- Если не найден `id` или для найденного `id` не найден диапазон, то возвращается значение по умолчанию для словаря. -- Если есть перекрывающиеся диапазоны, то можно использовать любой подходящий. -- Если граница диапазона `NULL` или некорректная дата (1900-01-01, 2039-01-01), то диапазон считается открытым. Диапазон может быть открытым с обеих сторон. +- Если не найден `id` или для найденного `id` не найден диапазон, то возвращается значение по умолчанию для словаря. +- Если есть перекрывающиеся диапазоны, то можно использовать любой подходящий. +- Если граница диапазона `NULL` или некорректная дата (1900-01-01, 2039-01-01), то диапазон считается открытым. Диапазон может быть открытым с обеих сторон. Пример конфигурации: -```xml +``` xml @@ -219,7 +217,7 @@ RANGE(MIN first MAX last) StartTimeStamp - UInt64 + UInt64 EndTimeStamp @@ -238,7 +236,7 @@ RANGE(MIN first MAX last) или -```sql +``` sql CREATE DICTIONARY somedict( Abcdef UInt64, StartTimeStamp UInt64, @@ -249,7 +247,7 @@ PRIMARY KEY Abcdef RANGE(MIN StartTimeStamp MAX EndTimeStamp) ``` -### cache +### cache {#cache} Словарь хранится в кэше, состоящем из фиксированного количества ячеек. Ячейки содержат часто используемые элементы. @@ -265,7 +263,7 @@ RANGE(MIN StartTimeStamp MAX EndTimeStamp) Пример настройки: -```xml +``` xml @@ -276,33 +274,31 @@ RANGE(MIN StartTimeStamp MAX EndTimeStamp) или -```sql +``` sql LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) ``` Укажите достаточно большой размер кэша. Количество ячеек следует подобрать экспериментальным путём: -1. Выставить некоторое значение. -2. Запросами добиться полной заполненности кэша. -3. Оценить потребление оперативной памяти с помощью таблицы `system.dictionaries`. -4. Увеличивать/уменьшать количество ячеек до получения требуемого расхода оперативной памяти. +1. Выставить некоторое значение. +2. Запросами добиться полной заполненности кэша. +3. Оценить потребление оперативной памяти с помощью таблицы `system.dictionaries`. +4. Увеличивать/уменьшать количество ячеек до получения требуемого расхода оперативной памяти. -!!! warning +!!! warning "Warning" Не используйте в качестве источника ClickHouse, поскольку он медленно обрабатывает запросы со случайным чтением. - -### complex_key_cache +### complex\_key\_cache {#complex-key-cache} Тип размещения предназначен для использования с составными [ключами](external_dicts_dict_structure.md). Аналогичен `cache`. - -### ip_trie +### ip\_trie {#ip-trie} Тип размещения предназначен для сопоставления префиксов сети (IP адресов) с метаданными, такими как ASN. Пример: таблица содержит префиксы сети и соответствующие им номера AS и коды стран: -```text +``` text +-----------------+-------+--------+ | prefix | asn | cca2 | +=================+=======+========+ @@ -320,7 +316,7 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) Пример: -```xml +``` xml @@ -343,7 +339,7 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) или -```sql +``` sql CREATE DICTIONARY somedict ( prefix String, asn UInt32, @@ -356,13 +352,13 @@ PRIMARY KEY prefix Для запросов необходимо использовать те же функции (`dictGetT` с кортежем), что и для словарей с составными ключами: -```sql +``` sql dictGetT('dict_name', 'attr_name', tuple(ip)) ``` Функция принимает либо `UInt32` для IPv4, либо `FixedString(16)` для IPv6: -```sql +``` sql dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1'))) ``` diff --git a/docs/ru/query_language/dicts/external_dicts_dict_lifetime.md b/docs/ru/query_language/dicts/external_dicts_dict_lifetime.md index d18ac8b38af..e886a3e7ef6 100644 --- a/docs/ru/query_language/dicts/external_dicts_dict_lifetime.md +++ b/docs/ru/query_language/dicts/external_dicts_dict_lifetime.md @@ -1,5 +1,4 @@ - -# Обновление словарей +# Обновление словарей {#obnovlenie-slovarei} ClickHouse периодически обновляет словари. Интервал обновления для полностью загружаемых словарей и интервал инвалидации для кэшируемых словарей определяется в теге `` в секундах. @@ -7,30 +6,30 @@ ClickHouse периодически обновляет словари. Инте Пример настройки: -```xml +``` xml ... 300 ... ``` + или -```sql +``` sql CREATE DICTIONARY (...) ... LIFETIME(300) ... ``` - Настройка `0` запрещает обновление словарей. Можно задать интервал, внутри которого ClickHouse равномерно-случайно выберет время для обновления. Это необходимо для распределения нагрузки на источник словаря при обновлении на большом количестве серверов. Пример настройки: -```xml +``` xml ... @@ -43,24 +42,24 @@ LIFETIME(300) или -```sql +``` sql LIFETIME(MIN 300 MAX 360) ``` При обновлении словарей сервер ClickHouse применяет различную логику в зависимости от типа [источника](external_dicts_dict_sources.md): -> - У текстового файла проверяется время модификации. Если время изменилось по отношению к запомненному ранее, то словарь обновляется. -> - Для таблиц типа MyISAM, время модификации проверяется запросом `SHOW TABLE STATUS`. -> - Словари из других источников по умолчанию обновляются каждый раз. +> - У текстового файла проверяется время модификации. Если время изменилось по отношению к запомненному ранее, то словарь обновляется. +> - Для таблиц типа MyISAM, время модификации проверяется запросом `SHOW TABLE STATUS`. +> - Словари из других источников по умолчанию обновляются каждый раз. Для источников MySQL (InnoDB), ODBC и ClickHouse можно настроить запрос, который позволит обновлять словари только в случае их фактического изменения, а не каждый раз. Чтобы это сделать необходимо выполнить следующие условия/действия: -> - В таблице словаря должно быть поле, которое гарантированно изменяется при обновлении данных в источнике. -> - В настройках источника указывается запрос, который получает изменяющееся поле. Результат запроса сервер ClickHouse интерпретирует как строку и если эта строка изменилась по отношению к предыдущему состоянию, то словарь обновляется. Запрос следует указывать в поле `` настроек [источника](external_dicts_dict_sources.md). +> - В таблице словаря должно быть поле, которое гарантированно изменяется при обновлении данных в источнике. +> - В настройках источника указывается запрос, который получает изменяющееся поле. Результат запроса сервер ClickHouse интерпретирует как строку и если эта строка изменилась по отношению к предыдущему состоянию, то словарь обновляется. Запрос следует указывать в поле `` настроек [источника](external_dicts_dict_sources.md). Пример настройки: -```xml +``` xml ... @@ -73,7 +72,7 @@ LIFETIME(MIN 300 MAX 360) или -```sql +``` sql ... SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source where id = 1')) ... diff --git a/docs/ru/query_language/dicts/external_dicts_dict_sources.md b/docs/ru/query_language/dicts/external_dicts_dict_sources.md index 58caf501847..402f1ac345f 100644 --- a/docs/ru/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/ru/query_language/dicts/external_dicts_dict_sources.md @@ -1,11 +1,10 @@ - -# Источники внешних словарей {#dicts-external_dicts_dict_sources} +# Источники внешних словарей {#dicts-external-dicts-dict-sources} Внешний словарь можно подключить из множества источников. Общий вид XML-конфигурации: -```xml +``` xml ... @@ -22,34 +21,32 @@ Аналогичный [DDL-запрос](../create.md#create-dictionary-query): -```sql +``` sql CREATE DICTIONARY dict_name (...) ... SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration ... ``` - Источник настраивается в разделе `source`. Типы источников (`source_type`): -- [Локальный файл](#dicts-external_dicts_dict_sources-local_file) -- [Исполняемый файл](#dicts-external_dicts_dict_sources-executable) -- [HTTP(s)](#dicts-external_dicts_dict_sources-http) +- [Локальный файл](#dicts-external_dicts_dict_sources-local_file) +- [Исполняемый файл](#dicts-external_dicts_dict_sources-executable) +- [HTTP(s)](#dicts-external_dicts_dict_sources-http) - СУБД: - - [ODBC](#dicts-external_dicts_dict_sources-odbc) - - [MySQL](#dicts-external_dicts_dict_sources-mysql) - - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - - [Redis](#dicts-external_dicts_dict_sources-redis) + - [ODBC](#dicts-external_dicts_dict_sources-odbc) + - [MySQL](#dicts-external_dicts_dict_sources-mysql) + - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) + - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) + - [Redis](#dicts-external_dicts_dict_sources-redis) - -## Локальный файл {#dicts-external_dicts_dict_sources-local_file} +## Локальный файл {#dicts-external-dicts-dict-sources-local-file} Пример настройки: -```xml +``` xml /opt/dictionaries/os.tsv @@ -60,23 +57,22 @@ SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration или -```sql +``` sql SOURCE(FILE(path '/opt/dictionaries/os.tsv' format 'TabSeparated')) ``` Поля настройки: -- `path` — Абсолютный путь к файлу. -- `format` — Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)". +- `path` — Абсолютный путь к файлу. +- `format` — Формат файла. Поддерживаются все форматы, описанные в разделе «[Форматы](../../interfaces/formats.md#formats)». - -## Исполняемый файл {#dicts-external_dicts_dict_sources-executable} +## Исполняемый файл {#dicts-external-dicts-dict-sources-executable} Работа с исполняемым файлом зависит от [размещения словаря в памяти](external_dicts_dict_layout.md). Если тип размещения словаря `cache` и `complex_key_cache`, то ClickHouse запрашивает необходимые ключи, отправляя запрос в `STDIN` исполняемого файла. Пример настройки: -```xml +``` xml cat /opt/dictionaries/os.tsv @@ -87,23 +83,22 @@ SOURCE(FILE(path '/opt/dictionaries/os.tsv' format 'TabSeparated')) или -```sql +``` sql SOURCE(EXECUTABLE(command 'cat /opt/dictionaries/os.tsv' format 'TabSeparated')) ``` Поля настройки: - `command` — Абсолютный путь к исполняемому файлу или имя файла (если каталог программы прописан в `PATH`). -- `format` — Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)". +- `format` — Формат файла. Поддерживаются все форматы, описанные в разделе «[Форматы](../../interfaces/formats.md#formats)». - -## HTTP(s) {#dicts-external_dicts_dict_sources-http} +## HTTP(s) {#dicts-external-dicts-dict-sources-http} Работа с HTTP(s) сервером зависит от [размещения словаря в памяти](external_dicts_dict_layout.md). Если тип размещения словаря `cache` и `complex_key_cache`, то ClickHouse запрашивает необходимые ключи, отправляя запрос методом `POST`. Пример настройки: -```xml +``` xml http://[::1]/os.tsv @@ -124,7 +119,7 @@ SOURCE(EXECUTABLE(command 'cat /opt/dictionaries/os.tsv' format 'TabSeparated')) или -```sql +``` sql SOURCE(HTTP( url 'http://[::1]/os.tsv' format 'TabSeparated' @@ -138,16 +133,15 @@ SOURCE(HTTP( Поля настройки: - `url` — URL источника. -- `format` — Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)". +- `format` — Формат файла. Поддерживаются все форматы, описанные в разделе «[Форматы](../../interfaces/formats.md#formats)». - -## ODBC {#dicts-external_dicts_dict_sources-odbc} +## ODBC {#dicts-external-dicts-dict-sources-odbc} Этим способом можно подключить любую базу данных, имеющую ODBC драйвер. Пример настройки: -```xml +``` xml DatabaseName @@ -160,7 +154,7 @@ SOURCE(HTTP( или -```sql +``` sql SOURCE(ODBC( db 'DatabaseName' table 'SchemaName.TableName' @@ -171,25 +165,25 @@ SOURCE(ODBC( Поля настройки: -- `db` — имя базы данных. Не указывать, если имя базы задано в параметрах. ``. -- `table` — имя таблицы и схемы, если она есть. -- `connection_string` — строка соединения. -- `invalidate_query` — запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md). +- `db` — имя базы данных. Не указывать, если имя базы задано в параметрах. ``. +- `table` — имя таблицы и схемы, если она есть. +- `connection_string` — строка соединения. +- `invalidate_query` — запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md). ClickHouse получает от ODBC-драйвера информацию о квотировании и квотирует настройки в запросах к драйверу, поэтому имя таблицы нужно указывать в соответствии с регистром имени таблицы в базе данных. Если у вас есть проблемы с кодировками при использовании Oracle, ознакомьтесь с соответствующим разделом [FAQ](../../faq/general.md#oracle-odbc-encodings). -### Выявленная уязвимость в функционировании ODBC словарей +### Выявленная уязвимость в функционировании ODBC словарей {#vyiavlennaia-uiazvimost-v-funktsionirovanii-odbc-slovarei} -!!! attention +!!! attention "Attention" При соединении с базой данных через ODBC можно заменить параметр соединения `Servername`. В этом случае, значения `USERNAME` и `PASSWORD` из `odbc.ini` отправляются на удаленный сервер и могут быть скомпрометированы. **Пример небезопасного использования** Сконфигурируем unixODBC для работы с PostgreSQL. Содержимое `/etc/odbc.ini`: -```text +``` text [gregtest] Driver = /usr/lib/psqlodbca.so Servername = localhost @@ -202,23 +196,25 @@ PASSWORD = test Если выполнить запрос вида: -```sql -SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); +``` sql +SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); ``` то ODBC драйвер отправит значения `USERNAME` и `PASSWORD` из `odbc.ini` на `some-server.com`. -### Пример подключения PostgreSQL +### Пример подключения PostgreSQL {#primer-podkliucheniia-postgresql} ОС Ubuntu. Установка unixODBC и ODBC-драйвера для PostgreSQL: : -```bash + +``` bash $ sudo apt-get install -y unixodbc odbcinst odbc-postgresql ``` + Настройка `/etc/odbc.ini` (или `~/.odbc.ini`): -```text +``` text [DEFAULT] Driver = myconnection @@ -239,7 +235,7 @@ $ sudo apt-get install -y unixodbc odbcinst odbc-postgresql Конфигурация словаря в ClickHouse: -```xml +``` xml table_name @@ -274,7 +270,7 @@ $ sudo apt-get install -y unixodbc odbcinst odbc-postgresql или -```sql +``` sql CREATE DICTIONARY table_name ( id UInt64, some_column UInt64 DEFAULT 0 @@ -298,7 +294,7 @@ $ sudo apt-get install tdsodbc freetds-bin sqsh Настройка драйвера: : -```bash +``` bash $ cat /etc/freetds/freetds.conf ... @@ -333,7 +329,7 @@ $ sudo apt-get install tdsodbc freetds-bin sqsh Настройка словаря в ClickHouse: -```xml +``` xml test @@ -369,7 +365,7 @@ $ sudo apt-get install tdsodbc freetds-bin sqsh или -```sql +``` sql CREATE DICTIONARY test ( k UInt64, s String DEFAULT '' @@ -380,15 +376,13 @@ LAYOUT(FLAT()) LIFETIME(MIN 300 MAX 360) ``` +## СУБД {#subd} -## СУБД - - -### MySQL {#dicts-external_dicts_dict_sources-mysql} +### MySQL {#dicts-external-dicts-dict-sources-mysql} Пример настройки: -```xml +``` xml 3306 @@ -412,7 +406,7 @@ LIFETIME(MIN 300 MAX 360) или -```sql +``` sql SOURCE(MYSQL( port 3306 user 'clickhouse' @@ -428,24 +422,30 @@ SOURCE(MYSQL( Поля настройки: -- `port` — порт сервера MySQL. Можно указать для всех реплик или для каждой в отдельности (внутри ``). -- `user` — имя пользователя MySQL. Можно указать для всех реплик или для каждой в отдельности (внутри ``). -- `password` — пароль пользователя MySQL. Можно указать для всех реплик или для каждой в отдельности (внутри ``). -- `replica` — блок конфигурации реплики. Блоков может быть несколько. +- `port` — порт сервера MySQL. Можно указать для всех реплик или для каждой в отдельности (внутри ``). - - `replica/host` — хост MySQL. - - `replica/priority` — приоритет реплики. При попытке соединения ClickHouse обходит реплики в соответствии с приоритетом. Чем меньше цифра, тем выше приоритет. +- `user` — имя пользователя MySQL. Можно указать для всех реплик или для каждой в отдельности (внутри ``). -- `db` — имя базы данных. -- `table` — имя таблицы. -- `where` — условие выбора. Синтаксис условия совпадает с синтаксисом секции `WHERE` в MySQL, например, `id > 10 AND id < 20`. Необязательный параметр. -- `invalidate_query` — запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md). +- `password` — пароль пользователя MySQL. Можно указать для всех реплик или для каждой в отдельности (внутри ``). + +- `replica` — блок конфигурации реплики. Блоков может быть несколько. + + - `replica/host` — хост MySQL. + - `replica/priority` — приоритет реплики. При попытке соединения ClickHouse обходит реплики в соответствии с приоритетом. Чем меньше цифра, тем выше приоритет. + +- `db` — имя базы данных. + +- `table` — имя таблицы. + +- `where` — условие выбора. Синтаксис условия совпадает с синтаксисом секции `WHERE` в MySQL, например, `id > 10 AND id < 20`. Необязательный параметр. + +- `invalidate_query` — запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md). MySQL можно подключить на локальном хосте через сокеты, для этого необходимо задать `host` и `socket`. Пример настройки: -```xml +``` xml localhost @@ -462,7 +462,7 @@ MySQL можно подключить на локальном хосте чер или -```sql +``` sql SOURCE(MYSQL( host 'localhost' socket '/path/to/socket/file.sock' @@ -475,12 +475,11 @@ SOURCE(MYSQL( )) ``` - -### ClickHouse {#dicts-external_dicts_dict_sources-clickhouse} +### ClickHouse {#dicts-external-dicts-dict-sources-clickhouse} Пример настройки: -```xml +``` xml example01-01-1 @@ -496,7 +495,7 @@ SOURCE(MYSQL( или -```sql +``` sql SOURCE(CLICKHOUSE( host 'example01-01-1' port 9000 @@ -519,12 +518,11 @@ SOURCE(CLICKHOUSE( - `where` — условие выбора. Может отсутствовать. - `invalidate_query` — запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md). - -### MongoDB {#dicts-external_dicts_dict_sources-mongodb} +### MongoDB {#dicts-external-dicts-dict-sources-mongodb} Пример настройки: -```xml +``` xml localhost @@ -539,7 +537,7 @@ SOURCE(CLICKHOUSE( или -```sql +``` sql SOURCE(MONGO( host 'localhost' port 27017 @@ -552,18 +550,18 @@ SOURCE(MONGO( Поля настройки: -- `host` — хост MongoDB. -- `port` — порт сервера MongoDB. -- `user` — имя пользователя MongoDB. -- `password` — пароль пользователя MongoDB. -- `db` — имя базы данных. -- `collection` — имя коллекции. +- `host` — хост MongoDB. +- `port` — порт сервера MongoDB. +- `user` — имя пользователя MongoDB. +- `password` — пароль пользователя MongoDB. +- `db` — имя базы данных. +- `collection` — имя коллекции. -### Redis {#dicts-external_dicts_dict_sources-redis} +### Redis {#dicts-external-dicts-dict-sources-redis} Пример настройки: -```xml +``` xml localhost @@ -576,7 +574,7 @@ SOURCE(MONGO( или -```sql +``` sql SOURCE(REDIS( host 'localhost' port 6379 @@ -587,9 +585,9 @@ SOURCE(REDIS( Поля настройки: -- `host` – хост Redis. -- `port` – порт сервера Redis. -- `storage_type` – способ хранения ключей. Необходимо использовать `simple` для источников с одним столбцом ключей, `hash_map` -- для источников с двумя столбцами ключей. Источники с более, чем двумя столбцами ключей, не поддерживаются. Может отсутствовать, значение по умолчанию `simple`. -- `db_index` – номер базы данных. Может отсутствовать, значение по умолчанию 0. +- `host` – хост Redis. +- `port` – порт сервера Redis. +- `storage_type` – способ хранения ключей. Необходимо использовать `simple` для источников с одним столбцом ключей, `hash_map` – для источников с двумя столбцами ключей. Источники с более, чем двумя столбцами ключей, не поддерживаются. Может отсутствовать, значение по умолчанию `simple`. +- `db_index` – номер базы данных. Может отсутствовать, значение по умолчанию 0. [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict_sources/) diff --git a/docs/ru/query_language/dicts/external_dicts_dict_structure.md b/docs/ru/query_language/dicts/external_dicts_dict_structure.md index f9ab9d30dac..9e4634aeaa8 100644 --- a/docs/ru/query_language/dicts/external_dicts_dict_structure.md +++ b/docs/ru/query_language/dicts/external_dicts_dict_structure.md @@ -1,10 +1,10 @@ -# Ключ и поля словаря +# Ключ и поля словаря {#kliuch-i-polia-slovaria} Секция `` описывает ключ словаря и поля, доступные для запросов. Описание в формате XML: -```xml +``` xml @@ -23,13 +23,12 @@ Атрибуты описываются элементами: -- `` — [столбец с ключом](external_dicts_dict_structure.md#ext_dict_structure-key). -- `` — [столбец данных](external_dicts_dict_structure.md#ext_dict_structure-attributes). Можно задать несколько атрибутов. - +- `` — [столбец с ключом](external_dicts_dict_structure.md#ext_dict_structure-key). +- `` — [столбец данных](external_dicts_dict_structure.md#ext_dict_structure-attributes). Можно задать несколько атрибутов. Создание словаря запросом: -```sql +``` sql CREATE DICTIONARY dict_name ( Id UInt64, -- attributes @@ -40,29 +39,28 @@ PRIMARY KEY Id Атрибуты задаются в теле запроса: -- `PRIMARY KEY` — [столбец с ключом](external_dicts_dict_structure.md#ext_dict_structure-key) -- `AttrName AttrType` — [столбец данных](external_dicts_dict_structure.md#ext_dict_structure-attributes). Можно задать несколько столбцов. +- `PRIMARY KEY` — [столбец с ключом](external_dicts_dict_structure.md#ext_dict_structure-key) +- `AttrName AttrType` — [столбец данных](external_dicts_dict_structure.md#ext_dict_structure-attributes). Можно задать несколько столбцов. - -## Ключ {#ext_dict_structure-key} +## Ключ {#ext-dict-structure-key} ClickHouse поддерживает следующие виды ключей: -- Числовой ключ. `UInt64`. Описывается в теге `` или ключевым словом `PRIMARY KEY`. -- Составной ключ. Набор значений разного типа. Описывается в теге `` или ключевым словом `PRIMARY KEY`. +- Числовой ключ. `UInt64`. Описывается в теге `` или ключевым словом `PRIMARY KEY`. +- Составной ключ. Набор значений разного типа. Описывается в теге `` или ключевым словом `PRIMARY KEY`. Структура может содержать либо `` либо ``. DDL-запрос может содержать только `PRIMARY KEY`. !!! warning "Обратите внимание" Ключ не надо дополнительно описывать в атрибутах. -### Числовой ключ {#ext_dict-numeric-key} +### Числовой ключ {#ext-dict-numeric-key} Тип: `UInt64`. Пример конфигурации: -```xml +``` xml Id @@ -70,11 +68,11 @@ ClickHouse поддерживает следующие виды ключей: Поля конфигурации: -- `name` — имя столбца с ключами. +- `name` — имя столбца с ключами. Для DDL-запроса: -```sql +``` sql CREATE DICTIONARY ( Id UInt64, ... @@ -83,10 +81,9 @@ PRIMARY KEY Id ... ``` -- `PRIMARY KEY` – имя столбца с ключами. +- `PRIMARY KEY` – имя столбца с ключами. - -### Составной ключ +### Составной ключ {#sostavnoi-kliuch} Ключом может быть кортеж (`tuple`) из полей произвольных типов. В этом случае [layout](external_dicts_dict_layout.md) должен быть `complex_key_hashed` или `complex_key_cache`. @@ -95,7 +92,7 @@ PRIMARY KEY Id Структура ключа задаётся в элементе ``. Поля ключа задаются в том же формате, что и [атрибуты](external_dicts_dict_structure.md) словаря. Пример: -```xml +``` xml @@ -113,7 +110,7 @@ PRIMARY KEY Id или -```sql +``` sql CREATE DICTIONARY ( field1 String, field2 String @@ -125,12 +122,11 @@ PRIMARY KEY field1, field2 При запросе в функции `dictGet*` в качестве ключа передаётся кортеж. Пример: `dictGetString('dict_name', 'attr_name', tuple('string for field1', num_for_field2))`. - -## Атрибуты {#ext_dict_structure-attributes} +## Атрибуты {#ext-dict-structure-attributes} Пример конфигурации: -```xml +``` xml ... @@ -147,27 +143,25 @@ PRIMARY KEY field1, field2 или -```sql +``` sql CREATE DICTIONARY somename ( Name ClickHouseDataType DEFAULT '' EXPRESSION rand64() HIERARCHICAL INJECTIVE IS_OBJECT_ID ) ``` - Поля конфигурации: -| Тег | Описание | Обязательный | -| ---- | ------------- | --------- | -| `name` | Имя столбца. | Да | -| `type` | Тип данных ClickHouse.
ClickHouse пытается привести значение из словаря к заданному типу данных. Например, в случае MySQL, в таблице-источнике поле может быть `TEXT`, `VARCHAR`, `BLOB`, но загружено может быть как `String`. [Nullable](../../data_types/nullable.md) не поддерживается. | Да | -| `null_value` | Значение по умолчанию для несуществующего элемента.
В примере это пустая строка. Нельзя указать значение `NULL`. | Да | -| `expression` | [Выражение](../syntax.md#syntax-expressions), которое ClickHouse выполняет со значением.
Выражением может быть имя столбца в удаленной SQL базе. Таким образом, вы можете использовать его для создания псевдонима удаленного столбца.

Значение по умолчанию: нет выражения. | Нет | - `hierarchical` | Если `true`, то атрибут содержит ключ предка для текущего элемента. Смотрите [Иерархические словари](external_dicts_dict_hierarchical.md).

Default value: `false`. | No| `injective` | Признак [инъективности](https://ru.wikipedia.org/wiki/Инъекция_(математика)) отображения `id -> attribute`.
Если `true`, то обращения к словарям с включенной инъективностью могут быть автоматически переставлены ClickHouse за стадию `GROUP BY`, что как правило существенно сокращает их количество.

Значение по умолчанию: `false`. | Нет | -| `is_object_id` | Признак того, что запрос выполняется к документу MongoDB по `ObjectID`.

Значение по умолчанию: `false`. | Нет | +| Тег | Описание | Обязательный | +|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------| +| `name` | Имя столбца. | Да | +| `type` | Тип данных ClickHouse.
ClickHouse пытается привести значение из словаря к заданному типу данных. Например, в случае MySQL, в таблице-источнике поле может быть `TEXT`, `VARCHAR`, `BLOB`, но загружено может быть как `String`. [Nullable](../../data_types/nullable.md) не поддерживается. | Да | +| `null_value` | Значение по умолчанию для несуществующего элемента.
В примере это пустая строка. Нельзя указать значение `NULL`. | Да | +| `expression` | [Выражение](../syntax.md#syntax-expressions), которое ClickHouse выполняет со значением.
Выражением может быть имя столбца в удаленной SQL базе. Таким образом, вы можете использовать его для создания псевдонима удаленного столбца.

Значение по умолчанию: нет выражения. | Нет | +| `hierarchical` | Если `true`, то атрибут содержит ключ предка для текущего элемента. Смотрите [Иерархические словари](external_dicts_dict_hierarchical.md).

Default value: `false`. | No | +| `is_object_id` | Признак того, что запрос выполняется к документу MongoDB по `ObjectID`.

Значение по умолчанию: `false`. | Нет | +## Смотрите также {#smotrite-takzhe} -## Смотрите также - - - [Функции для работы с внешними словарями](../functions/ext_dict_functions.md). +- [Функции для работы с внешними словарями](../functions/ext_dict_functions.md). [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict_structure/) diff --git a/docs/ru/query_language/dicts/index.md b/docs/ru/query_language/dicts/index.md index 815029c245c..e53b7f394d8 100644 --- a/docs/ru/query_language/dicts/index.md +++ b/docs/ru/query_language/dicts/index.md @@ -1,4 +1,4 @@ -# Словари +# Словари {#slovari} Словарь — это отображение (`ключ -> атрибуты`), которое удобно использовать для различного вида справочников. @@ -8,7 +8,7 @@ ClickHouse поддерживает специальные функции для ClickHouse поддерживает: -- [Встроенные словари](internal_dicts.md#internal_dicts) со специфическим [набором функций](../functions/ym_dict_functions.md). -- [Подключаемые (внешние) словари](external_dicts.md) с [набором функций](../functions/ext_dict_functions.md). +- [Встроенные словари](internal_dicts.md#internal_dicts) со специфическим [набором функций](../functions/ym_dict_functions.md). +- [Подключаемые (внешние) словари](external_dicts.md) с [набором функций](../functions/ext_dict_functions.md). [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/) diff --git a/docs/ru/query_language/dicts/internal_dicts.md b/docs/ru/query_language/dicts/internal_dicts.md index 4773b03e634..e2118b6000a 100644 --- a/docs/ru/query_language/dicts/internal_dicts.md +++ b/docs/ru/query_language/dicts/internal_dicts.md @@ -1,15 +1,15 @@ -# Встроенные словари {#internal_dicts} +# Встроенные словари {#internal-dicts} ClickHouse содержит встроенную возможность работы с геобазой. Это позволяет: -- для идентификатора региона получить его имя на нужном языке; -- по идентификатору региона получить идентификатор города, области, федерального округа, страны, континента; -- проверить, что один регион входит в другой; -- получить цепочку родительских регионов. +- для идентификатора региона получить его имя на нужном языке; +- по идентификатору региона получить идентификатор города, области, федерального округа, страны, континента; +- проверить, что один регион входит в другой; +- получить цепочку родительских регионов. -Все функции поддерживают "транслокальность", то есть возможность использовать одновременно разные точки зрения на принадлежность регионов. Подробнее смотрите в разделе "Функции для работы со словарями Яндекс.Метрики". +Все функции поддерживают «транслокальность», то есть возможность использовать одновременно разные точки зрения на принадлежность регионов. Подробнее смотрите в разделе «Функции для работы со словарями Яндекс.Метрики». В пакете по умолчанию, встроенные словари выключены. Для включения, раскомментируйте параметры `path_to_regions_hierarchy_file` и `path_to_regions_names_files` в конфигурационном файле сервера. @@ -24,15 +24,15 @@ ClickHouse содержит встроенную возможность рабо `regions_hierarchy*.txt`: TabSeparated (без заголовка), столбцы: -- идентификатор региона (`UInt32`); -- идентификатор родительского региона (`UInt32`); -- тип региона (`UInt8`): 1 - континент, 3 - страна, 4 - федеральный округ, 5 - область, 6 - город; остальные типы не имеют значения; -- население (`UInt32`) - не обязательный столбец. +- идентификатор региона (`UInt32`); +- идентификатор родительского региона (`UInt32`); +- тип региона (`UInt8`): 1 - континент, 3 - страна, 4 - федеральный округ, 5 - область, 6 - город; остальные типы не имеют значения; +- население (`UInt32`) - не обязательный столбец. `regions_names_*.txt`: TabSeparated (без заголовка), столбцы: -- идентификатор региона (`UInt32`); -- имя региона (`String`) - не может содержать табы или переводы строк, даже экранированные. +- идентификатор региона (`UInt32`); +- имя региона (`String`) - не может содержать табы или переводы строк, даже экранированные. Для хранения в оперативке используется плоский массив. Поэтому, идентификаторы не должны быть больше миллиона. diff --git a/docs/ru/query_language/functions/arithmetic_functions.md b/docs/ru/query_language/functions/arithmetic_functions.md index 7298d6ac6f8..7d979f05a5c 100644 --- a/docs/ru/query_language/functions/arithmetic_functions.md +++ b/docs/ru/query_language/functions/arithmetic_functions.md @@ -1,14 +1,14 @@ -# Арифметические функции +# Арифметические функции {#arifmeticheskie-funktsii} -Для всех арифметических функций, тип результата вычисляется, как минимальный числовой тип, который может вместить результат, если такой тип есть. Минимум берётся одновременно по числу бит, знаковости и "плавучести". Если бит не хватает, то берётся тип максимальной битности. +Для всех арифметических функций, тип результата вычисляется, как минимальный числовой тип, который может вместить результат, если такой тип есть. Минимум берётся одновременно по числу бит, знаковости и «плавучести». Если бит не хватает, то берётся тип максимальной битности. Пример: -```sql +``` sql SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0) ``` -```text +``` text ┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐ │ UInt8 │ UInt16 │ UInt32 │ UInt64 │ └───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘ @@ -18,57 +18,59 @@ SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 Переполнение производится также, как в C++. -## plus(a, b), оператор a + b +## plus(a, b), оператор a + b {#plusa-b-operator-a-b} Вычисляет сумму чисел. Также можно складывать целые числа с датой и датой-с-временем. В случае даты, прибавление целого числа означает прибавление соответствующего количества дней. В случае даты-с-временем - прибавление соответствующего количества секунд. -## minus(a, b), оператор a - b +## minus(a, b), оператор a - b {#minusa-b-operator-a-b} Вычисляет разность чисел. Результат всегда имеет знаковый тип. Также можно вычитать целые числа из даты и даты-с-временем. Смысл аналогичен - смотрите выше для plus. -## multiply(a, b), оператор a \* b +## multiply(a, b), оператор a \* b {#multiplya-b-operator-a-b} Вычисляет произведение чисел. -## divide(a, b), оператор a / b +## divide(a, b), оператор a / b {#dividea-b-operator-a-b} Вычисляет частное чисел. Тип результата всегда является типом с плавающей запятой. То есть, деление не целочисленное. Для целочисленного деления, используйте функцию intDiv. При делении на ноль получится inf, -inf или nan. -## intDiv(a, b) +## intDiv(a, b) {#intdiva-b} Вычисляет частное чисел. Деление целочисленное, с округлением вниз (по абсолютному значению). При делении на ноль или при делении минимального отрицательного числа на минус единицу, кидается исключение. -## intDivOrZero(a, b) +## intDivOrZero(a, b) {#intdivorzeroa-b} Отличается от intDiv тем, что при делении на ноль или при делении минимального отрицательного числа на минус единицу, возвращается ноль. -## modulo(a, b), оператор a % b +## modulo(a, b), оператор a % b {#moduloa-b-operator-a-b} Вычисляет остаток от деления. Если аргументы - числа с плавающей запятой, то они предварительно преобразуются в целые числа, путём отбрасывания дробной части. Берётся остаток в том же смысле, как это делается в C++. По факту, для отрицательных чисел, используется truncated division. При делении на ноль или при делении минимального отрицательного числа на минус единицу, кидается исключение. -## negate(a), оператор -a +## negate(a), оператор -a {#negatea-operator-a} Вычисляет число, обратное по знаку. Результат всегда имеет знаковый тип. -## abs(a) {#arithm_func-abs} +## abs(a) {#arithm-func-abs} -Вычисляет абсолютное значение для числа a. То есть, если a < 0, то возвращает -a. +Вычисляет абсолютное значение для числа a. То есть, если a \< 0, то возвращает -a. Для беззнаковых типов ничего не делает. Для чисел типа целых со знаком, возвращает число беззнакового типа. -## gcd(a, b) +## gcd(a, b) {#gcda-b} + Вычисляет наибольший общий делитель чисел. При делении на ноль или при делении минимального отрицательного числа на минус единицу, кидается исключение. -## lcm(a, b) +## lcm(a, b) {#lcma-b} + Вычисляет наименьшее общее кратное чисел. При делении на ноль или при делении минимального отрицательного числа на минус единицу, кидается исключение. diff --git a/docs/ru/query_language/functions/array_functions.md b/docs/ru/query_language/functions/array_functions.md index 9cdec5266a4..28fffc41c5e 100644 --- a/docs/ru/query_language/functions/array_functions.md +++ b/docs/ru/query_language/functions/array_functions.md @@ -1,4 +1,4 @@ -# Функции по работе с массивами +# Функции по работе с массивами {#funktsii-po-rabote-s-massivami} ## empty {#function-empty} @@ -12,63 +12,65 @@ Тип результата - UInt8. Функция также работает для строк. -## length {#array_functions-length} +## length {#array-functions-length} Возвращает количество элементов в массиве. Тип результата - UInt64. Функция также работает для строк. -## emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64 +## emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64 {#emptyarrayuint8-emptyarrayuint16-emptyarrayuint32-emptyarrayuint64} -## emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64 +## emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64 {#emptyarrayint8-emptyarrayint16-emptyarrayint32-emptyarrayint64} -## emptyArrayFloat32, emptyArrayFloat64 +## emptyArrayFloat32, emptyArrayFloat64 {#emptyarrayfloat32-emptyarrayfloat64} -## emptyArrayDate, emptyArrayDateTime +## emptyArrayDate, emptyArrayDateTime {#emptyarraydate-emptyarraydatetime} -## emptyArrayString +## emptyArrayString {#emptyarraystring} Принимает ноль аргументов и возвращает пустой массив соответствующего типа. -## emptyArrayToSingle +## emptyArrayToSingle {#emptyarraytosingle} Принимает пустой массив и возвращает массив из одного элемента, равного значению по умолчанию. -## range(N) +## range(N) {#rangen} Возвращает массив чисел от 0 до N-1. На всякий случай, если на блок данных, создаются массивы суммарной длины больше 100 000 000 элементов, то кидается исключение. -## array(x1, ...), оператор \[x1, ...\] +## array(x1, …), оператор \[x1, …\] {#arrayx1-operator-x1} Создаёт массив из аргументов функции. Аргументы должны быть константами и иметь типы, для которых есть наименьший общий тип. Должен быть передан хотя бы один аргумент, так как иначе непонятно, какого типа создавать массив. То есть, с помощью этой функции невозможно создать пустой массив (для этого используйте функции emptyArray\*, описанные выше). Возвращает результат типа Array(T), где T - наименьший общий тип от переданных аргументов. -## arrayConcat +## arrayConcat {#arrayconcat} Объединяет массивы, переданные в качестве аргументов. -```sql +``` sql arrayConcat(arrays) ``` **Параметры** -- `arrays` – произвольное количество элементов типа [Array](../../data_types/array.md) -**Пример** +- `arrays` – произвольное количество элементов типа [Array](../../data_types/array.md) + **Пример** -```sql + + +``` sql SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res ``` -```text +``` text ┌─res───────────┐ │ [1,2,3,4,5,6] │ └───────────────┘ ``` -## arrayElement(arr, n), operator arr[n] +## arrayElement(arr, n), operator arr\[n\] {#arrayelementarr-n-operator-arrn} Достаёт элемент с индексом n из массива arr. n должен быть любым целочисленным типом. Индексы в массиве начинаются с единицы. @@ -76,45 +78,46 @@ SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res Если индекс выходит за границы массива, то возвращается некоторое значение по умолчанию (0 для чисел, пустая строка для строк и т. п.), кроме случая с неконстантным массивом и константным индексом 0 (в этом случае будет ошибка `Array indices are 1-based`). -## has(arr, elem) +## has(arr, elem) {#hasarr-elem} Проверяет наличие элемента elem в массиве arr. Возвращает 0, если элемента в массиве нет, или 1, если есть. `NULL` обрабатывается как значение. -```sql +``` sql SELECT has([1, 2, NULL], NULL) ``` -```text + +``` text ┌─has([1, 2, NULL], NULL)─┐ │ 1 │ └─────────────────────────┘ ``` -## hasAll +## hasAll {#hasall} Проверяет, является ли один массив подмножеством другого. -```sql +``` sql hasAll(set, subset) ``` **Параметры** -- `set` – массив любого типа с набором элементов. -- `subset` – массив любого типа со значениями, которые проверяются на вхождение в `set`. +- `set` – массив любого типа с набором элементов. +- `subset` – массив любого типа со значениями, которые проверяются на вхождение в `set`. **Возвращаемые значения** -- `1`, если `set` содержит все элементы из `subset`. -- `0`, в противном случае. +- `1`, если `set` содержит все элементы из `subset`. +- `0`, в противном случае. **Особенности** -- Пустой массив является подмножеством любого массива. -- `NULL` обрабатывается как значение. -- Порядок значений в обоих массивах не имеет значения. +- Пустой массив является подмножеством любого массива. +- `NULL` обрабатывается как значение. +- Порядок значений в обоих массивах не имеет значения. **Примеры** @@ -130,28 +133,28 @@ hasAll(set, subset) `SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [3, 5]])` возвращает 0. -## hasAny +## hasAny {#hasany} Проверяет, имеют ли два массива хотя бы один общий элемент. -```sql +``` sql hasAny(array1, array2) ``` **Параметры** -- `array1` – массив любого типа с набором элементов. -- `array2` – массив любого типа с набором элементов. +- `array1` – массив любого типа с набором элементов. +- `array2` – массив любого типа с набором элементов. **Возвращаемые значения** -- `1`, если `array1` и `array2` имеют хотя бы один одинаковый элемент. -- `0`, в противном случае. +- `1`, если `array1` и `array2` имеют хотя бы один одинаковый элемент. +- `0`, в противном случае. **Особенности** -- `NULL` обрабатывается как значение. -- Порядок значений в обоих массивах не имеет значения. +- `NULL` обрабатывается как значение. +- Порядок значений в обоих массивах не имеет значения. **Примеры** @@ -165,16 +168,17 @@ hasAny(array1, array2) `SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [1, 2]])` возвращает `1`. -## indexOf(arr, x) +## indexOf(arr, x) {#indexofarr-x} Возвращает индекс первого элемента x (начиная с 1), если он есть в массиве, или 0, если его нет. Пример: -```sql +``` sql SELECT indexOf([1, 3, NULL, NULL], NULL) ``` -```text + +``` text ┌─indexOf([1, 3, NULL, NULL], NULL)─┐ │ 3 │ └───────────────────────────────────┘ @@ -182,30 +186,31 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) Элементы, равные `NULL`, обрабатываются как обычные значения. -## countEqual(arr, x) +## countEqual(arr, x) {#countequalarr-x} -Возвращает количество элементов массива, равных x. Эквивалентно arrayCount(elem -> elem = x, arr). +Возвращает количество элементов массива, равных x. Эквивалентно arrayCount(elem -\> elem = x, arr). `NULL` обрабатывается как значение. Пример: -```sql +``` sql SELECT countEqual([1, 2, NULL, NULL], NULL) ``` -```text + +``` text ┌─countEqual([1, 2, NULL, NULL], NULL)─┐ │ 2 │ └──────────────────────────────────────┘ ``` -## arrayEnumerate(arr) {#array_functions-arrayenumerate} +## arrayEnumerate(arr) {#array-functions-arrayenumerate} -Возвращает массив \[1, 2, 3, ..., length(arr)\] +Возвращает массив \[1, 2, 3, …, length(arr)\] Эта функция обычно используется совместно с ARRAY JOIN. Она позволяет, после применения ARRAY JOIN, посчитать что-либо только один раз для каждого массива. Пример: -```sql +``` sql SELECT count() AS Reaches, countIf(num = 1) AS Hits @@ -217,7 +222,7 @@ WHERE CounterID = 160656 LIMIT 10 ``` -```text +``` text ┌─Reaches─┬──Hits─┐ │ 95606 │ 31406 │ └─────────┴───────┘ @@ -225,7 +230,7 @@ LIMIT 10 В этом примере, Reaches - число достижений целей (строк, получившихся после применения ARRAY JOIN), а Hits - число хитов (строк, которые были до ARRAY JOIN). В данном случае, тот же результат можно получить проще: -```sql +``` sql SELECT sum(length(GoalsReached)) AS Reaches, count() AS Hits @@ -233,7 +238,7 @@ FROM test.hits WHERE (CounterID = 160656) AND notEmpty(GoalsReached) ``` -```text +``` text ┌─Reaches─┬──Hits─┐ │ 95606 │ 31406 │ └─────────┴───────┘ @@ -241,7 +246,7 @@ WHERE (CounterID = 160656) AND notEmpty(GoalsReached) Также эта функция может быть использована в функциях высшего порядка. Например, с её помощью можно достать индексы массива для элементов, удовлетворяющих некоторому условию. -## arrayEnumerateUniq(arr, ...) +## arrayEnumerateUniq(arr, …) {#arrayenumerateuniqarr} Возвращает массив, такого же размера, как исходный, где для каждого элемента указано, какой он по счету среди элементов с таким же значением. Например: arrayEnumerateUniq(\[10, 20, 10, 30\]) = \[1, 1, 2, 1\]. @@ -249,7 +254,7 @@ WHERE (CounterID = 160656) AND notEmpty(GoalsReached) Эта функция полезна при использовании ARRAY JOIN и агрегации по элементам массива. Пример: -```sql +``` sql SELECT Goals.ID AS GoalID, sum(Sign) AS Reaches, @@ -264,7 +269,7 @@ ORDER BY Reaches DESC LIMIT 10 ``` -```text +``` text ┌──GoalID─┬─Reaches─┬─Visits─┐ │ 53225 │ 3214 │ 1097 │ │ 2825062 │ 3188 │ 1097 │ @@ -283,11 +288,11 @@ LIMIT 10 Функция arrayEnumerateUniq может принимать несколько аргументов - массивов одинаковых размеров. В этом случае, уникальность считается для кортежей элементов на одинаковых позициях всех массивов. -```sql +``` sql SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res ``` -```text +``` text ┌─res───────────┐ │ [1,2,1,1,2,1] │ └───────────────┘ @@ -295,119 +300,119 @@ SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res Это нужно при использовании ARRAY JOIN с вложенной структурой данных и затем агрегации по нескольким элементам этой структуры. -## arrayPopBack +## arrayPopBack {#arraypopback} Удаляет последний элемент из массива. -```sql +``` sql arrayPopBack(array) ``` **Параметры** -- `array` - Массив. +- `array` - Массив. **Пример** -```sql +``` sql SELECT arrayPopBack([1, 2, 3]) AS res ``` -text -``` -┌─res───┐ -│ [1,2] │ -└───────┘ -``` -## arrayPopFront +text + + ┌─res───┐ + │ [1,2] │ + └───────┘ + +## arrayPopFront {#arraypopfront} Удаляет первый элемент из массива. -```sql +``` sql arrayPopFront(array) ``` **Параметры** -- `array` - Массив. +- `array` - Массив. **Пример** -```sql +``` sql SELECT arrayPopFront([1, 2, 3]) AS res ``` -```text +``` text ┌─res───┐ │ [2,3] │ └───────┘ ``` -## arrayPushBack +## arrayPushBack {#arraypushback} Добавляет один элемент в конец массива. -```sql +``` sql arrayPushBack(array, single_value) ``` **Параметры** -- `array` - Массив. -- `single_value` - Одиночное значение. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе "[Типы данных](../../data_types/index.md#data_types)". Может быть равно `NULL`. Функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. +- `array` - Массив. +- `single_value` - Одиночное значение. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../data_types/index.md#data_types)». Может быть равно `NULL`. Функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. **Пример** -```sql +``` sql SELECT arrayPushBack(['a'], 'b') AS res ``` -```text +``` text ┌─res───────┐ │ ['a','b'] │ └───────────┘ ``` -## arrayPushFront +## arrayPushFront {#arraypushfront} Добавляет один элемент в начало массива. -```sql +``` sql arrayPushFront(array, single_value) ``` **Параметры** -- `array` - Массив. -- `single_value` - Одиночное значение. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе "[Типы данных](../../data_types/index.md#data_types)". Может быть равно `NULL`. Функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. +- `array` - Массив. +- `single_value` - Одиночное значение. В массив с числам можно добавить только числа, в массив со строками только строки. При добавлении чисел ClickHouse автоматически приводит тип `single_value` к типу данных массива. Подробнее о типах данных в ClickHouse читайте в разделе «[Типы данных](../../data_types/index.md#data_types)». Может быть равно `NULL`. Функция добавит элемент `NULL` в массив, а тип элементов массива преобразует в `Nullable`. **Пример** -```sql +``` sql SELECT arrayPushFront(['b'], 'a') AS res ``` -```text +``` text ┌─res───────┐ │ ['a','b'] │ └───────────┘ ``` -## arrayResize +## arrayResize {#arrayresize} Изменяет длину массива. -```sql +``` sql arrayResize(array, size[, extender]) ``` **Параметры** -- `array` — массив. -- `size` — необходимая длина массива. - - Если `size` меньше изначального размера массива, то массив обрезается справа. - - Если `size` больше изначального размера массива, массив дополняется справа значениями `extender` или значениями по умолчанию для типа данных элементов массива. -- `extender` — значение для дополнения массива. Может быть `NULL`. +- `array` — массив. +- `size` — необходимая длина массива. + - Если `size` меньше изначального размера массива, то массив обрезается справа. + - Если `size` больше изначального размера массива, массив дополняется справа значениями `extender` или значениями по умолчанию для типа данных элементов массива. +- `extender` — значение для дополнения массива. Может быть `NULL`. **Возвращаемое значение:** @@ -415,45 +420,47 @@ arrayResize(array, size[, extender]) **Примеры вызовов** -```sql +``` sql SELECT arrayResize([1], 3) ``` -```text + +``` text ┌─arrayResize([1], 3)─┐ │ [1,0,0] │ └─────────────────────┘ ``` -```sql +``` sql SELECT arrayResize([1], 3, NULL) ``` -```text + +``` text ┌─arrayResize([1], 3, NULL)─┐ │ [1,NULL,NULL] │ └───────────────────────────┘ ``` -## arraySlice +## arraySlice {#arrayslice} Возвращает срез массива. -```sql +``` sql arraySlice(array, offset[, length]) ``` **Параметры** -- `array` - Массив данных. -- `offset` - Отступ от края массива. Положительное значение - отступ слева, отрицательное значение - отступ справа. Отсчет элементов массива начинается с 1. -- `length` - Длина необходимого среза. Если указать отрицательное значение, то функция вернёт открытый срез `[offset, array_length - length)`. Если не указать значение, то функция вернёт срез `[offset, the_end_of_array]`. +- `array` - Массив данных. +- `offset` - Отступ от края массива. Положительное значение - отступ слева, отрицательное значение - отступ справа. Отсчет элементов массива начинается с 1. +- `length` - Длина необходимого среза. Если указать отрицательное значение, то функция вернёт открытый срез `[offset, array_length - length)`. Если не указать значение, то функция вернёт срез `[offset, the_end_of_array]`. **Пример** -```sql +``` sql SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res ``` -```text +``` text ┌─res────────┐ │ [2,NULL,4] │ └────────────┘ @@ -461,16 +468,17 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res Элементы массива равные `NULL` обрабатываются как обычные значения. -## arraySort([func,] arr, ...) {#array_functions-sort} +## arraySort(\[func,\] arr, …) {#array-functions-sort} Возвращает массив `arr`, отсортированный в восходящем порядке. Если задана функция `func`, то порядок сортировки определяется результатом применения этой функции на элементы массива `arr`. Если `func` принимает несколько аргументов, то в функцию `arraySort` нужно передавать несколько массивов, которые будут соответствовать аргументам функции `func`. Подробные примеры рассмотрены в конце описания `arraySort`. Пример сортировки целочисленных значений: -```sql +``` sql SELECT arraySort([1, 3, 3, 0]) ``` -```text + +``` text ┌─arraySort([1, 3, 3, 0])─┐ │ [0,1,3,3] │ └─────────────────────────┘ @@ -478,10 +486,11 @@ SELECT arraySort([1, 3, 3, 0]) Пример сортировки строковых значений: -```sql +``` sql SELECT arraySort(['hello', 'world', '!']) ``` -```text + +``` text ┌─arraySort(['hello', 'world', '!'])─┐ │ ['!','hello','world'] │ └────────────────────────────────────┘ @@ -489,63 +498,68 @@ SELECT arraySort(['hello', 'world', '!']) Значения `NULL`, `NaN` и `Inf` сортируются по следующему принципу: -```sql +``` sql SELECT arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]); ``` -```text + +``` text ┌─arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf])─┐ │ [-inf,-4,1,2,3,inf,nan,nan,NULL,NULL] │ └───────────────────────────────────────────────────────────┘ ``` -- Значения `-Inf` идут в начале массива. -- Значения `NULL` идут в конце массива. -- Значения `NaN` идут перед `NULL`. -- Значения `Inf` идут перед `NaN`. +- Значения `-Inf` идут в начале массива. +- Значения `NULL` идут в конце массива. +- Значения `NaN` идут перед `NULL`. +- Значения `Inf` идут перед `NaN`. Функция `arraySort` является [функцией высшего порядка](higher_order_functions.md) — в качестве первого аргумента ей можно передать лямбда-функцию. В этом случае порядок сортировки определяется результатом применения лямбда-функции на элементы массива. -Рассмотрим пример: +Рассмотрим пример: -```sql +``` sql SELECT arraySort((x) -> -x, [1, 2, 3]) as res; ``` -```text + +``` text ┌─res─────┐ │ [3,2,1] │ └─────────┘ ``` -Для каждого элемента исходного массива лямбда-функция возвращает ключ сортировки, то есть [1 –> -1, 2 –> -2, 3 –> -3]. Так как `arraySort` сортирует элементы в порядке возрастания ключей, результат будет [3, 2, 1]. Как можно заметить, функция `x –> -x` устанавливает [обратный порядок сортировки](#array_functions-reverse-sort). +Для каждого элемента исходного массива лямбда-функция возвращает ключ сортировки, то есть \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Так как `arraySort` сортирует элементы в порядке возрастания ключей, результат будет \[3, 2, 1\]. Как можно заметить, функция `x –> -x` устанавливает [обратный порядок сортировки](#array_functions-reverse-sort). Лямбда-функция может принимать несколько аргументов. В этом случае, в функцию `arraySort` нужно передавать несколько массивов, которые будут соответствовать аргументам лямбда-функции (массивы должны быть одинаковой длины). Следует иметь в виду, что результат будет содержать элементы только из первого массива; элементы из всех последующих массивов будут задавать ключи сортировки. Например: -```sql +``` sql SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -```text +``` text ┌─res────────────────┐ │ ['world', 'hello'] │ └────────────────────┘ ``` -Элементы, указанные во втором массиве ([2,1]), определяют ключ сортировки для элементов из исходного массива (['hello', 'world']), то есть ['hello' –> 2, 'world' –> 1]. Так как лямбда-функция не использует `x`, элементы исходного массива не влияют на порядок сортировки. Таким образом, 'hello' будет вторым элементом в отсортированном массиве, а 'world' — первым. +Элементы, указанные во втором массиве (\[2,1\]), определяют ключ сортировки для элементов из исходного массива (\[‘hello’, ‘world’\]), то есть \[‘hello’ –\> 2, ‘world’ –\> 1\]. Так как лямбда-функция не использует `x`, элементы исходного массива не влияют на порядок сортировки. Таким образом, ‘hello’ будет вторым элементом в отсортированном массиве, а ‘world’ — первым. Ниже приведены другие примеры. -```sql +``` sql SELECT arraySort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res; ``` -```text + +``` text ┌─res─────┐ │ [2,1,0] │ └─────────┘ ``` -```sql + +``` sql SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; ``` -```text + +``` text ┌─res─────┐ │ [2,1,0] │ └─────────┘ @@ -554,16 +568,17 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; !!! note "Примечание" Для улучшения эффективности сортировки применяется [преобразование Шварца](https://ru.wikipedia.org/wiki/%D0%9F%D1%80%D0%B5%D0%BE%D0%B1%D1%80%D0%B0%D0%B7%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%A8%D0%B2%D0%B0%D1%80%D1%86%D0%B0). -## arrayReverseSort([func,] arr, ...) {#array_functions-reverse-sort} +## arrayReverseSort(\[func,\] arr, …) {#array-functions-reverse-sort} Возвращает массив `arr`, отсортированный в нисходящем порядке. Если указана функция `func`, то массив `arr` сначала сортируется в порядке, который определяется функцией `func`, а затем отсортированный массив переворачивается. Если функция `func` принимает несколько аргументов, то в функцию `arrayReverseSort` необходимо передавать несколько массивов, которые будут соответствовать аргументам функции `func`. Подробные примеры рассмотрены в конце описания функции `arrayReverseSort`. - + Пример сортировки целочисленных значений: - -```sql + +``` sql SELECT arrayReverseSort([1, 3, 3, 0]); ``` -```text + +``` text ┌─arrayReverseSort([1, 3, 3, 0])─┐ │ [3,3,1,0] │ └────────────────────────────────┘ @@ -571,37 +586,40 @@ SELECT arrayReverseSort([1, 3, 3, 0]); Пример сортировки строковых значений: -```sql +``` sql SELECT arrayReverseSort(['hello', 'world', '!']); ``` -```text + +``` text ┌─arrayReverseSort(['hello', 'world', '!'])─┐ │ ['world','hello','!'] │ └───────────────────────────────────────────┘ -``` +``` Значения `NULL`, `NaN` и `Inf` сортируются в следующем порядке: -```sql +``` sql SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res; ``` -```text + +``` text ┌─res───────────────────────────────────┐ │ [inf,3,2,1,-4,-inf,nan,nan,NULL,NULL] │ └───────────────────────────────────────┘ ``` -- Значения `Inf` идут в начале массива. -- Значения `NULL` идут в конце массива. -- Значения `NaN` идут перед `NULL`. -- Значения `-Inf` идут перед `NaN`. +- Значения `Inf` идут в начале массива. +- Значения `NULL` идут в конце массива. +- Значения `NaN` идут перед `NULL`. +- Значения `-Inf` идут перед `NaN`. Функция `arrayReverseSort` является [функцией высшего порядка](higher_order_functions.md). Вы можете передать ей в качестве первого аргумента лямбда-функцию. Например: -```sql +``` sql SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res; ``` -```text + +``` text ┌─res─────┐ │ [1,2,3] │ └─────────┘ @@ -609,15 +627,16 @@ SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res; В этом примере, порядок сортировки устанавливается следующим образом: -1. Сначала исходный массив ([1, 2, 3]) сортируется в том порядке, который определяется лямбда-функцией. Результатом будет массив [3, 2, 1]. -2. Массив, который был получен на предыдущем шаге, переворачивается. То есть, получается массив [1, 2, 3]. - +1. Сначала исходный массив (\[1, 2, 3\]) сортируется в том порядке, который определяется лямбда-функцией. Результатом будет массив \[3, 2, 1\]. +2. Массив, который был получен на предыдущем шаге, переворачивается. То есть, получается массив \[1, 2, 3\]. + Лямбда-функция может принимать на вход несколько аргументов. В этом случае, в функцию `arrayReverseSort` нужно передавать несколько массивов, которые будут соответствовать аргументам лямбда-функции (массивы должны быть одинаковой длины). Следует иметь в виду, что результат будет содержать элементы только из первого массива; элементы из всех последующих массивов будут определять ключи сортировки. Например: - -```sql + +``` sql SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -```text + +``` text ┌─res───────────────┐ │ ['hello','world'] │ └───────────────────┘ @@ -625,52 +644,55 @@ SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; В этом примере, массив сортируется следующим образом: -1. Сначала массив сортируется в том порядке, который определяется лямбда-функцией. Элементы, указанные во втором массиве ([2,1]), определяют ключи сортировки соответствующих элементов из исходного массива (['hello', 'world']). То есть, будет массив ['world', 'hello']. -2. Массив, который был отсортирован на предыдущем шаге, переворачивается. Получается массив ['hello', 'world']. +1. Сначала массив сортируется в том порядке, который определяется лямбда-функцией. Элементы, указанные во втором массиве (\[2,1\]), определяют ключи сортировки соответствующих элементов из исходного массива (\[‘hello’, ‘world’\]). То есть, будет массив \[‘world’, ‘hello’\]. +2. Массив, который был отсортирован на предыдущем шаге, переворачивается. Получается массив \[‘hello’, ‘world’\]. Ниже приведены ещё примеры. -```sql +``` sql SELECT arrayReverseSort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res; ``` -```text + +``` text ┌─res─────┐ │ [0,1,2] │ └─────────┘ ``` -```sql + +``` sql SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; ``` -```text + +``` text ┌─res─────┐ │ [4,3,5] │ └─────────┘ ``` -## arrayUniq(arr, ...) {#array_functions-arrayuniq} +## arrayUniq(arr, …) {#array-functions-arrayuniq} Если передан один аргумент, считает количество разных элементов в массиве. Если передано несколько аргументов, считает количество разных кортежей из элементов на соответствующих позициях в нескольких массивах. -Если необходимо получить список уникальных элементов массива, можно воспользоваться arrayReduce('groupUniqArray', arr). +Если необходимо получить список уникальных элементов массива, можно воспользоваться arrayReduce(‘groupUniqArray’, arr). -## arrayJoin(arr) {#array_functions-arrayjoin} +## arrayJoin(arr) {#array-functions-arrayjoin} -Особенная функция. Смотрите раздел ["Функция arrayJoin"](array_join.md#functions_arrayjoin). +Особенная функция. Смотрите раздел [«Функция arrayJoin»](array_join.md#functions_arrayjoin). ## arrayDifference {#arraydifference} Вычисляет разность между соседними элементами массива. Возвращает массив, где первым элементом будет 0, вторым – разность `a[1] - a[0]` и т. д. Тип элементов результирующего массива определяется правилами вывода типов при вычитании (напр. `UInt8` - `UInt8` = `Int16`). -**Синтаксис** +**Синтаксис** -```sql +``` sql arrayDifference(array) ``` -**Параметры** +**Параметры** -- `array` – [Массив](https://clickhouse.yandex/docs/ru/data_types/array/). +- `array` – [Массив](https://clickhouse.yandex/docs/ru/data_types/array/). **Возвращаемое значение** @@ -680,13 +702,13 @@ arrayDifference(array) Запрос: -```sql +``` sql SELECT arrayDifference([1, 2, 3, 4]) ``` Ответ: -```text +``` text ┌─arrayDifference([1, 2, 3, 4])─┐ │ [0,1,1,1] │ └───────────────────────────────┘ @@ -696,13 +718,13 @@ SELECT arrayDifference([1, 2, 3, 4]) Запрос: -```sql +``` sql SELECT arrayDifference([0, 10000000000000000000]) ``` Ответ: -```text +``` text ┌─arrayDifference([0, 10000000000000000000])─┐ │ [0,-8446744073709551616] │ └────────────────────────────────────────────┘ @@ -711,16 +733,16 @@ SELECT arrayDifference([0, 10000000000000000000]) ## arrayDistinct {#arraydistinct} Принимает массив, возвращает массив, содержащий уникальные элементы. - -**Синтаксис** -```sql +**Синтаксис** + +``` sql arrayDistinct(array) ``` **Параметры** -- `array` – [Массив](https://clickhouse.yandex/docs/ru/data_types/array/). +- `array` – [Массив](https://clickhouse.yandex/docs/ru/data_types/array/). **Возвращаемое значение** @@ -730,63 +752,63 @@ arrayDistinct(array) Запрос: -```sql +``` sql SELECT arrayDistinct([1, 2, 2, 3, 1]) ``` Ответ: -```text +``` text ┌─arrayDistinct([1, 2, 2, 3, 1])─┐ │ [1,2,3] │ └────────────────────────────────┘ ``` -## arrayEnumerateDense(arr) {#array_functions-arrayenumeratedense} +## arrayEnumerateDense(arr) {#array-functions-arrayenumeratedense} Возвращает массив того же размера, что и исходный массив, с индексами исходного массива, указывающими, где каждый элемент впервые появляется в исходном массиве. Пример: -```sql +``` sql SELECT arrayEnumerateDense([10, 20, 10, 30]) ``` -```text +``` text ┌─arrayEnumerateDense([10, 20, 10, 30])─┐ │ [1,2,1,3] │ └───────────────────────────────────────┘ ``` -## arrayIntersect(arr) {#array_functions-arrayintersect} +## arrayIntersect(arr) {#array-functions-arrayintersect} Принимает несколько массивов, возвращает массив с элементами, присутствующими во всех исходных массивах. Элементы на выходе следуют в порядке следования в первом массиве. Пример: -```sql +``` sql SELECT arrayIntersect([1, 2], [1, 3], [2, 3]) AS no_intersect, arrayIntersect([1, 2], [1, 3], [1, 4]) AS intersect ``` -```text +``` text ┌─no_intersect─┬─intersect─┐ │ [] │ [1] │ └──────────────┴───────────┘ ``` -## arrayReduce(agg_func, arr1, ...) {#array_functions-arrayreduce} +## arrayReduce(agg\_func, arr1, …) {#array-functions-arrayreduce} Применяет агрегатную функцию к элементам массива и возвращает ее результат. Имя агрегирующей функции передается как строка в одинарных кавычках `'max'`, `'sum'`. При использовании параметрических агрегатных функций, параметр указывается после имени функции в круглых скобках `'uniqUpTo(6)'`. Пример: -```sql +``` sql SELECT arrayReduce('max', [1, 2, 3]) ``` -```text +``` text ┌─arrayReduce('max', [1, 2, 3])─┐ │ 3 │ └───────────────────────────────┘ @@ -796,11 +818,11 @@ SELECT arrayReduce('max', [1, 2, 3]) Пример: -```sql +``` sql SELECT arrayReduce('maxIf', [3, 5], [1, 0]) ``` -```text +``` text ┌─arrayReduce('maxIf', [3, 5], [1, 0])─┐ │ 3 │ └──────────────────────────────────────┘ @@ -808,35 +830,35 @@ SELECT arrayReduce('maxIf', [3, 5], [1, 0]) Пример с параметрической агрегатной функцией: -```sql +``` sql SELECT arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) ``` -```text +``` text ┌─arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])─┐ │ 4 │ └─────────────────────────────────────────────────────────────┘ ``` -## arrayReverse(arr) {#array_functions-arrayreverse} +## arrayReverse(arr) {#array-functions-arrayreverse} Возвращает массив того же размера, что и исходный массив, содержащий элементы в обратном порядке. Пример: -```sql + +``` sql SELECT arrayReverse([1, 2, 3]) ``` -```text +``` text ┌─arrayReverse([1, 2, 3])─┐ │ [3,2,1] │ └─────────────────────────┘ ``` -## reverse(arr) {#array_functions-reverse} - -Синоним для ["arrayReverse"](#array_functions-arrayreverse) +## reverse(arr) {#array-functions-reverse} +Синоним для [«arrayReverse»](#array_functions-arrayreverse) ## arrayFlatten {#arrayflatten} @@ -844,30 +866,30 @@ SELECT arrayReverse([1, 2, 3]) Функция: -- Оперирует с массивами любой вложенности. -- Не изменяет массив, если он уже плоский. +- Оперирует с массивами любой вложенности. +- Не изменяет массив, если он уже плоский. Результирующий массив содержит все элементы исходных массивов. -**Синтаксис** +**Синтаксис** -```sql +``` sql flatten(array_of_arrays) ``` Синоним: `flatten`. - **Параметры** -- `array_of_arrays` — [Массивов](../../data_types/array.md) массивов. Например, `[[1,2,3], [4,5]]`. +- `array_of_arrays` — [Массивов](../../data_types/array.md) массивов. Например, `[[1,2,3], [4,5]]`. **Примеры** -```sql +``` sql SELECT flatten([[[1]], [[2], [3]]]) ``` -```text + +``` text ┌─flatten(array(array([1]), array([2], [3])))─┐ │ [1,2,3] │ └─────────────────────────────────────────────┘ @@ -879,7 +901,7 @@ SELECT flatten([[[1]], [[2], [3]]]) **Синтаксис** -```sql +``` sql arrayCompact(arr) ``` @@ -897,13 +919,13 @@ arrayCompact(arr) Запрос: -```sql +``` sql SELECT arrayCompact([1, 1, nan, nan, 2, 3, 3, 3]) ``` Ответ: -```text +``` text ┌─arrayCompact([1, 1, nan, nan, 2, 3, 3, 3])─┐ │ [1,nan,nan,2,3] │ └────────────────────────────────────────────┘ diff --git a/docs/ru/query_language/functions/array_join.md b/docs/ru/query_language/functions/array_join.md index 56a5fa40a9b..3a2e95b057e 100644 --- a/docs/ru/query_language/functions/array_join.md +++ b/docs/ru/query_language/functions/array_join.md @@ -1,4 +1,4 @@ -# Функция arrayJoin {#functions_arrayjoin} +# Функция arrayJoin {#functions-arrayjoin} Это совсем необычная функция. @@ -15,11 +15,11 @@ Пример: -```sql +``` sql SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src ``` -```text +``` text ┌─dst─┬─\'Hello\'─┬─src─────┐ │ 1 │ Hello │ [1,2,3] │ │ 2 │ Hello │ [1,2,3] │ diff --git a/docs/ru/query_language/functions/bit_functions.md b/docs/ru/query_language/functions/bit_functions.md index 3b8563f3587..435ba9baeec 100644 --- a/docs/ru/query_language/functions/bit_functions.md +++ b/docs/ru/query_language/functions/bit_functions.md @@ -1,20 +1,20 @@ -# Битовые функции +# Битовые функции {#bitovye-funktsii} Битовые функции работают для любой пары типов из UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64. Тип результата - целое число, битность которого равна максимальной битности аргументов. Если хотя бы один аргумент знаковый, то результат - знаковое число. Если аргумент - число с плавающей запятой - оно приводится к Int64. -## bitAnd(a, b) +## bitAnd(a, b) {#bitanda-b} -## bitOr(a, b) +## bitOr(a, b) {#bitora-b} -## bitXor(a, b) +## bitXor(a, b) {#bitxora-b} -## bitNot(a) +## bitNot(a) {#bitnota} -## bitShiftLeft(a, b) +## bitShiftLeft(a, b) {#bitshiftlefta-b} -## bitShiftRight(a, b) +## bitShiftRight(a, b) {#bitshiftrighta-b} ## bitTest {#bittest} @@ -22,14 +22,14 @@ **Синтаксис** -```sql +``` sql SELECT bitTest(number, index) ``` **Параметры** -- `number` – целое число. -- `index` – position of bit. +- `number` – целое число. +- `index` – position of bit. **Возвращаемое значение** @@ -43,13 +43,13 @@ SELECT bitTest(number, index) Запрос: -```sql +``` sql SELECT bitTest(43, 1) ``` Ответ: -```text +``` text ┌─bitTest(43, 1)─┐ │ 1 │ └────────────────┘ @@ -59,13 +59,13 @@ SELECT bitTest(43, 1) Запрос: -```sql +``` sql SELECT bitTest(43, 2) ``` Ответ: -```text +``` text ┌─bitTest(43, 2)─┐ │ 0 │ └────────────────┘ @@ -82,16 +82,16 @@ SELECT bitTest(43, 2) 1 AND 0 = 0 1 AND 1 = 1 -**Синтаксис** +**Синтаксис** -```sql +``` sql SELECT bitTestAll(number, index1, index2, index3, index4, ...) ``` -**Параметры** +**Параметры** -- `number` – целое число. -- `index1`, `index2`, `index3`, `index4` – позиция бита. Например, конъюнкция для набора позиций `index1`, `index2`, `index3`, `index4` является истинной, если все его позиции истинны `index1` ⋀ `index2` ⋀ `index3` ⋀ `index4`. +- `number` – целое число. +- `index1`, `index2`, `index3`, `index4` – позиция бита. Например, конъюнкция для набора позиций `index1`, `index2`, `index3`, `index4` является истинной, если все его позиции истинны `index1` ⋀ `index2` ⋀ `index3` ⋀ `index4`. **Возвращаемое значение** @@ -105,13 +105,13 @@ SELECT bitTestAll(number, index1, index2, index3, index4, ...) Запрос: -```sql +``` sql SELECT bitTestAll(43, 0, 1, 3, 5) ``` Ответ: -```text +``` text ┌─bitTestAll(43, 0, 1, 3, 5)─┐ │ 1 │ └────────────────────────────┘ @@ -121,13 +121,13 @@ SELECT bitTestAll(43, 0, 1, 3, 5) Запрос: -```sql +``` sql SELECT bitTestAll(43, 0, 1, 3, 5, 2) ``` Ответ: -```text +``` text ┌─bitTestAll(43, 0, 1, 3, 5, 2)─┐ │ 0 │ └───────────────────────────────┘ @@ -144,16 +144,16 @@ SELECT bitTestAll(43, 0, 1, 3, 5, 2) 1 OR 0 = 1 1 OR 1 = 1 -**Синтаксис** +**Синтаксис** -```sql +``` sql SELECT bitTestAny(number, index1, index2, index3, index4, ...) ``` -**Параметры** +**Параметры** -- `number` – целое число. -- `index1`, `index2`, `index3`, `index4` – позиции бита. +- `number` – целое число. +- `index1`, `index2`, `index3`, `index4` – позиции бита. **Возвращаемое значение** @@ -167,13 +167,13 @@ SELECT bitTestAny(number, index1, index2, index3, index4, ...) Запрос: -```sql +``` sql SELECT bitTestAny(43, 0, 2) ``` Ответ: -```text +``` text ┌─bitTestAny(43, 0, 2)─┐ │ 1 │ └──────────────────────┘ @@ -183,13 +183,13 @@ SELECT bitTestAny(43, 0, 2) Запрос: -```sql +``` sql SELECT bitTestAny(43, 4, 2) ``` Ответ: -```text +``` text ┌─bitTestAny(43, 4, 2)─┐ │ 0 │ └──────────────────────┘ diff --git a/docs/ru/query_language/functions/bitmap_functions.md b/docs/ru/query_language/functions/bitmap_functions.md index 5c1943f4e7a..5a58fd075b8 100644 --- a/docs/ru/query_language/functions/bitmap_functions.md +++ b/docs/ru/query_language/functions/bitmap_functions.md @@ -1,48 +1,48 @@ -# Функции для битмапов +# Функции для битмапов {#funktsii-dlia-bitmapov} -## bitmapBuild {#bitmap_functions-bitmapbuild} +## bitmapBuild {#bitmap-functions-bitmapbuild} Создаёт битовый массив из массива целочисленных значений. -```sql +``` sql bitmapBuild(array) ``` **Параметры** -- `array` – массив типа `UInt*`. +- `array` – массив типа `UInt*`. **Пример** -```sql +``` sql SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) ``` -```text + +``` text ┌─res─┬─toTypeName(bitmapBuild([1, 2, 3, 4, 5]))─────┐ │  │ AggregateFunction(groupBitmap, UInt8) │ └─────┴──────────────────────────────────────────────┘ ``` - -## bitmapToArray +## bitmapToArray {#bitmaptoarray} Преобразует битовый массив в массив целочисленных значений. -```sql +``` sql bitmapToArray(bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -```text +``` text ┌─res─────────┐ │ [1,2,3,4,5] │ └─────────────┘ @@ -54,15 +54,15 @@ SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res **Синтаксис** -```sql +``` sql bitmapSubsetLimit(bitmap, range_start, cardinality_limit) ``` **Параметры** -- `bitmap` – Битмап. [Bitmap object](#bitmap_functions-bitmapbuild). -- `range_start` – Начальная точка подмножества. [UInt32](../../data_types/int_uint.md). -- `cardinality_limit` – Верхний предел подмножества. [UInt32](../../data_types/int_uint.md). +- `bitmap` – Битмап. [Bitmap object](#bitmap_functions-bitmapbuild). +- `range_start` – Начальная точка подмножества. [UInt32](../../data_types/int_uint.md). +- `cardinality_limit` – Верхний предел подмножества. [UInt32](../../data_types/int_uint.md). **Возвращаемое значение** @@ -74,54 +74,55 @@ bitmapSubsetLimit(bitmap, range_start, cardinality_limit) Запрос: -```sql +``` sql SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res ``` Ответ: -```text +``` text ┌─res───────────────────────┐ │ [30,31,32,33,100,200,500] │ └───────────────────────────┘ ``` -## bitmapContains {#bitmap_functions-bitmapcontains} +## bitmapContains {#bitmap-functions-bitmapcontains} Проверяет вхождение элемента в битовый массив. -```sql +``` sql bitmapContains(haystack, needle) ``` **Параметры** -- `haystack` – [объект Bitmap](#bitmap_functions-bitmapbuild), в котором функция ищет значение. -- `needle` – значение, которое функция ищет. Тип — [UInt32](../../data_types/int_uint.md). +- `haystack` – [объект Bitmap](#bitmap_functions-bitmapbuild), в котором функция ищет значение. +- `needle` – значение, которое функция ищет. Тип — [UInt32](../../data_types/int_uint.md). **Возвращаемые значения** -- 0 — если в `haystack` нет `needle`. -- 1 — если в `haystack` есть `needle`. +- 0 — если в `haystack` нет `needle`. +- 1 — если в `haystack` есть `needle`. Тип — `UInt8`. **Пример** -```sql +``` sql SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res ``` -```text + +``` text ┌─res─┐ │ 1 │ └─────┘ ``` -## bitmapHasAny +## bitmapHasAny {#bitmaphasany} Проверяет, имеют ли два битовых массива хотя бы один общий элемент. -```sql +``` sql bitmapHasAny(bitmap1, bitmap2) ``` @@ -129,261 +130,261 @@ bitmapHasAny(bitmap1, bitmap2) **Параметры** -- `bitmap*` – массив любого типа с набором элементов. +- `bitmap*` – массив любого типа с набором элементов. **Возвращаемые значения** -- `1`, если `bitmap1` и `bitmap2` имеют хотя бы один одинаковый элемент. -- `0`, в противном случае. +- `1`, если `bitmap1` и `bitmap2` имеют хотя бы один одинаковый элемент. +- `0`, в противном случае. **Пример** -```sql +``` sql SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res ``` -```text +``` text ┌─res─┐ │ 1 │ └─────┘ ``` -## bitmapHasAll +## bitmapHasAll {#bitmaphasall} Аналогично функции `hasAll(array, array)` возвращает 1 если первый битовый массив содержит все элементы второго, 0 в противном случае. Если второй аргумент является пустым битовым массивом, то возвращает 1. -```sql +``` sql bitmapHasAll(bitmap,bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res ``` -```text +``` text ┌─res─┐ │ 0 │ └─────┘ ``` -## bitmapAnd +## bitmapAnd {#bitmapand} Логическое И для двух битовых массивов. Результат — новый битовый массив. -```sql +``` sql bitmapAnd(bitmap,bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -```text +``` text ┌─res─┐ │ [3] │ └─────┘ ``` -## bitmapOr +## bitmapOr {#bitmapor} Логическое ИЛИ для двух битовых массивов. Результат — новый битовый массив. -```sql +``` sql bitmapOr(bitmap,bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -```text +``` text ┌─res─────────┐ │ [1,2,3,4,5] │ └─────────────┘ ``` -## bitmapXor +## bitmapXor {#bitmapxor} Логическое исключающее ИЛИ для двух битовых массивов. Результат — новый битовый массив. -```sql +``` sql bitmapXor(bitmap,bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -```text +``` text ┌─res───────┐ │ [1,2,4,5] │ └───────────┘ ``` -## bitmapAndnot +## bitmapAndnot {#bitmapandnot} Логическое отрицание И для двух битовых массивов. Результат — новый битовый массив. -```sql +``` sql bitmapAndnot(bitmap,bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -```text +``` text ┌─res───┐ │ [1,2] │ └───────┘ ``` -## bitmapCardinality +## bitmapCardinality {#bitmapcardinality} Возвращает кардинальность битового массива в виде значения типа `UInt64`. -```sql +``` sql bitmapCardinality(bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -```text +``` text ┌─res─┐ │ 5 │ └─────┘ ``` -## bitmapAndCardinality +## bitmapAndCardinality {#bitmapandcardinality} Выполняет логическое И и возвращает кардинальность (`UInt64`) результирующего битового массива. -```sql +``` sql bitmapAndCardinality(bitmap,bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -```text +``` text ┌─res─┐ │ 1 │ └─────┘ ``` -## bitmapOrCardinality +## bitmapOrCardinality {#bitmaporcardinality} Выполняет логическое ИЛИ и возвращает кардинальность (`UInt64`) результирующего битового массива. -```sql +``` sql bitmapOrCardinality(bitmap,bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -```text +``` text ┌─res─┐ │ 5 │ └─────┘ ``` -## bitmapXorCardinality +## bitmapXorCardinality {#bitmapxorcardinality} Выполняет логическое исключающее ИЛИ и возвращает кардинальность (`UInt64`) результирующего битового массива. -```sql +``` sql bitmapXorCardinality(bitmap,bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -```text +``` text ┌─res─┐ │ 4 │ └─────┘ ``` -## bitmapAndnotCardinality +## bitmapAndnotCardinality {#bitmapandnotcardinality} Выполняет логическое отрицание И и возвращает кардинальность (`UInt64`) результирующего битового массива. -```sql +``` sql bitmapAndnotCardinality(bitmap,bitmap) ``` **Параметры** -- `bitmap` – битовый массив. +- `bitmap` – битовый массив. **Пример** -```sql +``` sql SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -```text +``` text ┌─res─┐ │ 2 │ └─────┘ diff --git a/docs/ru/query_language/functions/comparison_functions.md b/docs/ru/query_language/functions/comparison_functions.md index 62d889f72ce..16ae7ee4cae 100644 --- a/docs/ru/query_language/functions/comparison_functions.md +++ b/docs/ru/query_language/functions/comparison_functions.md @@ -1,14 +1,13 @@ -# Функции сравнения - +# Функции сравнения {#funktsii-sravneniia} Функции сравнения возвращают всегда 0 или 1 (UInt8). Сравнивать можно следующие типы: -- числа; -- строки и фиксированные строки; -- даты; -- даты-с-временем; +- числа; +- строки и фиксированные строки; +- даты; +- даты-с-временем; внутри каждой группы, но не из разных групп. @@ -16,7 +15,7 @@ Строки сравниваются побайтово. Более короткая строка меньше всех строк, начинающихся с неё и содержащих ещё хотя бы один символ. -Замечание. До версии 1.1.54134 сравнение знаковых и беззнаковых целых чисел производилось также, как в C++. То есть, вы могли получить неверный результат в таких случаях: SELECT 9223372036854775807 > -1. С версии 1.1.54134 поведение изменилось и стало математически корректным. +Замечание. До версии 1.1.54134 сравнение знаковых и беззнаковых целых чисел производилось также, как в C++. То есть, вы могли получить неверный результат в таких случаях: SELECT 9223372036854775807 \> -1. С версии 1.1.54134 поведение изменилось и стало математически корректным. ## equals, оператор a = b и a == b {#function-equals} diff --git a/docs/ru/query_language/functions/conditional_functions.md b/docs/ru/query_language/functions/conditional_functions.md index 2e0593c765b..f2c97330d20 100644 --- a/docs/ru/query_language/functions/conditional_functions.md +++ b/docs/ru/query_language/functions/conditional_functions.md @@ -1,12 +1,12 @@ -# Условные функции +# Условные функции {#uslovnye-funktsii} ## if {#if} Условное выражение. В отличии от большинства систем, ClickHouse всегда считает оба выражения `then` и `else`. -**Синтаксис** +**Синтаксис** -```sql +``` sql SELECT if(cond, then, else) ``` @@ -14,9 +14,9 @@ SELECT if(cond, then, else) **Параметры** -- `cond` – Условие, которое может быть равно 0 или нет. Может быть [UInt8](../../data_types/int_uint.md) или `NULL`. -- `then` - Возвращается результат выражения, если условие `cond` истинно. -- `else` - Возвращается результат выражения, если условие `cond` ложно. +- `cond` – Условие, которое может быть равно 0 или нет. Может быть [UInt8](../../data_types/int_uint.md) или `NULL`. +- `then` - Возвращается результат выражения, если условие `cond` истинно. +- `else` - Возвращается результат выражения, если условие `cond` ложно. **Возвращаемые значения** @@ -26,13 +26,13 @@ SELECT if(cond, then, else) Запрос: -```sql +``` sql SELECT if(1, plus(2, 2), plus(2, 6)) ``` Ответ: -```text +``` text ┌─plus(2, 2)─┐ │ 4 │ └────────────┘ @@ -40,13 +40,13 @@ SELECT if(1, plus(2, 2), plus(2, 6)) Запрос: -```sql +``` sql SELECT if(0, plus(2, 2), plus(2, 6)) ``` Ответ: -```text +``` text ┌─plus(2, 6)─┐ │ 8 │ └────────────┘ @@ -60,27 +60,25 @@ SELECT if(0, plus(2, 2), plus(2, 6)) Возвращает `then`, если `cond` верно (больше нуля), в остальных случаях возвращает `else`. -* `cond` должно быть типа `UInt8`, `then` и `else` должны относиться к наименьшему общему типу. +- `cond` должно быть типа `UInt8`, `then` и `else` должны относиться к наименьшему общему типу. -* `then` и `else` могут быть `NULL`. +- `then` и `else` могут быть `NULL`. **Смотрите также** -- [ifNotFinite](other_functions.md#ifnotfinite). +- [ifNotFinite](other_functions.md#ifnotfinite). -## multiIf +## multiIf {#multiif} Позволяет более компактно записать оператор [CASE](../operators.md#operator_case) в запросе. -``` -multiIf(cond_1, then_1, cond_2, then_2...else) -``` + multiIf(cond_1, then_1, cond_2, then_2...else) **Параметры** -- `cond_N` — Условие, при выполнении которого функция вернёт `then_N`. -- `then_N` — Результат функции при выполнении. -- `else` — Результат функции, если ни одно из условий не выполнено. +- `cond_N` — Условие, при выполнении которого функция вернёт `then_N`. +- `then_N` — Результат функции при выполнении. +- `else` — Результат функции, если ни одно из условий не выполнено. Функция принимает `2N+1` параметров. @@ -92,7 +90,7 @@ multiIf(cond_1, then_1, cond_2, then_2...else) Рассмотрим таблицу -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -101,7 +99,7 @@ multiIf(cond_1, then_1, cond_2, then_2...else) Выполним запрос `SELECT multiIf(isNull(y), x, y < 3, y, NULL) FROM t_null`. Результат: -```text +``` text ┌─multiIf(isNull(y), x, less(y, 3), y, NULL)─┐ │ 1 │ │ ᴺᵁᴸᴸ │ diff --git a/docs/ru/query_language/functions/date_time_functions.md b/docs/ru/query_language/functions/date_time_functions.md index df274a360bb..51a097fb31b 100644 --- a/docs/ru/query_language/functions/date_time_functions.md +++ b/docs/ru/query_language/functions/date_time_functions.md @@ -1,10 +1,10 @@ -# Функции для работы с датами и временем +# Функции для работы с датами и временем {#funktsii-dlia-raboty-s-datami-i-vremenem} Поддержка часовых поясов Все функции по работе с датой и временем, для которых это имеет смысл, могут принимать второй, необязательный аргумент - имя часового пояса. Пример: Asia/Yekaterinburg. В этом случае, они используют не локальный часовой пояс (по умолчанию), а указанный. -```sql +``` sql SELECT toDateTime('2016-06-15 23:00:00') AS time, toDate(time) AS date_local, @@ -12,7 +12,7 @@ SELECT toString(time, 'US/Samoa') AS time_samoa ``` -```text +``` text ┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ └─────────────────────┴────────────┴────────────┴─────────────────────┘ @@ -20,45 +20,52 @@ SELECT Поддерживаются только часовые пояса, отличающиеся от UTC на целое число часов. -## toYear +## toYear {#toyear} + Переводит дату или дату-с-временем в число типа UInt16, содержащее номер года (AD). -## toMonth +## toMonth {#tomonth} + Переводит дату или дату-с-временем в число типа UInt8, содержащее номер месяца (1-12). -## toDayOfMonth +## toDayOfMonth {#todayofmonth} + Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в месяце (1-31). -## toDayOfWeek +## toDayOfWeek {#todayofweek} + Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в неделе (понедельник - 1, воскресенье - 7). -## toHour +## toHour {#tohour} + Переводит дату-с-временем в число типа UInt8, содержащее номер часа в сутках (0-23). Функция исходит из допущения, что перевод стрелок вперёд, если осуществляется, то на час, в два часа ночи, а перевод стрелок назад, если осуществляется, то на час, в три часа ночи (что, в общем, не верно - даже в Москве два раза перевод стрелок был осуществлён в другое время). -## toMinute +## toMinute {#tominute} + Переводит дату-с-временем в число типа UInt8, содержащее номер минуты в часе (0-59). -## toSecond +## toSecond {#tosecond} + Переводит дату-с-временем в число типа UInt8, содержащее номер секунды в минуте (0-59). Секунды координации не учитываются. -## toUnixTimestamp {#to_unix_timestamp} +## toUnixTimestamp {#to-unix-timestamp} For DateTime argument: converts value to its internal numeric representation (Unix Timestamp). For String argument: parse datetime from string according to the timezone (optional second argument, server timezone is used by default) and returns the corresponding unix timestamp. For Date argument: the behaviour is unspecified. -**Syntax** +**Syntax** -```sql +``` sql toUnixTimestamp(datetime) toUnixTimestamp(str, [timezone]) ``` **Returned value** -- Returns the unix timestamp. +- Returns the unix timestamp. Type: `UInt32`. @@ -66,97 +73,118 @@ Type: `UInt32`. Query: -```sql +``` sql SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp ``` Result: -```text +``` text ┌─unix_timestamp─┐ │ 1509836867 │ └────────────────┘ ``` -## toStartOfYear +## toStartOfYear {#tostartofyear} + Округляет дату или дату-с-временем вниз до первого дня года. Возвращается дата. -## toStartOfQuarter +## toStartOfQuarter {#tostartofquarter} + Округляет дату или дату-с-временем вниз до первого дня квартала. Первый день квартала - это одно из 1 января, 1 апреля, 1 июля, 1 октября. Возвращается дата. -## toStartOfMonth +## toStartOfMonth {#tostartofmonth} + Округляет дату или дату-с-временем вниз до первого дня месяца. Возвращается дата. -!!! attention - Возвращаемое значение для некорректных дат зависит от реализации. ClickHouse может вернуть нулевую дату, выбросить исключение, или выполнить "естественное" перетекание дат между месяцами. +!!! attention "Attention" + Возвращаемое значение для некорректных дат зависит от реализации. ClickHouse может вернуть нулевую дату, выбросить исключение, или выполнить «естественное» перетекание дат между месяцами. + +## toMonday {#tomonday} -## toMonday Округляет дату или дату-с-временем вниз до ближайшего понедельника. Возвращается дата. -## toStartOfDay +## toStartOfDay {#tostartofday} + Округляет дату-с-временем вниз до начала дня. Возвращается дата-с-временем. -## toStartOfHour +## toStartOfHour {#tostartofhour} + Округляет дату-с-временем вниз до начала часа. -## toStartOfMinute +## toStartOfMinute {#tostartofminute} + Округляет дату-с-временем вниз до начала минуты. -## toStartOfFiveMinute +## toStartOfFiveMinute {#tostartoffiveminute} + Округляет дату-с-временем вниз до начала пятиминутного интервала. -## toStartOfTenMinutes +## toStartOfTenMinutes {#tostartoftenminutes} + Округляет дату-с-временем вниз до начала десятиминутного интервала. -## toStartOfFifteenMinutes +## toStartOfFifteenMinutes {#tostartoffifteenminutes} + Округляет дату-с-временем вниз до начала пятнадцатиминутного интервала. -## toStartOfInterval(time_or_data, INTERVAL x unit [, time_zone]) -Обобщение остальных функций `toStartOf*`. Например, -`toStartOfInterval(t, INTERVAL 1 year)` возвращает то же самое, что и `toStartOfYear(t)`, -`toStartOfInterval(t, INTERVAL 1 month)` возвращает то же самое, что и `toStartOfMonth(t)`, -`toStartOfInterval(t, INTERVAL 1 day)` возвращает то же самое, что и `toStartOfDay(t)`, +## toStartOfInterval(time\_or\_data, INTERVAL x unit \[, time\_zone\]) {#tostartofintervaltime-or-data-interval-x-unit-time-zone} + +Обобщение остальных функций `toStartOf*`. Например, +`toStartOfInterval(t, INTERVAL 1 year)` возвращает то же самое, что и `toStartOfYear(t)`, +`toStartOfInterval(t, INTERVAL 1 month)` возвращает то же самое, что и `toStartOfMonth(t)`, +`toStartOfInterval(t, INTERVAL 1 day)` возвращает то же самое, что и `toStartOfDay(t)`, `toStartOfInterval(t, INTERVAL 15 minute)` возвращает то же самое, что и `toStartOfFifteenMinutes(t)`, и т.п. -## toTime +## toTime {#totime} + Переводит дату-с-временем на некоторую фиксированную дату, сохраняя при этом время. -## toRelativeYearNum +## toRelativeYearNum {#torelativeyearnum} + Переводит дату-с-временем или дату в номер года, начиная с некоторого фиксированного момента в прошлом. -## toRelativeMonthNum +## toRelativeMonthNum {#torelativemonthnum} + Переводит дату-с-временем или дату в номер месяца, начиная с некоторого фиксированного момента в прошлом. -## toRelativeWeekNum +## toRelativeWeekNum {#torelativeweeknum} + Переводит дату-с-временем или дату в номер недели, начиная с некоторого фиксированного момента в прошлом. -## toRelativeDayNum +## toRelativeDayNum {#torelativedaynum} + Переводит дату-с-временем или дату в номер дня, начиная с некоторого фиксированного момента в прошлом. -## toRelativeHourNum +## toRelativeHourNum {#torelativehournum} + Переводит дату-с-временем в номер часа, начиная с некоторого фиксированного момента в прошлом. -## toRelativeMinuteNum +## toRelativeMinuteNum {#torelativeminutenum} + Переводит дату-с-временем в номер минуты, начиная с некоторого фиксированного момента в прошлом. -## toRelativeSecondNum +## toRelativeSecondNum {#torelativesecondnum} + Переводит дату-с-временем в номер секунды, начиная с некоторого фиксированного момента в прошлом. -## now +## now {#now} Принимает ноль аргументов и возвращает текущее время на один из моментов выполнения запроса. Функция возвращает константу, даже если запрос выполнялся долго. -## today +## today {#today} + Принимает ноль аргументов и возвращает текущую дату на один из моментов выполнения запроса. То же самое, что toDate(now()) -## yesterday +## yesterday {#yesterday} + Принимает ноль аргументов и возвращает вчерашнюю дату на один из моментов выполнения запроса. Делает то же самое, что today() - 1. @@ -166,30 +194,32 @@ Result: **Синтаксис** -```sql +``` sql dateDiff('unit', startdate, enddate, [timezone]) ``` **Параметры** -- `unit` — Единица измерения времени, в которой будет вычислена разница между `startdate` и `enddate`. [String](../syntax.md#syntax-string-literal). +- `unit` — Единица измерения времени, в которой будет вычислена разница между `startdate` и `enddate`. [String](../syntax.md#syntax-string-literal). - Поддерживаемые значения: + Поддерживаемые значения: - | unit | - | ------ | - |second | - |minute | - |hour | - |day | - |week | - |month | - |quarter | - |year | + | unit | + | ------ | + |second | + |minute | + |hour | + |day | + |week | + |month | + |quarter | + |year | -- `startdate` — Первая дата. [Date](../../data_types/date.md) или [DateTime](../../data_types/datetime.md). -- `enddate` — Вторая дата. [Date](../../data_types/date.md) или [DateTime](../../data_types/datetime.md). -- `timezone` — Опциональный параметр. Если определен, применяется к обоим значениям: `startdate` и `enddate`. Если не определен, используются часовые пояса `startdate` и `enddate`. Если часовые пояса не совпадают, вернется неожидаемый результат. +- `startdate` — Первая дата. [Date](../../data_types/date.md) или [DateTime](../../data_types/datetime.md). + +- `enddate` — Вторая дата. [Date](../../data_types/date.md) или [DateTime](../../data_types/datetime.md). + +- `timezone` — Опциональный параметр. Если определен, применяется к обоим значениям: `startdate` и `enddate`. Если не определен, используются часовые пояса `startdate` и `enddate`. Если часовые пояса не совпадают, вернется неожидаемый результат. **Возвращаемое значение** @@ -201,57 +231,60 @@ dateDiff('unit', startdate, enddate, [timezone]) Запрос: -```sql +``` sql SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); ``` Ответ: -```text +``` text ┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ │ 25 │ └────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## timeSlot +## timeSlot {#timeslot} + Округляет время до получаса. Эта функция является специфичной для Яндекс.Метрики, так как пол часа - минимальное время, для которого, если соседние по времени хиты одного посетителя на одном счётчике отстоят друг от друга строго более, чем на это время, визит может быть разбит на два визита. То есть, кортежи (номер счётчика, идентификатор посетителя, тайм-слот) могут использоваться для поиска хитов, входящий в соответствующий визит. -## timeSlots(StartTime, Duration,\[, Size\]) -Для интервала времени, начинающегося в 'StartTime' и продолжающегося 'Duration' секунд, возвращает массив моментов времени, состоящий из округлений вниз до 'Size' точек в секундах из этого интервала. 'Size' - необязательный параметр, константный UInt32, по умолчанию равен 1800. +## timeSlots(StartTime, Duration,\[, Size\]) {#timeslotsstarttime-duration-size} + +Для интервала времени, начинающегося в ‘StartTime’ и продолжающегося ‘Duration’ секунд, возвращает массив моментов времени, состоящий из округлений вниз до ‘Size’ точек в секундах из этого интервала. ‘Size’ - необязательный параметр, константный UInt32, по умолчанию равен 1800. Например, `timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600)) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')]`. Это нужно для поиска хитов, входящих в соответствующий визит. ## formatDateTime(Time, Format\[, Timezone\]) {#formatdatetime} + Функция преобразования даты-с-временем в String согласно заданному шаблону. Важно - шаблон является константным выражением, т.е. невозможно использование разных шаблонов в одной колонке. Поддерживаемые модификаторы в шаблоне Format: -(колонка "Пример" показана для времени `2018-01-02 22:33:44`) +(колонка «Пример» показана для времени `2018-01-02 22:33:44`) -| Модификатор | Описание | Пример | -| ----------- | -------- | --------------- | -|%C|номер года, поделённый на 100 (00-99)|20 -|%d|день месяца, с ведущим нулём (01-31)|02 -|%D|короткая запись %m/%d/%y|01/02/18| -|%e|день месяца, с ведущим пробелом ( 1-31)| 2| -|%F|короткая запись %Y-%m-%d|2018-01-02 -|%H|час в 24-часовом формате (00-23)|22| -|%I|час в 12-часовом формате (01-12)|10| -|%j|номер дня в году, с ведущими нулями (001-366)|002| -|%m|месяц, с ведущим нулём (01-12)|01| -|%M|минуты, с ведущим нулём (00-59)|33| -|%n|символ переноса строки ('\n')|| -|%p|обозначения AM или PM|PM| -|%R|короткая запись %H:%M|22:33| -|%S|секунды, с ведущими нулями (00-59)|44| -|%t|символ табуляции ('\t')|| -|%T|формат времени ISO 8601, одинаковый с %H:%M:%S|22:33:44| -|%u|номер дня недели согласно ISO 8601, понедельник - 1, воскресенье - 7|2| -|%V|номер недели согласно ISO 8601 (01-53)|01| -|%w|номер дня недели, начиная с воскресенья (0-6)|2| -|%y|год, последние 2 цифры (00-99)|18| -|%Y|год, 4 цифры|2018| -|%%|символ %|%| +| Модификатор | Описание | Пример | +|-------------|----------------------------------------------------------------------|------------| +| %C | номер года, поделённый на 100 (00-99) | 20 | +| %d | день месяца, с ведущим нулём (01-31) | 02 | +| %D | короткая запись %m/%d/%y | 01/02/18 | +| %e | день месяца, с ведущим пробелом ( 1-31) | 2 | +| %F | короткая запись %Y-%m-%d | 2018-01-02 | +| %H | час в 24-часовом формате (00-23) | 22 | +| %I | час в 12-часовом формате (01-12) | 10 | +| %j | номер дня в году, с ведущими нулями (001-366) | 002 | +| %m | месяц, с ведущим нулём (01-12) | 01 | +| %M | минуты, с ведущим нулём (00-59) | 33 | +| %n | символ переноса строки (‘’) | | +| %p | обозначения AM или PM | PM | +| %R | короткая запись %H:%M | 22:33 | +| %S | секунды, с ведущими нулями (00-59) | 44 | +| %t | символ табуляции (’) | | +| %T | формат времени ISO 8601, одинаковый с %H:%M:%S | 22:33:44 | +| %u | номер дня недели согласно ISO 8601, понедельник - 1, воскресенье - 7 | 2 | +| %V | номер недели согласно ISO 8601 (01-53) | 01 | +| %w | номер дня недели, начиная с воскресенья (0-6) | 2 | +| %y | год, последние 2 цифры (00-99) | 18 | +| %Y | год, 4 цифры | 2018 | +| %% | символ % | % | [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/date_time_functions/) diff --git a/docs/ru/query_language/functions/encoding_functions.md b/docs/ru/query_language/functions/encoding_functions.md index a11a6f5feaa..e6fbeb133c5 100644 --- a/docs/ru/query_language/functions/encoding_functions.md +++ b/docs/ru/query_language/functions/encoding_functions.md @@ -1,4 +1,4 @@ -# Функции кодирования +# Функции кодирования {#funktsii-kodirovaniia} ## char {#char} @@ -6,17 +6,17 @@ **Синтаксис** -```sql +``` sql char(number_1, [number_2, ..., number_n]); ``` **Параметры** -- `number_1, number_2, ..., number_n` — Числовые аргументы, которые интерпретируются как целые числа. Типы: [Int](../../data_types/int_uint.md), [Float](../../data_types/float.md). +- `number_1, number_2, ..., number_n` — Числовые аргументы, которые интерпретируются как целые числа. Типы: [Int](../../data_types/int_uint.md), [Float](../../data_types/float.md). **Возвращаемое значение** -- строка из соответствующих байт. +- строка из соответствующих байт. Тип: `String`. @@ -24,13 +24,13 @@ char(number_1, [number_2, ..., number_n]); Запрос: -```sql +``` sql SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello ``` Ответ: -```text +``` text ┌─hello─┐ │ hello │ └───────┘ @@ -39,13 +39,14 @@ SELECT char(104.1, 101, 108.9, 108.9, 111) AS hello Вы можете создать строку в произвольной кодировке, передав соответствующие байты. Пример для UTF-8: Запрос: -```sql + +``` sql SELECT char(0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5, 0xD1, 0x82) AS hello; ``` Ответ: -```text +``` text ┌─hello──┐ │ привет │ └────────┘ @@ -53,13 +54,13 @@ SELECT char(0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5, 0xD1, 0x Запрос: -```sql +``` sql SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; ``` Ответ: -```text +``` text ┌─hello─┐ │ 你好 │ └───────┘ @@ -67,17 +68,17 @@ SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; ## hex {#hex} -Returns a string containing the argument's hexadecimal representation. +Returns a string containing the argument’s hexadecimal representation. **Syntax** -```sql +``` sql hex(arg) ``` The function is using uppercase letters `A-F` and not using any prefixes (like `0x`) or suffixes (like `h`). -For integer arguments, it prints hex digits ("nibbles") from the most significant to least significant (big endian or "human readable" order). It starts with the most significant non-zero byte (leading zero bytes are omitted) but always prints both digits of every byte even if leading digit is zero. +For integer arguments, it prints hex digits («nibbles») from the most significant to least significant (big endian or «human readable» order). It starts with the most significant non-zero byte (leading zero bytes are omitted) but always prints both digits of every byte even if leading digit is zero. Example: @@ -85,13 +86,13 @@ Example: Query: -```sql +``` sql SELECT hex(1); ``` Result: -```text +``` text 01 ``` @@ -103,11 +104,11 @@ Values of floating point and Decimal types are encoded as their representation i **Parameters** -- `arg` — A value to convert to hexadecimal. Types: [String](../../data_types/string.md), [UInt](../../data_types/int_uint.md), [Float](../../data_types/float.md), [Decimal](../../data_types/decimal.md), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). +- `arg` — A value to convert to hexadecimal. Types: [String](../../data_types/string.md), [UInt](../../data_types/int_uint.md), [Float](../../data_types/float.md), [Decimal](../../data_types/decimal.md), [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md). **Returned value** -- A string with the hexadecimal representation of the argument. +- A string with the hexadecimal representation of the argument. Type: `String`. @@ -115,13 +116,13 @@ Type: `String`. Query: -```sql +``` sql SELECT hex(toFloat32(number)) as hex_presentation FROM numbers(15, 2); ``` Result: -```text +``` text ┌─hex_presentation─┐ │ 00007041 │ │ 00008041 │ @@ -130,34 +131,38 @@ Result: Query: -```sql +``` sql SELECT hex(toFloat64(number)) as hex_presentation FROM numbers(15, 2); ``` Result: -```text +``` text ┌─hex_presentation─┐ │ 0000000000002E40 │ │ 0000000000003040 │ └──────────────────┘ ``` -## unhex(str) +## unhex(str) {#unhexstr} -Accepts a string containing any number of hexadecimal digits, and returns a string containing the corresponding bytes. Supports both uppercase and lowercase letters A-F. The number of hexadecimal digits does not have to be even. If it is odd, the last digit is interpreted as the least significant half of the 00-0F byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn't thrown). -If you want to convert the result to a number, you can use the 'reverse' and 'reinterpretAsType' functions. +Accepts a string containing any number of hexadecimal digits, and returns a string containing the corresponding bytes. Supports both uppercase and lowercase letters A-F. The number of hexadecimal digits does not have to be even. If it is odd, the last digit is interpreted as the least significant half of the 00-0F byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn’t thrown). +If you want to convert the result to a number, you can use the ‘reverse’ and ‘reinterpretAsType’ functions. + +## UUIDStringToNum(str) {#uuidstringtonumstr} -## UUIDStringToNum(str) Принимает строку, содержащую 36 символов в формате `123e4567-e89b-12d3-a456-426655440000`, и возвращает в виде набора байт в FixedString(16). -## UUIDNumToString(str) +## UUIDNumToString(str) {#uuidnumtostringstr} + Принимает значение типа FixedString(16). Возвращает строку из 36 символов в текстовом виде. -## bitmaskToList(num) +## bitmaskToList(num) {#bitmasktolistnum} + Принимает целое число. Возвращает строку, содержащую список степеней двойки, в сумме дающих исходное число; по возрастанию, в текстовом виде, через запятую, без пробелов. -## bitmaskToArray(num) +## bitmaskToArray(num) {#bitmasktoarraynum} + Принимает целое число. Возвращает массив чисел типа UInt64, содержащий степени двойки, в сумме дающих исходное число; числа в массиве идут по возрастанию. [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/encoding_functions/) diff --git a/docs/ru/query_language/functions/ext_dict_functions.md b/docs/ru/query_language/functions/ext_dict_functions.md index ca2b461a0ed..5760735797c 100644 --- a/docs/ru/query_language/functions/ext_dict_functions.md +++ b/docs/ru/query_language/functions/ext_dict_functions.md @@ -1,30 +1,31 @@ -# Функции для работы с внешними словарями {#ext_dict_functions} +# Функции для работы с внешними словарями {#ext-dict-functions} Информацию о подключении и настройке внешних словарей смотрите в разделе [Внешние словари](../dicts/external_dicts.md). -## dictGet +## dictGet {#dictget} Извлекает значение из внешнего словаря. -```sql +``` sql dictGet('dict_name', 'attr_name', id_expr) dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` **Параметры** -- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). -- `attr_name` — имя столбца словаря. [Строковый литерал](../syntax.md#syntax-string-literal). -- `id_expr` — значение ключа словаря. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md) или [Tuple](../../data_types/tuple.md) в зависимости от конфигурации словаря. -- `default_value_expr` — значение, возвращаемое в том случае, когда словарь не содержит строки с заданным ключом `id_expr`. [Выражение](../syntax.md#syntax-expressions) возвращающее значение с типом данных, сконфигурированным для атрибута `attr_name`. +- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). +- `attr_name` — имя столбца словаря. [Строковый литерал](../syntax.md#syntax-string-literal). +- `id_expr` — значение ключа словаря. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md) или [Tuple](../../data_types/tuple.md) в зависимости от конфигурации словаря. +- `default_value_expr` — значение, возвращаемое в том случае, когда словарь не содержит строки с заданным ключом `id_expr`. [Выражение](../syntax.md#syntax-expressions) возвращающее значение с типом данных, сконфигурированным для атрибута `attr_name`. **Возвращаемое значение** -- Значение атрибута, соответствующее ключу `id_expr`, если ClickHouse смог привести это значение к [заданному типу данных](../dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes). +- Значение атрибута, соответствующее ключу `id_expr`, если ClickHouse смог привести это значение к [заданному типу данных](../dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes). -- Если ключа, соответствующего `id_expr` в словаре нет, то: - - `dictGet` возвращает содержимое элемента ``, указанного для атрибута в конфигурации словаря. - - `dictGetOrDefault` возвращает атрибут `default_value_expr`. +- Если ключа, соответствующего `id_expr` в словаре нет, то: + + - `dictGet` возвращает содержимое элемента ``, указанного для атрибута в конфигурации словаря. + - `dictGetOrDefault` возвращает атрибут `default_value_expr`. Если значение атрибута не удалось обработать или оно не соответствует типу данных атрибута, то ClickHouse генерирует исключение. @@ -32,7 +33,7 @@ dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) Создадим текстовый файл `ext-dict-text.csv` со следующим содержимым: -```text +``` text 1,1 2,2 ``` @@ -41,7 +42,7 @@ dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) Настройка внешнего словаря: -```xml +``` xml ext-dict-test @@ -71,7 +72,7 @@ dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) Выполним запрос: -```sql +``` sql SELECT dictGetOrDefault('ext-dict-test', 'c1', number + 1, toUInt32(number * 10)) AS val, toTypeName(val) AS type @@ -79,7 +80,7 @@ FROM system.numbers LIMIT 3 ``` -```text +``` text ┌─val─┬─type───┐ │ 1 │ UInt32 │ │ 2 │ UInt32 │ @@ -89,25 +90,25 @@ LIMIT 3 **Смотрите также** -- [Внешние словари](../dicts/external_dicts.md) +- [Внешние словари](../dicts/external_dicts.md) -## dictHas +## dictHas {#dicthas} Проверяет, присутствует ли запись с указанным ключом в словаре. -```sql +``` sql dictHas('dict_name', id) ``` **Параметры** -- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). -- `id_expr` — значение ключа словаря. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). +- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). +- `id_expr` — значение ключа словаря. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). **Возвращаемое значение** -- 0, если ключа нет. -- 1, если ключ есть. +- 0, если ключа нет. +- 1, если ключ есть. Тип — `UInt8`. @@ -117,23 +118,22 @@ dictHas('dict_name', id) **Синтаксис** -```sql +``` sql dictGetHierarchy('dict_name', key) ``` **Параметры** -- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). -- `key` — значение ключа. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). +- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). +- `key` — значение ключа. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). **Возвращаемое значение** -- Цепочка предков заданного ключа. +- Цепочка предков заданного ключа. Type: [Array(UInt64)](../../data_types/array.md). - -## dictIsIn +## dictIsIn {#dictisin} Проверяет предка ключа по всей иерархической цепочке словаря. @@ -141,54 +141,55 @@ Type: [Array(UInt64)](../../data_types/array.md). **Параметры** -- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). -- `child_id_expr` — ключ для проверки. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). -- `ancestor_id_expr` — предполагаемый предок ключа `child_id_expr`. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). +- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). +- `child_id_expr` — ключ для проверки. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). +- `ancestor_id_expr` — предполагаемый предок ключа `child_id_expr`. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). **Возвращаемое значение** -- 0, если `child_id_expr` — не дочерний элемент `ancestor_id_expr`. -- 1, если `child_id_expr` — дочерний элемент `ancestor_id_expr` или если `child_id_expr` и есть `ancestor_id_expr`. +- 0, если `child_id_expr` — не дочерний элемент `ancestor_id_expr`. +- 1, если `child_id_expr` — дочерний элемент `ancestor_id_expr` или если `child_id_expr` и есть `ancestor_id_expr`. Тип — `UInt8`. -## Прочие функции {#ext_dict_functions-other} +## Прочие функции {#ext-dict-functions-other} ClickHouse поддерживает специализированные функции, которые приводят значения атрибутов словаря к определённому типу данных независимо от конфигурации словаря. Функции: -- `dictGetInt8`, `dictGetInt16`, `dictGetInt32`, `dictGetInt64` -- `dictGetUInt8`, `dictGetUInt16`, `dictGetUInt32`, `dictGetUInt64` -- `dictGetFloat32`, `dictGetFloat64` -- `dictGetDate` -- `dictGetDateTime` -- `dictGetUUID` -- `dictGetString` +- `dictGetInt8`, `dictGetInt16`, `dictGetInt32`, `dictGetInt64` +- `dictGetUInt8`, `dictGetUInt16`, `dictGetUInt32`, `dictGetUInt64` +- `dictGetFloat32`, `dictGetFloat64` +- `dictGetDate` +- `dictGetDateTime` +- `dictGetUUID` +- `dictGetString` Все эти функции можно использовать с модификатором `OrDefault`. Например, `dictGetDateOrDefault`. Синтаксис: -```sql +``` sql dictGet[Type]('dict_name', 'attr_name', id_expr) dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` **Параметры** -- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). -- `attr_name` — имя столбца словаря. [Строковый литерал](../syntax.md#syntax-string-literal). -- `id_expr` — значение ключа словаря. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). -- `default_value_expr` — значение, возвращаемое в том случае, когда словарь не содержит строки с заданным ключом `id_expr`. [Выражение](../syntax.md#syntax-expressions) возвращающее значение с типом данных, сконфигурированным для атрибута `attr_name`. +- `dict_name` — имя словаря. [Строковый литерал](../syntax.md#syntax-string-literal). +- `attr_name` — имя столбца словаря. [Строковый литерал](../syntax.md#syntax-string-literal). +- `id_expr` — значение ключа словаря. [Выражение](../syntax.md#syntax-expressions), возвращающее значение типа [UInt64](../../data_types/int_uint.md). +- `default_value_expr` — значение, возвращаемое в том случае, когда словарь не содержит строки с заданным ключом `id_expr`. [Выражение](../syntax.md#syntax-expressions) возвращающее значение с типом данных, сконфигурированным для атрибута `attr_name`. **Возвращаемое значение** -- Если ClickHouse успешно обработал атрибут в соответствии с [заданным типом данных](../dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes), то функции возвращают значение атрибута, соответствующее ключу `id_expr`. +- Если ClickHouse успешно обработал атрибут в соответствии с [заданным типом данных](../dicts/external_dicts_dict_structure.md#ext_dict_structure-attributes), то функции возвращают значение атрибута, соответствующее ключу `id_expr`. -- Если запрошенного `id_expr` нет в словаре, то: - - `dictGet[Type]` возвращает содержимое элемента ``, указанного для атрибута в конфигурации словаря. - - `dictGet[Type]OrDefault` возвращает аргумент `default_value_expr`. +- Если запрошенного `id_expr` нет в словаре, то: + + - `dictGet[Type]` возвращает содержимое элемента ``, указанного для атрибута в конфигурации словаря. + - `dictGet[Type]OrDefault` возвращает аргумент `default_value_expr`. Если значение атрибута не удалось обработать или оно не соответствует типу данных атрибута, то ClickHouse генерирует исключение. diff --git a/docs/ru/query_language/functions/functions_for_nulls.md b/docs/ru/query_language/functions/functions_for_nulls.md index 8558542a8f7..1782a5a0cf2 100644 --- a/docs/ru/query_language/functions/functions_for_nulls.md +++ b/docs/ru/query_language/functions/functions_for_nulls.md @@ -1,27 +1,27 @@ -# Функции для работы с Nullable-аргументами +# Функции для работы с Nullable-аргументами {#funktsii-dlia-raboty-s-nullable-argumentami} -## isNull +## isNull {#isnull} Проверяет является ли аргумент [NULL](../syntax.md#null). -```sql +``` sql isNull(x) ``` **Параметры** -- `x` — значение с не составным типом данных. +- `x` — значение с не составным типом данных. **Возвращаемое значение** -- `1`, если `x` — `NULL`. -- `0`, если `x` — не `NULL`. +- `1`, если `x` — `NULL`. +- `0`, если `x` — не `NULL`. **Пример** Входная таблица -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -30,37 +30,38 @@ isNull(x) Запрос -```sql +``` sql SELECT x FROM t_null WHERE isNull(y) ``` -```text + +``` text ┌─x─┐ │ 1 │ └───┘ ``` -## isNotNull +## isNotNull {#isnotnull} Проверяет не является ли аргумент [NULL](../syntax.md#null). -```sql +``` sql isNotNull(x) ``` **Параметры** -- `x` — значение с не составным типом данных. +- `x` — значение с не составным типом данных. **Возвращаемое значение** -- `0`, если `x` — `NULL`. -- `1`, если `x` — не `NULL`. +- `0`, если `x` — `NULL`. +- `1`, если `x` — не `NULL`. **Пример** Входная таблица -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -69,36 +70,38 @@ isNotNull(x) Запрос -```sql +``` sql SELECT x FROM t_null WHERE isNotNull(y) ``` -```text + +``` text ┌─x─┐ │ 2 │ └───┘ ``` -## coalesce +## coalesce {#coalesce} Последовательно слева-направо проверяет являются ли переданные аргументы `NULL` и возвращает первый не `NULL`. -```sql +``` sql coalesce(x,...) ``` + **Параметры** -- Произвольное количество параметров не составного типа. Все параметры должны быть совместимы по типу данных. +- Произвольное количество параметров не составного типа. Все параметры должны быть совместимы по типу данных. **Возвращаемые значения** -- Первый не `NULL` аргумент. -- `NULL`, если все аргументы — `NULL`. +- Первый не `NULL` аргумент. +- `NULL`, если все аргументы — `NULL`. **Пример** Рассмотрим адресную книгу, в которой может быть указано несколько способов связи с клиентом. -```text +``` text ┌─name─────┬─mail─┬─phone─────┬──icq─┐ │ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ │ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ @@ -109,58 +112,62 @@ coalesce(x,...) Получим из адресной книги первый доступный способ связаться с клиентом: -```sql +``` sql SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook ``` -```text + +``` text ┌─name─────┬─coalesce(mail, phone, CAST(icq, 'Nullable(String)'))─┐ │ client 1 │ 123-45-67 │ │ client 2 │ ᴺᵁᴸᴸ │ └──────────┴──────────────────────────────────────────────────────┘ ``` -## ifNull +## ifNull {#ifnull} Возвращает альтернативное значение, если основной аргумент — `NULL`. -```sql +``` sql ifNull(x,alt) ``` **Параметры** -- `x` — значение для проверки на `NULL`, -- `alt` — значение, которое функция вернёт, если `x` — `NULL`. +- `x` — значение для проверки на `NULL`, +- `alt` — значение, которое функция вернёт, если `x` — `NULL`. **Возвращаемые значения** -- Значение `x`, если `x` — не `NULL`. -- Значение `alt`, если `x` — `NULL`. +- Значение `x`, если `x` — не `NULL`. +- Значение `alt`, если `x` — `NULL`. **Пример** -```sql +``` sql SELECT ifNull('a', 'b') ``` -```text + +``` text ┌─ifNull('a', 'b')─┐ │ a │ └──────────────────┘ ``` -```sql + +``` sql SELECT ifNull(NULL, 'b') ``` -```text + +``` text ┌─ifNull(NULL, 'b')─┐ │ b │ └───────────────────┘ ``` -## nullIf +## nullIf {#nullif} Возвращает `NULL`, если аргументы равны. -```sql +``` sql nullIf(x, y) ``` @@ -170,58 +177,63 @@ nullIf(x, y) **Возвращаемые значения** -- `NULL`, если аргументы равны. -- Значение `x`, если аргументы не равны. +- `NULL`, если аргументы равны. +- Значение `x`, если аргументы не равны. **Пример** -```sql +``` sql SELECT nullIf(1, 1) ``` -```text + +``` text ┌─nullIf(1, 1)─┐ │ ᴺᵁᴸᴸ │ └──────────────┘ ``` -```sql + +``` sql SELECT nullIf(1, 2) ``` -```text + +``` text ┌─nullIf(1, 2)─┐ │ 1 │ └──────────────┘ ``` -## assumeNotNull +## assumeNotNull {#assumenotnull} Приводит значение типа [Nullable](../../data_types/nullable.md) к не `Nullable`, если значение не `NULL`. -```sql +``` sql assumeNotNull(x) ``` **Параметры** -- `x` — исходное значение. +- `x` — исходное значение. **Возвращаемые значения** -- Исходное значение с не `Nullable` типом, если оно — не `NULL`. -- Значение по умолчанию для не `Nullable` типа, если исходное значение — `NULL`. +- Исходное значение с не `Nullable` типом, если оно — не `NULL`. +- Значение по умолчанию для не `Nullable` типа, если исходное значение — `NULL`. **Пример** Рассмотрим таблицу `t_null`. -```sql +``` sql SHOW CREATE TABLE t_null ``` -```text + +``` text ┌─statement─────────────────────────────────────────────────────────────────┐ │ CREATE TABLE default.t_null ( x Int8, y Nullable(Int8)) ENGINE = TinyLog │ └───────────────────────────────────────────────────────────────────────────┘ ``` -```text + +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -230,55 +242,61 @@ SHOW CREATE TABLE t_null Применим функцию `assumeNotNull` к столбцу `y`. -```sql +``` sql SELECT assumeNotNull(y) FROM t_null ``` -```text + +``` text ┌─assumeNotNull(y)─┐ │ 0 │ │ 3 │ └──────────────────┘ ``` -```sql + +``` sql SELECT toTypeName(assumeNotNull(y)) FROM t_null ``` -```text + +``` text ┌─toTypeName(assumeNotNull(y))─┐ │ Int8 │ │ Int8 │ └──────────────────────────────┘ ``` -## toNullable +## toNullable {#tonullable} Преобразует тип аргумента к `Nullable`. -```sql +``` sql toNullable(x) ``` **Параметры** -- `x` — значение произвольного не составного типа. +- `x` — значение произвольного не составного типа. **Возвращаемое значение** -- Входное значение с типом не `Nullable`. +- Входное значение с типом не `Nullable`. **Пример** -```sql +``` sql SELECT toTypeName(10) ``` -```text + +``` text ┌─toTypeName(10)─┐ │ UInt8 │ └────────────────┘ ``` -```sql + +``` sql SELECT toTypeName(toNullable(10)) ``` -```text + +``` text ┌─toTypeName(toNullable(10))─┐ │ Nullable(UInt8) │ └────────────────────────────┘ diff --git a/docs/ru/query_language/functions/geo.md b/docs/ru/query_language/functions/geo.md index 3994534ae26..07e2e4c1da6 100644 --- a/docs/ru/query_language/functions/geo.md +++ b/docs/ru/query_language/functions/geo.md @@ -1,19 +1,19 @@ -# Функции для работы с географическими координатами +# Функции для работы с географическими координатами {#funktsii-dlia-raboty-s-geograficheskimi-koordinatami} -## greatCircleDistance +## greatCircleDistance {#greatcircledistance} Вычисляет расстояние между двумя точками на поверхности Земли по [формуле большого круга](https://en.wikipedia.org/wiki/Great-circle_distance). -```sql +``` sql greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) ``` **Входные параметры** -- `lon1Deg` — долгота первой точки в градусах. Диапазон — `[-180°, 180°]`. -- `lat1Deg` — широта первой точки в градусах. Диапазон — `[-90°, 90°]`. -- `lon2Deg` — долгота второй точки в градусах. Диапазон — `[-180°, 180°]`. -- `lat2Deg` — широта второй точки в градусах. Диапазон — `[-90°, 90°]`. +- `lon1Deg` — долгота первой точки в градусах. Диапазон — `[-180°, 180°]`. +- `lat1Deg` — широта первой точки в градусах. Диапазон — `[-90°, 90°]`. +- `lon2Deg` — долгота второй точки в градусах. Диапазон — `[-180°, 180°]`. +- `lat2Deg` — широта второй точки в градусах. Диапазон — `[-90°, 90°]`. Положительные значения соответствуют северной широте и восточной долготе, отрицательные — южной широте и западной долготе. @@ -25,30 +25,30 @@ greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) **Пример** -```sql +``` sql SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) ``` -```text +``` text ┌─greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673)─┐ │ 14132374.194975413 │ └───────────────────────────────────────────────────────────────────┘ ``` -## greatCircleAngle +## greatCircleAngle {#greatcircleangle} Вычисляет угловое расстояние на сфере по [формуле большого круга](https://en.wikipedia.org/wiki/Great-circle_distance). -```sql +``` sql greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) ``` **Входные параметры** -- `lon1Deg` — долгота первой точки в градусах. -- `lat1Deg` — широта первой точки в градусах. -- `lon2Deg` — долгота второй точки в градусах. -- `lat2Deg` — широта второй точки в градусах. +- `lon1Deg` — долгота первой точки в градусах. +- `lat1Deg` — широта первой точки в градусах. +- `lon2Deg` — долгота второй точки в градусах. +- `lat2Deg` — широта второй точки в градусах. **Возвращаемое значение** @@ -56,31 +56,28 @@ greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) **Пример** -```sql +``` sql SELECT greatCircleAngle(0, 0, 45, 0) AS arc ``` -```text +``` text ┌─arc─┐ │ 45 │ └─────┘ ``` - -## pointInEllipses +## pointInEllipses {#pointinellipses} Проверяет, принадлежит ли точка хотя бы одному из эллипсов. Координаты — геометрические в декартовой системе координат. -``` -pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) -``` + pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) **Входные параметры** -- `x, y` — координаты точки на плоскости. -- `xᵢ, yᵢ` — координаты центра `i`-го эллипса. -- `aᵢ, bᵢ` — полуоси `i`-го эллипса (в единицах измерения координат x,y). +- `x, y` — координаты точки на плоскости. +- `xᵢ, yᵢ` — координаты центра `i`-го эллипса. +- `aᵢ, bᵢ` — полуоси `i`-го эллипса (в единицах измерения координат x,y). Входных параметров должно быть `2+4⋅n`, где `n` — количество эллипсов. @@ -90,29 +87,29 @@ pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) **Пример** -```sql +``` sql SELECT pointInEllipses(10., 10., 10., 9.1, 1., 0.9999) ``` -```text +``` text ┌─pointInEllipses(10., 10., 10., 9.1, 1., 0.9999)─┐ │ 1 │ └─────────────────────────────────────────────────┘ ``` -## pointInPolygon +## pointInPolygon {#pointinpolygon} Проверяет, принадлежит ли точка многоугольнику на плоскости. -```sql +``` sql pointInPolygon((x, y), [(a, b), (c, d) ...], ...) ``` **Входные значения** -- `(x, y)` — координаты точки на плоскости. Тип данных — [Tuple](../../data_types/tuple.md) — кортеж из двух чисел. -- `[(a, b), (c, d) ...]` — вершины многоугольника. Тип данных — [Array](../../data_types/array.md). Каждая вершина представлена парой координат `(a, b)`. Вершины следует указывать в порядке обхода по или против часовой стрелки. Минимальное количество вершин — 3. Многоугольник должен быть константным. -- функция поддерживает также многоугольники с дырками (вырезанными кусками). Для этого случая, добавьте многоугольники, описывающие вырезанные куски, дополнительными аргументами функции. Функция не поддерживает не односвязные многоугольники. +- `(x, y)` — координаты точки на плоскости. Тип данных — [Tuple](../../data_types/tuple.md) — кортеж из двух чисел. +- `[(a, b), (c, d) ...]` — вершины многоугольника. Тип данных — [Array](../../data_types/array.md). Каждая вершина представлена парой координат `(a, b)`. Вершины следует указывать в порядке обхода по или против часовой стрелки. Минимальное количество вершин — 3. Многоугольник должен быть константным. +- функция поддерживает также многоугольники с дырками (вырезанными кусками). Для этого случая, добавьте многоугольники, описывающие вырезанные куски, дополнительными аргументами функции. Функция не поддерживает не односвязные многоугольники. **Возвращаемые значения** @@ -121,69 +118,69 @@ pointInPolygon((x, y), [(a, b), (c, d) ...], ...) **Пример** -```sql +``` sql SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res ``` -```text +``` text ┌─res─┐ │ 1 │ └─────┘ ``` -## geohashEncode +## geohashEncode {#geohashencode} -Кодирует широту и долготу в строку geohash, смотрите [http://geohash.org/](http://geohash.org/), [https://en.wikipedia.org/wiki/Geohash](https://en.wikipedia.org/wiki/Geohash). +Кодирует широту и долготу в строку geohash, смотрите http://geohash.org/, https://en.wikipedia.org/wiki/Geohash. -```sql +``` sql geohashEncode(longitude, latitude, [precision]) ``` **Входные значения** -- longitude — долгота. Диапазон — `[-180°, 180°].` -- latitude — широта. Диапазон — `[-90°, 90°].` -- precision — длина результирующей строки, по умолчанию `12`. Опционально. Целое число в диапазоне `[1, 12]`. Любое значение меньше, чем `1` или больше `12` автоматически преобразуются в `12`. +- longitude — долгота. Диапазон — `[-180°, 180°].` +- latitude — широта. Диапазон — `[-90°, 90°].` +- precision — длина результирующей строки, по умолчанию `12`. Опционально. Целое число в диапазоне `[1, 12]`. Любое значение меньше, чем `1` или больше `12` автоматически преобразуются в `12`. **Возвращаемые значения** -- Строка с координатой, закодированной модифицированной версией алфавита base32. +- Строка с координатой, закодированной модифицированной версией алфавита base32. **Пример** -```sql +``` sql SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res ``` -```text +``` text ┌─res──────────┐ │ ezs42d000000 │ └──────────────┘ ``` -## geohashDecode +## geohashDecode {#geohashdecode} Декодирует любую строку, закодированную в geohash, на долготу и широту. -```sql +``` sql geohashDecode(geohash_string) ``` **Входные значения** -- `geohash_string` — строка, содержащая geohash. +- `geohash_string` — строка, содержащая geohash. **Возвращаемые значения** -- `(longitude, latitude)` — широта и долгота. Кортеж из двух значений типа `Float64`. +- `(longitude, latitude)` — широта и долгота. Кортеж из двух значений типа `Float64`. **Пример** -```sql +``` sql SELECT geohashDecode('ezs42') AS res ``` -```text +``` text ┌─res─────────────────────────────┐ │ (-5.60302734375,42.60498046875) │ └─────────────────────────────────┘ @@ -193,73 +190,75 @@ SELECT geohashDecode('ezs42') AS res Проверяет корректность H3-индекса. -```sql +``` sql h3IsValid(h3index) ``` **Входные значения** -- `h3index` — идентификатор шестиугольника. Тип данных — [UInt64](../../data_types/int_uint.md). +- `h3index` — идентификатор шестиугольника. Тип данных — [UInt64](../../data_types/int_uint.md). **Возвращаемые значения** -- 0 — число не является H3-индексом -- 1 — число является H3-индексом +- 0 — число не является H3-индексом +- 1 — число является H3-индексом Тип — [UInt8](../../data_types/int_uint.md). **Пример** -```sql +``` sql SELECT h3IsValid(630814730351855103) as h3IsValid ``` -```text + +``` text ┌─h3IsValid─┐ │ 1 │ └───────────┘ ``` -## h3GetResolution +## h3GetResolution {#h3getresolution} Извлекает разрешение H3-индекса. -```sql +``` sql h3GetResolution(h3index) ``` **Входные значения** -- `h3index` — идентификатор шестиугольника. Тип данных — [UInt64](../../data_types/int_uint.md). +- `h3index` — идентификатор шестиугольника. Тип данных — [UInt64](../../data_types/int_uint.md). **Возвращаемые значения** -- Разрешение сетки, от 0 до 15. -- Для несуществующего идентификатора может быть возвращено произвольное значение, используйте [h3IsValid](#h3isvalid) для проверки идентификаторов +- Разрешение сетки, от 0 до 15. +- Для несуществующего идентификатора может быть возвращено произвольное значение, используйте [h3IsValid](#h3isvalid) для проверки идентификаторов Тип — [UInt8](../../data_types/int_uint.md). **Пример** -```sql +``` sql SELECT h3GetResolution(639821929606596015) as resolution ``` -```text + +``` text ┌─resolution─┐ │ 14 │ └────────────┘ ``` -## h3EdgeAngle +## h3EdgeAngle {#h3edgeangle} Информирует о среднем размере стороны шестигранника H3 в градусах -```sql +``` sql h3EdgeAngle(resolution) ``` **Входные значения** -- `resolution` — требуемое разрешение индекса. Тип данных — [UInt8](../../data_types/int_uint.md). Диапазон возможных значений — `[0, 15]`. +- `resolution` — требуемое разрешение индекса. Тип данных — [UInt8](../../data_types/int_uint.md). Диапазон возможных значений — `[0, 15]`. **Возвращаемые значения** @@ -267,27 +266,27 @@ h3EdgeAngle(resolution) **Пример** -```sql +``` sql SELECT h3EdgeAngle(10) as edgeAngle ``` -```text + +``` text ┌─────────h3EdgeAngle(10)─┐ │ 0.0005927224846720883 │ └───────────────────────┘ ``` - -## h3EdgeLengthM +## h3EdgeLengthM {#h3edgelengthm} Информирует о среднем размере стороны шестигранника H3 в метрах -```sql +``` sql h3EdgeLengthM(resolution) ``` **Входные значения** -- `resolution` — требуемое разрешение индекса. Тип данных — [UInt8](../../data_types/int_uint.md). Диапазон возможных значений — `[0, 15]`. +- `resolution` — требуемое разрешение индекса. Тип данных — [UInt8](../../data_types/int_uint.md). Диапазон возможных значений — `[0, 15]`. **Возвращаемые значения** @@ -295,10 +294,11 @@ h3EdgeLengthM(resolution) **Пример** -```sql +``` sql SELECT h3EdgeLengthM(15) as edgeLengthM ``` -```text + +``` text ┌─edgeLengthM─┐ │ 0.509713273 │ └─────────────┘ @@ -312,22 +312,22 @@ SELECT h3EdgeLengthM(15) as edgeLengthM H3 индекс используется в основном для определения местоположения с помощью карт и других геопространственных манипуляций. -**Синтаксис** +**Синтаксис** -```sql +``` sql geoToH3(lon, lat, resolution) ``` **Параметры** -- `lon` — географическая долгота. Тип данных — [Float64](../../data_types/float.md). -- `lat` — географическая широта. Тип данных — [Float64](../../data_types/float.md). -- `resolution` — требуемое разрешение индекса. Тип данных — [UInt8](../../data_types/int_uint.md). Диапазон возможных значений — `[0, 15]`. +- `lon` — географическая долгота. Тип данных — [Float64](../../data_types/float.md). +- `lat` — географическая широта. Тип данных — [Float64](../../data_types/float.md). +- `resolution` — требуемое разрешение индекса. Тип данных — [UInt8](../../data_types/int_uint.md). Диапазон возможных значений — `[0, 15]`. **Возвращаемые значения** -- Порядковый номер шестиугольника. -- 0 в случае ошибки. +- Порядковый номер шестиугольника. +- 0 в случае ошибки. Тип — [UInt64](../../data_types/int_uint.md). @@ -335,29 +335,30 @@ geoToH3(lon, lat, resolution) Запрос: -```sql +``` sql SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index ``` Ответ: -```text +``` text ┌────────────h3Index─┐ │ 644325524701193974 │ └────────────────────┘ ``` -## h3kRing +## h3kRing {#h3kring} Возвращает H3-индексы шестиугольников в радиусе `k` от данного в произвольном порядке -```sql +``` sql h3kRing(h3index, k) ``` + **Входные значения** -- `h3index` — идентификатор шестиугольника. Тип данных — [UInt64](../../data_types/int_uint.md). -- `k` — радиус. Тип данных — [целое число](../../data_types/int_uint.md) +- `h3index` — идентификатор шестиугольника. Тип данных — [UInt64](../../data_types/int_uint.md). +- `k` — радиус. Тип данных — [целое число](../../data_types/int_uint.md) **Возвращаемые значения** @@ -365,10 +366,11 @@ h3kRing(h3index, k) **Пример** -```sql +``` sql SELECT arrayJoin(h3kRing(644325529233966508, 1)) AS h3index ``` -```text + +``` text ┌────────────h3index─┐ │ 644325529233966508 │ │ 644325529233966497 │ @@ -380,5 +382,4 @@ SELECT arrayJoin(h3kRing(644325529233966508, 1)) AS h3index └────────────────────┘ ``` - [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/geo/) diff --git a/docs/ru/query_language/functions/hash_functions.md b/docs/ru/query_language/functions/hash_functions.md index fb460bf25f5..736b9c4ebd7 100644 --- a/docs/ru/query_language/functions/hash_functions.md +++ b/docs/ru/query_language/functions/hash_functions.md @@ -1,12 +1,12 @@ -# Функции хэширования +# Функции хэширования {#funktsii-kheshirovaniia} Функции хэширования могут использоваться для детерминированного псевдослучайного разбрасывания элементов. -## halfMD5 {#hash_functions-halfmd5} +## halfMD5 {#hash-functions-halfmd5} [Интерпретирует](../../query_language/functions/type_conversion_functions.md#type_conversion_functions-reinterpretAsString) все входные параметры как строки и вычисляет хэш [MD5](https://ru.wikipedia.org/wiki/MD5) для каждой из них. Затем объединяет хэши, берет первые 8 байт хэша результирующей строки и интерпретирует их как значение типа `UInt64` с big-endian порядком байтов. -```sql +``` sql halfMD5(par1, ...) ``` @@ -23,27 +23,27 @@ halfMD5(par1, ...) **Пример** -```sql +``` sql SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS halfMD5hash, toTypeName(halfMD5hash) AS type ``` -```text +``` text ┌────────halfMD5hash─┬─type───┐ │ 186182704141653334 │ UInt64 │ └────────────────────┴────────┘ ``` -## MD5 {#hash_functions-md5} +## MD5 {#hash-functions-md5} Вычисляет MD5 от строки и возвращает полученный набор байт в виде FixedString(16). Если вам не нужен конкретно MD5, а нужен неплохой криптографический 128-битный хэш, то используйте вместо этого функцию sipHash128. Если вы хотите получить такой же результат, как выдаёт утилита md5sum, напишите lower(hex(MD5(s))). -## sipHash64 {#hash_functions-siphash64} +## sipHash64 {#hash-functions-siphash64} Генерирует 64-х битное значение [SipHash](https://131002.net/siphash/). -```sql +``` sql sipHash64(par1,...) ``` @@ -51,10 +51,10 @@ sipHash64(par1,...) Функция [интерпретирует](../../query_language/functions/type_conversion_functions.md#type_conversion_functions-reinterpretAsString) все входные параметры как строки и вычисляет хэш MD5 для каждой из них. Затем комбинирует хэши по следующему алгоритму. -1. После хэширования всех входных параметров функция получает массив хэшей. -2. Функция принимает первый и второй элементы и вычисляет хэш для массива из них. -3. Затем функция принимает хэш-значение, вычисленное на предыдущем шаге, и третий элемент исходного хэш-массива, и вычисляет хэш для массива из них. -4. Предыдущий шаг повторяется для всех остальных элементов исходного хэш-массива. +1. После хэширования всех входных параметров функция получает массив хэшей. +2. Функция принимает первый и второй элементы и вычисляет хэш для массива из них. +3. Затем функция принимает хэш-значение, вычисленное на предыдущем шаге, и третий элемент исходного хэш-массива, и вычисляет хэш для массива из них. +4. Предыдущий шаг повторяется для всех остальных элементов исходного хэш-массива. **Параметры** @@ -66,31 +66,31 @@ sipHash64(par1,...) **Пример** -```sql +``` sql SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type ``` -```text +``` text ┌──────────────SipHash─┬─type───┐ │ 13726873534472839665 │ UInt64 │ └──────────────────────┴────────┘ ``` -## sipHash128 {#hash_functions-siphash128} +## sipHash128 {#hash-functions-siphash128} Вычисляет SipHash от строки. Принимает аргумент типа String. Возвращает FixedString(16). Отличается от sipHash64 тем, что финальный xor-folding состояния делается только до 128 бит. -## cityHash64 +## cityHash64 {#cityhash64} Генерирует 64-х битное значение [CityHash](https://github.com/google/cityhash). -```sql +``` sql cityHash64(par1,...) ``` -Это не криптографическая хэш-функция. Она использует CityHash алгоритм для строковых параметров и зависящую от реализации быструю некриптографическую хэш-функцию для параметров с другими типами данных. Функция использует комбинатор CityHash для получения конечных результатов. +Это не криптографическая хэш-функция. Она использует CityHash алгоритм для строковых параметров и зависящую от реализации быструю некриптографическую хэш-функцию для параметров с другими типами данных. Функция использует комбинатор CityHash для получения конечных результатов. **Параметры** @@ -104,10 +104,11 @@ cityHash64(par1,...) Пример вызова: -```sql +``` sql SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS CityHash, toTypeName(CityHash) AS type ``` -```text + +``` text ┌─────────────CityHash─┬─type───┐ │ 12072650598913549138 │ UInt64 │ └──────────────────────┴────────┘ @@ -115,47 +116,47 @@ SELECT cityHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 А вот так вы можете вычислить чексумму всей таблицы с точностью до порядка строк: -```sql +``` sql SELECT groupBitXor(cityHash64(*)) FROM table ``` -## intHash32 +## intHash32 {#inthash32} Вычисляет 32-битный хэш-код от целого числа любого типа. Это сравнительно быстрая не криптографическая хэш-функция среднего качества для чисел. -## intHash64 +## intHash64 {#inthash64} Вычисляет 64-битный хэш-код от целого числа любого типа. Работает быстрее, чем intHash32. Качество среднее. -## SHA1 +## SHA1 {#sha1} -## SHA224 +## SHA224 {#sha224} -## SHA256 +## SHA256 {#sha256} Вычисляет SHA-1, SHA-224, SHA-256 от строки и возвращает полученный набор байт в виде FixedString(20), FixedString(28), FixedString(32). Функция работает достаточно медленно (SHA-1 - примерно 5 миллионов коротких строк в секунду на одном процессорном ядре, SHA-224 и SHA-256 - примерно 2.2 миллионов). Рекомендуется использовать эти функции лишь в тех случаях, когда вам нужна конкретная хэш-функция и вы не можете её выбрать. Даже в этих случаях, рекомендуется применять функцию оффлайн - заранее вычисляя значения при вставке в таблицу, вместо того, чтобы применять её при SELECT-ах. -## URLHash(url\[, N\]) +## URLHash(url\[, N\]) {#urlhashurl-n} Быстрая не криптографическая хэш-функция неплохого качества для строки, полученной из URL путём некоторой нормализации. `URLHash(s)` - вычислить хэш от строки без одного завершающего символа `/`, `?` или `#` на конце, если там такой есть. `URLHash(s, N)` - вычислить хэш от строки до N-го уровня в иерархии URL, без одного завершающего символа `/`, `?` или `#` на конце, если там такой есть. Уровни аналогичные URLHierarchy. Функция специфична для Яндекс.Метрики. -## farmHash64 +## farmHash64 {#farmhash64} Генерирует 64-х битное значение [FarmHash](https://github.com/google/farmhash). -```sql +``` sql farmHash64(par1, ...) ``` -Из всех [доступных методов](https://github.com/google/farmhash/blob/master/src/farmhash.h) функция использует `Hash64`. +Из всех [доступных методов](https://github.com/google/farmhash/blob/master/src/farmhash.h) функция использует `Hash64`. **Параметры** @@ -167,21 +168,21 @@ farmHash64(par1, ...) **Пример** -```sql +``` sql SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS FarmHash, toTypeName(FarmHash) AS type ``` -```text +``` text ┌─────────────FarmHash─┬─type───┐ │ 17790458267262532859 │ UInt64 │ └──────────────────────┴────────┘ ``` -## javaHash {#hash_functions-javahash} +## javaHash {#hash-functions-javahash} Вычисляет [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) от строки. `JavaHash` не отличается ни скоростью, ни качеством, поэтому эту функцию следует считать устаревшей. Используйте эту функцию, если вам необходимо получить значение хэша по такому же алгоритму. -```sql +``` sql SELECT javaHash(''); ``` @@ -195,13 +196,13 @@ SELECT javaHash(''); Запрос: -```sql +``` sql SELECT javaHash('Hello, world!'); ``` Ответ: -```text +``` text ┌─javaHash('Hello, world!')─┐ │ -1880044555 │ └───────────────────────────┘ @@ -211,15 +212,15 @@ SELECT javaHash('Hello, world!'); Вычисляет [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) от строки, при допущении, что строка представлена в кодировке `UTF-16LE`. -**Синтаксис** +**Синтаксис** -```sql +``` sql javaHashUTF16LE(stringUtf16le) ``` **Параметры** -- `stringUtf16le` — строка в `UTF-16LE`. +- `stringUtf16le` — строка в `UTF-16LE`. **Возвращаемое значение** @@ -233,27 +234,27 @@ javaHashUTF16LE(stringUtf16le) Запрос: -```sql +``` sql SELECT javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le')) ``` Ответ: -```text +``` text ┌─javaHashUTF16LE(convertCharset('test', 'utf-8', 'utf-16le'))─┐ │ 3556498 │ └──────────────────────────────────────────────────────────────┘ ``` -## hiveHash {#hash_functions-hivehash} +## hiveHash {#hash-functions-hivehash} Вычисляет `HiveHash` от строки. -```sql +``` sql SELECT hiveHash(''); ``` -`HiveHash` — это результат [JavaHash](#hash_functions-javahash) с обнулённым битом знака числа. Функция используется в [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) вплоть до версии 3.0. +`HiveHash` — это результат [JavaHash](#hash_functions-javahash) с обнулённым битом знака числа. Функция используется в [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) вплоть до версии 3.0. **Возвращаемое значение** @@ -265,23 +266,23 @@ SELECT hiveHash(''); Запрос: -```sql +``` sql SELECT hiveHash('Hello, world!'); ``` Ответ: -```text +``` text ┌─hiveHash('Hello, world!')─┐ │ 267439093 │ └───────────────────────────┘ ``` -## metroHash64 +## metroHash64 {#metrohash64} -Генерирует 64-х битное значение [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/). +Генерирует 64-х битное значение [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/). -```sql +``` sql metroHash64(par1, ...) ``` @@ -295,27 +296,27 @@ metroHash64(par1, ...) **Пример** -```sql +``` sql SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MetroHash, toTypeName(MetroHash) AS type ``` -```text +``` text ┌────────────MetroHash─┬─type───┐ │ 14235658766382344533 │ UInt64 │ └──────────────────────┴────────┘ ``` -## jumpConsistentHash +## jumpConsistentHash {#jumpconsistenthash} Вычисляет JumpConsistentHash от значения типа UInt64. Имеет два параметра: ключ типа UInt64 и количество бакетов. Возвращает значение типа Int32. Дополнительные сведения смотрите по ссылке: [JumpConsistentHash](https://arxiv.org/pdf/1406.2294.pdf) -## murmurHash2_32, murmurHash2_64 +## murmurHash2\_32, murmurHash2\_64 {#murmurhash2-32-murmurhash2-64} Генерирует значение [MurmurHash2](https://github.com/aappleby/smhasher). -```sql +``` sql murmurHash2_32(par1, ...) murmurHash2_64(par1, ...) ``` @@ -326,26 +327,26 @@ murmurHash2_64(par1, ...) **Возвращаемое значение** -- Функция `murmurHash2_32` возвращает значение типа [UInt32](../../data_types/int_uint.md). -- Функция `murmurHash2_64` возвращает значение типа [UInt64](../../data_types/int_uint.md). +- Функция `murmurHash2_32` возвращает значение типа [UInt32](../../data_types/int_uint.md). +- Функция `murmurHash2_64` возвращает значение типа [UInt64](../../data_types/int_uint.md). **Пример** -```sql +``` sql SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash2, toTypeName(MurmurHash2) AS type ``` -```text +``` text ┌──────────MurmurHash2─┬─type───┐ │ 11832096901709403633 │ UInt64 │ └──────────────────────┴────────┘ ``` -## murmurHash3_32, murmurHash3_64 +## murmurHash3\_32, murmurHash3\_64 {#murmurhash3-32-murmurhash3-64} Генерирует значение [MurmurHash3](https://github.com/aappleby/smhasher). -```sql +``` sql murmurHash3_32(par1, ...) murmurHash3_64(par1, ...) ``` @@ -356,32 +357,32 @@ murmurHash3_64(par1, ...) **Возвращаемое значение** -- Функция `murmurHash3_32` возвращает значение типа [UInt32](../../data_types/int_uint.md). -- Функция `murmurHash3_64` возвращает значение типа [UInt64](../../data_types/int_uint.md). +- Функция `murmurHash3_32` возвращает значение типа [UInt32](../../data_types/int_uint.md). +- Функция `murmurHash3_64` возвращает значение типа [UInt64](../../data_types/int_uint.md). **Пример** -```sql +``` sql SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS MurmurHash3, toTypeName(MurmurHash3) AS type ``` -```text +``` text ┌─MurmurHash3─┬─type───┐ │ 2152717 │ UInt32 │ └─────────────┴────────┘ ``` -## murmurHash3_128 +## murmurHash3\_128 {#murmurhash3-128} Генерирует значение [MurmurHash3](https://github.com/aappleby/smhasher). -```sql +``` sql murmurHash3_128( expr ) ``` **Параметры** -- `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее значение типа[String](../../data_types/string.md). +- `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее значение типа[String](../../data_types/string.md). **Возвращаемое значение** @@ -389,21 +390,21 @@ murmurHash3_128( expr ) **Пример** -```sql +``` sql SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) AS type ``` -```text +``` text ┌─MurmurHash3──────┬─type────────────┐ │ 6�1�4"S5KT�~~q │ FixedString(16) │ └──────────────────┴─────────────────┘ ``` -## xxHash32, xxHash64 {#hash_functions-xxhash32-xxhash64} +## xxHash32, xxHash64 {#hash-functions-xxhash32-xxhash64} Вычисляет `xxHash` от строки. Предлагается в двух вариантах: 32 и 64 бита. -```sql +``` sql SELECT xxHash32(''); OR @@ -421,13 +422,13 @@ SELECT xxHash64(''); Запрос: -```sql +``` sql SELECT xxHash32('Hello, world!'); ``` Ответ: -```text +``` text ┌─xxHash32('Hello, world!')─┐ │ 834093149 │ └───────────────────────────┘ @@ -435,6 +436,6 @@ SELECT xxHash32('Hello, world!'); **Смотрите также** -- [xxHash](http://cyan4973.github.io/xxHash/). +- [xxHash](http://cyan4973.github.io/xxHash/). [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/hash_functions/) diff --git a/docs/ru/query_language/functions/higher_order_functions.md b/docs/ru/query_language/functions/higher_order_functions.md index cd5d8945aaf..93fa169f6fa 100644 --- a/docs/ru/query_language/functions/higher_order_functions.md +++ b/docs/ru/query_language/functions/higher_order_functions.md @@ -1,6 +1,6 @@ -# Функции высшего порядка {#higher_order_functions} +# Функции высшего порядка {#higher-order-functions} -## Оператор `->`, функция lambda(params, expr) +## Оператор `->`, функция lambda(params, expr) {#operator-funktsiia-lambdaparams-expr} Позволяет описать лямбда-функцию для передачи в функцию высшего порядка. Слева от стрелочки стоит формальный параметр - произвольный идентификатор, или несколько формальных параметров - произвольные идентификаторы в кортеже. Справа от стрелочки стоит выражение, в котором могут использоваться эти формальные параметры, а также любые столбцы таблицы. @@ -14,21 +14,22 @@ Для функций, перечисленных ниже, лямбда-функцию должна быть указана всегда: -- [arrayMap](#higher_order_functions-array-map) -- [arrayFilter](#higher_order_functions-array-filter) -- [arrayFirst](#higher_order_functions-array-first) -- [arrayFirstIndex](#higher_order_functions-array-first-index) +- [arrayMap](#higher_order_functions-array-map) +- [arrayFilter](#higher_order_functions-array-filter) +- [arrayFirst](#higher_order_functions-array-first) +- [arrayFirstIndex](#higher_order_functions-array-first-index) -### arrayMap(func, arr1, ...) {#higher_order_functions-array-map} +### arrayMap(func, arr1, …) {#higher-order-functions-array-map} Вернуть массив, полученный на основе результатов применения функции `func` к каждому элементу массива `arr`. Примеры: -```sql +``` sql SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res; ``` -```text + +``` text ┌─res─────┐ │ [3,4,5] │ └─────────┘ @@ -36,10 +37,11 @@ SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res; Следующий пример показывает, как создать кортежи из элементов разных массивов: -```sql +``` sql SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res ``` -```text + +``` text ┌─res─────────────────┐ │ [(1,4),(2,5),(3,6)] │ └─────────────────────┘ @@ -47,23 +49,23 @@ SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res Обратите внимание, что у функции `arrayMap` первый аргумент (лямбда-функция) не может быть опущен. -### arrayFilter(func, arr1, ...) {#higher_order_functions-array-filter} +### arrayFilter(func, arr1, …) {#higher-order-functions-array-filter} -Вернуть массив, содержащий только те элементы массива `arr1`, для которых функция `func` возвращает не 0. +Вернуть массив, содержащий только те элементы массива `arr1`, для которых функция `func` возвращает не 0. Примеры: -```sql +``` sql SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res ``` -```text +``` text ┌─res───────────┐ │ ['abc World'] │ └───────────────┘ ``` -```sql +``` sql SELECT arrayFilter( (i, x) -> x LIKE '%World%', @@ -72,7 +74,7 @@ SELECT AS res ``` -```text +``` text ┌─res─┐ │ [2] │ └─────┘ @@ -80,59 +82,63 @@ SELECT Обратите внимание, что у функции `arrayFilter` первый аргумент (лямбда-функция) не может быть опущен. -### arrayCount(\[func,\] arr1, ...) {#higher_order_functions-array-count} +### arrayCount(\[func,\] arr1, …) {#higher-order-functions-array-count} + Вернуть количество элементов массива `arr`, для которых функция func возвращает не 0. Если func не указана - вернуть количество ненулевых элементов массива. -### arrayExists(\[func,\] arr1, ...) +### arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} + Вернуть 1, если существует хотя бы один элемент массива `arr`, для которого функция func возвращает не 0. Иначе вернуть 0. -### arrayAll(\[func,\] arr1, ...) +### arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} + Вернуть 1, если для всех элементов массива `arr`, функция `func` возвращает не 0. Иначе вернуть 0. -### arraySum(\[func,\] arr1, ...) {#higher_order_functions-array-sum} +### arraySum(\[func,\] arr1, …) {#higher-order-functions-array-sum} + Вернуть сумму значений функции `func`. Если функция не указана - просто вернуть сумму элементов массива. -### arrayFirst(func, arr1, ...) {#higher_order_functions-array-first} +### arrayFirst(func, arr1, …) {#higher-order-functions-array-first} + Вернуть первый элемент массива `arr1`, для которого функция func возвращает не 0. Обратите внимание, что у функции `arrayFirst` первый аргумент (лямбда-функция) не может быть опущен. -### arrayFirstIndex(func, arr1, ...) {#higher_order_functions-array-first-index} +### arrayFirstIndex(func, arr1, …) {#higher-order-functions-array-first-index} Вернуть индекс первого элемента массива `arr1`, для которого функция func возвращает не 0. Обратите внимание, что у функции `arrayFirstFilter` первый аргумент (лямбда-функция) не может быть опущен. -### arrayCumSum(\[func,\] arr1, ...) +### arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} Возвращает массив из частичных сумм элементов исходного массива (сумма с накоплением). Если указана функция `func`, то значения элементов массива преобразуются этой функцией перед суммированием. Пример: -```sql +``` sql SELECT arrayCumSum([1, 1, 1, 1]) AS res ``` -```text +``` text ┌─res──────────┐ │ [1, 2, 3, 4] │ └──────────────┘ ``` +### arraySort(\[func,\] arr1, …) {#arraysortfunc-arr1} -### arraySort(\[func,\] arr1, ...) - -Возвращает отсортированный в восходящем порядке массив `arr1`. Если задана функция `func`, то порядок сортировки определяется результатом применения функции `func` на элементы массива (массивов). +Возвращает отсортированный в восходящем порядке массив `arr1`. Если задана функция `func`, то порядок сортировки определяется результатом применения функции `func` на элементы массива (массивов). Для улучшения эффективности сортировки применяется [Преобразование Шварца](https://ru.wikipedia.org/wiki/%D0%9F%D1%80%D0%B5%D0%BE%D0%B1%D1%80%D0%B0%D0%B7%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%A8%D0%B2%D0%B0%D1%80%D1%86%D0%B0). Пример: -```sql +``` sql SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); ``` -```text +``` text ┌─res────────────────┐ │ ['world', 'hello'] │ └────────────────────┘ @@ -140,16 +146,17 @@ SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); Подробная информация о методе `arraySort` приведена в разделе [Функции по работе с массивами](array_functions.md#array_functions-sort). -### arrayReverseSort(\[func,\] arr1, ...) +### arrayReverseSort(\[func,\] arr1, …) {#arrayreversesortfunc-arr1} -Возвращает отсортированный в нисходящем порядке массив `arr1`. Если задана функция `func`, то порядок сортировки определяется результатом применения функции `func` на элементы массива (массивов). +Возвращает отсортированный в нисходящем порядке массив `arr1`. Если задана функция `func`, то порядок сортировки определяется результатом применения функции `func` на элементы массива (массивов). Пример: -```sql +``` sql SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -```text + +``` text ┌─res───────────────┐ │ ['hello','world'] │ └───────────────────┘ diff --git a/docs/ru/query_language/functions/in_functions.md b/docs/ru/query_language/functions/in_functions.md index b8b12ea3808..70c8c1bb6c3 100644 --- a/docs/ru/query_language/functions/in_functions.md +++ b/docs/ru/query_language/functions/in_functions.md @@ -1,15 +1,17 @@ -# Функции для реализации оператора IN. +# Функции для реализации оператора IN. {#funktsii-dlia-realizatsii-operatora-in} ## in, notIn, globalIn, globalNotIn {#in-functions} Смотрите раздел [Операторы IN](../select.md#select-in-operators). -## tuple(x, y, ...), оператор (x, y, ...) +## tuple(x, y, …), оператор (x, y, …) {#tuplex-y-operator-x-y} + Функция, позволяющая сгруппировать несколько столбцов. -Для столбцов, имеющих типы T1, T2, ... возвращает кортеж типа Tuple(T1, T2, ...), содержащий эти столбцы. Выполнение функции ничего не стоит. +Для столбцов, имеющих типы T1, T2, … возвращает кортеж типа Tuple(T1, T2, …), содержащий эти столбцы. Выполнение функции ничего не стоит. Кортежи обычно используются как промежуточное значение в качестве аргумента операторов IN, или для создания списка формальных параметров лямбда-функций. Кортежи не могут быть записаны в таблицу. -## tupleElement(tuple, n), оператор x.N +## tupleElement(tuple, n), оператор x.N {#tupleelementtuple-n-operator-x-n} + Функция, позволяющая достать столбец из кортежа. N - индекс столбца начиная с 1. N должно быть константой. N должно быть целым строго положительным числом не большим размера кортежа. Выполнение функции ничего не стоит. diff --git a/docs/ru/query_language/functions/index.md b/docs/ru/query_language/functions/index.md index f978fb75096..1f7d736f691 100644 --- a/docs/ru/query_language/functions/index.md +++ b/docs/ru/query_language/functions/index.md @@ -1,24 +1,24 @@ -# Функции +# Функции {#funktsii} Функции бывают как минимум\* двух видов - обычные функции (называются просто, функциями) и агрегатные функции. Это совершенно разные вещи. Обычные функции работают так, как будто применяются к каждой строке по отдельности (для каждой строки, результат вычисления функции не зависит от других строк). Агрегатные функции аккумулируют множество значений из разных строк (то есть, зависят от целого множества строк). -В этом разделе речь пойдёт об обычных функциях. Для агрегатных функций, смотрите раздел "Агрегатные функции". +В этом разделе речь пойдёт об обычных функциях. Для агрегатных функций, смотрите раздел «Агрегатные функции». \* - есть ещё третий вид функций, к которым относится функция arrayJoin; также можно отдельно иметь ввиду табличные функции.\* -## Строгая типизация +## Строгая типизация {#strogaia-tipizatsiia} В ClickHouse, в отличие от стандартного SQL, типизация является строгой. То есть, не производится неявных преобразований между типами. Все функции работают для определённого набора типов. Это значит, что иногда вам придётся использовать функции преобразования типов. -## Склейка одинаковых выражений +## Склейка одинаковых выражений {#skleika-odinakovykh-vyrazhenii} Все выражения в запросе, имеющие одинаковые AST (одинаковую запись или одинаковый результат синтаксического разбора), считаются имеющими одинаковые значения. Такие выражения склеиваются и исполняются один раз. Одинаковые подзапросы тоже склеиваются. -## Типы результата +## Типы результата {#tipy-rezultata} Все функции возвращают одно (не несколько, не ноль) значение в качестве результата. Тип результата обычно определяется только типами аргументов, но не значениями аргументов. Исключение - функция tupleElement (оператор a.N), а также функция toFixedString. -## Константы +## Константы {#konstanty} Для простоты, некоторые функции могут работать только с константами в качестве некоторых аргументов. Например, правый аргумент оператора LIKE должен быть константой. Почти все функции возвращают константу для константных аргументов. Исключение - функции генерации случайных чисел. @@ -27,27 +27,27 @@ Функции могут быть по-разному реализованы для константных и не константных аргументов (выполняется разный код). Но результат работы для константы и полноценного столбца, содержащего только одно такое же значение, должен совпадать. -## Обработка NULL +## Обработка NULL {#obrabotka-null} Функции имеют следующие виды поведения: -- Если хотя бы один из аргументов функции — `NULL`, то результат функции тоже `NULL`. -- Специальное поведение, указанное в описании каждой функции отдельно. В исходном коде ClickHouse такие функции можно определить по свойству `UseDefaultImplementationForNulls=false`. +- Если хотя бы один из аргументов функции — `NULL`, то результат функции тоже `NULL`. +- Специальное поведение, указанное в описании каждой функции отдельно. В исходном коде ClickHouse такие функции можно определить по свойству `UseDefaultImplementationForNulls=false`. -## Неизменяемость +## Неизменяемость {#neizmeniaemost} Функции не могут поменять значения своих аргументов - любые изменения возвращаются в качестве результата. Соответственно, от порядка записи функций в запросе, результат вычислений отдельных функций не зависит. -## Обработка ошибок +## Обработка ошибок {#obrabotka-oshibok} Некоторые функции могут кидать исключения в случае ошибочных данных. В этом случае, выполнение запроса прерывается, и текст ошибки выводится клиенту. При распределённой обработке запроса, при возникновении исключения на одном из серверов, на другие серверы пытается отправиться просьба тоже прервать выполнение запроса. -## Вычисление выражений-аргументов +## Вычисление выражений-аргументов {#vychislenie-vyrazhenii-argumentov} В почти всех языках программирования, для некоторых операторов может не вычисляться один из аргументов. Обычно - для операторов `&&`, `||`, `?:`. Но в ClickHouse, аргументы функций (операторов) вычисляются всегда. Это связано с тем, что вычисления производятся не по отдельности для каждой строки, а сразу для целых кусочков столбцов. -## Выполнение функций при распределённой обработке запроса +## Выполнение функций при распределённой обработке запроса {#vypolnenie-funktsii-pri-raspredelionnoi-obrabotke-zaprosa} При распределённой обработке запроса, как можно большая часть стадий выполнения запроса производится на удалённых серверах, а оставшиеся стадии (слияние промежуточных результатов и всё, что дальше) - на сервере-инициаторе запроса. diff --git a/docs/ru/query_language/functions/introspection.md b/docs/ru/query_language/functions/introspection.md deleted file mode 120000 index b1a487e9c77..00000000000 --- a/docs/ru/query_language/functions/introspection.md +++ /dev/null @@ -1 +0,0 @@ -../../../en/query_language/functions/introspection.md \ No newline at end of file diff --git a/docs/ru/query_language/functions/introspection.md b/docs/ru/query_language/functions/introspection.md new file mode 100644 index 00000000000..0907dffcd48 --- /dev/null +++ b/docs/ru/query_language/functions/introspection.md @@ -0,0 +1,303 @@ +# Функции интроспекции {#introspection-functions} + +Функции из этого раздела могут использоваться для интроспекции [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) и [DWARF](https://en.wikipedia.org/wiki/DWARF) в целях профилирования запросов. + +!!! warning "Предупреждение" + Эти функции выполняются медленно и могут приводить к нежелательным последствиям в плане безопасности. + +Для правильной работы функций интроспекции: + +- Установите пакет `clickhouse-common-static-dbg`. + +- Установите настройку [allow\_introspection\_functions](../../operations/settings/settings.md#settings-allow_introspection_functions) в 1. + +Из соображений безопасности данные функции отключены по умолчанию. + +ClickHouse сохраняет отчеты профилировщика в [журнал трассировки](../../operations/system_tables.md#system_tables-trace_log) в системной таблице. Убедитесь, что таблица и профилировщик настроены правильно. + +## addresssToLine {#addresstoline} + +Преобразует адрес виртуальной памяти внутри процесса сервера ClickHouse в имя файла и номер строки в исходном коде ClickHouse. + +Если вы используете официальные пакеты ClickHouse, вам необходимо установить следующий пакеты: `clickhouse-common-static-dbg`. + +**Синтаксис** + +``` sql +addressToLine(address_of_binary_instruction) +``` + +**Параметры** + +- `address_of_binary_instruction` ([Тип UInt64](../../data_types/int_uint.md))- Адрес инструкции в запущенном процессе. + +**Возвращаемое значение** + +- Имя файла исходного кода и номер строки в этом файле разделяются двоеточием. + + Например, `/build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.cpp:199`, где `199` — номер строки. + +- Имя бинарного файла, если функция не может найти отладочную информацию. + +- Пустая строка, если адрес не является допустимым. + +Тип: [String](../../data_types/string.md). + +**Пример** + +Включение функций самоанализа: + +``` sql +SET allow_introspection_functions=1 +``` + +Выбор первой строки из списка `trace_log` системная таблица: + +``` sql +SELECT * FROM system.trace_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2019-11-19 +event_time: 2019-11-19 18:57:23 +revision: 54429 +timer_type: Real +thread_number: 48 +query_id: 421b6855-1858-45a5-8f37-f383409d6d72 +trace: [140658411141617,94784174532828,94784076370703,94784076372094,94784076361020,94784175007680,140658411116251,140658403895439] +``` + +То `trace` поле содержит трассировку стека в момент выборки. + +Получение имени файла исходного кода и номера строки для одного адреса: + +``` sql +SELECT addressToLine(94784076370703) \G +``` + +``` text +Row 1: +────── +addressToLine(94784076370703): /build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.cpp:199 +``` + +Применение функции ко всему стектрейсу: + +``` sql +SELECT + arrayStringConcat(arrayMap(x -> addressToLine(x), trace), '\n') AS trace_source_code_lines +FROM system.trace_log +LIMIT 1 +\G +``` + +Функция [arrayMap](higher_order_functions.md#higher_order_functions-array-map) позволяет обрабатывать каждый отдельный элемент массива `trace` с помощью функции `addressToLine`. Результат этой обработки вы видите в виде `trace_source_code_lines` колонки выходных данных. + +``` text +Row 1: +────── +trace_source_code_lines: /lib/x86_64-linux-gnu/libpthread-2.27.so +/usr/lib/debug/usr/bin/clickhouse +/build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.cpp:199 +/build/obj-x86_64-linux-gnu/../dbms/src/Common/ThreadPool.h:155 +/usr/include/c++/9/bits/atomic_base.h:551 +/usr/lib/debug/usr/bin/clickhouse +/lib/x86_64-linux-gnu/libpthread-2.27.so +/build/glibc-OTsEL5/glibc-2.27/misc/../sysdeps/unix/sysv/linux/x86_64/clone.S:97 +``` + +## addressToSymbol {#addresstosymbol} + +Преобразует адрес виртуальной памяти внутри серверного процесса ClickHouse в символ из объектных файлов ClickHouse. + +**Синтаксис** + +``` sql +addressToSymbol(address_of_binary_instruction) +``` + +**Параметры** + +- `address_of_binary_instruction` ([Тип uint64](../../data_types/int_uint.md)) — Адрес инструкции в запущенном процессе. + +**Возвращаемое значение** + +- Символ из объектных файлов ClickHouse. +- Пустая строка, если адрес не является допустимым. + +Тип: [String](../../data_types/string.md). + +**Пример** + +Включение функций самоанализа: + +``` sql +SET allow_introspection_functions=1 +``` + +Выбор первой строки из списка `trace_log` системная таблица: + +``` sql +SELECT * FROM system.trace_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2019-11-20 +event_time: 2019-11-20 16:57:59 +revision: 54429 +timer_type: Real +thread_number: 48 +query_id: 724028bf-f550-45aa-910d-2af6212b94ac +trace: [94138803686098,94138815010911,94138815096522,94138815101224,94138815102091,94138814222988,94138806823642,94138814457211,94138806823642,94138814457211,94138806823642,94138806795179,94138806796144,94138753770094,94138753771646,94138753760572,94138852407232,140399185266395,140399178045583] +``` + +То `trace` поле содержит трассировку стека в момент выборки. + +Получение символа для одного адреса: + +``` sql +SELECT addressToSymbol(94138803686098) \G +``` + +``` text +Row 1: +────── +addressToSymbol(94138803686098): _ZNK2DB24IAggregateFunctionHelperINS_20AggregateFunctionSumImmNS_24AggregateFunctionSumDataImEEEEE19addBatchSinglePlaceEmPcPPKNS_7IColumnEPNS_5ArenaE +``` + +Применение функции ко всей трассировке стека: + +``` sql +SELECT + arrayStringConcat(arrayMap(x -> addressToSymbol(x), trace), '\n') AS trace_symbols +FROM system.trace_log +LIMIT 1 +\G +``` + +То [arrayMap](higher_order_functions.md#higher_order_functions-array-map) функция позволяет обрабатывать каждый отдельный элемент системы. `trace` массив по типу `addressToSymbols` функция. Результат этой обработки вы видите в виде `trace_symbols` колонка выходных данных. + +``` text +Row 1: +────── +trace_symbols: _ZNK2DB24IAggregateFunctionHelperINS_20AggregateFunctionSumImmNS_24AggregateFunctionSumDataImEEEEE19addBatchSinglePlaceEmPcPPKNS_7IColumnEPNS_5ArenaE +_ZNK2DB10Aggregator21executeWithoutKeyImplERPcmPNS0_28AggregateFunctionInstructionEPNS_5ArenaE +_ZN2DB10Aggregator14executeOnBlockESt6vectorIN3COWINS_7IColumnEE13immutable_ptrIS3_EESaIS6_EEmRNS_22AggregatedDataVariantsERS1_IPKS3_SaISC_EERS1_ISE_SaISE_EERb +_ZN2DB10Aggregator14executeOnBlockERKNS_5BlockERNS_22AggregatedDataVariantsERSt6vectorIPKNS_7IColumnESaIS9_EERS6_ISB_SaISB_EERb +_ZN2DB10Aggregator7executeERKSt10shared_ptrINS_17IBlockInputStreamEERNS_22AggregatedDataVariantsE +_ZN2DB27AggregatingBlockInputStream8readImplEv +_ZN2DB17IBlockInputStream4readEv +_ZN2DB26ExpressionBlockInputStream8readImplEv +_ZN2DB17IBlockInputStream4readEv +_ZN2DB26ExpressionBlockInputStream8readImplEv +_ZN2DB17IBlockInputStream4readEv +_ZN2DB28AsynchronousBlockInputStream9calculateEv +_ZNSt17_Function_handlerIFvvEZN2DB28AsynchronousBlockInputStream4nextEvEUlvE_E9_M_invokeERKSt9_Any_data +_ZN14ThreadPoolImplI20ThreadFromGlobalPoolE6workerESt14_List_iteratorIS0_E +_ZZN20ThreadFromGlobalPoolC4IZN14ThreadPoolImplIS_E12scheduleImplIvEET_St8functionIFvvEEiSt8optionalImEEUlvE1_JEEEOS4_DpOT0_ENKUlvE_clEv +_ZN14ThreadPoolImplISt6threadE6workerESt14_List_iteratorIS0_E +execute_native_thread_routine +start_thread +clone +``` + +## demangle {#demangle} + +Преобразует символ, который вы можете получить с помощью [addressstosymbol](#addresstosymbol) функция имя функции C++. + +**Синтаксис** + +``` sql +demangle(symbol) +``` + +**Параметры** + +- `symbol` ([Строка](../../data_types/string.md)) - Символ из объектного файла. + +**Возвращаемое значение** + +- Имя функции C++. +- Пустая строка, если символ не является допустимым. + +Тип: [Строка](../../data_types/string.md). + +**Пример** + +Включение функций самоанализа: + +``` sql +SET allow_introspection_functions=1 +``` + +Выбор первой строки из списка `trace_log` системная таблица: + +``` sql +SELECT * FROM system.trace_log LIMIT 1 \G +``` + +``` text +Row 1: +────── +event_date: 2019-11-20 +event_time: 2019-11-20 16:57:59 +revision: 54429 +timer_type: Real +thread_number: 48 +query_id: 724028bf-f550-45aa-910d-2af6212b94ac +trace: [94138803686098,94138815010911,94138815096522,94138815101224,94138815102091,94138814222988,94138806823642,94138814457211,94138806823642,94138814457211,94138806823642,94138806795179,94138806796144,94138753770094,94138753771646,94138753760572,94138852407232,140399185266395,140399178045583] +``` + +То `trace` поле содержит трассировку стека в момент выборки. + +Получение имени функции для одного адреса: + +``` sql +SELECT demangle(addressToSymbol(94138803686098)) \G +``` + +``` text +Row 1: +────── +demangle(addressToSymbol(94138803686098)): DB::IAggregateFunctionHelper > >::addBatchSinglePlace(unsigned long, char*, DB::IColumn const**, DB::Arena*) const +``` + +Применение функции ко всему стектрейсу: + +``` sql +SELECT + arrayStringConcat(arrayMap(x -> demangle(addressToSymbol(x)), trace), '\n') AS trace_functions +FROM system.trace_log +LIMIT 1 +\G +``` + +Функция [arrayMap](higher_order_functions.md#higher_order_functions-array-map) позволяет обрабатывать каждый отдельный элемент массива `trace` с помощью функции `demangle`. + +``` text +Row 1: +────── +trace_functions: DB::IAggregateFunctionHelper > >::addBatchSinglePlace(unsigned long, char*, DB::IColumn const**, DB::Arena*) const +DB::Aggregator::executeWithoutKeyImpl(char*&, unsigned long, DB::Aggregator::AggregateFunctionInstruction*, DB::Arena*) const +DB::Aggregator::executeOnBlock(std::vector::immutable_ptr, std::allocator::immutable_ptr > >, unsigned long, DB::AggregatedDataVariants&, std::vector >&, std::vector >, std::allocator > > >&, bool&) +DB::Aggregator::executeOnBlock(DB::Block const&, DB::AggregatedDataVariants&, std::vector >&, std::vector >, std::allocator > > >&, bool&) +DB::Aggregator::execute(std::shared_ptr const&, DB::AggregatedDataVariants&) +DB::AggregatingBlockInputStream::readImpl() +DB::IBlockInputStream::read() +DB::ExpressionBlockInputStream::readImpl() +DB::IBlockInputStream::read() +DB::ExpressionBlockInputStream::readImpl() +DB::IBlockInputStream::read() +DB::AsynchronousBlockInputStream::calculate() +std::_Function_handler::_M_invoke(std::_Any_data const&) +ThreadPoolImpl::worker(std::_List_iterator) +ThreadFromGlobalPool::ThreadFromGlobalPool::scheduleImpl(std::function, int, std::optional)::{lambda()#3}>(ThreadPoolImpl::scheduleImpl(std::function, int, std::optional)::{lambda()#3}&&)::{lambda()#1}::operator()() const +ThreadPoolImpl::worker(std::_List_iterator) +execute_native_thread_routine +start_thread +clone +``` diff --git a/docs/ru/query_language/functions/ip_address_functions.md b/docs/ru/query_language/functions/ip_address_functions.md index 342e07e72b6..87c1da4114b 100644 --- a/docs/ru/query_language/functions/ip_address_functions.md +++ b/docs/ru/query_language/functions/ip_address_functions.md @@ -1,20 +1,20 @@ -# Функции для работы с IP-адресами +# Функции для работы с IP-адресами {#funktsii-dlia-raboty-s-ip-adresami} -## IPv4NumToString(num) +## IPv4NumToString(num) {#ipv4numtostringnum} Принимает число типа UInt32. Интерпретирует его, как IPv4-адрес в big endian. Возвращает строку, содержащую соответствующий IPv4-адрес в формате A.B.C.D (числа в десятичной форме через точки). -## IPv4StringToNum(s) +## IPv4StringToNum(s) {#ipv4stringtonums} Функция, обратная к IPv4NumToString. Если IPv4 адрес в неправильном формате, то возвращает 0. -## IPv4NumToStringClassC(num) +## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Похоже на IPv4NumToString, но вместо последнего октета используется xxx. Пример: -```sql +``` sql SELECT IPv4NumToStringClassC(ClientIP) AS k, count() AS c @@ -24,7 +24,7 @@ ORDER BY c DESC LIMIT 10 ``` -```text +``` text ┌─k──────────────┬─────c─┐ │ 83.149.9.xxx │ 26238 │ │ 217.118.81.xxx │ 26074 │ @@ -41,22 +41,22 @@ LIMIT 10 В связи с тем, что использование xxx весьма необычно, это может быть изменено в дальнейшем. Вам не следует полагаться на конкретный вид этого фрагмента. -### IPv6NumToString(x) +### IPv6NumToString(x) {#ipv6numtostringx} Принимает значение типа FixedString(16), содержащее IPv6-адрес в бинарном виде. Возвращает строку, содержащую этот адрес в текстовом виде. IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.222.33.44. Примеры: -```sql +``` sql SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr ``` -```text +``` text ┌─addr─────────┐ │ 2a02:6b8::11 │ └──────────────┘ ``` -```sql +``` sql SELECT IPv6NumToString(ClientIP6 AS k), count() AS c @@ -67,7 +67,7 @@ ORDER BY c DESC LIMIT 10 ``` -```text +``` text ┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ │ 2a02:2168:aaa:bbbb::2 │ 24695 │ │ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ @@ -82,7 +82,7 @@ LIMIT 10 └─────────────────────────────────────────┴───────┘ ``` -```sql +``` sql SELECT IPv6NumToString(ClientIP6 AS k), count() AS c @@ -93,7 +93,7 @@ ORDER BY c DESC LIMIT 10 ``` -```text +``` text ┌─IPv6NumToString(ClientIP6)─┬──────c─┐ │ ::ffff:94.26.111.111 │ 747440 │ │ ::ffff:37.143.222.4 │ 529483 │ @@ -108,30 +108,30 @@ LIMIT 10 └────────────────────────────┴────────┘ ``` -## IPv6StringToNum(s) +## IPv6StringToNum(s) {#ipv6stringtonums} Функция, обратная к IPv6NumToString. Если IPv6 адрес в неправильном формате, то возвращает строку из нулевых байт. HEX может быть в любом регистре. -## IPv4ToIPv6(x) +## IPv4ToIPv6(x) {#ipv4toipv6x} Принимает число типа `UInt32`. Интерпретирует его, как IPv4-адрес в [big endian](https://en.wikipedia.org/wiki/Endianness). Возвращает значение `FixedString(16)`, содержащее адрес IPv6 в двоичном формате. Примеры: -```sql +``` sql SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr ``` -```text +``` text ┌─addr───────────────┐ │ ::ffff:192.168.0.1 │ └────────────────────┘ ``` -## cutIPv6(x, bitsToCutForIPv6, bitsToCutForIPv4) +## cutIPv6(x, bitsToCutForIPv6, bitsToCutForIPv4) {#cutipv6x-bitstocutforipv6-bitstocutforipv4} Принимает значение типа FixedString(16), содержащее IPv6-адрес в бинарном виде. Возвращает строку, содержащую адрес из указанного количества битов, удаленных в текстовом формате. Например: -```sql +``` sql WITH IPv6StringToNum('2001:0DB8:AC10:FE01:FEED:BABE:CAFE:F00D') AS ipv6, IPv4ToIPv6(IPv4StringToNum('192.168.0.1')) AS ipv4 @@ -140,43 +140,45 @@ SELECT cutIPv6(ipv4, 0, 2) ``` -```text +``` text ┌─cutIPv6(ipv6, 2, 0)─────────────────┬─cutIPv6(ipv4, 0, 2)─┐ │ 2001:db8:ac10:fe01:feed:babe:cafe:0 │ ::ffff:192.168.0.0 │ └─────────────────────────────────────┴─────────────────────┘ ``` -## IPv4CIDRToRange(ipv4, cidr), +## IPv4CIDRToRange(ipv4, cidr), {#ipv4cidrtorangeipv4-cidr} Принимает на вход IPv4 и значение `UInt8`, содержащее [CIDR](https://ru.wikipedia.org/wiki/Бесклассовая_адресация). Возвращает кортеж с двумя IPv4, содержащими нижний и более высокий диапазон подсети. -```sql +``` sql SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16) ``` -```text + +``` text ┌─IPv4CIDRToRange(toIPv4('192.168.5.2'), 16)─┐ │ ('192.168.0.0','192.168.255.255') │ └────────────────────────────────────────────┘ ``` -## IPv6CIDRToRange(ipv6, cidr), +## IPv6CIDRToRange(ipv6, cidr), {#ipv6cidrtorangeipv6-cidr} Принимает на вход IPv6 и значение `UInt8`, содержащее CIDR. Возвращает кортеж с двумя IPv6, содержащими нижний и более высокий диапазон подсети. -```sql +``` sql SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32) ``` -```text + +``` text ┌─IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32)─┐ │ ('2001:db8::','2001:db8:ffff:ffff:ffff:ffff:ffff:ffff') │ └────────────────────────────────────────────────────────────────────────┘ ``` -## toIPv4(string) +## toIPv4(string) {#toipv4string} Псевдоним функции `IPv4StringToNum()` которая принимает строку с адресом IPv4 и возвращает значение типа [IPv4](../../data_types/domains/ipv4.md), которое равно значению, возвращаемому функцией `IPv4StringToNum()`. -```sql +``` sql WITH '171.225.130.45' as IPv4_string SELECT @@ -184,13 +186,13 @@ SELECT toTypeName(toIPv4(IPv4_string)) ``` -```text +``` text ┌─toTypeName(IPv4StringToNum(IPv4_string))─┬─toTypeName(toIPv4(IPv4_string))─┐ │ UInt32 │ IPv4 │ └──────────────────────────────────────────┴─────────────────────────────────┘ ``` -```sql +``` sql WITH '171.225.130.45' as IPv4_string SELECT @@ -198,17 +200,17 @@ SELECT hex(toIPv4(IPv4_string)) ``` -```text +``` text ┌─hex(IPv4StringToNum(IPv4_string))─┬─hex(toIPv4(IPv4_string))─┐ │ ABE1822D │ ABE1822D │ └───────────────────────────────────┴──────────────────────────┘ ``` -## toIPv6(string) +## toIPv6(string) {#toipv6string} Псевдоним функции `IPv6StringToNum()` которая принимает строку с адресом IPv6 и возвращает значение типа [IPv6](../../data_types/domains/ipv6.md), которое равно значению, возвращаемому функцией `IPv6StringToNum()`. -```sql +``` sql WITH '2001:438:ffff::407d:1bc1' as IPv6_string SELECT @@ -216,13 +218,13 @@ SELECT toTypeName(toIPv6(IPv6_string)) ``` -```text +``` text ┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ │ FixedString(16) │ IPv6 │ └──────────────────────────────────────────┴─────────────────────────────────┘ ``` -```sql +``` sql WITH '2001:438:ffff::407d:1bc1' as IPv6_string SELECT @@ -230,7 +232,7 @@ SELECT hex(toIPv6(IPv6_string)) ``` -```text +``` text ┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ │ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ └───────────────────────────────────┴──────────────────────────────────┘ diff --git a/docs/ru/query_language/functions/json_functions.md b/docs/ru/query_language/functions/json_functions.md index d17135e8ed4..290fa911e6c 100644 --- a/docs/ru/query_language/functions/json_functions.md +++ b/docs/ru/query_language/functions/json_functions.md @@ -1,52 +1,52 @@ -# Функции для работы с JSON. +# Функции для работы с JSON. {#funktsii-dlia-raboty-s-json} В Яндекс.Метрике пользователями передаётся JSON в качестве параметров визитов. Для работы с таким JSON-ом, реализованы некоторые функции. (Хотя в большинстве случаев, JSON-ы дополнительно обрабатываются заранее, и полученные значения кладутся в отдельные столбцы в уже обработанном виде.) Все эти функции исходят из сильных допущений о том, каким может быть JSON, и при этом стараются почти ничего не делать. Делаются следующие допущения: -1. Имя поля (аргумент функции) должно быть константой; -2. Считается, что имя поля в JSON-е закодировано некоторым каноническим образом. Например, `visitParamHas('{"abc":"def"}', 'abc') = 1`, но `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` -3. Поля ищутся на любом уровне вложенности, без разбора. Если есть несколько подходящих полей - берётся первое. -4. В JSON-е нет пробельных символов вне строковых литералов. +1. Имя поля (аргумент функции) должно быть константой; +2. Считается, что имя поля в JSON-е закодировано некоторым каноническим образом. Например, `visitParamHas('{"abc":"def"}', 'abc') = 1`, но `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` +3. Поля ищутся на любом уровне вложенности, без разбора. Если есть несколько подходящих полей - берётся первое. +4. В JSON-е нет пробельных символов вне строковых литералов. -## visitParamHas(params, name) +## visitParamHas(params, name) {#visitparamhasparams-name} Проверить наличие поля с именем name. -## visitParamExtractUInt(params, name) +## visitParamExtractUInt(params, name) {#visitparamextractuintparams-name} Распарсить UInt64 из значения поля с именем name. Если поле строковое - попытаться распарсить число из начала строки. Если такого поля нет, или если оно есть, но содержит не число, то вернуть 0. -## visitParamExtractInt(params, name) +## visitParamExtractInt(params, name) {#visitparamextractintparams-name} Аналогично для Int64. -## visitParamExtractFloat(params, name) +## visitParamExtractFloat(params, name) {#visitparamextractfloatparams-name} Аналогично для Float64. -## visitParamExtractBool(params, name) +## visitParamExtractBool(params, name) {#visitparamextractboolparams-name} Распарсить значение true/false. Результат - UInt8. -## visitParamExtractRaw(params, name) +## visitParamExtractRaw(params, name) {#visitparamextractrawparams-name} Вернуть значение поля, включая разделители. Примеры: -```sql +``` sql visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' ``` -## visitParamExtractString(params, name) +## visitParamExtractString(params, name) {#visitparamextractstringparams-name} Распарсить строку в двойных кавычках. У значения убирается экранирование. Если убрать экранированные символы не удалось, то возвращается пустая строка. Примеры: -```sql +``` sql visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' @@ -57,18 +57,18 @@ visitParamExtractString('{"abc":"hello}', 'abc') = '' Следующие функции используют [simdjson](https://github.com/lemire/simdjson) который разработан под более сложные требования для разбора JSON. Упомянутое выше предположение 2 по-прежнему применимо. -## isValidJSON(json) +## isValidJSON(json) {#isvalidjsonjson} Проверяет, является ли переданная строка валидным json значением. Примеры: -```sql +``` sql SELECT isValidJSON('{"a": "hello", "b": [-100, 200.0, 300]}') = 1 SELECT isValidJSON('not a json') = 0 ``` -## JSONHas(json[, indices_or_keys]...) +## JSONHas(json\[, indices\_or\_keys\]…) {#jsonhasjson-indices-or-keys} Если значение существует в документе JSON, то возвращается `1`. @@ -76,16 +76,16 @@ SELECT isValidJSON('not a json') = 0 Примеры: -```sql +``` sql SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 ``` `indices_or_keys` — это список из нуля или более аргументов каждый из них может быть либо строкой либо целым числом. -* Строка — это доступ к объекту по ключу. -* Положительное целое число — это доступ к n-му члену/ключу с начала. -* Отрицательное целое число — это доступ к n-му члену/ключу с конца. +- Строка — это доступ к объекту по ключу. +- Положительное целое число — это доступ к n-му члену/ключу с начала. +- Отрицательное целое число — это доступ к n-му члену/ключу с конца. Адресация элементов по индексу начинается с 1, следовательно элемент 0 не существует. @@ -93,7 +93,7 @@ SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 Примеры: -```sql +``` sql SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' @@ -101,7 +101,7 @@ SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' ``` -## JSONLength(json[, indices_or_keys]...) +## JSONLength(json\[, indices\_or\_keys\]…) {#jsonlengthjson-indices-or-keys} Возвращает длину массива JSON или объекта JSON. @@ -109,12 +109,12 @@ SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' Примеры: -```sql +``` sql SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 ``` -## JSONType(json[, indices_or_keys]...) +## JSONType(json\[, indices\_or\_keys\]…) {#jsontypejson-indices-or-keys} Возвращает тип значения JSON. @@ -122,19 +122,19 @@ SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 Примеры: -```sql +``` sql SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' ``` -## JSONExtractUInt(json[, indices_or_keys]...) +## JSONExtractUInt(json\[, indices\_or\_keys\]…) {#jsonextractuintjson-indices-or-keys} -## JSONExtractInt(json[, indices_or_keys]...) +## JSONExtractInt(json\[, indices\_or\_keys\]…) {#jsonextractintjson-indices-or-keys} -## JSONExtractFloat(json[, indices_or_keys]...) +## JSONExtractFloat(json\[, indices\_or\_keys\]…) {#jsonextractfloatjson-indices-or-keys} -## JSONExtractBool(json[, indices_or_keys]...) +## JSONExtractBool(json\[, indices\_or\_keys\]…) {#jsonextractbooljson-indices-or-keys} Парсит JSON и извлекает значение. Эти функции аналогичны функциям `visitParam`. @@ -142,13 +142,13 @@ SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' Примеры: -```sql +``` sql SELECT JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 ``` -## JSONExtractString(json[, indices_or_keys]...) +## JSONExtractString(json\[, indices\_or\_keys\]…) {#jsonextractstringjson-indices-or-keys} Парсит JSON и извлекает строку. Эта функция аналогична функции `visitParamExtractString`. @@ -158,7 +158,7 @@ SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 Примеры: -```sql +``` sql SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' SELECT JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' SELECT JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' @@ -166,7 +166,7 @@ SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' ``` -## JSONExtract(json[, indices_or_keys...], return_type) +## JSONExtract(json\[, indices\_or\_keys…\], return\_type) {#jsonextractjson-indices-or-keys-return-type} Парсит JSON и извлекает значение с заданным типом данных. @@ -177,7 +177,7 @@ SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' Примеры: -```sql +``` sql SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))') = ('hello',[-100,200,300]) SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)') = ([-100,200,300],'hello') SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))') = [-100, NULL, NULL] @@ -187,17 +187,17 @@ SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' ``` -## JSONExtractKeysAndValues(json[, indices_or_keys...], value_type) +## JSONExtractKeysAndValues(json\[, indices\_or\_keys…\], value\_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} Разбор пар ключ-значение из JSON, где значение имеет тип данных ClickHouse. Пример: -```sql +``` sql SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; ``` -## JSONExtractRaw(json[, indices_or_keys]...) +## JSONExtractRaw(json\[, indices\_or\_keys\]…) {#jsonextractrawjson-indices-or-keys} Возвращает часть JSON в виде строки, содержащей неразобранную подстроку. @@ -205,11 +205,11 @@ SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') Пример: -```sql +``` sql SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' ``` -## JSONExtractArrayRaw(json[, indices_or_keys]...) +## JSONExtractArrayRaw(json\[, indices\_or\_keys\]…) {#jsonextractarrayrawjson-indices-or-keys} Возвращает массив из элементов JSON массива, каждый из которых представлен в виде строки с неразобранными подстроками из JSON. @@ -217,7 +217,7 @@ SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, Пример: -```sql +``` sql SELECT JSONExtractArrayRaw('{"a": "hello", "b": [-100, 200.0, "hello"]}', 'b') = ['-100', '200.0', '"hello"']' ``` diff --git a/docs/ru/query_language/functions/logical_functions.md b/docs/ru/query_language/functions/logical_functions.md index a5c6d1cdd00..de2a3d51729 100644 --- a/docs/ru/query_language/functions/logical_functions.md +++ b/docs/ru/query_language/functions/logical_functions.md @@ -1,15 +1,15 @@ -# Логические функции +# Логические функции {#logicheskie-funktsii} Логические функции принимают любые числовые типы, а возвращают число типа UInt8, равное 0 или 1. -Ноль в качестве аргумента считается "ложью", а любое ненулевое значение - "истиной". +Ноль в качестве аргумента считается «ложью», а любое ненулевое значение - «истиной». -## and, оператор AND +## and, оператор AND {#and-operator-and} -## or, оператор OR +## or, оператор OR {#or-operator-or} -## not, оператор NOT +## not, оператор NOT {#not-operator-not} -## xor +## xor {#xor} [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/logical_functions/) diff --git a/docs/ru/query_language/functions/machine_learning_functions.md b/docs/ru/query_language/functions/machine_learning_functions.md index 7df87dc04e4..0662e868178 100644 --- a/docs/ru/query_language/functions/machine_learning_functions.md +++ b/docs/ru/query_language/functions/machine_learning_functions.md @@ -1,13 +1,13 @@ -# Функции машинного обучения +# Функции машинного обучения {#funktsii-mashinnogo-obucheniia} -## evalMLMethod (prediction) {#machine_learning_methods-evalmlmethod} +## evalMLMethod (prediction) {#machine-learning-methods-evalmlmethod} Предсказание с использованием подобранных регрессионных моделей. -### Stochastic Linear Regression +### Stochastic Linear Regression {#stochastic-linear-regression} -Агрегатная функция [stochasticLinearRegression](../agg_functions/reference.md#agg_functions-stochasticlinearregression) реализует стохастический градиентный спуск, использую линейную модель и функцию потерь MSE. +Агрегатная функция [stochasticLinearRegression](../agg_functions/reference.md#agg_functions-stochasticlinearregression) реализует стохастический градиентный спуск, использую линейную модель и функцию потерь MSE. -### Stochastic Logistic Regression +### Stochastic Logistic Regression {#stochastic-logistic-regression} -Агрегатная функция [stochasticLogisticRegression](../agg_functions/reference.md#agg_functions-stochasticlogisticregression) реализует стохастический градиентный спуск для задачи бинарной классификации. +Агрегатная функция [stochasticLogisticRegression](../agg_functions/reference.md#agg_functions-stochasticlogisticregression) реализует стохастический градиентный спуск для задачи бинарной классификации. diff --git a/docs/ru/query_language/functions/math_functions.md b/docs/ru/query_language/functions/math_functions.md index 886a0be48a7..e52b14d26c6 100644 --- a/docs/ru/query_language/functions/math_functions.md +++ b/docs/ru/query_language/functions/math_functions.md @@ -1,82 +1,101 @@ -# Математические функции +# Математические функции {#matematicheskie-funktsii} Все функции возвращают число типа Float64. Точность результата близка к максимально возможной, но результат может не совпадать с наиболее близким к соответствующему вещественному числу машинно представимым числом. -## e() +## e() {#e} Возвращает число типа Float64, близкое к числу e. -## pi() +## pi() {#pi} + Возвращает число типа Float64, близкое к числу π. -## exp(x) +## exp(x) {#expx} + Принимает числовой аргумент, возвращает число типа Float64, близкое к экспоненте от аргумента. -## log(x) +## log(x) {#logx} + Принимает числовой аргумент, возвращает число типа Float64, близкое к натуральному логарифму от аргумента. -## exp2(x) +## exp2(x) {#exp2x} + Принимает числовой аргумент, возвращает число типа Float64, близкое к 2 в степени x. -## log2(x) +## log2(x) {#log2x} + Принимает числовой аргумент, возвращает число типа Float64, близкое к двоичному логарифму от аргумента. -## exp10(x) +## exp10(x) {#exp10x} + Принимает числовой аргумент, возвращает число типа Float64, близкое к 10 в степени x. -## log10(x) +## log10(x) {#log10x} + Принимает числовой аргумент, возвращает число типа Float64, близкое к десятичному логарифму от аргумента. -## sqrt(x) +## sqrt(x) {#sqrtx} + Принимает числовой аргумент, возвращает число типа Float64, близкое к квадратному корню от аргумента. -## cbrt(x) +## cbrt(x) {#cbrtx} + Принимает числовой аргумент, возвращает число типа Float64, близкое к кубическому корню от аргумента. -## erf(x) +## erf(x) {#erfx} Если x неотрицательно, то erf(x / σ√2) - вероятность того, что случайная величина, имеющая нормальное распределение со среднеквадратичным отклонением σ, принимает значение, отстоящее от мат. ожидания больше чем на x. Пример (правило трёх сигм): -```sql +``` sql SELECT erf(3 / sqrt(2)) ``` -```text +``` text ┌─erf(divide(3, sqrt(2)))─┐ │ 0.9973002039367398 │ └─────────────────────────┘ ``` -## erfc(x) +## erfc(x) {#erfcx} + Принимает числовой аргумент, возвращает число типа Float64, близкое к 1 - erf(x), но без потери точности для больших x. -## lgamma(x) +## lgamma(x) {#lgammax} + Логарифм от гамма функции. -## tgamma(x) +## tgamma(x) {#tgammax} + Гамма функция. -## sin(x) +## sin(x) {#sinx} + Синус. -## cos(x) +## cos(x) {#cosx} + Косинус. -## tan(x) +## tan(x) {#tanx} + Тангенс. -## asin(x) +## asin(x) {#asinx} + Арксинус. -## acos(x) +## acos(x) {#acosx} + Арккосинус. -## atan(x) +## atan(x) {#atanx} + Арктангенс. -## pow(x, y) +## pow(x, y) {#powx-y} + Принимает два числовых аргумента x и y. Возвращает число типа Float64, близкое к x в степени y. [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/math_functions/) diff --git a/docs/ru/query_language/functions/other_functions.md b/docs/ru/query_language/functions/other_functions.md index 1b5bdafcc74..e4f7440b7cd 100644 --- a/docs/ru/query_language/functions/other_functions.md +++ b/docs/ru/query_language/functions/other_functions.md @@ -1,6 +1,6 @@ -# Прочие функции +# Прочие функции {#prochie-funktsii} -## hostName() +## hostName() {#hostname} Возвращает строку - имя хоста, на котором эта функция была выполнена. При распределённой обработке запроса, это будет имя хоста удалённого сервера, если функция выполняется на удалённом сервере. @@ -10,7 +10,7 @@ **Синтаксис** -```sql +``` sql fqdn(); ``` @@ -18,7 +18,7 @@ fqdn(); **Возвращаемое значение** -- Полное имя домена. +- Полное имя домена. Тип: `String`. @@ -26,118 +26,124 @@ fqdn(); Запрос: -```sql +``` sql SELECT FQDN(); ``` Ответ: -```text +``` text ┌─FQDN()──────────────────────────┐ │ clickhouse.ru-central1.internal │ └─────────────────────────────────┘ ``` -## basename +## basename {#basename} Извлекает конечную часть строки после последнего слэша или бэкслэша. Функция часто используется для извлечения имени файла из пути. -```sql +``` sql basename( expr ) ``` **Параметры** -- `expr` — Выражение, возвращающее значение типа [String](../../data_types/string.md). В результирующем значении все бэкслэши должны быть экранированы. +- `expr` — Выражение, возвращающее значение типа [String](../../data_types/string.md). В результирующем значении все бэкслэши должны быть экранированы. **Возвращаемое значение** Строка, содержащая: -- Конечную часть строки после последнего слэша или бэкслэша. +- Конечную часть строки после последнего слэша или бэкслэша. - Если входная строка содержит путь, заканчивающийся слэшем или бэкслэшем, например, `/` или `с:\`, функция возвращает пустую строку. + Если входная строка содержит путь, заканчивающийся слэшем или бэкслэшем, например, `/` или `с:\`, функция возвращает пустую строку. -- Исходная строка, если нет слэша или бэкслэша. +- Исходная строка, если нет слэша или бэкслэша. **Пример** -```sql +``` sql SELECT 'some/long/path/to/file' AS a, basename(a) ``` -```text +``` text ┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ │ some\long\path\to\file │ file │ └────────────────────────┴────────────────────────────────────────┘ ``` -```sql +``` sql SELECT 'some\\long\\path\\to\\file' AS a, basename(a) ``` -```text +``` text ┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ │ some\long\path\to\file │ file │ └────────────────────────┴────────────────────────────────────────┘ ``` -```sql +``` sql SELECT 'some-file-name' AS a, basename(a) ``` -```text +``` text ┌─a──────────────┬─basename('some-file-name')─┐ │ some-file-name │ some-file-name │ └────────────────┴────────────────────────────┘ ``` -## visibleWidth(x) +## visibleWidth(x) {#visiblewidthx} Вычисляет приблизительную ширину при выводе значения в текстовом (tab-separated) виде на консоль. Функция используется системой для реализации Pretty форматов. `NULL` представляется как строка, соответствующая отображению `NULL` в форматах `Pretty`. -```sql +``` sql SELECT visibleWidth(NULL) ``` -```text +``` text ┌─visibleWidth(NULL)─┐ │ 4 │ └────────────────────┘ ``` -## toTypeName(x) +## toTypeName(x) {#totypenamex} + Возвращает строку, содержащую имя типа переданного аргумента. Если на вход функции передать `NULL`, то она вернёт тип `Nullable(Nothing)`, что соответствует внутреннему представлению `NULL` в ClickHouse. ## blockSize() {#function-blocksize} + Получить размер блока. В ClickHouse выполнение запроса всегда идёт по блокам (наборам кусочков столбцов). Функция позволяет получить размер блока, для которого её вызвали. -## materialize(x) +## materialize(x) {#materializex} + Превращает константу в полноценный столбец, содержащий только одно значение. В ClickHouse полноценные столбцы и константы представлены в памяти по-разному. Функции по-разному работают для аргументов-констант и обычных аргументов (выполняется разный код), хотя результат почти всегда должен быть одинаковым. Эта функция предназначена для отладки такого поведения. -## ignore(...) +## ignore(…) {#ignore} + Принимает любые аргументы, в т.ч. `NULL`, всегда возвращает 0. При этом, аргумент всё равно вычисляется. Это может использоваться для бенчмарков. -## sleep(seconds) +## sleep(seconds) {#sleepseconds} + Спит seconds секунд на каждый блок данных. Можно указать как целое число, так и число с плавающей запятой. -## currentDatabase() +## currentDatabase() {#currentdatabase} + Возвращает имя текущей базы данных. Эта функция может использоваться в параметрах движка таблицы в запросе CREATE TABLE там, где нужно указать базу данных. -## currentUser() {#other_function-currentuser} +## currentUser() {#other-function-currentuser} Возвращает логин текущего пользователя. При распределенном запросе, возвращается имя пользователя, инициировавшего запрос. -```sql +``` sql SELECT currentUser(); ``` @@ -145,8 +151,8 @@ SELECT currentUser(); **Возвращаемые значения** -- Логин текущего пользователя. -- При распределенном запросе — логин пользователя, инициировавшего запрос. +- Логин текущего пользователя. +- При распределенном запросе — логин пользователя, инициировавшего запрос. Тип: `String`. @@ -154,66 +160,64 @@ SELECT currentUser(); Запрос: -```sql +``` sql SELECT currentUser(); ``` Ответ: -```text +``` text ┌─currentUser()─┐ │ default │ └───────────────┘ ``` -## isFinite(x) +## isFinite(x) {#isfinitex} + Принимает Float32 или Float64 и возвращает UInt8, равный 1, если аргумент не бесконечный и не NaN, иначе 0. ## ifNotFinite {#ifnotfinite} Проверяет, является ли значение дробного числа с плавающей точкой конечным. -**Синтаксис** +**Синтаксис** -``` -ifNotFinite(x,y) -``` + ifNotFinite(x,y) **Параметры** -- `x` — Значение, которое нужно проверить на бесконечность. Тип: [Float*](../../data_types/float.md). -- `y` — Запасное значение. Тип: [Float*](../../data_types/float.md). +- `x` — Значение, которое нужно проверить на бесконечность. Тип: [Float\*](../../data_types/float.md). +- `y` — Запасное значение. Тип: [Float\*](../../data_types/float.md). **Возвращаемые значения** -- `x`, если `x` принимает конечное значение. -- `y`, если`x` принимает не конечное значение. +- `x`, если `x` принимает конечное значение. +- `y`, если`x` принимает не конечное значение. **Пример** Запрос: -``` -SELECT 1/0 as infimum, ifNotFinite(infimum,42) -``` + SELECT 1/0 as infimum, ifNotFinite(infimum,42) Результат: -``` -┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ -│ inf │ 42 │ -└─────────┴───────────────────────────────┘ -``` + ┌─infimum─┬─ifNotFinite(divide(1, 0), 42)─┐ + │ inf │ 42 │ + └─────────┴───────────────────────────────┘ Аналогичный результат можно получить с помощью [тернарного оператора](conditional_functions.md#ternary-operator) `isFinite(x) ? x : y`. -## isInfinite(x) +## isInfinite(x) {#isinfinitex} + Принимает Float32 или Float64 и возвращает UInt8, равный 1, если аргумент бесконечный, иначе 0. Отметим, что в случае NaN возвращается 0. -## isNaN(x) +## isNaN(x) {#isnanx} + Принимает Float32 или Float64 и возвращает UInt8, равный 1, если аргумент является NaN, иначе 0. -## hasColumnInTable(\['hostname'\[, 'username'\[, 'password'\]\],\] 'database', 'table', 'column') +## hasColumnInTable(\[‘hostname’\[, ‘username’\[, ‘password’\]\],\] ‘database’, ‘table’, ‘column’) {#hascolumnintablehostname-username-password-database-table-column} + Принимает константные строки - имя базы данных, имя таблицы и название столбца. Возвращает константное выражение типа UInt8, равное 1, если есть столбец, иначе 0. Если задан параметр hostname, проверка будет выполнена на удалённом сервере. Функция кидает исключение, если таблица не существует. @@ -227,15 +231,15 @@ SELECT 1/0 as infimum, ifNotFinite(infimum,42) Параметры: -- `x` — Величина для отображения. -- `min, max` — Целочисленные константы, значение должно помещаться в `Int64`. -- `width` — Константа, положительное число, может быть дробным. +- `x` — Величина для отображения. +- `min, max` — Целочисленные константы, значение должно помещаться в `Int64`. +- `width` — Константа, положительное число, может быть дробным. Полоса рисуется с точностью до одной восьмой символа. Пример: -```sql +``` sql SELECT toHour(EventTime) AS h, count() AS c, @@ -245,7 +249,7 @@ GROUP BY h ORDER BY h ASC ``` -```text +``` text ┌──h─┬──────c─┬─bar────────────────┐ │ 0 │ 292907 │ █████████▋ │ │ 1 │ 180563 │ ██████ │ @@ -274,12 +278,12 @@ ORDER BY h ASC └────┴────────┴────────────────────┘ ``` +## transform {#transform} -## transform Преобразовать значение согласно явно указанному отображению одних элементов на другие. Имеется два варианта функции: -### transform(x, array_from, array_to, default) +### transform(x, array\_from, array\_to, default) {#transformx-array-from-array-to-default} `x` - что преобразовывать. @@ -299,11 +303,11 @@ ORDER BY h ASC При этом, где обозначена одна и та же буква (T или U), могут быть, в случае числовых типов, не совпадающие типы, а типы, для которых есть общий тип. Например, первый аргумент может иметь тип Int64, а второй - Array(UInt16). -Если значение x равно одному из элементов массива array_from, то возвращает соответствующий (такой же по номеру) элемент массива array_to; иначе возвращает default. Если имеется несколько совпадающих элементов в array_from, то возвращает какой-нибудь из соответствующих. +Если значение x равно одному из элементов массива array\_from, то возвращает соответствующий (такой же по номеру) элемент массива array\_to; иначе возвращает default. Если имеется несколько совпадающих элементов в array\_from, то возвращает какой-нибудь из соответствующих. Пример: -```sql +``` sql SELECT transform(SearchEngineID, [2, 3], ['Yandex', 'Google'], 'Other') AS title, count() AS c @@ -313,7 +317,7 @@ GROUP BY title ORDER BY c DESC ``` -```text +``` text ┌─title─────┬──────c─┐ │ Yandex │ 498635 │ │ Google │ 229872 │ @@ -321,10 +325,10 @@ ORDER BY c DESC └───────────┴────────┘ ``` -### transform(x, array_from, array_to) +### transform(x, array\_from, array\_to) {#transformx-array-from-array-to} Отличается от первого варианта отсутствующим аргументом default. -Если значение x равно одному из элементов массива array_from, то возвращает соответствующий (такой же по номеру) элемент массива array_to; иначе возвращает x. +Если значение x равно одному из элементов массива array\_from, то возвращает соответствующий (такой же по номеру) элемент массива array\_to; иначе возвращает x. Типы: @@ -332,7 +336,7 @@ ORDER BY c DESC Пример: -```sql +``` sql SELECT transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'example.com']) AS s, count() AS c @@ -342,7 +346,7 @@ ORDER BY count() DESC LIMIT 10 ``` -```text +``` text ┌─s──────────────┬───────c─┐ │ │ 2906259 │ │ www.yandex │ 867767 │ @@ -356,18 +360,19 @@ LIMIT 10 └────────────────┴─────────┘ ``` -## formatReadableSize(x) +## formatReadableSize(x) {#formatreadablesizex} + Принимает размер (число байт). Возвращает округленный размер с суффиксом (KiB, MiB и т.д.) в виде строки. Пример: -```sql +``` sql SELECT arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, formatReadableSize(filesize_bytes) AS filesize ``` -```text +``` text ┌─filesize_bytes─┬─filesize───┐ │ 1 │ 1.00 B │ │ 1024 │ 1.00 KiB │ @@ -376,23 +381,28 @@ SELECT └────────────────┴────────────┘ ``` -## least(a, b) +## least(a, b) {#leasta-b} + Возвращает наименьшее значение из a и b. -## greatest(a, b) +## greatest(a, b) {#greatesta-b} + Возвращает наибольшее значение из a и b. -## uptime() +## uptime() {#uptime} + Возвращает аптайм сервера в секундах. -## version() +## version() {#version} + Возвращает версию сервера в виде строки. ## rowNumberInBlock {#function-rownumberinblock} Возвращает порядковый номер строки в блоке данных. Для каждого блока данных нумерация начинается с 0. -## rowNumberInAllBlocks() +## rowNumberInAllBlocks() {#rownumberinallblocks} + Возвращает порядковый номер строки в блоке данных. Функция учитывает только задействованные блоки данных. ## neighbor {#neighbor} @@ -401,7 +411,7 @@ SELECT **Синтаксис** -```sql +``` sql neighbor(column, offset[, default_value]) ``` @@ -409,14 +419,14 @@ neighbor(column, offset[, default_value]) **Параметры** -- `column` — Имя столбца или скалярное выражение. -- `offset` - Смещение от текущей строки `column`. [Int64](../../data_types/int_uint.md). -- `default_value` - Опциональный параметр. Значение, которое будет возвращено, если смещение выходит за пределы блока данных. +- `column` — Имя столбца или скалярное выражение. +- `offset` - Смещение от текущей строки `column`. [Int64](../../data_types/int_uint.md). +- `default_value` - Опциональный параметр. Значение, которое будет возвращено, если смещение выходит за пределы блока данных. **Возвращаемое значение** -- Значение `column` в смещении от текущей строки, если значение `offset` не выходит за пределы блока. -- Значение по умолчанию для `column`, если значение `offset` выходит за пределы блока данных. Если передан параметр `default_value`, то значение берется из него. +- Значение `column` в смещении от текущей строки, если значение `offset` не выходит за пределы блока. +- Значение по умолчанию для `column`, если значение `offset` выходит за пределы блока данных. Если передан параметр `default_value`, то значение берется из него. Тип: зависит от данных в `column` или переданного значения по умолчанию в `default_value`. @@ -424,13 +434,13 @@ neighbor(column, offset[, default_value]) Запрос: -```sql +``` sql SELECT number, neighbor(number, 2) FROM system.numbers LIMIT 10; ``` Ответ: -```text +``` text ┌─number─┬─neighbor(number, 2)─┐ │ 0 │ 2 │ │ 1 │ 3 │ @@ -447,13 +457,13 @@ SELECT number, neighbor(number, 2) FROM system.numbers LIMIT 10; Запрос: -```sql +``` sql SELECT number, neighbor(number, 2, 999) FROM system.numbers LIMIT 10; ``` Ответ: -```text +``` text ┌─number─┬─neighbor(number, 2, 999)─┐ │ 0 │ 2 │ │ 1 │ 3 │ @@ -472,7 +482,7 @@ SELECT number, neighbor(number, 2, 999) FROM system.numbers LIMIT 10; Запрос: -```sql +``` sql WITH toDate('2018-01-01') AS start_date SELECT toStartOfMonth(start_date + (number * 32)) AS month, @@ -484,7 +494,7 @@ FROM numbers(16) Ответ: -```text +``` text ┌──────month─┬─money─┬─prev_year─┬─year_over_year─┐ │ 2018-01-01 │ 32 │ 0 │ 0 │ │ 2018-02-01 │ 63 │ 0 │ 0 │ @@ -505,7 +515,8 @@ FROM numbers(16) └────────────┴───────┴───────────┴────────────────┘ ``` -## runningDifference(x) +## runningDifference(x) {#runningdifferencex} + Считает разницу между последовательными значениями строк в блоке данных. Возвращает 0 для первой строки и разницу с предыдущей строкой для каждой последующей строки. @@ -514,7 +525,7 @@ FROM numbers(16) Пример: -```sql +``` sql SELECT EventID, EventTime, @@ -531,7 +542,7 @@ FROM ) ``` -```text +``` text ┌─EventID─┬───────────EventTime─┬─delta─┐ │ 1106 │ 2016-11-24 00:00:04 │ 0 │ │ 1107 │ 2016-11-24 00:00:05 │ 1 │ @@ -543,14 +554,15 @@ FROM Обратите внимание — размер блока влияет на результат. С каждым новым блоком состояние `runningDifference` сбрасывается. -```sql +``` sql SELECT number, runningDifference(number + 1) AS diff FROM numbers(100000) WHERE diff != 1 ``` -```text + +``` text ┌─number─┬─diff─┐ │ 0 │ 0 │ └────────┴──────┘ @@ -558,95 +570,99 @@ WHERE diff != 1 │ 65536 │ 0 │ └────────┴──────┘ ``` -``` -set max_block_size=100000 -- по умолчанию 65536! -SELECT - number, - runningDifference(number + 1) AS diff -FROM numbers(100000) -WHERE diff != 1 -``` -```text + set max_block_size=100000 -- по умолчанию 65536! + + SELECT + number, + runningDifference(number + 1) AS diff + FROM numbers(100000) + WHERE diff != 1 + +``` text ┌─number─┬─diff─┐ │ 0 │ 0 │ └────────┴──────┘ ``` -## runningDifferenceStartingWithFirstValue -То же, что и [runningDifference] (./other_functions.md # other_functions-runningdifference), но в первой строке возвращается значение первой строки, а не ноль. +## runningDifferenceStartingWithFirstValue {#runningdifferencestartingwithfirstvalue} + +То же, что и \[runningDifference\] (./other\_functions.md \# other\_functions-runningdifference), но в первой строке возвращается значение первой строки, а не ноль. + +## MACNumToString(num) {#macnumtostringnum} -## MACNumToString(num) Принимает число типа UInt64. Интерпретирует его, как MAC-адрес в big endian. Возвращает строку, содержащую соответствующий MAC-адрес в формате AA:BB:CC:DD:EE:FF (числа в шестнадцатеричной форме через двоеточие). -## MACStringToNum(s) +## MACStringToNum(s) {#macstringtonums} + Функция, обратная к MACNumToString. Если MAC адрес в неправильном формате, то возвращает 0. -## MACStringToOUI(s) +## MACStringToOUI(s) {#macstringtoouis} + Принимает MAC адрес в формате AA:BB:CC:DD:EE:FF (числа в шестнадцатеричной форме через двоеточие). Возвращает первые три октета как число в формате UInt64. Если MAC адрес в неправильном формате, то возвращает 0. -## getSizeOfEnumType +## getSizeOfEnumType {#getsizeofenumtype} Возвращает количество полей в [Enum](../../data_types/enum.md). -```sql +``` sql getSizeOfEnumType(value) ``` **Параметры** -- `value` — Значение типа `Enum`. - +- `value` — Значение типа `Enum`. **Возвращаемые значения** -- Количество полей входного значения типа `Enum`. -- Исключение, если тип не `Enum`. +- Количество полей входного значения типа `Enum`. +- Исключение, если тип не `Enum`. **Пример** -```sql +``` sql SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x ``` -```text +``` text ┌─x─┐ │ 2 │ └───┘ ``` -## toColumnTypeName +## toColumnTypeName {#tocolumntypename} Возвращает имя класса, которым представлен тип данных столбца в оперативной памяти. -```sql +``` sql toColumnTypeName(value) ``` **Параметры** -- `value` — Значение произвольного типа. +- `value` — Значение произвольного типа. **Возвращаемые значения** -- Строка с именем класса, который используется для представления типа данных `value` в оперативной памяти. +- Строка с именем класса, который используется для представления типа данных `value` в оперативной памяти. **Пример разницы между `toTypeName` и `toColumnTypeName`** -```sql +``` sql SELECT toTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) ``` -```text +``` text ┌─toTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ DateTime │ └─────────────────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) ``` -```text +``` text ┌─toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ Const(UInt32) │ └───────────────────────────────────────────────────────────┘ @@ -654,90 +670,90 @@ SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) В примере видно, что тип данных `DateTime` хранится в памяти как `Const(UInt32)`. -## dumpColumnStructure +## dumpColumnStructure {#dumpcolumnstructure} Выводит развернутое описание структур данных в оперативной памяти -```sql +``` sql dumpColumnStructure(value) ``` **Параметры** -- `value` — Значение произвольного типа. +- `value` — Значение произвольного типа. **Возвращаемые значения** -- Строка с описанием структуры, которая используется для представления типа данных `value` в оперативной памяти. +- Строка с описанием структуры, которая используется для представления типа данных `value` в оперативной памяти. **Пример** -```sql +``` sql SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) ``` -```text +``` text ┌─dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ DateTime, Const(size = 1, UInt32(size = 1)) │ └──────────────────────────────────────────────────────────────┘ ``` -## defaultValueOfArgumentType +## defaultValueOfArgumentType {#defaultvalueofargumenttype} Выводит значение по умолчанию для типа данных. Не учитывает значения по умолчанию для столбцов, заданные пользователем. -```sql +``` sql defaultValueOfArgumentType(expression) ``` **Параметры** -- `expression` — Значение произвольного типа или выражение, результатом которого является значение произвольного типа. +- `expression` — Значение произвольного типа или выражение, результатом которого является значение произвольного типа. **Возвращаемые значения** -- `0` для чисел; -- Пустая строка для строк; -- `ᴺᵁᴸᴸ` для [Nullable](../../data_types/nullable.md). +- `0` для чисел; +- Пустая строка для строк; +- `ᴺᵁᴸᴸ` для [Nullable](../../data_types/nullable.md). **Пример** -```sql +``` sql SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) ``` -```text +``` text ┌─defaultValueOfArgumentType(CAST(1, 'Int8'))─┐ │ 0 │ └─────────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) ``` -```text +``` text ┌─defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)'))─┐ │ ᴺᵁᴸᴸ │ └───────────────────────────────────────────────────────┘ ``` - -## replicate {#other_functions-replicate} +## replicate {#other-functions-replicate} Создает массив, заполненный одним значением. Используется для внутренней реализации [arrayJoin](array_join.md#functions_arrayjoin). -```sql +``` sql SELECT replicate(x, arr); ``` **Параметры** -- `arr` — Исходный массив. ClickHouse создаёт новый массив такой же длины как исходный и заполняет его значением `x`. -- `x` — Значение, которым будет заполнен результирующий массив. +- `arr` — Исходный массив. ClickHouse создаёт новый массив такой же длины как исходный и заполняет его значением `x`. +- `x` — Значение, которым будет заполнен результирующий массив. **Возвращаемое значение** @@ -749,13 +765,13 @@ SELECT replicate(x, arr); Запрос: -```sql +``` sql SELECT replicate(1, ['a', 'b', 'c']); ``` Ответ: -```text +``` text ┌─replicate(1, ['a', 'b', 'c'])─┐ │ [1,1,1] │ └───────────────────────────────┘ @@ -767,13 +783,13 @@ SELECT replicate(1, ['a', 'b', 'c']); **Синтаксис** -```sql +``` sql filesystemAvailable() ``` **Возвращаемое значение** -- Объём доступного для записи данных места в байтах. +- Объём доступного для записи данных места в байтах. Тип: [UInt64](../../data_types/int_uint.md). @@ -781,13 +797,13 @@ filesystemAvailable() Запрос: -```sql +``` sql SELECT formatReadableSize(filesystemAvailable()) AS "Available space", toTypeName(filesystemAvailable()) AS "Type"; ``` Ответ: -```text +``` text ┌─Available space─┬─Type───┐ │ 30.75 GiB │ UInt64 │ └─────────────────┴────────┘ @@ -799,13 +815,13 @@ SELECT formatReadableSize(filesystemAvailable()) AS "Available space", toTypeNam **Синтаксис** -```sql +``` sql filesystemFree() ``` **Возвращаемое значение** -- Объем свободного места в байтах. +- Объем свободного места в байтах. Тип: [UInt64](../../data_types/int_uint.md). @@ -813,13 +829,13 @@ filesystemFree() Запрос: -```sql +``` sql SELECT formatReadableSize(filesystemFree()) AS "Free space", toTypeName(filesystemFree()) AS "Type"; ``` Ответ: -```text +``` text ┌─Free space─┬─Type───┐ │ 32.39 GiB │ UInt64 │ └────────────┴────────┘ @@ -831,13 +847,13 @@ SELECT formatReadableSize(filesystemFree()) AS "Free space", toTypeName(filesyst **Синтаксис** -```sql +``` sql filesystemCapacity() ``` **Возвращаемое значение** -- Информация о ёмкости файловой системы в байтах. +- Информация о ёмкости файловой системы в байтах. Тип: [UInt64](../../data_types/int_uint.md). @@ -845,13 +861,13 @@ filesystemCapacity() Запрос: -```sql +``` sql SELECT formatReadableSize(filesystemCapacity()) AS "Capacity", toTypeName(filesystemCapacity()) AS "Type" ``` Ответ: -```text +``` text ┌─Capacity──┬─Type───┐ │ 39.32 GiB │ UInt64 │ └───────────┴────────┘ @@ -863,7 +879,7 @@ SELECT formatReadableSize(filesystemCapacity()) AS "Capacity", toTypeName(filesy ## runningAccumulate {#function-runningaccumulate} -Принимает на вход состояния агрегатной функции и возвращает столбец со значениями, которые представляют собой результат мёржа этих состояний для выборки строк из блока от первой до текущей строки. Например, принимает состояние агрегатной функции (например, `runningAccumulate(uniqState(UserID))`), и для каждой строки блока возвращает результат агрегатной функции после мёржа состояний функции для всех предыдущих строк и текущей. Таким образом, результат зависит от разбиения данных по блокам и от порядка данных в блоке. +Принимает на вход состояния агрегатной функции и возвращает столбец со значениями, которые представляют собой результат мёржа этих состояний для выборки строк из блока от первой до текущей строки. Например, принимает состояние агрегатной функции (например, `runningAccumulate(uniqState(UserID))`), и для каждой строки блока возвращает результат агрегатной функции после мёржа состояний функции для всех предыдущих строк и текущей. Таким образом, результат зависит от разбиения данных по блокам и от порядка данных в блоке. ## joinGet {#joinget} @@ -875,21 +891,21 @@ SELECT formatReadableSize(filesystemCapacity()) AS "Capacity", toTypeName(filesy **Синтаксис** -```sql +``` sql joinGet(join_storage_table_name, `value_column`, join_keys) ``` **Параметры** -- `join_storage_table_name` — [идентификатор](../syntax.md#syntax-identifiers), который указывает, откуда производится выборка данных. Поиск по идентификатору осуществляется в базе данных по умолчанию (см. конфигурацию `default_database`). Чтобы переопределить базу данных по умолчанию, используйте команду `USE db_name`, или укажите базу данных и таблицу через разделитель `db_name.db_table`, см. пример. -- `value_column` — столбец, из которого нужно произвести выборку данных. -- `join_keys` — список ключей, по которым производится выборка данных. +- `join_storage_table_name` — [идентификатор](../syntax.md#syntax-identifiers), который указывает, откуда производится выборка данных. Поиск по идентификатору осуществляется в базе данных по умолчанию (см. конфигурацию `default_database`). Чтобы переопределить базу данных по умолчанию, используйте команду `USE db_name`, или укажите базу данных и таблицу через разделитель `db_name.db_table`, см. пример. +- `value_column` — столбец, из которого нужно произвести выборку данных. +- `join_keys` — список ключей, по которым производится выборка данных. **Возвращаемое значение** Возвращает значение по списку ключей. -Если значения не существует в исходной таблице, вернется `0` или `null` в соответствии с настройками [join_use_nulls](../../operations/settings/settings.md#join_use_nulls). +Если значения не существует в исходной таблице, вернется `0` или `null` в соответствии с настройками [join\_use\_nulls](../../operations/settings/settings.md#join_use_nulls). Подробнее о настройке `join_use_nulls` в [операциях Join](../../operations/table_engines/join.md). @@ -897,13 +913,13 @@ joinGet(join_storage_table_name, `value_column`, join_keys) Входная таблица: -```sql +``` sql CREATE DATABASE db_test CREATE TABLE db_test.id_val(`id` UInt32, `val` UInt32) ENGINE = Join(ANY, LEFT, id) SETTINGS join_use_nulls = 1 INSERT INTO db_test.id_val VALUES (1,11)(2,12)(4,13) ``` -```text +``` text ┌─id─┬─val─┐ │ 4 │ 13 │ │ 2 │ 12 │ @@ -913,13 +929,13 @@ INSERT INTO db_test.id_val VALUES (1,11)(2,12)(4,13) Запрос: -```sql +``` sql SELECT joinGet(db_test.id_val,'val',toUInt32(number)) from numbers(4) SETTINGS join_use_nulls = 1 ``` Результат: -```text +``` text ┌─joinGet(db_test.id_val, 'val', toUInt32(number))─┐ │ 0 │ │ 11 │ @@ -928,33 +944,33 @@ SELECT joinGet(db_test.id_val,'val',toUInt32(number)) from numbers(4) SETTINGS j └──────────────────────────────────────────────────┘ ``` -## modelEvaluate(model_name, ...) {#function-modelevaluate} +## modelEvaluate(model\_name, …) {#function-modelevaluate} Оценивает внешнюю модель. Принимает на вход имя и аргументы модели. Возвращает Float64. - -## throwIf(x\[, custom_message\]) +## throwIf(x\[, custom\_message\]) {#throwifx-custom-message} Бросает исключение, если аргумент не равен нулю. -custom_message - необязательный параметр, константная строка, задает текст сообщения об ошибке. +custom\_message - необязательный параметр, константная строка, задает текст сообщения об ошибке. -```sql +``` sql SELECT throwIf(number = 3, 'Too many') FROM numbers(10); ``` -```text + +``` text ↙ Progress: 0.00 rows, 0.00 B (0.00 rows/s., 0.00 B/s.) Received exception from server (version 19.14.1): Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many. ``` ## identity {#identity} -Возвращает свой аргумент. Используется для отладки и тестирования, позволяет отменить использование индекса, и получить результат и производительность полного сканирования таблицы. Это работает, потому что оптимизатор запросов не может "заглянуть" внутрь функции `identity`. +Возвращает свой аргумент. Используется для отладки и тестирования, позволяет отменить использование индекса, и получить результат и производительность полного сканирования таблицы. Это работает, потому что оптимизатор запросов не может «заглянуть» внутрь функции `identity`. **Синтаксис** -```sql +``` sql identity(x) ``` @@ -962,13 +978,13 @@ identity(x) Query: -```sql +``` sql SELECT identity(42) ``` Результат: -```text +``` text ┌─identity(42)─┐ │ 42 │ └──────────────┘ @@ -980,28 +996,29 @@ SELECT identity(42) **Синтаксис** -```sql +``` sql randomPrintableASCII(length) ``` **Параметры** -- `length` — Длина результирующей строки. Положительное целое число. +- `length` — Длина результирующей строки. Положительное целое число. - Если передать `length < 0`, то поведение функции не определено. + Если передать `length < 0`, то поведение функции не определено. **Возвращаемое значение** - - Строка со случайным набором печатных символов [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters). +- Строка со случайным набором печатных символов [ASCII](https://en.wikipedia.org/wiki/ASCII#Printable_characters). Тип: [String](../../data_types/string.md) **Пример** -```sql +``` sql SELECT number, randomPrintableASCII(30) as str, length(str) FROM system.numbers LIMIT 3 ``` -```text + +``` text ┌─number─┬─str────────────────────────────┬─length(randomPrintableASCII(30))─┐ │ 0 │ SuiCOSTvC0csfABSw=UcSzp2.`rv8x │ 30 │ │ 1 │ 1Ag NlJ &RCN:*>HVPG;PE-nO"SUFD │ 30 │ diff --git a/docs/ru/query_language/functions/random_functions.md b/docs/ru/query_language/functions/random_functions.md index 886413b4f24..b6ce2b178ee 100644 --- a/docs/ru/query_language/functions/random_functions.md +++ b/docs/ru/query_language/functions/random_functions.md @@ -1,4 +1,4 @@ -# Функции генерации псевдослучайных чисел +# Функции генерации псевдослучайных чисел {#funktsii-generatsii-psevdosluchainykh-chisel} Используются не криптографические генераторы псевдослучайных чисел. @@ -6,11 +6,13 @@ В случае, если передан аргумент - он может быть любого типа, и его значение никак не используется. Этот аргумент нужен только для того, чтобы предотвратить склейку одинаковых выражений - чтобы две разные записи одной функции возвращали разные столбцы, с разными случайными числами. -## rand +## rand {#rand} + Возвращает псевдослучайное число типа UInt32, равномерно распределённое среди всех чисел типа UInt32. Используется linear congruential generator. -## rand64 +## rand64 {#rand64} + Возвращает псевдослучайное число типа UInt64, равномерно распределённое среди всех чисел типа UInt64. Используется linear congruential generator. diff --git a/docs/ru/query_language/functions/rounding_functions.md b/docs/ru/query_language/functions/rounding_functions.md index 9bbcafa2214..485b5ccb8a3 100644 --- a/docs/ru/query_language/functions/rounding_functions.md +++ b/docs/ru/query_language/functions/rounding_functions.md @@ -1,6 +1,7 @@ -# Функции округления +# Функции округления {#funktsii-okrugleniia} + +## floor(x\[, N\]) {#floorx-n} -## floor(x\[, N\]) Возвращает наибольшее круглое число, которое меньше или равно, чем x. Круглым называется число, кратное 1 / 10N или ближайшее к нему число соответствующего типа данных, если 1 / 10N не представимо точно. N - целочисленная константа, не обязательный параметр. По умолчанию - ноль, что означает - округлять до целого числа. @@ -12,41 +13,42 @@ N может быть отрицательным. Для целочисленных аргументов имеет смысл округление с отрицательным значением N (для неотрицательных N, функция ничего не делает). В случае переполнения при округлении (например, floor(-128, -1)), возвращается implementation specific результат. -## ceil(x\[, N\]) +## ceil(x\[, N\]) {#ceilx-n} + Возвращает наименьшее круглое число, которое больше или равно, чем x. В остальном, аналогично функции floor, см. выше. -## round(x\[, N\]) {#rounding_functions-round} +## round(x\[, N\]) {#rounding-functions-round} Округляет значение до указанного десятичного разряда. Функция возвращает ближайшее значение указанного порядка. В случае, когда заданное число равноудалено от чисел необходимого порядка, функция возвращает то из них, которое имеет ближайшую чётную цифру (банковское округление). -```sql +``` sql round(expression [, decimal_places]) ``` **Параметры:** -- `expression` — Число для округления. Может быть любым [выражением](../syntax.md#syntax-expressions), возвращающим числовой [тип данных](../../data_types/index.md#data_types). -- `decimal-places` — Целое значение. - - Если `decimal-places > 0`, то функция округляет значение справа от запятой. - - Если `decimal-places < 0` то функция округляет значение слева от запятой. - - Если `decimal-places = 0`, то функция округляет значение до целого. В этом случае аргумент можно опустить. +- `expression` — Число для округления. Может быть любым [выражением](../syntax.md#syntax-expressions), возвращающим числовой [тип данных](../../data_types/index.md#data_types). +- `decimal-places` — Целое значение. + - Если `decimal-places > 0`, то функция округляет значение справа от запятой. + - Если `decimal-places < 0` то функция округляет значение слева от запятой. + - Если `decimal-places = 0`, то функция округляет значение до целого. В этом случае аргумент можно опустить. **Возвращаемое значение:** Округлённое значение того же типа, что и входящее. - -### Примеры +### Примеры {#primery} **Пример использования** -```sql +``` sql SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 ``` -```text + +``` text ┌───x─┬─round(divide(number, 2))─┐ │ 0 │ 0 │ │ 0.5 │ 0 │ @@ -58,7 +60,7 @@ SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 Округление до ближайшего числа. -```text +``` text round(3.2, 0) = 3 round(4.1267, 2) = 4.13 round(22,-1) = 20 @@ -68,52 +70,50 @@ round(-467,-2) = -500 Банковское округление. -```text +``` text round(3.5) = 4 round(4.5) = 4 round(3.55, 1) = 3.6 round(3.65, 1) = 3.6 ``` -**Смотрите также** +**Смотрите также** -- [roundBankers](#roundbankers) +- [roundBankers](#roundbankers) ## roundBankers {#roundbankers} Округляет число до указанного десятичного разряда. -- Если округляемое число равноудалено от соседних чисел, то используется банковское округление. +- Если округляемое число равноудалено от соседних чисел, то используется банковское округление. - Банковское округление (англ. banker's rounding) — метод округления дробных чисел. Если округляемое число равноудалено от соседних чисел, то оно округляется до ближайшей чётной цифры заданного десятичного разряда. К примеру, 3,5 округляется до 4, а 2,5 до 2. + Банковское округление (англ. banker's rounding) — метод округления дробных чисел. Если округляемое число равноудалено от соседних чисел, то оно округляется до ближайшей чётной цифры заданного десятичного разряда. К примеру, 3,5 округляется до 4, а 2,5 до 2. - Этот метод округления, используемый по умолчанию для чисел с плавающей запятой, определён в стандарте [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). Функция [round](#rounding_functions-round) также округляет числа с плавающей запятой по этому методу. Функция `roundBankers` округляет не только числа с плавающей запятой, но и целые числа методом банковского округления, например, `roundBankers(45, -1) = 40`. + Этот метод округления, используемый по умолчанию для чисел с плавающей запятой, определён в стандарте [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). Функция [round](#rounding_functions-round) также округляет числа с плавающей запятой по этому методу. Функция `roundBankers` округляет не только числа с плавающей запятой, но и целые числа методом банковского округления, например, `roundBankers(45, -1) = 40`. -- В других случаях функция округляет к ближайшему целому. +- В других случаях функция округляет к ближайшему целому. Банковское округление позволяет уменьшить влияние округления чисел на результат суммирования или вычитания этих чисел. Пример суммирования чисел 1.5, 2.5, 3.5 и 4.5 с различным округлением: -- Без округления: 1.5 + 2.5 + 3.5 + 4.5 = 12. -- Банковское округление: 2 + 2 + 4 + 4 = 12. -- Округление до ближайшего целого: 2 + 3 + 4 + 5 = 14. +- Без округления: 1.5 + 2.5 + 3.5 + 4.5 = 12. +- Банковское округление: 2 + 2 + 4 + 4 = 12. +- Округление до ближайшего целого: 2 + 3 + 4 + 5 = 14. +**Синтаксис** -**Синтаксис** - -```sql +``` sql roundBankers(expression [, decimal_places]) ``` -**Параметры** - -- `expression` — Число для округления. Может быть любым [выражением](../syntax.md#syntax-expressions), возвращающим числовой [тип данных](../../data_types/index.md#data_types). -- `decimal-places` — Десятичный разряд. Целое число. - - `decimal-places > 0` — Функция округляет значение выражения до ближайшего чётного числа на соответствующей позиции справа от запятой. Например, `roundBankers(3.55, 1) = 3.6`. - - `decimal-places < 0` — Функция округляет значение выражения до ближайшего чётного числа на соответствующей позиции слева от запятой. Например, `roundBankers(24.55, -1) = 20`. - - `decimal-places = 0` — Функция округляет значение до целого. В этом случае аргумент можно не передавать. Например, `roundBankers(2.5) = 2`. +**Параметры** +- `expression` — Число для округления. Может быть любым [выражением](../syntax.md#syntax-expressions), возвращающим числовой [тип данных](../../data_types/index.md#data_types). +- `decimal-places` — Десятичный разряд. Целое число. + - `decimal-places > 0` — Функция округляет значение выражения до ближайшего чётного числа на соответствующей позиции справа от запятой. Например, `roundBankers(3.55, 1) = 3.6`. + - `decimal-places < 0` — Функция округляет значение выражения до ближайшего чётного числа на соответствующей позиции слева от запятой. Например, `roundBankers(24.55, -1) = 20`. + - `decimal-places = 0` — Функция округляет значение до целого. В этом случае аргумент можно не передавать. Например, `roundBankers(2.5) = 2`. **Возвращаемое значение** @@ -123,13 +123,13 @@ roundBankers(expression [, decimal_places]) Запрос: -```sql +``` sql SELECT number / 2 AS x, roundBankers(x, 0) AS b fROM system.numbers limit 10 ``` Результат: -```text +``` text ┌───x─┬─b─┐ │ 0 │ 0 │ │ 0.5 │ 0 │ @@ -144,10 +144,9 @@ roundBankers(expression [, decimal_places]) └─────┴───┘ ``` - **Примеры банковского округления** -```text +``` text roundBankers(0.4) = 0 roundBankers(-3.5) = -4 roundBankers(4.5) = 4 @@ -157,18 +156,20 @@ roundBankers(10.35, 1) = 10.4 roundBankers(10.755, 2) = 11,76 ``` -**Смотрите также** +**Смотрите также** -- [round](#rounding_functions-round) +- [round](#rounding_functions-round) +## roundToExp2(num) {#roundtoexp2num} -## roundToExp2(num) Принимает число. Если число меньше единицы - возвращает 0. Иначе округляет число вниз до ближайшей (целой неотрицательной) степени двух. -## roundDuration(num) +## roundDuration(num) {#rounddurationnum} + Принимает число. Если число меньше единицы - возвращает 0. Иначе округляет число вниз до чисел из набора: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. Эта функция специфична для Яндекс.Метрики и предназначена для реализации отчёта по длительности визита. -## roundAge(num) +## roundAge(num) {#roundagenum} + Принимает число. Если число меньше 18 - возвращает 0. Иначе округляет число вниз до чисел из набора: 18, 25, 35, 45, 55. Эта функция специфична для Яндекс.Метрики и предназначена для реализации отчёта по возрасту посетителей. [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/rounding_functions/) diff --git a/docs/ru/query_language/functions/splitting_merging_functions.md b/docs/ru/query_language/functions/splitting_merging_functions.md index a4dcec38d8b..c5a7f880c08 100644 --- a/docs/ru/query_language/functions/splitting_merging_functions.md +++ b/docs/ru/query_language/functions/splitting_merging_functions.md @@ -1,31 +1,36 @@ -# Функции разбиения и слияния строк и массивов +# Функции разбиения и слияния строк и массивов {#funktsii-razbieniia-i-sliianiia-strok-i-massivov} + +## splitByChar(separator, s) {#splitbycharseparator-s} -## splitByChar(separator, s) Разбивает строку на подстроки, используя в качестве разделителя separator. separator должен быть константной строкой из ровно одного символа. Возвращается массив выделенных подстрок. Могут выделяться пустые подстроки, если разделитель идёт в начале или в конце строки, или если идёт более одного разделителя подряд. -## splitByString(separator, s) +## splitByString(separator, s) {#splitbystringseparator-s} + То же самое, но использует строку из нескольких символов в качестве разделителя. Строка должна быть непустой. -## arrayStringConcat(arr\[, separator\]) +## arrayStringConcat(arr\[, separator\]) {#arraystringconcatarr-separator} + Склеивает строки, перечисленные в массиве, с разделителем separator. separator - необязательный параметр, константная строка, по умолчанию равен пустой строке. Возвращается строка. -## alphaTokens(s) +## alphaTokens(s) {#alphatokenss} + Выделяет подстроки из подряд идущих байт из диапазонов a-z и A-Z. Возвращается массив выделенных подстрок. **Пример:** -```sql +``` sql SELECT alphaTokens('abca1abc') ``` -```text +``` text ┌─alphaTokens('abca1abc')─┐ │ ['abca','abc'] │ └─────────────────────────┘ ``` + [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/splitting_merging_functions/) diff --git a/docs/ru/query_language/functions/string_functions.md b/docs/ru/query_language/functions/string_functions.md index 1aaabcdf6b1..ef2793d2c18 100644 --- a/docs/ru/query_language/functions/string_functions.md +++ b/docs/ru/query_language/functions/string_functions.md @@ -1,66 +1,86 @@ -# Функции для работы со строками +# Функции для работы со строками {#funktsii-dlia-raboty-so-strokami} + +## empty {#empty} -## empty {#string_functions-empty} Возвращает 1 для пустой строки, и 0 для непустой строки. -Тип результата - UInt8. +Тип результата — UInt8. Строка считается непустой, если содержит хотя бы один байт, пусть даже это пробел или нулевой байт. Функция также работает для массивов. -## notEmpty +## notEmpty {#notempty} + Возвращает 0 для пустой строки, и 1 для непустой строки. -Тип результата - UInt8. +Тип результата — UInt8. Функция также работает для массивов. -## length +## length {#length} + Возвращает длину строки в байтах (не символах, не кодовых точках). -Тип результата - UInt64. +Тип результата — UInt64. Функция также работает для массивов. -## lengthUTF8 -Возвращает длину строки в кодовых точках Unicode (не символах), при допущении, что строка содержит набор байт, являющийся текстом в кодировке UTF-8. Если допущение не выполнено - то возвращает какой-нибудь результат (не кидает исключение). -Тип результата - UInt64. +## lengthUTF8 {#lengthutf8} + +Возвращает длину строки в кодовых точках Unicode (не символах), при допущении, что строка содержит набор байтов, являющийся текстом в кодировке UTF-8. Если допущение не выполнено, то возвращает какой-нибудь результат (не кидает исключение). +Тип результата — UInt64. + +## char\_length, CHAR\_LENGTH {#char-length} + +Возвращает длину строки в кодовых точках Unicode (не символах), при допущении, что строка содержит набор байтов, являющийся текстом в кодировке UTF-8. Если допущение не выполнено, возвращает какой-нибудь результат (не кидает исключение). +Тип результата — UInt64. + +## character\_length, CHARACTER\_LENGTH {#character-length} + +Возвращает длину строки в кодовых точках Unicode (не символах), при допущении, что строка содержит набор байтов, являющийся текстом в кодировке UTF-8. Если допущение не выполнено, возвращает какой-нибудь результат (не кидает исключение). +Тип результата — UInt64. + +## lower, lcase {#lower} -## lower Переводит ASCII-символы латиницы в строке в нижний регистр. -## upper +## upper, ucase {#upper} + Переводит ASCII-символы латиницы в строке в верхний регистр. -## lowerUTF8 -Переводит строку в нижний регистр, при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. +## lowerUTF8 {#lowerutf8} + +Переводит строку в нижний регистр, при допущении, что строка содержит набор байтов, представляющий текст в кодировке UTF-8. Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным. -Если длина UTF-8 последовательности байт различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки, результат работы может быть некорректным. -Если строка содержит набор байт, не являющийся UTF-8, то поведение не определено. +Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки результат работы может быть некорректным. +Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено. -## upperUTF8 -Переводит строку в верхний регистр, при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. +## upperUTF8 {#upperutf8} + +Переводит строку в верхний регистр, при допущении, что строка содержит набор байтов, представляющий текст в кодировке UTF-8. Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным. -Если длина UTF-8 последовательности байт различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки, результат работы может быть некорректным. -Если строка содержит набор байт, не являющийся UTF-8, то поведение не определено. +Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки, результат работы может быть некорректным. +Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено. -## isValidUTF8 -Возвращает 1, если набор байт является корректным в кодировке UTF-8, 0 иначе. +## isValidUTF8 {#isvalidutf8} -## toValidUTF8 +Возвращает 1, если набор байтов является корректным в кодировке UTF-8, 0 иначе. + +## toValidUTF8 {#tovalidutf8} Заменяет некорректные символы UTF-8 на символ `�` (U+FFFD). Все идущие подряд некорректные символы схлопываются в один заменяющий символ. -```sql +``` sql toValidUTF8( input_string ) ``` Параметры: -- input_string — произвольный набор байтов, представленный как объект типа [String](../../data_types/string.md). +- input\_string — произвольный набор байтов, представленный как объект типа [String](../../data_types/string.md). Возвращаемое значение: Корректная строка UTF-8. -### Пример +**Пример** -```sql +``` sql SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') ``` -```text + +``` text ┌─toValidUTF8('a����b')─┐ │ a�b │ └───────────────────────┘ @@ -72,18 +92,18 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') **Синтаксис** -```sql +``` sql repeat(s, n) ``` **Параметры** -- `s` — Строка для повторения. [String](../../data_types/string.md). -- `n` — Количество повторов. [UInt](../../data_types/int_uint.md). +- `s` — Строка для повторения. [String](../../data_types/string.md). +- `n` — Количество повторов. [UInt](../../data_types/int_uint.md). **Возвращаемое значение** -Строка, состоящая из повторений `n` раз исходной строки `s`. Если `n` < 1, то функция вернет пустую строку. +Строка, состоящая из повторений `n` раз исходной строки `s`. Если `n` \< 1, то функция вернет пустую строку. Тип: `String`. @@ -91,53 +111,57 @@ repeat(s, n) Запрос: -```sql +``` sql SELECT repeat('abc', 10) ``` Ответ: -```text +``` text ┌─repeat('abc', 10)──────────────┐ │ abcabcabcabcabcabcabcabcabcabc │ └────────────────────────────────┘ ``` -## reverse -Разворачивает строку (как последовательность байт). +## reverse {#reverse} -## reverseUTF8 -Разворачивает последовательность кодовых точек Unicode, при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Иначе - что-то делает (не кидает исключение). +Разворачивает строку (как последовательность байтов). +## reverseUTF8 {#reverseutf8} -## format(pattern, s0, s1, ...) +Разворачивает последовательность кодовых точек Unicode, при допущении, что строка содержит набор байтов, представляющий текст в кодировке UTF-8. Иначе — что-то делает (не кидает исключение). -Форматирует константный шаблон со строками, перечисленными в аргументах. `pattern` -- упрощенная версия шаблона в языке Python. Шаблон содержит "заменяющие поля", которые окружены фигурными скобками `{}`. Всё, что не содержится в скобках, интерпретируется как обычный текст и просто копируется. Если нужно использовать символ фигурной скобки, можно экранировать двойной скобкой `{{` или `}}`. Имя полей могут быть числами (нумерация с нуля) или пустыми (тогда они интерпретируются как последовательные числа). +## format(pattern, s0, s1, …) {#format} -```sql +Форматирует константный шаблон со строками, перечисленными в аргументах. `pattern` — упрощенная версия шаблона в языке Python. Шаблон содержит «заменяющие поля», которые окружены фигурными скобками `{}`. Всё, что не содержится в скобках, интерпретируется как обычный текст и просто копируется. Если нужно использовать символ фигурной скобки, можно экранировать двойной скобкой `{{ '{{' }}` или `{{ '}}' }}`. Имя полей могут быть числами (нумерация с нуля) или пустыми (тогда они интерпретируются как последовательные числа). + +``` sql SELECT format('{1} {0} {1}', 'World', 'Hello') ``` -```text + +``` text ┌─format('{1} {0} {1}', 'World', 'Hello')─┐ │ Hello World Hello │ └─────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT format('{} {}', 'Hello', 'World') ``` -```text + +``` text ┌─format('{} {}', 'Hello', 'World')─┐ │ Hello World │ └───────────────────────────────────┘ ``` -## concat +## concat {#concat} -Склеивает строки переданные в аргументы в одну строку без разделителей. +Склеивает строки, переданные в аргументы, в одну строку без разделителей. -**Cинтаксис** +**Cинтаксис** -```sql +``` sql concat(s1, s2, ...) ``` @@ -147,7 +171,7 @@ concat(s1, s2, ...) **Возвращаемое значение** -Возвращает строку, полученную в результате склейки аргументов. +Возвращает строку, полученную в результате склейки аргументов. Если любой из аргументов имеет значение `NULL`, `concat` возвращает значение `NULL`. @@ -155,13 +179,13 @@ concat(s1, s2, ...) Запрос: -```sql +``` sql SELECT concat('Hello, ', 'World!') ``` Ответ: -```text +``` text ┌─concat('Hello, ', 'World!')─┐ │ Hello, World! │ └─────────────────────────────┘ @@ -171,11 +195,11 @@ SELECT concat('Hello, ', 'World!') Аналогична [concat](#concat). Разница заключается в том, что вам нужно убедиться, что `concat(s1, s2, ...) → sn` является инъективным, так как это предположение будет использоваться для оптимизации GROUP BY. -Функция называется "инъективной", если она возвращает разные значения для разных аргументов. Или, иными словами, функция никогда не выдаёт одно и то же значение, если аргументы разные. +Функция называется «инъективной», если она возвращает разные значения для разных аргументов. Или, иными словами, функция никогда не выдаёт одно и то же значение, если аргументы разные. -**Синтаксис** +**Синтаксис** -```sql +``` sql concatAssumeInjective(s1, s2, ...) ``` @@ -185,7 +209,7 @@ concatAssumeInjective(s1, s2, ...) **Возвращаемые значения** -Возвращает строку, полученную в результате объединения аргументов. +Возвращает строку, полученную в результате объединения аргументов. Если любой из аргументов имеет значение `NULL`, `concatAssumeInjective` возвращает значение `NULL`. @@ -193,13 +217,13 @@ concatAssumeInjective(s1, s2, ...) Вводная таблица: -```sql +``` sql CREATE TABLE key_val(`key1` String, `key2` String, `value` UInt32) ENGINE = TinyLog INSERT INTO key_val VALUES ('Hello, ','World',1)('Hello, ','World',2)('Hello, ','World!',3)('Hello',', World!',2) SELECT * from key_val ``` -```text +``` text ┌─key1────┬─key2─────┬─value─┐ │ Hello, │ World │ 1 │ │ Hello, │ World │ 2 │ @@ -210,13 +234,13 @@ SELECT * from key_val Запрос: -```sql +``` sql SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY (key1, key2) ``` Ответ: -```text +``` text ┌─concat(key1, key2)─┬─sum(value)─┐ │ Hello, World! │ 3 │ │ Hello, World! │ 2 │ @@ -224,55 +248,62 @@ SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY (key1, key2) └────────────────────┴────────────┘ ``` -## substring(s, offset, length) -Возвращает подстроку, начиная с байта по индексу offset, длины length байт. Индексация символов - начиная с единицы (как в стандартном SQL). Аргументы offset и length должны быть константами. +## substring(s, offset, length), mid(s, offset, length), substr(s, offset, length) {#substring} -## substringUTF8(s, offset, length) -Так же, как substring, но для кодовых точек Unicode. Работает при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Если допущение не выполнено - то возвращает какой-нибудь результат (не кидает исключение). +Возвращает подстроку, начиная с байта по индексу offset, длины length байт. Индексация символов — начиная с единицы (как в стандартном SQL). Аргументы offset и length должны быть константами. + +## substringUTF8(s, offset, length) {#substringutf8} + +Так же, как substring, но для кодовых точек Unicode. Работает при допущении, что строка содержит набор байтов, представляющий текст в кодировке UTF-8. Если допущение не выполнено, то возвращает какой-нибудь результат (не кидает исключение). + +## appendTrailingCharIfAbsent(s, c) {#appendtrailingcharifabsent} -## appendTrailingCharIfAbsent(s, c) Если строка s непустая и не содержит символ c на конце, то добавляет символ c в конец. -## convertCharset(s, from, to) +## convertCharset(s, from, to) {#convertcharset} + Возвращает сконвертированную из кодировки from в кодировку to строку s. -## base64Encode(s) +## base64Encode(s) {#base64encode} + Производит кодирование строки s в base64-представление. -## base64Decode(s) +## base64Decode(s) {#base64decode} + Декодирует base64-представление s в исходную строку. При невозможности декодирования выбрасывает исключение -## tryBase64Decode(s) +## tryBase64Decode(s) {#trybase64decode} + Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. -## endsWith(s, suffix) {#function-endswith} +## endsWith(s, suffix) {#endswith} Возвращает 1, если строка завершается указанным суффиксом, и 0 в противном случае. -## startsWith(str, prefix) {#function-startswith} +## startsWith(str, prefix) {#startswith} Возвращает 1, если строка начинается указанным префиксом, в противном случае 0. -```sql +``` sql SELECT startsWith('Spider-Man', 'Spi'); ``` **Возвращаемые значения** -- 1, если строка начинается указанным префиксом. -- 0, если строка не начинается указанным префиксом. +- 1, если строка начинается указанным префиксом. +- 0, если строка не начинается указанным префиксом. **Пример** Запрос: -```sql +``` sql SELECT startsWith('Hello, world!', 'He'); ``` Ответ: -```text +``` text ┌─startsWith('Hello, world!', 'He')─┐ │ 1 │ └───────────────────────────────────┘ @@ -285,14 +316,14 @@ SELECT startsWith('Hello, world!', 'He'); **Синтаксис** -```sql +``` sql trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) ``` **Параметры** -- `trim_character` — один или несколько символов, подлежащие удалению. [String](../../data_types/string.md). -- `input_string` — строка для обрезки. [String](../../data_types/string.md). +- `trim_character` — один или несколько символов, подлежащие удалению. [String](../../data_types/string.md). +- `input_string` — строка для обрезки. [String](../../data_types/string.md). **Возвращаемое значение** @@ -304,13 +335,13 @@ trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string) Запрос: -```sql +``` sql SELECT trim(BOTH ' ()' FROM '( Hello, world! )') ``` Ответ: -```text +``` text ┌─trim(BOTH ' ()' FROM '( Hello, world! )')─┐ │ Hello, world! │ └───────────────────────────────────────────────┘ @@ -320,9 +351,9 @@ SELECT trim(BOTH ' ()' FROM '( Hello, world! )') Удаляет все последовательные вхождения обычных пробелов (32 символ ASCII) с левого конца строки. Не удаляет другие виды пробелов (табуляция, пробел без разрыва и т. д.). -**Синтаксис** +**Синтаксис** -```sql +``` sql trimLeft(input_string) ``` @@ -330,7 +361,7 @@ trimLeft(input_string) **Параметры** -- `input_string` — строка для обрезки. [String](../../data_types/string.md). +- `input_string` — строка для обрезки. [String](../../data_types/string.md). **Возвращаемое значение** @@ -342,25 +373,25 @@ trimLeft(input_string) Запрос: -```sql +``` sql SELECT trimLeft(' Hello, world! ') ``` Ответ: -```text +``` text ┌─trimLeft(' Hello, world! ')─┐ │ Hello, world! │ └─────────────────────────────────────┘ ``` -## trimRight {#trimright} +## trimRight {#trimright} Удаляет все последовательные вхождения обычных пробелов (32 символ ASCII) с правого конца строки. Не удаляет другие виды пробелов (табуляция, пробел без разрыва и т. д.). -**Синтаксис** +**Синтаксис** -```sql +``` sql trimRight(input_string) ``` @@ -368,7 +399,7 @@ trimRight(input_string) **Параметры** -- `input_string` — строка для обрезки. [String](../../data_types/string.md). +- `input_string` — строка для обрезки. [String](../../data_types/string.md). **Возвращаемое значение** @@ -380,13 +411,13 @@ trimRight(input_string) Запрос: -```sql +``` sql SELECT trimRight(' Hello, world! ') ``` Ответ: -```text +``` text ┌─trimRight(' Hello, world! ')─┐ │ Hello, world! │ └──────────────────────────────────────┘ @@ -396,9 +427,9 @@ SELECT trimRight(' Hello, world! ') Удаляет все последовательные вхождения обычных пробелов (32 символ ASCII) с обоих концов строки. Не удаляет другие виды пробелов (табуляция, пробел без разрыва и т. д.). -**Синтаксис** +**Синтаксис** -```sql +``` sql trimBoth(input_string) ``` @@ -406,7 +437,7 @@ trimBoth(input_string) **Параметры** -- `input_string` — строка для обрезки. [String](../../data_types/string.md). +- `input_string` — строка для обрезки. [String](../../data_types/string.md). **Возвращаемое значение** @@ -418,34 +449,34 @@ trimBoth(input_string) Запрос: -```sql +``` sql SELECT trimBoth(' Hello, world! ') ``` Ответ: -```text +``` text ┌─trimBoth(' Hello, world! ')─┐ │ Hello, world! │ └─────────────────────────────────────┘ ``` -## CRC32(s) +## CRC32(s) {#crc32} Возвращает чексумму CRC32 данной строки, используется CRC-32-IEEE 802.3 многочлен и начальным значением `0xffffffff` (т.к. используется реализация из zlib). -Тип результата - UInt32. +Тип результата — UInt32. -## CRC32IEEE(s) +## CRC32IEEE(s) {#crc32ieee} Возвращает чексумму CRC32 данной строки, используется CRC-32-IEEE 802.3 многочлен. -Тип результата - UInt32. +Тип результата — UInt32. -## CRC64(s) +## CRC64(s) {#crc64} Возвращает чексумму CRC64 данной строки, используется CRC-64-ECMA многочлен. -Тип результата - UInt64. +Тип результата — UInt64. [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/string_functions/) diff --git a/docs/ru/query_language/functions/string_replace_functions.md b/docs/ru/query_language/functions/string_replace_functions.md index 7e8a0682316..8a2db9bf472 100644 --- a/docs/ru/query_language/functions/string_replace_functions.md +++ b/docs/ru/query_language/functions/string_replace_functions.md @@ -1,13 +1,16 @@ -# Функции поиска и замены в строках +# Функции поиска и замены в строках {#funktsii-poiska-i-zameny-v-strokakh} + +## replaceOne(haystack, pattern, replacement) {#replaceonehaystack-pattern-replacement} -## replaceOne(haystack, pattern, replacement) Замена первого вхождения, если такое есть, подстроки pattern в haystack на подстроку replacement. Здесь и далее, pattern и replacement должны быть константами. -## replaceAll(haystack, pattern, replacement) +## replaceAll(haystack, pattern, replacement) {#replaceallhaystack-pattern-replacement} + Замена всех вхождений подстроки pattern в haystack на подстроку replacement. -## replaceRegexpOne(haystack, pattern, replacement) +## replaceRegexpOne(haystack, pattern, replacement) {#replaceregexponehaystack-pattern-replacement} + Замена по регулярному выражению pattern. Регулярное выражение re2. Заменяется только первое вхождение, если есть. В качестве replacement может быть указан шаблон для замен. Этот шаблон может включать в себя подстановки `\0-\9`. @@ -17,7 +20,7 @@ Пример 1. Переведём дату в американский формат: -```sql +``` sql SELECT DISTINCT EventDate, replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res @@ -26,7 +29,7 @@ LIMIT 7 FORMAT TabSeparated ``` -```text +``` text 2014-03-17 03/17/2014 2014-03-18 03/18/2014 2014-03-19 03/19/2014 @@ -38,24 +41,25 @@ FORMAT TabSeparated Пример 2. Размножить строку десять раз: -```sql +``` sql SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res ``` -```text +``` text ┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## replaceRegexpAll(haystack, pattern, replacement) +## replaceRegexpAll(haystack, pattern, replacement) {#replaceregexpallhaystack-pattern-replacement} + То же самое, но делается замена всех вхождений. Пример: -```sql +``` sql SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res ``` -```text +``` text ┌─res────────────────────────┐ │ HHeelllloo,, WWoorrlldd!! │ └────────────────────────────┘ @@ -64,11 +68,11 @@ SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res В качестве исключения, если регулярное выражение сработало на пустой подстроке, то замена делается не более одного раза. Пример: -```sql +``` sql SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res ``` -```text +``` text ┌─res─────────────────┐ │ here: Hello, World! │ └─────────────────────┘ diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index f94145f23d4..1b84631d553 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -1,8 +1,9 @@ -# Функции поиска в строках +# Функции поиска в строках {#funktsii-poiska-v-strokakh} Во всех функциях, поиск регистрозависимый по умолчанию. Существуют варианты функций для регистронезависимого поиска. ## position(haystack, needle) {#position} + Поиск подстроки `needle` в строке `haystack`. Возвращает позицию (в байтах) найденной подстроки в строке, начиная с 1, или 0, если подстрока не найдена. @@ -13,7 +14,7 @@ **Синтаксис** -```sql +``` sql position(haystack, needle) ``` @@ -21,13 +22,13 @@ position(haystack, needle) **Параметры** -- `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). -- `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). +- `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). +- `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). **Возвращаемые значения** -- Начальная позиция в байтах (начиная с 1), если подстрока найдена. -- 0, если подстрока не найдена. +- Начальная позиция в байтах (начиная с 1), если подстрока найдена. +- 0, если подстрока не найдена. Тип: `Integer`. @@ -37,13 +38,13 @@ position(haystack, needle) Запрос: -```sql +``` sql SELECT position('Hello, world!', '!') ``` Ответ: -```text +``` text ┌─position('Hello, world!', '!')─┐ │ 13 │ └────────────────────────────────┘ @@ -53,13 +54,13 @@ SELECT position('Hello, world!', '!') Запрос: -```sql +``` sql SELECT position('Привет, мир!', '!') ``` Ответ: -```text +``` text ┌─position('Привет, мир!', '!')─┐ │ 21 │ └───────────────────────────────┘ @@ -73,19 +74,19 @@ SELECT position('Привет, мир!', '!') **Синтаксис** -```sql +``` sql positionCaseInsensitive(haystack, needle) ``` **Параметры** -- `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). -- `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). +- `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). +- `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). **Возвращаемые значения** -- Начальная позиция в байтах (начиная с 1), если подстрока найдена. -- 0, если подстрока не найдена. +- Начальная позиция в байтах (начиная с 1), если подстрока найдена. +- 0, если подстрока не найдена. Тип: `Integer`. @@ -93,13 +94,13 @@ positionCaseInsensitive(haystack, needle) Запрос: -```sql +``` sql SELECT positionCaseInsensitive('Hello, world!', 'hello') ``` Ответ: -```text +``` text ┌─positionCaseInsensitive('Hello, world!', 'hello')─┐ │ 1 │ └───────────────────────────────────────────────────┘ @@ -115,19 +116,19 @@ SELECT positionCaseInsensitive('Hello, world!', 'hello') **Синтаксис** -```sql +``` sql positionUTF8(haystack, needle) ``` **Параметры** -- `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). -- `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). +- `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). +- `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). **Возвращаемые значения** -- Начальная позиция в кодовых точках Unicode (начиная с 1), если подстрока найдена. -- 0, если подстрока не найдена. +- Начальная позиция в кодовых точках Unicode (начиная с 1), если подстрока найдена. +- 0, если подстрока не найдена. Тип: `Integer`. @@ -137,13 +138,13 @@ positionUTF8(haystack, needle) Запрос: -```sql +``` sql SELECT positionUTF8('Привет, мир!', '!') ``` Ответ: -```text +``` text ┌─positionUTF8('Привет, мир!', '!')─┐ │ 12 │ └───────────────────────────────────┘ @@ -153,13 +154,13 @@ SELECT positionUTF8('Привет, мир!', '!') Запрос для символа `é`, который представлен одной кодовой точкой `U+00E9`: -```sql +``` sql SELECT positionUTF8('Salut, étudiante!', '!') ``` Result: -```text +``` text ┌─positionUTF8('Salut, étudiante!', '!')─┐ │ 17 │ └────────────────────────────────────────┘ @@ -167,13 +168,13 @@ Result: Запрос для символа `é`, который представлен двумя кодовыми точками `U+0065U+0301`: -```sql +``` sql SELECT positionUTF8('Salut, étudiante!', '!') ``` Ответ: -```text +``` text ┌─positionUTF8('Salut, étudiante!', '!')─┐ │ 18 │ └────────────────────────────────────────┘ @@ -187,19 +188,19 @@ SELECT positionUTF8('Salut, étudiante!', '!') **Синтаксис** -```sql +``` sql positionCaseInsensitiveUTF8(haystack, needle) ``` **Параметры** -- `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). -- `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). +- `haystack` — строка, по которой выполняется поиск. [Строка](../syntax.md#syntax-string-literal). +- `needle` — подстрока, которую необходимо найти. [Строка](../syntax.md#syntax-string-literal). **Возвращаемые значения** -- Начальная позиция в байтах (начиная с 1), если подстрока найдена. -- 0, если подстрока не найдена. +- Начальная позиция в байтах (начиная с 1), если подстрока найдена. +- 0, если подстрока не найдена. Тип: `Integer`. @@ -207,75 +208,77 @@ positionCaseInsensitiveUTF8(haystack, needle) Запрос: -```sql +``` sql SELECT positionCaseInsensitiveUTF8('Привет, мир!', 'Мир') ``` Ответ: -```text +``` text ┌─positionCaseInsensitiveUTF8('Привет, мир!', 'Мир')─┐ │ 9 │ └────────────────────────────────────────────────────┘ ``` -## multiSearchAllPositions {#multiSearchAllPositions} +## multiSearchAllPositions {#multisearchallpositions} The same as [position](#position) but returns `Array` of positions (in bytes) of the found corresponding substrings in the string. Positions are indexed starting from 1. The search is performed on sequences of bytes without respect to string encoding and collation. -- For case-insensitive ASCII search, use the function `multiSearchAllPositionsCaseInsensitive`. -- For search in UTF-8, use the function [multiSearchAllPositionsUTF8](#multiSearchAllPositionsUTF8). -- For case-insensitive UTF-8 search, use the function multiSearchAllPositionsCaseInsensitiveUTF8. +- For case-insensitive ASCII search, use the function `multiSearchAllPositionsCaseInsensitive`. +- For search in UTF-8, use the function [multiSearchAllPositionsUTF8](#multiSearchAllPositionsUTF8). +- For case-insensitive UTF-8 search, use the function multiSearchAllPositionsCaseInsensitiveUTF8. -**Syntax** +**Syntax** -```sql +``` sql multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) ``` **Parameters** -- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). -- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). +- `haystack` — string, in which substring will to be searched. [String](../syntax.md#syntax-string-literal). +- `needle` — substring to be searched. [String](../syntax.md#syntax-string-literal). **Returned values** -- Array of starting positions in bytes (counting from 1), if the corresponding substring was found and 0 if not found. +- Array of starting positions in bytes (counting from 1), if the corresponding substring was found and 0 if not found. **Example** Query: -```sql +``` sql SELECT multiSearchAllPositions('Hello, World!', ['hello', '!', 'world']) ``` Result: -```text +``` text ┌─multiSearchAllPositions('Hello, World!', ['hello', '!', 'world'])─┐ │ [0,13,0] │ └───────────────────────────────────────────────────────────────────┘ ``` -## multiSearchAllPositionsUTF8 {#multiSearchAllPositionsUTF8} +## multiSearchAllPositionsUTF8 {#multisearchallpositionsutf8} Смотрите `multiSearchAllPositions`. -## multiSearchFirstPosition(haystack, [needle1, needle2, ..., needlen]) +## multiSearchFirstPosition(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstpositionhaystack-needle1-needle2-needlen} Так же, как и `position`, только возвращает оффсет первого вхождения любого из needles. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`. -## multiSearchFirstIndex(haystack, [needle1, needle2, ..., needlen]) +## multiSearchFirstIndex(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstindexhaystack-needle1-needle2-needlen} + Возвращает индекс `i` (нумерация с единицы) первой найденной строки needlei в строке `haystack` и 0 иначе. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) {#function-multisearchany} +## multiSearchAny(haystack, \[needle1, needle2, …, needlen\]) {#function-multisearchany} + Возвращает 1, если хотя бы одна подстрока needlei нашлась в строке `haystack` и 0 иначе. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. @@ -283,7 +286,8 @@ Result: !!! note "Примечание" Во всех функциях `multiSearch*` количество needles должно быть меньше 28 из-за особенностей реализации. -## match(haystack, pattern) +## match(haystack, pattern) {#matchhaystack-pattern} + Проверка строки на соответствие регулярному выражению pattern. Регулярное выражение **re2**. Синтаксис регулярных выражений **re2** является более ограниченным по сравнению с регулярными выражениями **Perl** ([подробнее](https://github.com/google/re2/wiki/Syntax)). Возвращает 0 (если не соответствует) или 1 (если соответствует). @@ -292,30 +296,30 @@ Result: Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты. Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее. -## multiMatchAny(haystack, [pattern1, pattern2, ..., patternn]) +## multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется библиотека [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearchAny`, так как она работает существенно быстрее. !!! note "Примечание" Длина любой строки из `haystack` должна быть меньше 232 байт, иначе бросается исключение. Это ограничение связано с ограничением hyperscan API. -## multiMatchAnyIndex(haystack, [pattern1, pattern2, ..., patternn]) +## multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения. -## multiMatchAllIndices(haystack, [pattern1, pattern2, ..., patternn]) +## multiMatchAllIndices(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchallindiceshaystack-pattern1-pattern2-patternn} То же, что и `multiMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке. -## multiFuzzyMatchAny(haystack, distance, [pattern1, pattern2, ..., patternn]) +## multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyhaystack-distance-pattern1-pattern2-patternn} То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). -## multiFuzzyMatchAnyIndex(haystack, distance, [pattern1, pattern2, ..., patternn]) +## multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyindexhaystack-distance-pattern1-pattern2-patternn} То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния. -## multiFuzzyMatchAllIndices(haystack, distance, [pattern1, pattern2, ..., patternn]) +## multiFuzzyMatchAllIndices(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchallindiceshaystack-distance-pattern1-pattern2-patternn} То же, что и `multiFuzzyMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке в пределах константного редакционного расстояния. @@ -325,13 +329,16 @@ Result: !!! note "Примечание" Чтобы выключить все функции, использующие hyperscan, используйте настройку `SET allow_hyperscan = 0;`. -## extract(haystack, pattern) +## extract(haystack, pattern) {#extracthaystack-pattern} + Извлечение фрагмента строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Если регулярное выражение не содержит subpattern-ов, то вынимается фрагмент, который подпадает под всё регулярное выражение. Иначе вынимается фрагмент, который подпадает под первый subpattern. -## extractAll(haystack, pattern) +## extractAll(haystack, pattern) {#extractallhaystack-pattern} + Извлечение всех фрагментов строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Возвращается массив строк, состоящий из всех соответствий регулярному выражению. В остальном, поведение аналогично функции extract (по прежнему, вынимается первый subpattern, или всё выражение, если subpattern-а нет). ## like(haystack, pattern), оператор haystack LIKE pattern {#function-like} + Проверка строки на соответствие простому регулярному выражению. Регулярное выражение может содержать метасимволы `%` и `_`. @@ -345,22 +352,22 @@ Result: Для остальных регулярных выражений, код аналогичен функции match. ## notLike(haystack, pattern), оператор haystack NOT LIKE pattern {#function-notlike} + То же, что like, но с отрицанием. -## ngramDistance(haystack, needle) +## ngramDistance(haystack, needle) {#ngramdistancehaystack-needle} -Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализуется на сумму их мощностей. Возвращает число float от 0 до 1 -- чем ближе к нулю, тем больше строки похожи друг на друга. Если константный `needle` или `haystack` больше чем 32КБ, кидается исключение. Если некоторые строки из неконстантного `haystack` или `needle` больше 32КБ, расстояние всегда равно единице. +Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализуется на сумму их мощностей. Возвращает число float от 0 до 1 – чем ближе к нулю, тем больше строки похожи друг на друга. Если константный `needle` или `haystack` больше чем 32КБ, кидается исключение. Если некоторые строки из неконстантного `haystack` или `needle` больше 32КБ, расстояние всегда равно единице. Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -## ngramSearch(haystack, needle) +## ngramSearch(haystack, needle) {#ngramsearchhaystack-needle} -То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` -- количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Чем ближе результат к единице, тем вероятнее, что `needle` внутри `haystack`. Может быть использовано для приближенного поиска. +То же, что и `ngramDistance`, но вычисляет несимметричную разность между `needle` и `haystack` – количество n-грамм из `needle` минус количество общих n-грамм, нормированное на количество n-грамм из `needle`. Чем ближе результат к единице, тем вероятнее, что `needle` внутри `haystack`. Может быть использовано для приближенного поиска. Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. - !!! note "Примечание" - Для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем (не)симметрическую разность между хэш таблицами -- могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` -- мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки, а также первый бит нулевого байта, если байтов больше 1 -- это работает для латиницы и почти для всех кириллических букв. + Для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем (не)симметрическую разность между хэш таблицами – могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` – мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки, а также первый бит нулевого байта, если байтов больше 1 – это работает для латиницы и почти для всех кириллических букв. [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/string_search_functions/) diff --git a/docs/ru/query_language/functions/type_conversion_functions.md b/docs/ru/query_language/functions/type_conversion_functions.md index 54913e26f5c..fa7eab9c232 100644 --- a/docs/ru/query_language/functions/type_conversion_functions.md +++ b/docs/ru/query_language/functions/type_conversion_functions.md @@ -1,4 +1,4 @@ -# Функции преобразования типов +# Функции преобразования типов {#funktsii-preobrazovaniia-tipov} ## Общие проблемы преобразования чисел {#numeric-conversion-issues} @@ -6,18 +6,18 @@ Поведение ClickHouse при конвертировании похоже на [поведение C++ программ](https://en.cppreference.com/w/cpp/language/implicit_conversion). -## toInt(8|16|32|64) +## toInt(8\|16\|32\|64) {#toint8163264} Преобразует входное значение к типу [Int](../../data_types/int_uint.md). Семейство функций включает: -* `toInt8(expr)` — возвращает значение типа `Int8`. -* `toInt16(expr)` — возвращает значение типа `Int16`. -* `toInt32(expr)` — возвращает значение типа `Int32`. -* `toInt64(expr)` — возвращает значение типа `Int64`. +- `toInt8(expr)` — возвращает значение типа `Int8`. +- `toInt16(expr)` — возвращает значение типа `Int16`. +- `toInt32(expr)` — возвращает значение типа `Int32`. +- `toInt64(expr)` — возвращает значение типа `Int64`. **Параметры** -- `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее число или строку с десятичным представление числа. Бинарное, восьмеричное и шестнадцатеричное представление числа не поддержаны. Ведущие нули обрезаются. +- `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее число или строку с десятичным представление числа. Бинарное, восьмеричное и шестнадцатеричное представление числа не поддержаны. Ведущие нули обрезаются. **Возвращаемое значение** @@ -29,59 +29,60 @@ **Пример** -```sql +``` sql SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) ``` -```text + +``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ └──────────────────────┴─────────────┴───────────────┴─────────────┘ ``` -## toInt(8|16|32|64)OrZero +## toInt(8\|16\|32\|64)OrZero {#toint8163264orzero} -Принимает аргумент типа String и пытается его распарсить в Int(8|16|32|64). Если не удалось - возвращает 0. +Принимает аргумент типа String и пытается его распарсить в Int(8\|16\|32\|64). Если не удалось - возвращает 0. **Пример** -```sql +``` sql select toInt64OrZero('123123'), toInt8OrZero('123qwe123') ``` -```text + +``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ └─────────────────────────┴───────────────────────────┘ ``` +## toInt(8\|16\|32\|64)OrNull {#toint8163264ornull} -## toInt(8|16|32|64)OrNull - -Принимает аргумент типа String и пытается его распарсить в Int(8|16|32|64). Если не удалось - возвращает NULL. +Принимает аргумент типа String и пытается его распарсить в Int(8\|16\|32\|64). Если не удалось - возвращает NULL. **Пример** -```sql +``` sql select toInt64OrNull('123123'), toInt8OrNull('123qwe123') ``` -```text + +``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ └─────────────────────────┴───────────────────────────┘ ``` - -## toUInt(8|16|32|64) +## toUInt(8\|16\|32\|64) {#touint8163264} Преобраует входное значение к типу [UInt](../../data_types/int_uint.md). Семейство функций включает: -* `toUInt8(expr)` — возвращает значение типа `UInt8`. -* `toUInt16(expr)` — возвращает значение типа `UInt16`. -* `toUInt32(expr)` — возвращает значение типа `UInt32`. -* `toUInt64(expr)` — возвращает значение типа `UInt64`. +- `toUInt8(expr)` — возвращает значение типа `UInt8`. +- `toUInt16(expr)` — возвращает значение типа `UInt16`. +- `toUInt32(expr)` — возвращает значение типа `UInt32`. +- `toUInt64(expr)` — возвращает значение типа `UInt64`. **Параметры** -- `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее число или строку с десятичным представление числа. Бинарное, восьмеричное и шестнадцатеричное представление числа не поддержаны. Ведущие нули обрезаются. +- `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее число или строку с десятичным представление числа. Бинарное, восьмеричное и шестнадцатеричное представление числа не поддержаны. Ведущие нули обрезаются. **Возвращаемое значение** @@ -93,134 +94,135 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123') **Пример** -```sql +``` sql SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ``` -```text + +``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ └─────────────────────┴───────────────┴────────────────┴──────────────┘ ``` -## toUInt(8|16|32|64)OrZero +## toUInt(8\|16\|32\|64)OrZero {#touint8163264orzero} -## toUInt(8|16|32|64)OrNull +## toUInt(8\|16\|32\|64)OrNull {#touint8163264ornull} -## toFloat(32|64) +## toFloat(32\|64) {#tofloat3264} -## toFloat(32|64)OrZero +## toFloat(32\|64)OrZero {#tofloat3264orzero} -## toFloat(32|64)OrNull +## toFloat(32\|64)OrNull {#tofloat3264ornull} -## toDate +## toDate {#todate} -## toDateOrZero +## toDateOrZero {#todateorzero} -## toDateOrNull +## toDateOrNull {#todateornull} -## toDateTime +## toDateTime {#todatetime} -## toDateTimeOrZero +## toDateTimeOrZero {#todatetimeorzero} -## toDateTimeOrNull +## toDateTimeOrNull {#todatetimeornull} -## toDecimal(32|64|128) +## toDecimal(32\|64\|128) {#todecimal3264128} Преобразует `value` к типу данных [Decimal](../../data_types/decimal.md) с точностью `S`. `value` может быть числом или строкой. Параметр `S` (scale) задаёт число десятичных знаков. -- `toDecimal32(value, S)` -- `toDecimal64(value, S)` -- `toDecimal128(value, S)` +- `toDecimal32(value, S)` +- `toDecimal64(value, S)` +- `toDecimal128(value, S)` -## toDecimal(32|64|128)OrNull +## toDecimal(32\|64\|128)OrNull {#todecimal3264128ornull} Преобразует входную строку в значение с типом данных [Nullable (Decimal (P, S))](../../data_types/decimal.md). Семейство функций включает в себя: -- `toDecimal32OrNull(expr, S)` — Возвращает значение типа `Nullable(Decimal32(S))`. -- `toDecimal64OrNull(expr, S)` — Возвращает значение типа `Nullable(Decimal64(S))`. -- `toDecimal128OrNull(expr, S)` — Возвращает значение типа `Nullable(Decimal128(S))`. +- `toDecimal32OrNull(expr, S)` — Возвращает значение типа `Nullable(Decimal32(S))`. +- `toDecimal64OrNull(expr, S)` — Возвращает значение типа `Nullable(Decimal64(S))`. +- `toDecimal128OrNull(expr, S)` — Возвращает значение типа `Nullable(Decimal128(S))`. Эти функции следует использовать вместо функций `toDecimal*()`, если при ошибке обработки входного значения вы хотите получать `NULL` вместо исключения. **Параметры** -- `expr` — [выражение](../syntax.md#syntax-expressions), возвращающее значение типа [String](../../data_types/string.md). ClickHouse ожидает текстовое представление десятичного числа. Например, `'1.111'`. -- `S` — количество десятичных знаков в результирующем значении. +- `expr` — [выражение](../syntax.md#syntax-expressions), возвращающее значение типа [String](../../data_types/string.md). ClickHouse ожидает текстовое представление десятичного числа. Например, `'1.111'`. +- `S` — количество десятичных знаков в результирующем значении. **Возвращаемое значение** Значение типа `Nullable(Decimal(P,S))`. Значение содержит: -- Число с `S` десятичными знаками, если ClickHouse распознал число во входной строке. -- `NULL`, если ClickHouse не смог распознать число во входной строке или входное число содержит больше чем `S` десятичных знаков. +- Число с `S` десятичными знаками, если ClickHouse распознал число во входной строке. +- `NULL`, если ClickHouse не смог распознать число во входной строке или входное число содержит больше чем `S` десятичных знаков. **Примеры** -```sql +``` sql SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` -```text +``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` -```sql +``` sql SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) ``` -```text +``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ └──────┴────────────────────────────────────────────────────┘ ``` -## toDecimal(32|64|128)OrZero +## toDecimal(32\|64\|128)OrZero {#todecimal3264128orzero} Преобразует тип входного значения в [Decimal (P, S)](../../data_types/decimal.md). Семейство функций включает в себя: -- `toDecimal32OrZero( expr, S)` — возвращает значение типа `Decimal32(S)`. -- `toDecimal64OrZero( expr, S)` — возвращает значение типа `Decimal64(S)`. -- `toDecimal128OrZero( expr, S)` — возвращает значение типа `Decimal128(S)`. +- `toDecimal32OrZero( expr, S)` — возвращает значение типа `Decimal32(S)`. +- `toDecimal64OrZero( expr, S)` — возвращает значение типа `Decimal64(S)`. +- `toDecimal128OrZero( expr, S)` — возвращает значение типа `Decimal128(S)`. Эти функции следует использовать вместо функций `toDecimal*()`, если при ошибке обработки входного значения вы хотите получать `0` вместо исключения. **Параметры** -- `expr` — [выражение](../syntax.md#syntax-expressions), возвращающее значение типа [String](../../data_types/string.md). ClickHouse ожидает текстовое представление десятичного числа. Например, `'1.111'`. -- `S` — количество десятичных знаков в результирующем значении. +- `expr` — [выражение](../syntax.md#syntax-expressions), возвращающее значение типа [String](../../data_types/string.md). ClickHouse ожидает текстовое представление десятичного числа. Например, `'1.111'`. +- `S` — количество десятичных знаков в результирующем значении. **Возвращаемое значение** Значение типа `Nullable(Decimal(P,S))`. `P` равно числовой части имени функции. Например, для функции `toDecimal32OrZero`, `P = 32`. Значение содержит: -- Число с `S` десятичными знаками, если ClickHouse распознал число во входной строке. -- 0 c `S` десятичными знаками, если ClickHouse не смог распознать число во входной строке или входное число содержит больше чем `S` десятичных знаков. +- Число с `S` десятичными знаками, если ClickHouse распознал число во входной строке. +- 0 c `S` десятичными знаками, если ClickHouse не смог распознать число во входной строке или входное число содержит больше чем `S` десятичных знаков. **Пример** -```sql +``` sql SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) ``` -```text +``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` -```sql +``` sql SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) ``` -```text +``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ └──────┴────────────────────────────────────────────────────┘ ``` -## toString +## toString {#tostring} Функции преобразования между числами, строками (но не фиксированными строками), датами и датами-с-временем. Все эти функции принимают один аргумент. @@ -232,12 +234,12 @@ SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) Форматы даты и даты-с-временем для функций toDate/toDateTime определены следующим образом: -```text +``` text YYYY-MM-DD YYYY-MM-DD hh:mm:ss ``` -В качестве исключения, если делается преобразование из числа типа UInt32, Int32, UInt64, Int64 в Date, и если число больше или равно 65536, то число рассматривается как unix timestamp (а не как число дней) и округляется до даты. Это позволяет поддержать распространённый случай, когда пишут toDate(unix_timestamp), что иначе было бы ошибкой и требовало бы написания более громоздкого toDate(toDateTime(unix_timestamp)) +В качестве исключения, если делается преобразование из числа типа UInt32, Int32, UInt64, Int64 в Date, и если число больше или равно 65536, то число рассматривается как unix timestamp (а не как число дней) и округляется до даты. Это позволяет поддержать распространённый случай, когда пишут toDate(unix\_timestamp), что иначе было бы ошибкой и требовало бы написания более громоздкого toDate(toDateTime(unix\_timestamp)) Преобразование между датой и датой-с-временем производится естественным образом: добавлением нулевого времени или отбрасыванием времени. @@ -245,13 +247,13 @@ YYYY-MM-DD hh:mm:ss Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: `Asia/Yekaterinburg` В этом случае, форматирование времени производится согласно указанной тайм-зоне. -```sql +``` sql SELECT now() AS now_local, toString(now(), 'Asia/Yekaterinburg') AS now_yekat ``` -```text +``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ └─────────────────────┴─────────────────────┘ @@ -259,59 +261,61 @@ SELECT Также смотрите функцию `toUnixTimestamp`. -## toFixedString(s, N) +## toFixedString(s, N) {#tofixedstrings-n} Преобразует аргумент типа String в тип FixedString(N) (строку фиксированной длины N). N должно быть константой. Если строка имеет меньше байт, чем N, то она дополняется нулевыми байтами справа. Если строка имеет больше байт, чем N - кидается исключение. -## toStringCutToZero(s) +## toStringCutToZero(s) {#tostringcuttozeros} Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта. Пример: -```sql +``` sql SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut ``` -```text +``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` -```sql +``` sql SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ``` -```text +``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ └────────────┴───────┘ ``` -## reinterpretAsUInt(8|16|32|64) +## reinterpretAsUInt(8\|16\|32\|64) {#reinterpretasuint8163264} -## reinterpretAsInt(8|16|32|64) +## reinterpretAsInt(8\|16\|32\|64) {#reinterpretasint8163264} -## reinterpretAsFloat(32|64) +## reinterpretAsFloat(32\|64) {#reinterpretasfloat3264} -## reinterpretAsDate +## reinterpretAsDate {#reinterpretasdate} -## reinterpretAsDateTime +## reinterpretAsDateTime {#reinterpretasdatetime} Функции принимают строку и интерпретируют байты, расположенные в начале строки, как число в host order (little endian). Если строка имеет недостаточную длину, то функции работают так, как будто строка дополнена необходимым количеством нулевых байт. Если строка длиннее, чем нужно, то лишние байты игнорируются. Дата интерпретируется, как число дней с начала unix-эпохи, а дата-с-временем - как число секунд с начала unix-эпохи. -## reinterpretAsString {#type_conversion_functions-reinterpretAsString} +## reinterpretAsString {#type-conversion-functions-reinterpretasstring} + Функция принимает число или дату или дату-с-временем и возвращает строку, содержащую байты, представляющие соответствующее значение в host order (little endian). При этом, отбрасываются нулевые байты с конца. Например, значение 255 типа UInt32 будет строкой длины 1 байт. -## CAST(x, t) {#type_conversion_function-cast} +## CAST(x, t) {#type-conversion-function-cast} + Преобразует x в тип данных t. Поддерживается также синтаксис CAST(x AS t). Пример: -```sql +``` sql SELECT '2016-06-15 23:00:00' AS timestamp, CAST(timestamp AS DateTime) AS datetime, @@ -320,7 +324,7 @@ SELECT CAST(timestamp, 'FixedString(22)') AS fixed_string ``` -```text +``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ └─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ @@ -330,32 +334,35 @@ SELECT Поддержано преобразование к типу [Nullable](../../data_types/nullable.md) и обратно. Пример: -```sql +``` sql SELECT toTypeName(x) FROM t_null ``` -```text + +``` text ┌─toTypeName(x)─┐ │ Int8 │ │ Int8 │ └───────────────┘ ``` -```sql + +``` sql SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ``` -```text + +``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ │ Nullable(UInt16) │ └─────────────────────────────────────────┘ ``` -## toInterval(Year|Quarter|Month|Week|Day|Hour|Minute|Second) {#function-tointerval} +## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval} Приводит аргумент из числового типа данных к типу данных [IntervalType](../../data_types/special_data_types/interval.md). **Синтксис** -```sql +``` sql toIntervalSecond(number) toIntervalMinute(number) toIntervalHour(number) @@ -368,15 +375,15 @@ toIntervalYear(number) **Параметры** -- `number` — длительность интервала. Положительное целое число. +- `number` — длительность интервала. Положительное целое число. **Возвращаемые значения** -- Значение с типом данных `Interval`. +- Значение с типом данных `Interval`. **Пример** -```sql +``` sql WITH toDate('2019-01-01') AS date, INTERVAL 1 WEEK AS interval_week, @@ -386,11 +393,10 @@ SELECT date + interval_to_week ``` -```text +``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ └───────────────────────────┴──────────────────────────────┘ ``` - [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/type_conversion_functions/) diff --git a/docs/ru/query_language/functions/url_functions.md b/docs/ru/query_language/functions/url_functions.md index abf45003ab6..6dc62b9c193 100644 --- a/docs/ru/query_language/functions/url_functions.md +++ b/docs/ru/query_language/functions/url_functions.md @@ -1,29 +1,30 @@ -# Функции для работы с URL +# Функции для работы с URL {#funktsii-dlia-raboty-s-url} Все функции работают не по RFC - то есть, максимально упрощены ради производительности. -## Функции, извлекающие часть URL-а. +## Функции, извлекающие часть URL-а. {#funktsii-izvlekaiushchie-chast-url-a} Если в URL-е нет ничего похожего, то возвращается пустая строка. -### protocol -Возвращает протокол. Примеры: http, ftp, mailto, magnet... +### protocol {#protocol} -### domain +Возвращает протокол. Примеры: http, ftp, mailto, magnet… + +### domain {#domain} Извлекает имя хоста из URL. -```sql +``` sql domain(url) ``` **Параметры** -- `url` — URL. Тип — [String](../../data_types/string.md). +- `url` — URL. Тип — [String](../../data_types/string.md). URL может быть указан со схемой или без неё. Примеры: -```text +``` text svn+ssh://some.svn-hosting.com:80/repo/trunk some.svn-hosting.com:80/repo/trunk https://yandex.com/time/ @@ -31,7 +32,7 @@ https://yandex.com/time/ Для указанных примеров функция `domain` возвращает следующие результаты: -```text +``` text some.svn-hosting.com some.svn-hosting.com yandex.com @@ -39,42 +40,42 @@ yandex.com **Возвращаемые значения** -- Имя хоста. Если ClickHouse может распарсить входную строку как URL. -- Пустая строка. Если ClickHouse не может распарсить входную строку как URL. +- Имя хоста. Если ClickHouse может распарсить входную строку как URL. +- Пустая строка. Если ClickHouse не может распарсить входную строку как URL. Тип — `String`. **Пример** -```sql +``` sql SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk') ``` -```text +``` text ┌─domain('svn+ssh://some.svn-hosting.com:80/repo/trunk')─┐ │ some.svn-hosting.com │ └────────────────────────────────────────────────────────┘ ``` -### domainWithoutWWW +### domainWithoutWWW {#domainwithoutwww} -Возвращает домен, удалив префикс 'www.', если он присутствовал. +Возвращает домен, удалив префикс ‘www.’, если он присутствовал. -### topLevelDomain +### topLevelDomain {#topleveldomain} Извлекает домен верхнего уровня из URL. -```sql +``` sql topLevelDomain(url) ``` **Параметры** -- `url` — URL. Тип — [String](../../data_types/string.md). +- `url` — URL. Тип — [String](../../data_types/string.md). URL может быть указан со схемой или без неё. Примеры: -```text +``` text svn+ssh://some.svn-hosting.com:80/repo/trunk some.svn-hosting.com:80/repo/trunk https://yandex.com/time/ @@ -82,63 +83,75 @@ https://yandex.com/time/ **Возвращаемые значения** -- Имя домена. Если ClickHouse может распарсить входную строку как URL. -- Пустая строка. Если ClickHouse не может распарсить входную строку как URL. +- Имя домена. Если ClickHouse может распарсить входную строку как URL. +- Пустая строка. Если ClickHouse не может распарсить входную строку как URL. Тип — `String`. **Пример** -```sql +``` sql SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk') ``` -```text +``` text ┌─topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk')─┐ │ com │ └────────────────────────────────────────────────────────────────────┘ ``` -### firstSignificantSubdomain -Возвращает "первый существенный поддомен". Это понятие является нестандартным и специфично для Яндекс.Метрики. Первый существенный поддомен - это домен второго уровня, если он не равен одному из com, net, org, co, или домен третьего уровня, иначе. Например, firstSignificantSubdomain('') = 'yandex', firstSignificantSubdomain('') = 'yandex'. Список "несущественных" доменов второго уровня и другие детали реализации могут изменяться в будущем. +### firstSignificantSubdomain {#firstsignificantsubdomain} -### cutToFirstSignificantSubdomain -Возвращает часть домена, включающую поддомены верхнего уровня до "первого существенного поддомена" (см. выше). +Возвращает «первый существенный поддомен». Это понятие является нестандартным и специфично для Яндекс.Метрики. Первый существенный поддомен - это домен второго уровня, если он не равен одному из com, net, org, co, или домен третьего уровня, иначе. Например, firstSignificantSubdomain(‘https://news.yandex.ru/’) = ‘yandex’, firstSignificantSubdomain(‘https://news.yandex.com.tr/’) = ‘yandex’. Список «несущественных» доменов второго уровня и другие детали реализации могут изменяться в будущем. + +### cutToFirstSignificantSubdomain {#cuttofirstsignificantsubdomain} + +Возвращает часть домена, включающую поддомены верхнего уровня до «первого существенного поддомена» (см. выше). Например, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`. -### path +### path {#path} + Возвращает путь. Пример: `/top/news.html` Путь не включает в себя query string. -### pathFull +### pathFull {#pathfull} + То же самое, но включая query string и fragment. Пример: /top/news.html?page=2\#comments -### queryString +### queryString {#querystring} + Возвращает query-string. Пример: page=1&lr=213. query-string не включает в себя начальный знак вопроса, а также \# и всё, что после \#. -### fragment +### fragment {#fragment} + Возвращает fragment identifier. fragment не включает в себя начальный символ решётки. -### queryStringAndFragment +### queryStringAndFragment {#querystringandfragment} + Возвращает query string и fragment identifier. Пример: страница=1\#29390. -### extractURLParameter(URL, name) +### extractURLParameter(URL, name) {#extracturlparameterurl-name} + Возвращает значение параметра name в URL, если такой есть; или пустую строку, иначе; если параметров с таким именем много - вернуть первый попавшийся. Функция работает при допущении, что имя параметра закодировано в URL в точности таким же образом, что и в переданном аргументе. -### extractURLParameters(URL) +### extractURLParameters(URL) {#extracturlparametersurl} + Возвращает массив строк вида name=value, соответствующих параметрам URL. Значения никак не декодируются. -### extractURLParameterNames(URL) +### extractURLParameterNames(URL) {#extracturlparameternamesurl} + Возвращает массив строк вида name, соответствующих именам параметров URL. Значения никак не декодируются. -### URLHierarchy(URL) +### URLHierarchy(URL) {#urlhierarchyurl} + Возвращает массив, содержащий URL, обрезанный с конца по символам /, ? в пути и query-string. Подряд идущие символы-разделители считаются за один. Резка производится в позиции после всех подряд идущих символов-разделителей. Пример: -### URLPathHierarchy(URL) +### URLPathHierarchy(URL) {#urlpathhierarchyurl} + То же самое, но без протокола и хоста в результате. Элемент / (корень) не включается. Пример: Функция используется для реализации древовидных отчётов по URL в Яндекс.Метрике. -```text +``` text URLPathHierarchy('https://example.com/browse/CONV-6788') = [ '/browse/', @@ -146,37 +159,43 @@ URLPathHierarchy('https://example.com/browse/CONV-6788') = ] ``` -### decodeURLComponent(URL) +### decodeURLComponent(URL) {#decodeurlcomponenturl} + Возвращает декодированный URL. Пример: -```sql +``` sql SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; ``` -```text +``` text ┌─DecodedURL─────────────────────────────┐ │ http://127.0.0.1:8123/?query=SELECT 1; │ └────────────────────────────────────────┘ ``` -## Функции, удаляющие часть из URL-а +## Функции, удаляющие часть из URL-а {#funktsii-udaliaiushchie-chast-iz-url-a} Если в URL-е нет ничего похожего, то URL остаётся без изменений. -### cutWWW -Удаляет не более одного 'www.' с начала домена URL-а, если есть. +### cutWWW {#cutwww} + +Удаляет не более одного ‘www.’ с начала домена URL-а, если есть. + +### cutQueryString {#cutquerystring} -### cutQueryString Удаляет query string. Знак вопроса тоже удаляется. -### cutFragment +### cutFragment {#cutfragment} + Удаляет fragment identifier. Символ решётки тоже удаляется. -### cutQueryStringAndFragment +### cutQueryStringAndFragment {#cutquerystringandfragment} + Удаляет query string и fragment identifier. Знак вопроса и символ решётки тоже удаляются. -### cutURLParameter(URL, name) +### cutURLParameter(URL, name) {#cuturlparameterurl-name} + Удаляет параметр URL с именем name, если такой есть. Функция работает при допущении, что имя параметра закодировано в URL в точности таким же образом, что и в переданном аргументе. [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/url_functions/) diff --git a/docs/ru/query_language/functions/uuid_functions.md b/docs/ru/query_language/functions/uuid_functions.md index aa8ae451d6b..1f52ba27e31 100644 --- a/docs/ru/query_language/functions/uuid_functions.md +++ b/docs/ru/query_language/functions/uuid_functions.md @@ -1,10 +1,10 @@ -# Функции для работы с UUID +# Функции для работы с UUID {#funktsii-dlia-raboty-s-uuid} -## generateUUIDv4 {#uuid_function-generate} +## generateUUIDv4 {#uuid-function-generate} Генерирует идентификатор [UUID версии 4](https://tools.ietf.org/html/rfc4122#section-4.4). -```sql +``` sql generateUUIDv4() ``` @@ -16,7 +16,7 @@ generateUUIDv4() Этот пример демонстрирует, как создать таблицу с UUID-колонкой и добавить в нее сгенерированный UUID. -```sql +``` sql CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog INSERT INTO t_uuid SELECT generateUUIDv4() @@ -24,17 +24,17 @@ INSERT INTO t_uuid SELECT generateUUIDv4() SELECT * FROM t_uuid ``` -```text +``` text ┌────────────────────────────────────x─┐ │ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │ └──────────────────────────────────────┘ ``` -## toUUID (x) +## toUUID (x) {#touuid-x} Преобразует значение типа String в тип UUID. -```sql +``` sql toUUID(String) ``` @@ -44,21 +44,21 @@ toUUID(String) **Пример использования** -```sql +``` sql SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid ``` -```text +``` text ┌─────────────────────────────────uuid─┐ │ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │ └──────────────────────────────────────┘ ``` -## UUIDStringToNum +## UUIDStringToNum {#uuidstringtonum} Принимает строку, содержащую 36 символов в формате `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, и возвращает в виде набора байт в [FixedString(16)](../../data_types/fixedstring.md). -```sql +``` sql UUIDStringToNum(String) ``` @@ -68,23 +68,23 @@ FixedString(16) **Пример использования** -```sql -SELECT - '612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid, +``` sql +SELECT + '612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid, UUIDStringToNum(uuid) AS bytes ``` -```text +``` text ┌─uuid─────────────────────────────────┬─bytes────────────┐ │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │ └──────────────────────────────────────┴──────────────────┘ ``` -## UUIDNumToString +## UUIDNumToString {#uuidnumtostring} Принимает значение типа [FixedString(16)](../../data_types/fixedstring.md). Возвращает строку из 36 символов в текстовом виде. -```sql +``` sql UUIDNumToString(FixedString(16)) ``` @@ -94,22 +94,21 @@ UUIDNumToString(FixedString(16)) **Пример использования** -```sql -SELECT - 'a/<@];!~p{jTj={)' AS bytes, +``` sql +SELECT + 'a/<@];!~p{jTj={)' AS bytes, UUIDNumToString(toFixedString(bytes, 16)) AS uuid ``` -```text +``` text ┌─bytes────────────┬─uuid─────────────────────────────────┐ │ a/<@];!~p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ └──────────────────┴──────────────────────────────────────┘ ``` -## См. также: +## См. также: {#sm-takzhe} -- [dictGetUUID](ext_dict_functions.md) -- [dictGetUUIDOrDefault](ext_dict_functions.md) +- [dictGetUUID](ext_dict_functions.md) +- [dictGetUUIDOrDefault](ext_dict_functions.md) [Original article](https://clickhouse.tech/docs/en/query_language/functions/uuid_function/) - diff --git a/docs/ru/query_language/functions/ym_dict_functions.md b/docs/ru/query_language/functions/ym_dict_functions.md index 4340d6c8e16..0f08831b8dc 100644 --- a/docs/ru/query_language/functions/ym_dict_functions.md +++ b/docs/ru/query_language/functions/ym_dict_functions.md @@ -1,45 +1,47 @@ -# Функции для работы со словарями Яндекс.Метрики {#ym_dict_functions} +# Функции для работы со словарями Яндекс.Метрики {#ym-dict-functions} Чтобы указанные ниже функции работали, в конфиге сервера должны быть указаны пути и адреса для получения всех словарей Яндекс.Метрики. Словари загружаются при первом вызове любой из этих функций. Если справочники не удаётся загрузить - будет выкинуто исключение. -О том, как создать справочники, смотрите в разделе "Словари". +О том, как создать справочники, смотрите в разделе «Словари». -## Множественные геобазы +## Множественные геобазы {#mnozhestvennye-geobazy} ClickHouse поддерживает работу одновременно с несколькими альтернативными геобазами (иерархиями регионов), для того чтобы можно было поддержать разные точки зрения о принадлежности регионов странам. В конфиге clickhouse-server указывается файл с иерархией регионов: `/opt/geo/regions_hierarchy.txt` -Кроме указанного файла, рядом ищутся файлы, к имени которых (до расширения) добавлен символ _ и какой угодно суффикс. +Кроме указанного файла, рядом ищутся файлы, к имени которых (до расширения) добавлен символ \_ и какой угодно суффикс. Например, также найдётся файл `/opt/geo/regions_hierarchy_ua.txt`, если такой есть. `ua` называется ключом словаря. Для словаря без суффикса, ключ является пустой строкой. -Все словари перезагружаются в рантайме (раз в количество секунд, заданное в конфигурационном параметре builtin_dictionaries_reload_interval, по умолчанию - раз в час), но перечень доступных словарей определяется один раз, при старте сервера. +Все словари перезагружаются в рантайме (раз в количество секунд, заданное в конфигурационном параметре builtin\_dictionaries\_reload\_interval, по умолчанию - раз в час), но перечень доступных словарей определяется один раз, при старте сервера. Во все функции по работе с регионами, в конце добавлен один необязательный аргумент - ключ словаря. Далее он обозначен как geobase. Пример: -```text +``` text regionToCountry(RegionID) - использует словарь по умолчанию: /opt/geo/regions_hierarchy.txt; regionToCountry(RegionID, '') - использует словарь по умолчанию: /opt/geo/regions_hierarchy.txt; regionToCountry(RegionID, 'ua') - использует словарь для ключа ua: /opt/geo/regions_hierarchy_ua.txt; ``` -### regionToCity(id\[, geobase\]) +### regionToCity(id\[, geobase\]) {#regiontocityid-geobase} + Принимает число типа UInt32 - идентификатор региона из геобазы Яндекса. Если регион является городом или входит в некоторый город, то возвращает идентификатор региона - соответствующего города. Иначе возвращает 0. -### regionToArea(id\[, geobase\]) +### regionToArea(id\[, geobase\]) {#regiontoareaid-geobase} + Переводит регион в область (тип в геобазе - 5). В остальном, аналогично функции regionToCity. -```sql +``` sql SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua')) FROM system.numbers LIMIT 15 ``` -```text +``` text ┌─regionToName(regionToArea(toUInt32(number), \'ua\'))─┐ │ │ │ Москва и Московская область │ @@ -59,16 +61,17 @@ LIMIT 15 └──────────────────────────────────────────────────────┘ ``` -### regionToDistrict(id\[, geobase\]) +### regionToDistrict(id\[, geobase\]) {#regiontodistrictid-geobase} + Переводит регион в федеральный округ (тип в геобазе - 4). В остальном, аналогично функции regionToCity. -```sql +``` sql SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua')) FROM system.numbers LIMIT 15 ``` -```text +``` text ┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'))─┐ │ │ │ Центральный федеральный округ │ @@ -88,29 +91,35 @@ LIMIT 15 └──────────────────────────────────────────────────────────┘ ``` -### regionToCountry(id\[, geobase\]) +### regionToCountry(id\[, geobase\]) {#regiontocountryid-geobase} + Переводит регион в страну. В остальном, аналогично функции regionToCity. Пример: `regionToCountry(toUInt32(213)) = 225` - преобразовали Москву (213) в Россию (225). -### regionToContinent(id\[, geobase\]) +### regionToContinent(id\[, geobase\]) {#regiontocontinentid-geobase} + Переводит регион в континент. В остальном, аналогично функции regionToCity. Пример: `regionToContinent(toUInt32(213)) = 10001` - преобразовали Москву (213) в Евразию (10001). -### regionToPopulation(id\[, geobase\]) +### regionToPopulation(id\[, geobase\]) {#regiontopopulationid-geobase} + Получает население для региона. -Население может быть прописано в файлах с геобазой. Смотрите в разделе "Встроенные словари". +Население может быть прописано в файлах с геобазой. Смотрите в разделе «Встроенные словари». Если для региона не прописано население, возвращается 0. В геобазе Яндекса, население может быть прописано для дочерних регионов, но не прописано для родительских. -### regionIn(lhs, rhs\[, geobase\]) +### regionIn(lhs, rhs\[, geobase\]) {#regioninlhs-rhs-geobase} + Проверяет принадлежность региона lhs региону rhs. Возвращает число типа UInt8, равное 1, если принадлежит и 0, если не принадлежит. Отношение рефлексивное - любой регион принадлежит также самому себе. -### regionHierarchy(id\[, geobase\]) +### regionHierarchy(id\[, geobase\]) {#regionhierarchyid-geobase} + Принимает число типа UInt32 - идентификатор региона из геобазы Яндекса. Возвращает массив идентификаторов регионов, состоящий из переданного региона и всех родителей по цепочке. Пример: `regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]`. -### regionToName(id\[, lang\]) +### regionToName(id\[, lang\]) {#regiontonameid-lang} + Принимает число типа UInt32 - идентификатор региона из геобазы Яндекса. Вторым аргументом может быть передана строка - название языка. Поддерживаются языки ru, en, ua, uk, by, kz, tr. Если второй аргумент отсутствует - используется язык ru. Если язык не поддерживается - кидается исключение. Возвращает строку - название региона на соответствующем языке. Если региона с указанным идентификатором не существует - возвращается пустая строка. `ua` и `uk` обозначают одно и то же - украинский язык. diff --git a/docs/ru/query_language/index.md b/docs/ru/query_language/index.md index 20b8ecdabec..d57d8971a7d 100644 --- a/docs/ru/query_language/index.md +++ b/docs/ru/query_language/index.md @@ -1,9 +1,9 @@ -# Справка по SQL +# Справка по SQL {#spravka-po-sql} -* [SELECT](select.md) -* [INSERT INTO](insert_into.md) -* [CREATE](create.md) -* [ALTER](alter.md#query_language_queries_alter) -* [Прочие виды запросов](misc.md) +- [SELECT](select.md) +- [INSERT INTO](insert_into.md) +- [CREATE](create.md) +- [ALTER](alter.md#query_language_queries_alter) +- [Прочие виды запросов](misc.md) [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/) diff --git a/docs/ru/query_language/insert_into.md b/docs/ru/query_language/insert_into.md index 9919984bffb..9bc3dd24b6e 100644 --- a/docs/ru/query_language/insert_into.md +++ b/docs/ru/query_language/insert_into.md @@ -1,30 +1,29 @@ - -## INSERT +## INSERT {#insert} Добавление данных. Базовый формат запроса: -```sql +``` sql INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ... ``` В запросе можно указать список столбцов для вставки `[(c1, c2, c3)]`. В этом случае, в остальные столбцы записываются: -- Значения, вычисляемые из `DEFAULT` выражений, указанных в определении таблицы. -- Нули и пустые строки, если `DEFAULT` не определены. +- Значения, вычисляемые из `DEFAULT` выражений, указанных в определении таблицы. +- Нули и пустые строки, если `DEFAULT` не определены. -Если [strict_insert_defaults=1](../operations/settings/settings.md), то столбцы, для которых не определены `DEFAULT`, необходимо перечислить в запросе. +Если [strict\_insert\_defaults=1](../operations/settings/settings.md), то столбцы, для которых не определены `DEFAULT`, необходимо перечислить в запросе. В INSERT можно передавать данные любого [формата](../interfaces/formats.md#formats), который поддерживает ClickHouse. Для этого формат необходимо указать в запросе в явном виде: -```sql +``` sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` -Например, следующий формат запроса идентичен базовому варианту INSERT ... VALUES: +Например, следующий формат запроса идентичен базовому варианту INSERT … VALUES: -```sql +``` sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... ``` @@ -32,21 +31,21 @@ ClickHouse отсекает все пробелы и один перенос с Пример: -```sql +``` sql INSERT INTO t FORMAT TabSeparated 11 Hello, world! 22 Qwerty ``` -С помощью консольного клиента или HTTP интерфейса можно вставлять данные отдельно от запроса. Как это сделать, читайте в разделе "[Интерфейсы](../interfaces/index.md#interfaces)". +С помощью консольного клиента или HTTP интерфейса можно вставлять данные отдельно от запроса. Как это сделать, читайте в разделе «[Интерфейсы](../interfaces/index.md#interfaces)». -### Ограничения (constraints) +### Ограничения (constraints) {#ogranicheniia-constraints} Если в таблице объявлены [ограничения](create.md#constraints), то их выполнимость будет проверена для каждой вставляемой строки. Если для хотя бы одной строки ограничения не будут выполнены, запрос будет остановлен. -### Вставка результатов `SELECT` {#insert_query_insert-select} +### Вставка результатов `SELECT` {#insert-query-insert-select} -```sql +``` sql INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... ``` @@ -60,16 +59,16 @@ INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... Для табличной функции [input()](table_functions/input.md) после секции `SELECT` должна следовать секция `FORMAT`. -### Замечания о производительности +### Замечания о производительности {#zamechaniia-o-proizvoditelnosti} `INSERT` сортирует входящие данные по первичному ключу и разбивает их на партиции по ключу партиционирования. Если вы вставляете данные в несколько партиций одновременно, то это может значительно снизить производительность запроса `INSERT`. Чтобы избежать этого: -- Добавляйте данные достаточно большими пачками. Например, по 100 000 строк. -- Группируйте данные по ключу партиционирования самостоятельно перед загрузкой в ClickHouse. +- Добавляйте данные достаточно большими пачками. Например, по 100 000 строк. +- Группируйте данные по ключу партиционирования самостоятельно перед загрузкой в ClickHouse. Снижения производительности не будет, если: -- Данные поступают в режиме реального времени. -- Вы загружаете данные, которые как правило отсортированы по времени. +- Данные поступают в режиме реального времени. +- Вы загружаете данные, которые как правило отсортированы по времени. [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/insert_into/) diff --git a/docs/ru/query_language/misc.md b/docs/ru/query_language/misc.md index d75258c8e80..2304b4921f8 100644 --- a/docs/ru/query_language/misc.md +++ b/docs/ru/query_language/misc.md @@ -1,25 +1,25 @@ -# Прочие виды запросов +# Прочие виды запросов {#prochie-vidy-zaprosov} -## ATTACH +## ATTACH {#attach} Запрос полностью аналогичен запросу `CREATE`, но: -- вместо слова `CREATE` используется слово `ATTACH`; -- запрос не создаёт данные на диске, а предполагает, что данные уже лежат в соответствующих местах, и всего лишь добавляет информацию о таблице на сервер. После выполнения запроса `ATTACH` сервер будет знать о существовании таблицы. +- вместо слова `CREATE` используется слово `ATTACH`; +- запрос не создаёт данные на диске, а предполагает, что данные уже лежат в соответствующих местах, и всего лишь добавляет информацию о таблице на сервер. После выполнения запроса `ATTACH` сервер будет знать о существовании таблицы. Если таблица перед этим была отсоединена (`DETACH`), т.е. её структура известна, можно использовать сокращенную форму записи без определения структуры. -```sql +``` sql ATTACH TABLE [IF NOT EXISTS] [db.]name [ON CLUSTER cluster] ``` Этот запрос используется при старте сервера. Сервер хранит метаданные таблиц в виде файлов с запросами `ATTACH`, которые он просто исполняет при запуске (за исключением системных таблиц, которые явно создаются на сервере). -## CHECK TABLE +## CHECK TABLE {#check-table} Проверяет таблицу на повреждение данных. -```sql +``` sql CHECK TABLE [db.]name ``` @@ -27,15 +27,15 @@ CHECK TABLE [db.]name Ответ содержит колонку `result`, содержащую одну строку с типом [Boolean](../data_types/boolean.md). Допустимые значения: -- 0 - данные в таблице повреждены; -- 1 - данные не повреждены. +- 0 - данные в таблице повреждены; +- 1 - данные не повреждены. Запрос `CHECK TABLE` поддерживает следующие движки таблиц: -- [Log](../operations/table_engines/log.md) -- [TinyLog](../operations/table_engines/tinylog.md) -- [StripeLog](../operations/table_engines/stripelog.md) -- [Семейство MergeTree](../operations/table_engines/mergetree.md) +- [Log](../operations/table_engines/log.md) +- [TinyLog](../operations/table_engines/tinylog.md) +- [StripeLog](../operations/table_engines/stripelog.md) +- [Семейство MergeTree](../operations/table_engines/mergetree.md) При попытке выполнить запрос с таблицами с другими табличными движками, ClickHouse генерирует исключение. @@ -47,70 +47,71 @@ CHECK TABLE [db.]name В этом случае можно скопировать оставшиеся неповрежденные данные в другую таблицу. Для этого: -1. Создайте новую таблицу с такой же структурой, как у поврежденной таблицы. Для этого выполните запрос `CREATE TABLE AS `. -2. Установите значение параметра [max_threads](../operations/settings/settings.md#settings-max_threads) в 1. Это нужно для того, чтобы выполнить следующий запрос в одном потоке. Установить значение параметра можно через запрос: `SET max_threads = 1`. -3. Выполните запрос `INSERT INTO SELECT * FROM `. В результате неповрежденные данные будут скопированы в другую таблицу. Обратите внимание, будут скопированы только те данные, которые следуют до поврежденного участка. -4. Перезапустите `clickhouse-client`, чтобы вернуть предыдущее значение параметра `max_threads`. +1. Создайте новую таблицу с такой же структурой, как у поврежденной таблицы. Для этого выполните запрос `CREATE TABLE AS `. +2. Установите значение параметра [max\_threads](../operations/settings/settings.md#settings-max_threads) в 1. Это нужно для того, чтобы выполнить следующий запрос в одном потоке. Установить значение параметра можно через запрос: `SET max_threads = 1`. +3. Выполните запрос `INSERT INTO SELECT * FROM `. В результате неповрежденные данные будут скопированы в другую таблицу. Обратите внимание, будут скопированы только те данные, которые следуют до поврежденного участка. +4. Перезапустите `clickhouse-client`, чтобы вернуть предыдущее значение параметра `max_threads`. ## DESCRIBE TABLE {#misc-describe-table} -```sql +``` sql DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] ``` + Возвращает описание столбцов таблицы. Результат запроса содержит столбцы (все столбцы имеют тип String): -- `name` — имя столбца таблицы; -- `type`— тип столбца; -- `default_type` — в каком виде задано [выражение для значения по умолчанию](create.md#create-default-values): `DEFAULT`, `MATERIALIZED` или `ALIAS`. Столбец содержит пустую строку, если значение по умолчанию не задано. -- `default_expression` — значение, заданное в секции `DEFAULT`; -- `comment_expression` — комментарий к столбцу. +- `name` — имя столбца таблицы; +- `type`— тип столбца; +- `default_type` — в каком виде задано [выражение для значения по умолчанию](create.md#create-default-values): `DEFAULT`, `MATERIALIZED` или `ALIAS`. Столбец содержит пустую строку, если значение по умолчанию не задано. +- `default_expression` — значение, заданное в секции `DEFAULT`; +- `comment_expression` — комментарий к столбцу. -Вложенные структуры данных выводятся в "развёрнутом" виде. То есть, каждый столбец - по отдельности, с именем через точку. +Вложенные структуры данных выводятся в «развёрнутом» виде. То есть, каждый столбец - по отдельности, с именем через точку. -## DETACH +## DETACH {#detach} Удаляет из сервера информацию о таблице name. Сервер перестаёт знать о существовании таблицы. -```sql +``` sql DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` Но ни данные, ни метаданные таблицы не удаляются. При следующем запуске сервера, сервер прочитает метаданные и снова узнает о таблице. -Также, "отцепленную" таблицу можно прицепить заново запросом `ATTACH` (за исключением системных таблиц, для которых метаданные не хранятся). +Также, «отцепленную» таблицу можно прицепить заново запросом `ATTACH` (за исключением системных таблиц, для которых метаданные не хранятся). Запроса `DETACH DATABASE` нет. -## DROP +## DROP {#drop} Запрос имеет два вида: `DROP DATABASE` и `DROP TABLE`. -```sql +``` sql DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster] ``` Удаляет все таблицы внутри базы данных db, а затем саму базу данных db. Если указано `IF EXISTS` - не выдавать ошибку, если база данных не существует. -```sql +``` sql DROP [TEMPORARY] TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` Удаляет таблицу. Если указано `IF EXISTS` - не выдавать ошибку, если таблица не существует или база данных не существует. -## EXISTS +## EXISTS {#exists} -```sql +``` sql EXISTS [TEMPORARY] TABLE [db.]name [INTO OUTFILE filename] [FORMAT format] ``` Возвращает один столбец типа `UInt8`, содержащий одно значение - `0`, если таблицы или БД не существует и `1`, если таблица в указанной БД существует. -## KILL QUERY +## KILL QUERY {#kill-query} -```sql +``` sql KILL QUERY [ON CLUSTER cluster] WHERE [SYNC|ASYNC|TEST] @@ -122,7 +123,7 @@ KILL QUERY [ON CLUSTER cluster] Примеры -```sql +``` sql -- Принудительно останавливает все запросы с указанным query_id: KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90' @@ -137,15 +138,15 @@ Readonly-пользователи могут останавливать толь Синхронный вариант (`SYNC`) ожидает остановки всех запросов и построчно выводит информацию о процессах по ходу их остановки. Ответ содержит колонку `kill_status`, которая может принимать следующие значения: -1. 'finished' - запрос был успешно остановлен; -2. 'waiting' - запросу отправлен сигнал завершения, ожидается его остановка; -3. остальные значения описывают причину невозможности остановки запроса. +1. ‘finished’ - запрос был успешно остановлен; +2. ‘waiting’ - запросу отправлен сигнал завершения, ожидается его остановка; +3. остальные значения описывают причину невозможности остановки запроса. Тестовый вариант запроса (`TEST`) только проверяет права пользователя и выводит список запросов для остановки. ## KILL MUTATION {#kill-mutation} -```sql +``` sql KILL MUTATION [ON CLUSTER cluster] WHERE [TEST] @@ -158,7 +159,7 @@ KILL MUTATION [ON CLUSTER cluster] Примеры: -```sql +``` sql -- Останавливает все мутации одной таблицы: KILL MUTATION WHERE database = 'default' AND table = 'table' @@ -170,9 +171,9 @@ KILL MUTATION WHERE database = 'default' AND table = 'table' AND mutation_id = ' Данные, уже изменённые мутацией, остаются в таблице (отката на старую версию данных не происходит). -## OPTIMIZE {#misc_operations-optimize} +## OPTIMIZE {#misc-operations-optimize} -```sql +``` sql OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE] ``` @@ -180,19 +181,19 @@ OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION I Если `OPTIMIZE` применяется к таблицам семейства [ReplicatedMergeTree](../operations/table_engines/replication.md), ClickHouse создаёт задачу на мёрж и ожидает её исполнения на всех узлах (если активирована настройка `replication_alter_partitions_sync`). -- Если `OPTIMIZE` не выполняет мёрж по любой причине, ClickHouse не оповещает об этом клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../operations/settings/settings.md#setting-optimize_throw_if_noop). -- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter.md#alter-how-to-specify-part-expr). -- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске. -- Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех колонках), имеет смысл только для движка MergeTree. +- Если `OPTIMIZE` не выполняет мёрж по любой причине, ClickHouse не оповещает об этом клиента. Чтобы включить оповещения, используйте настройку [optimize\_throw\_if\_noop](../operations/settings/settings.md#setting-optimize_throw_if_noop). +- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter.md#alter-how-to-specify-part-expr). +- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске. +- Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех колонках), имеет смысл только для движка MergeTree. !!! warning "Внимание" - Запрос `OPTIMIZE` не может устранить причину появления ошибки "Too many parts". + Запрос `OPTIMIZE` не может устранить причину появления ошибки «Too many parts». -## RENAME {#misc_operations-rename} +## RENAME {#misc-operations-rename} Переименовывает одну или несколько таблиц. -```sql +``` sql RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ... [ON CLUSTER cluster] ``` @@ -200,7 +201,7 @@ RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ... ## SET {#query-set} -```sql +``` sql SET param = value ``` @@ -208,15 +209,15 @@ SET param = value Можно одним запросом установить все настройки из заданного профиля настроек. -```sql +``` sql SET profile = 'profile-name-from-the-settings-file' ``` Подробности смотрите в разделе [Настройки](../operations/settings/settings.md). -## TRUNCATE +## TRUNCATE {#truncate} -```sql +``` sql TRUNCATE TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` @@ -224,9 +225,9 @@ TRUNCATE TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] Запрос `TRUNCATE` не поддерживается для следующих движков: [View](../operations/table_engines/view.md), [File](../operations/table_engines/file.md), [URL](../operations/table_engines/url.md) и [Null](../operations/table_engines/null.md). -## USE +## USE {#use} -```sql +``` sql USE db ``` diff --git a/docs/ru/query_language/operators.md b/docs/ru/query_language/operators.md index 1bc1fa62719..e88a6606aa5 100644 --- a/docs/ru/query_language/operators.md +++ b/docs/ru/query_language/operators.md @@ -1,19 +1,19 @@ -# Операторы +# Операторы {#operatory} Все операторы преобразуются в соответствующие функции на этапе парсинга запроса, с учётом их приоритетов и ассоциативности. Далее будут перечислены группы операторов в порядке их приоритета (чем выше, тем раньше оператор связывается со своими аргументами). -## Операторы доступа +## Операторы доступа {#operatory-dostupa} `a[N]` - доступ к элементу массива, функция `arrayElement(a, N)`. `a.N` - доступ к элементу кортежа, функция `tupleElement(a, N)`. -## Оператор числового отрицания +## Оператор числового отрицания {#operator-chislovogo-otritsaniia} `-a` - функция `negate(a)`. -## Операторы умножения и деления +## Операторы умножения и деления {#operatory-umnozheniia-i-deleniia} `a * b` - функция `multiply(a, b)` @@ -21,13 +21,13 @@ `a % b` - функция `modulo(a, b)` -## Операторы сложения и вычитания +## Операторы сложения и вычитания {#operatory-slozheniia-i-vychitaniia} `a + b` - функция `plus(a, b)` `a - b` - функция `minus(a, b)` -## Операторы сравнения +## Операторы сравнения {#operatory-sravneniia} `a = b` - функция `equals(a, b)` @@ -53,7 +53,7 @@ `a NOT BETWEEN b AND c` - равнозначно `a < b OR a > c` -## Операторы для работы с множествами +## Операторы для работы с множествами {#operatory-dlia-raboty-s-mnozhestvami} *Смотрите раздел [Операторы IN](select.md#select-in-operators).* @@ -67,9 +67,9 @@ ## Оператор для работы с датами и временем {#operators-datetime} -### EXTRACT +### EXTRACT {#extract} -```sql +``` sql EXTRACT(part FROM date); ``` @@ -77,12 +77,12 @@ EXTRACT(part FROM date); В параметре `part` указывается, какой фрагмент даты нужно получить. Доступные значения: -- `DAY` — День. Возможные значения: 1–31. -- `MONTH` — Номер месяца. Возможные значения: 1–12. -- `YEAR` — Год. -- `SECOND` — Секунда. Возможные значения: 0–59. -- `MINUTE` — Минута. Возможные значения: 0–59. -- `HOUR` — Час. Возможные значения: 0–23. +- `DAY` — День. Возможные значения: 1–31. +- `MONTH` — Номер месяца. Возможные значения: 1–12. +- `YEAR` — Год. +- `SECOND` — Секунда. Возможные значения: 0–59. +- `MINUTE` — Минута. Возможные значения: 0–59. +- `HOUR` — Час. Возможные значения: 0–23. Эти значения могут быть указаны также в нижнем регистре (`day`, `month`). @@ -90,7 +90,7 @@ EXTRACT(part FROM date); Примеры: -```sql +``` sql SELECT EXTRACT(DAY FROM toDate('2017-06-15')); SELECT EXTRACT(MONTH FROM toDate('2017-06-15')); SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); @@ -98,7 +98,7 @@ SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); В следующем примере создадим таблицу и добавим в неё значение с типом `DateTime`. -```sql +``` sql CREATE TABLE test.Orders ( OrderId UInt64, @@ -108,10 +108,11 @@ CREATE TABLE test.Orders ENGINE = Log; ``` -```sql +``` sql INSERT INTO test.Orders VALUES (1, 'Jarlsberg Cheese', toDateTime('2008-10-11 13:23:44')); ``` -```sql + +``` sql SELECT toYear(OrderDate) AS OrderYear, toMonth(OrderDate) AS OrderMonth, @@ -122,7 +123,7 @@ SELECT FROM test.Orders; ``` -```text +``` text ┌─OrderYear─┬─OrderMonth─┬─OrderDay─┬─OrderHour─┬─OrderMinute─┬─OrderSecond─┐ │ 2008 │ 10 │ 11 │ 13 │ 23 │ 44 │ └───────────┴────────────┴──────────┴───────────┴─────────────┴─────────────┘ @@ -149,10 +150,11 @@ FROM test.Orders; Пример: -```sql +``` sql SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL 3 HOUR ``` -```text + +``` text ┌───current_date_time─┬─plus(plus(now(), toIntervalDay(4)), toIntervalHour(3))─┐ │ 2019-10-23 11:16:28 │ 2019-10-27 14:16:28 │ └─────────────────────┴────────────────────────────────────────────────────────┘ @@ -160,22 +162,22 @@ SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL **Смотрите также** -- Тип данных [Interval](../data_types/special_data_types/interval.md) -- Функции преобразования типов [toInterval](functions/type_conversion_functions.md#function-tointerval) +- Тип данных [Interval](../data_types/special_data_types/interval.md) +- Функции преобразования типов [toInterval](functions/type_conversion_functions.md#function-tointerval) -## Оператор логического отрицания +## Оператор логического отрицания {#operator-logicheskogo-otritsaniia} `NOT a` - функция `not(a)` -## Оператор логического 'И' +## Оператор логического ‘И’ {#operator-logicheskogo-i} `a AND b` - функция `and(a, b)` -## Оператор логического 'ИЛИ' +## Оператор логического ‘ИЛИ’ {#operator-logicheskogo-ili} `a OR b` - функция `or(a, b)` -## Условный оператор +## Условный оператор {#uslovnyi-operator} `a ? b : c` - функция `if(a, b, c)` @@ -183,9 +185,9 @@ SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL Условный оператор сначала вычисляет значения b и c, затем проверяет выполнение условия a, и только после этого возвращает соответствующее значение. Если в качестве b или с выступает функция [arrayJoin()](functions/array_join.md#functions_arrayjoin), то размножение каждой строки произойдет вне зависимости от условия а. -## Условное выражение {#operator_case} +## Условное выражение {#operator-case} -```sql +``` sql CASE [x] WHEN a THEN b [WHEN ... THEN ...] @@ -199,63 +201,68 @@ END !!! note "Примечание" Функция `transform` не умеет работать с `NULL`. -## Оператор склеивания строк +## Оператор склеивания строк {#operator-skleivaniia-strok} `s1 || s2` - функция `concat(s1, s2)` -## Оператор создания лямбда-выражения +## Оператор создания лямбда-выражения {#operator-sozdaniia-liambda-vyrazheniia} `x -> expr` - функция `lambda(x, expr)` Следующие операторы не имеют приоритета, так как представляют собой скобки: -## Оператор создания массива +## Оператор создания массива {#operator-sozdaniia-massiva} `[x1, ...]` - функция `array(x1, ...)` -## Оператор создания кортежа +## Оператор создания кортежа {#operator-sozdaniia-kortezha} `(x1, x2, ...)` - функция `tuple(x2, x2, ...)` -## Ассоциативность +## Ассоциативность {#assotsiativnost} Все бинарные операторы имеют левую ассоциативность. Например, `1 + 2 + 3` преобразуется в `plus(plus(1, 2), 3)`. Иногда это работает не так, как ожидается. Например, `SELECT 4 > 3 > 2` выдаст 0. Для эффективности, реализованы функции `and` и `or`, принимающие произвольное количество аргументов. Соответствующие цепочки операторов `AND` и `OR`, преобразуются в один вызов этих функций. -## Проверка на `NULL` +## Проверка на `NULL` {#proverka-na-null} ClickHouse поддерживает операторы `IS NULL` и `IS NOT NULL`. ### IS NULL {#operator-is-null} -- Для значений типа [Nullable](../data_types/nullable.md) оператор `IS NULL` возвращает: - - `1`, если значение — `NULL`. - - `0` в обратном случае. -- Для прочих значений оператор `IS NULL` всегда возвращает `0`. +- Для значений типа [Nullable](../data_types/nullable.md) оператор `IS NULL` возвращает: + - `1`, если значение — `NULL`. + - `0` в обратном случае. +- Для прочих значений оператор `IS NULL` всегда возвращает `0`. -```sql + + +``` sql SELECT x+100 FROM t_null WHERE y IS NULL ``` -```text + +``` text ┌─plus(x, 100)─┐ │ 101 │ └──────────────┘ ``` +### IS NOT NULL {#is-not-null} -### IS NOT NULL +- Для значений типа [Nullable](../data_types/nullable.md) оператор `IS NOT NULL` возвращает: + - `0`, если значение — `NULL`. + - `1`, в обратном случае. +- Для прочих значений оператор `IS NOT NULL` всегда возвращает `1`. -- Для значений типа [Nullable](../data_types/nullable.md) оператор `IS NOT NULL` возвращает: - - `0`, если значение — `NULL`. - - `1`, в обратном случае. -- Для прочих значений оператор `IS NOT NULL` всегда возвращает `1`. + -```sql +``` sql SELECT * FROM t_null WHERE y IS NOT NULL ``` -```text + +``` text ┌─x─┬─y─┐ │ 2 │ 3 │ └───┴───┘ diff --git a/docs/ru/query_language/select.md b/docs/ru/query_language/select.md index 70a2fc32810..759a65f155a 100644 --- a/docs/ru/query_language/select.md +++ b/docs/ru/query_language/select.md @@ -1,8 +1,8 @@ -# Синтаксис запросов SELECT +# Синтаксис запросов SELECT {#sintaksis-zaprosov-select} `SELECT` осуществляет выборку данных. -```sql +``` sql [WITH expr_list|(subquery)] SELECT [DISTINCT] expr_list [FROM [db.]table | (subquery) | table_function] [FINAL] @@ -25,17 +25,19 @@ SELECT [DISTINCT] expr_list Ниже секции будут описаны в порядке, почти соответствующем конвейеру выполнения запроса. Если в запросе отсутствуют секции `DISTINCT`, `GROUP BY`, `ORDER BY`, подзапросы в `IN` и `JOIN`, то запрос будет обработан полностью потоково, с использованием O(1) количества оперативки. -Иначе запрос может съесть много оперативки, если не указаны подходящие ограничения `max_memory_usage`, `max_rows_to_group_by`, `max_rows_to_sort`, `max_rows_in_distinct`, `max_bytes_in_distinct`, `max_rows_in_set`, `max_bytes_in_set`, `max_rows_in_join`, `max_bytes_in_join`, `max_bytes_before_external_sort`, `max_bytes_before_external_group_by`. Подробнее смотрите в разделе "Настройки". Присутствует возможность использовать внешнюю сортировку (с сохранением временных данных на диск) и внешнюю агрегацию. `Merge join` в системе нет. +Иначе запрос может съесть много оперативки, если не указаны подходящие ограничения `max_memory_usage`, `max_rows_to_group_by`, `max_rows_to_sort`, `max_rows_in_distinct`, `max_bytes_in_distinct`, `max_rows_in_set`, `max_bytes_in_set`, `max_rows_in_join`, `max_bytes_in_join`, `max_bytes_before_external_sort`, `max_bytes_before_external_group_by`. Подробнее смотрите в разделе «Настройки». Присутствует возможность использовать внешнюю сортировку (с сохранением временных данных на диск) и внешнюю агрегацию. `Merge join` в системе нет. + +### Секция WITH {#sektsiia-with} -### Секция WITH Данная секция представляет собой [CTE](https://ru.wikipedia.org/wiki/Иерархические_и_рекурсивные_запросы_в_SQL), с рядом ограничений: 1. Рекурсивные запросы не поддерживаются 2. Если в качестве выражения используется подзапрос, то результат должен содержать ровно одну строку 3. Результаты выражений нельзя переиспользовать во вложенных запросах В дальнейшем, результаты выражений можно использовать в секции SELECT. -Пример 1: Использование константного выражения как "переменной" -```sql +Пример 1: Использование константного выражения как «переменной» + +``` sql WITH '2019-08-01 15:23:00' as ts_upper_bound SELECT * FROM hits @@ -45,7 +47,8 @@ WHERE ``` Пример 2: Выкидывание выражения sum(bytes) из списка колонок в SELECT -```sql + +``` sql WITH sum(bytes) as s SELECT formatReadableSize(s), @@ -56,7 +59,8 @@ ORDER BY s ``` Пример 3: Использование результатов скалярного подзапроса -```sql + +``` sql /* запрос покажет TOP 10 самых больших таблиц */ WITH ( @@ -75,7 +79,8 @@ LIMIT 10 Пример 4: Переиспользование выражения В настоящий момент, переиспользование выражения из секции WITH внутри подзапроса возможно только через дублирование. -```sql + +``` sql WITH ['hello'] AS hello SELECT hello, @@ -86,7 +91,8 @@ FROM SELECT hello ) ``` -```text + +``` text ┌─hello─────┬─hello─────┐ │ ['hello'] │ ['hello'] │ └───────────┴───────────┘ @@ -99,9 +105,9 @@ FROM Cекция `FROM` определяет источник данных: -- Таблица -- Подзапрос -- [Табличная функция](table_functions/index.md) +- Таблица +- Подзапрос +- [Табличная функция](table_functions/index.md) Также могут присутствовать `ARRAY JOIN` и обычный `JOIN` (смотрите ниже). @@ -111,7 +117,7 @@ Cекция `FROM` определяет источник данных: Для выполнения запроса, из соответствующей таблицы, вынимаются все столбцы, перечисленные в запросе. Из подзапросов выкидываются столбцы, не нужные для внешнего запроса. Если в запросе не перечислено ни одного столбца (например, `SELECT count() FROM t`), то из таблицы всё равно вынимается один какой-нибудь столбец (предпочитается самый маленький), для того, чтобы можно было посчитать количество строк. -Модификатор `FINAL` может быть использован в запросе `SELECT` из таблиц семейства [MergeTree](../operations/table_engines/mergetree.md). При указании `FINAL`, данные будут выбираться полностью "домерженными". Стоит учитывать, что использование `FINAL` приводит к чтению также столбцов, относящихся к первичному ключу. Также, запрос будет выполняться в один поток, и при выполнении запроса будет выполняться слияние данных. Это приводит к тому, что при использовании `FINAL`, запрос выполняется медленнее. В большинстве случаев, следует избегать использования `FINAL`. +Модификатор `FINAL` может быть использован в запросе `SELECT` из таблиц семейства [MergeTree](../operations/table_engines/mergetree.md). При указании `FINAL`, данные будут выбираться полностью «домерженными». Стоит учитывать, что использование `FINAL` приводит к чтению также столбцов, относящихся к первичному ключу. Также, запрос будет выполняться в один поток, и при выполнении запроса будет выполняться слияние данных. Это приводит к тому, что при использовании `FINAL`, запрос выполняется медленнее. В большинстве случаев, следует избегать использования `FINAL`. Модификатор `FINAL` может быть использован для всех таблиц семейства `MergeTree`, которые производят преобразования данных в процессе фоновых слияний (кроме GraphiteMergeTree). #### FINAL Modifier {#select-from-final} @@ -120,45 +126,44 @@ Cекция `FROM` определяет источник данных: Также поддержан для движков: -- [Replicated](../operations/table_engines/replication.md)-версий `MergeTree`. -- [View](../operations/table_engines/view.md), [Buffer](../operations/table_engines/buffer.md), [Distributed](../operations/table_engines/distributed.md), и [MaterializedView](../operations/table_engines/materializedview.md), которые работают поверх других движков, если они созданы для таблиц с движками семейства `MergeTree`. +- [Replicated](../operations/table_engines/replication.md)-версий `MergeTree`. +- [View](../operations/table_engines/view.md), [Buffer](../operations/table_engines/buffer.md), [Distributed](../operations/table_engines/distributed.md), и [MaterializedView](../operations/table_engines/materializedview.md), которые работают поверх других движков, если они созданы для таблиц с движками семейства `MergeTree`. Запросы, использующие `FINAL` исполняются медленнее аналогичных запросов без `FINAL`, поскольку: -- Запрос исполняется в один поток и данные мёржатся в процессе выполнения. -- Запросы с модификатором `FINAL` дополнительно к столбцам, указанным в запросе, читают столбцы первичного ключа. +- Запрос исполняется в один поток и данные мёржатся в процессе выполнения. +- Запросы с модификатором `FINAL` дополнительно к столбцам, указанным в запросе, читают столбцы первичного ключа. По возможности не используйте модификатор `FINAL`. - ### Секция SAMPLE {#select-sample-clause} Секция `SAMPLE` позволяет выполнять запросы приближённо. Например, чтобы посчитать статистику по всем визитам, можно обработать 1/10 всех визитов и результат домножить на 10. Сэмплирование имеет смысл, когда: -1. Точность результата не важна, например, для оценочных расчетов. -2. Возможности аппаратной части не позволяют соответствовать строгим критериям. Например, время ответа должно быть <100 мс. При этом точность расчета имеет более низкий приоритет. -3. Точность результата участвует в бизнес-модели сервиса. Например, пользователи с бесплатной подпиской на сервис могут получать отчеты с меньшей точностью, чем пользователи с премиум подпиской. +1. Точность результата не важна, например, для оценочных расчетов. +2. Возможности аппаратной части не позволяют соответствовать строгим критериям. Например, время ответа должно быть \<100 мс. При этом точность расчета имеет более низкий приоритет. +3. Точность результата участвует в бизнес-модели сервиса. Например, пользователи с бесплатной подпиской на сервис могут получать отчеты с меньшей точностью, чем пользователи с премиум подпиской. !!! note "Внимание" Не стоит использовать сэмплирование в тех задачах, где важна точность расчетов. Например, при работе с финансовыми отчетами. Свойства сэмплирования: -- Сэмплирование работает детерминированно. При многократном выполнении одного и того же запроса `SELECT .. SAMPLE`, результат всегда будет одинаковым. -- Сэмплирование поддерживает консистентность для разных таблиц. Имеется в виду, что для таблиц с одним и тем же ключом сэмплирования, подмножество данных в выборках будет одинаковым (выборки при этом должны быть сформированы для одинаковой доли данных). Например, выборка по идентификаторам посетителей выберет из разных таблиц строки с одинаковым подмножеством всех возможных идентификаторов. Это свойство позволяет использовать выборки в подзапросах в секции [IN](#select-in-operators), а также объединять выборки с помощью [JOIN](#select-join). -- Сэмплирование позволяет читать меньше данных с диска. Обратите внимание, для этого необходимо корректно указать ключ сэмплирования. Подробнее см. в разделе [Создание таблицы MergeTree](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table). +- Сэмплирование работает детерминированно. При многократном выполнении одного и того же запроса `SELECT .. SAMPLE`, результат всегда будет одинаковым. +- Сэмплирование поддерживает консистентность для разных таблиц. Имеется в виду, что для таблиц с одним и тем же ключом сэмплирования, подмножество данных в выборках будет одинаковым (выборки при этом должны быть сформированы для одинаковой доли данных). Например, выборка по идентификаторам посетителей выберет из разных таблиц строки с одинаковым подмножеством всех возможных идентификаторов. Это свойство позволяет использовать выборки в подзапросах в секции [IN](#select-in-operators), а также объединять выборки с помощью [JOIN](#select-join). +- Сэмплирование позволяет читать меньше данных с диска. Обратите внимание, для этого необходимо корректно указать ключ сэмплирования. Подробнее см. в разделе [Создание таблицы MergeTree](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table). Сэмплирование поддерживается только таблицами семейства [MergeTree](../operations/table_engines/mergetree.md) и только в том случае, если для таблиц был указан ключ сэмплирования (выражение, на основе которого должна производиться выборка). Подробнее см. в разделе [Создание таблиц MergeTree](../operations/table_engines/mergetree.md#table_engine-mergetree-creating-a-table). Выражение `SAMPLE` в запросе можно задать следующими способами: -| Способ задания SAMPLE| Описание | -| ---------------- | --------- | -| `SAMPLE k` | Здесь `k` – это дробное число в интервале от 0 до 1.
Запрос будет выполнен по `k` доле данных. Например, если указано `SAMPLE 1/10`, то запрос будет выполнен для выборки из 1/10 данных. [Подробнее](#select-sample-k)| -| `SAMPLE n` | Здесь `n` – это достаточно большое целое число.
Запрос будет выполнен для выборки, состоящей из не менее чем `n` строк. Например, если указано `SAMPLE 10000000`, то запрос будет выполнен для не менее чем 10,000,000 строк. [Подробнее](#select-sample-n) | -| `SAMPLE k OFFSET m` | Здесь `k` и `m` – числа от 0 до 1.
Запрос будет выполнен по `k` доле данных. При этом выборка будет сформирована со смещением на `m` долю. [Подробнее](#select-sample-offset) | +| Способ задания SAMPLE | Описание | +|-----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `SAMPLE k` | Здесь `k` – это дробное число в интервале от 0 до 1.
Запрос будет выполнен по `k` доле данных. Например, если указано `SAMPLE 1/10`, то запрос будет выполнен для выборки из 1/10 данных. [Подробнее](#select-sample-k) | +| `SAMPLE n` | Здесь `n` – это достаточно большое целое число.
Запрос будет выполнен для выборки, состоящей из не менее чем `n` строк. Например, если указано `SAMPLE 10000000`, то запрос будет выполнен для не менее чем 10,000,000 строк. [Подробнее](#select-sample-n) | +| `SAMPLE k OFFSET m` | Здесь `k` и `m` – числа от 0 до 1.
Запрос будет выполнен по `k` доле данных. При этом выборка будет сформирована со смещением на `m` долю. [Подробнее](#select-sample-offset) | #### SAMPLE k {#select-sample-k} @@ -166,7 +171,7 @@ Cекция `FROM` определяет источник данных: Если задано выражение `SAMPLE k`, запрос будет выполнен для `k` доли данных. Рассмотрим пример: -```sql +``` sql SELECT Title, count() * 10 AS PageViews @@ -180,7 +185,7 @@ ORDER BY PageViews DESC LIMIT 1000 В этом примере запрос выполняется по выборке из 0.1 (10%) данных. Значения агрегатных функций не корректируются автоматически, поэтому чтобы получить приближённый результат, значение `count()` нужно вручную умножить на 10. -Выборка с указанием относительного коэффициента является "согласованной": для таблиц с одним и тем же ключом сэмплирования, выборка с одинаковой относительной долей всегда будет составлять одно и то же подмножество данных. То есть выборка из разных таблиц, на разных серверах, в разное время, формируется одинаковым образом. +Выборка с указанием относительного коэффициента является «согласованной»: для таблиц с одним и тем же ключом сэмплирования, выборка с одинаковой относительной долей всегда будет составлять одно и то же подмножество данных. То есть выборка из разных таблиц, на разных серверах, в разное время, формируется одинаковым образом. #### SAMPLE n {#select-sample-n} @@ -197,7 +202,7 @@ ORDER BY PageViews DESC LIMIT 1000 Предположим, у нас есть таблица, в которой ведется статистика посещений сайта. Пример ниже показывает, как рассчитать суммарное число просмотров: -```sql +``` sql SELECT sum(PageViews * _sample_factor) FROM visits SAMPLE 10000000 @@ -205,7 +210,7 @@ SAMPLE 10000000 Следующий пример показывает, как посчитать общее число визитов: -```sql +``` sql SELECT sum(_sample_factor) FROM visits SAMPLE 10000000 @@ -213,7 +218,7 @@ SAMPLE 10000000 В примере ниже рассчитывается среднее время на сайте. Обратите внимание, при расчете средних значений, умножать результат на коэффициент сэмплирования не нужно. -```sql +``` sql SELECT avg(Duration) FROM visits SAMPLE 10000000 @@ -227,7 +232,7 @@ SAMPLE 10000000 **Пример 1** -```sql +``` sql SAMPLE 1/10 ``` @@ -237,7 +242,7 @@ SAMPLE 1/10 **Пример 2** -```sql +``` sql SAMPLE 1/10 OFFSET 1/2 ``` @@ -249,7 +254,7 @@ SAMPLE 1/10 OFFSET 1/2 Позволяет выполнить `JOIN` с массивом или вложенной структурой данных. Смысл похож на функцию [arrayJoin](functions/array_join.md#functions_arrayjoin), но функциональность более широкая. -```sql +``` sql SELECT FROM [LEFT] ARRAY JOIN @@ -263,12 +268,12 @@ FROM Секция `ARRAY JOIN` поддерживает следующие формы записи: -- `ARRAY JOIN` — в этом случае результат `JOIN` не будет содержать пустые массивы; -- `LEFT ARRAY JOIN` — пустые массивы попадут в результат выполнения `JOIN`. В качестве значения для пустых массивов устанавливается значение по умолчанию. Обычно это 0, пустая строка или NULL, в зависимости от типа элементов массива. +- `ARRAY JOIN` — в этом случае результат `JOIN` не будет содержать пустые массивы; +- `LEFT ARRAY JOIN` — пустые массивы попадут в результат выполнения `JOIN`. В качестве значения для пустых массивов устанавливается значение по умолчанию. Обычно это 0, пустая строка или NULL, в зависимости от типа элементов массива. Рассмотрим примеры использования `ARRAY JOIN` и `LEFT ARRAY JOIN`. Для начала создадим таблицу, содержащую столбец с типом [Array](../data_types/array.md), и добавим в него значение: -```sql +``` sql CREATE TABLE arrays_test ( s String, @@ -278,7 +283,8 @@ CREATE TABLE arrays_test INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); ``` -```text + +``` text ┌─s───────────┬─arr─────┐ │ Hello │ [1,2] │ │ World │ [3,4,5] │ @@ -288,12 +294,13 @@ VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); В примере ниже используется `ARRAY JOIN`: -```sql +``` sql SELECT s, arr FROM arrays_test ARRAY JOIN arr; ``` -```text + +``` text ┌─s─────┬─arr─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -305,12 +312,13 @@ ARRAY JOIN arr; Следующий пример использует `LEFT ARRAY JOIN`: -```sql +``` sql SELECT s, arr FROM arrays_test LEFT ARRAY JOIN arr; ``` -```text + +``` text ┌─s───────────┬─arr─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -321,17 +329,17 @@ LEFT ARRAY JOIN arr; └─────────────┴─────┘ ``` -#### Использование алиасов +#### Использование алиасов {#ispolzovanie-aliasov} Для массива в секции `ARRAY JOIN` может быть указан алиас. В этом случае, элемент массива будет доступен под этим алиасом, а сам массив — под исходным именем. Пример: -```sql +``` sql SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a; ``` -```text +``` text ┌─s─────┬─arr─────┬─a─┐ │ Hello │ [1,2] │ 1 │ │ Hello │ [1,2] │ 2 │ @@ -343,13 +351,13 @@ ARRAY JOIN arr AS a; Используя алиасы, можно выполнять `JOIN` с внешними массивами: -```sql +``` sql SELECT s, arr_external FROM arrays_test ARRAY JOIN [1, 2, 3] AS arr_external; ``` -```text +``` text ┌─s───────────┬─arr_external─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -365,13 +373,13 @@ ARRAY JOIN [1, 2, 3] AS arr_external; В секции `ARRAY JOIN` можно указать через запятую сразу несколько массивов. В этом случае, `JOIN` делается с ними одновременно (прямая сумма, а не прямое произведение). Обратите внимание, массивы должны быть одинаковых размеров. Примеры: -```sql +``` sql SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped; ``` -```text +``` text ┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ │ Hello │ [1,2] │ 1 │ 1 │ 2 │ │ Hello │ [1,2] │ 2 │ 2 │ 3 │ @@ -383,13 +391,13 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS ma В примере ниже используется функция [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate): -```sql +``` sql SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; ``` -```text +``` text ┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ │ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ │ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ @@ -399,11 +407,11 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; └───────┴─────────┴───┴─────┴─────────────────────┘ ``` -#### ARRAY JOIN с вложенными структурами данных +#### ARRAY JOIN с вложенными структурами данных {#array-join-s-vlozhennymi-strukturami-dannykh} `ARRAY JOIN` также работает с [вложенными структурами данных](../data_types/nested_data_structures/nested.md). Пример: -```sql +``` sql CREATE TABLE nested_test ( s String, @@ -416,7 +424,7 @@ INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []); ``` -```text +``` text ┌─s───────┬─nest.x──┬─nest.y─────┐ │ Hello │ [1,2] │ [10,20] │ │ World │ [3,4,5] │ [30,40,50] │ @@ -424,13 +432,13 @@ VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [] └─────────┴─────────┴────────────┘ ``` -```sql +``` sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN nest; ``` -```text +``` text ┌─s─────┬─nest.x─┬─nest.y─┐ │ Hello │ 1 │ 10 │ │ Hello │ 2 │ 20 │ @@ -442,13 +450,13 @@ ARRAY JOIN nest; При указании имени вложенной структуры данных в `ARRAY JOIN`, смысл такой же, как `ARRAY JOIN` со всеми элементами-массивами, из которых она состоит. Пример: -```sql +``` sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN `nest.x`, `nest.y`; ``` -```text +``` text ┌─s─────┬─nest.x─┬─nest.y─┐ │ Hello │ 1 │ 10 │ │ Hello │ 2 │ 20 │ @@ -460,13 +468,13 @@ ARRAY JOIN `nest.x`, `nest.y`; Такой вариант тоже имеет смысл: -```sql +``` sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN `nest.x`; ``` -```text +``` text ┌─s─────┬─nest.x─┬─nest.y─────┐ │ Hello │ 1 │ [10,20] │ │ Hello │ 2 │ [10,20] │ @@ -478,13 +486,13 @@ ARRAY JOIN `nest.x`; Алиас для вложенной структуры данных можно использовать, чтобы выбрать как результат `JOIN`-а, так и исходный массив. Пример: -```sql +``` sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN nest AS n; ``` -```text +``` text ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ @@ -496,13 +504,13 @@ ARRAY JOIN nest AS n; Пример использования функции [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate): -```sql +``` sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num; ``` -```text +``` text ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ @@ -519,7 +527,7 @@ ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num; !!! info "Примечание" Не связана с функциональностью [ARRAY JOIN](#select-array-join-clause). -```sql +``` sql SELECT FROM [GLOBAL] [ANY|ALL] [INNER|LEFT|RIGHT|FULL|CROSS] [OUTER] JOIN @@ -530,15 +538,15 @@ FROM #### Поддерживаемые типы `JOIN` {#select-join-types} -- `INNER JOIN` (or `JOIN`) -- `LEFT JOIN` (or `LEFT OUTER JOIN`) -- `RIGHT JOIN` (or `RIGHT OUTER JOIN`) -- `FULL JOIN` (or `FULL OUTER JOIN`) -- `CROSS JOIN` (or `,` ) +- `INNER JOIN` (or `JOIN`) +- `LEFT JOIN` (or `LEFT OUTER JOIN`) +- `RIGHT JOIN` (or `RIGHT OUTER JOIN`) +- `FULL JOIN` (or `FULL OUTER JOIN`) +- `CROSS JOIN` (or `,` ) Смотрите описание стандартного [SQL JOIN](https://en.wikipedia.org/wiki/Join_(SQL)). -#### Множественный JOIN +#### Множественный JOIN {#mnozhestvennyi-join} При выполнении запросов, ClickHouse перезаписывает множественный `JOIN` как комбинацию двух-табличных объединений и обрабатывает их последовательно. Например, если необходимо объединить четыре таблицы, ClickHouse объединяет первую и вторую таблицы, затем соединяет результат с третьей, а затем с четвертой. @@ -546,13 +554,13 @@ FROM Для создания запросов мы рекомендуем использовать синтаксис `JOIN ON` или `JOIN USING`. Например: -```sql +``` sql SELECT * FROM t1 JOIN t2 ON t1.a = t2.a JOIN t3 ON t1.a = t3.a ``` В секции `FROM` вы можете использовать разделенные запятыми списки таблиц для объединения. Например: -```sql +``` sql SELECT * FROM t1, t2, t3 WHERE t1.a = t2.a AND t1.a = t3.a ``` @@ -562,9 +570,9 @@ ClickHouse не поддерживает синтаксис с запятыми #### Строгость {#select-join-strictness} -- `ALL` — если правая таблица содержит несколько подходящих строк, то ClickHouse выполняет их [декартово произведение](https://ru.wikipedia.org/wiki/Прямое_произведение). Это стандартное поведение `JOIN` в SQL. -- `ANY` — если в правой таблице несколько соответствующих строк, то присоединяется только первая найденная. Если в правой таблице есть только одна подходящая строка, то результаты `ANY` и `ALL` совпадают. -- `ASOF` — для объединения последовательностей с нечётким совпадением. `ASOF JOIN` описан ниже по тексту. +- `ALL` — если правая таблица содержит несколько подходящих строк, то ClickHouse выполняет их [декартово произведение](https://ru.wikipedia.org/wiki/Прямое_произведение). Это стандартное поведение `JOIN` в SQL. +- `ANY` — если в правой таблице несколько соответствующих строк, то присоединяется только первая найденная. Если в правой таблице есть только одна подходящая строка, то результаты `ANY` и `ALL` совпадают. +- `ASOF` — для объединения последовательностей с нечётким совпадением. `ASOF JOIN` описан ниже по тексту. **Использование ASOF JOIN** @@ -574,20 +582,20 @@ ClickHouse не поддерживает синтаксис с запятыми Синтаксис `ASOF JOIN ... ON`: -```sql +``` sql SELECT expressions_list FROM table_1 ASOF LEFT JOIN table_2 ON equi_cond AND closest_match_cond ``` -Можно использовать произвольное количество условий равенства и одно условие на ближайшее совпадение. Например, `SELECT count() FROM table_1 ASOF LEFT JOIN table_2 ON table_1.a == table_2.b AND table_2.t <= table_1.t`. +Можно использовать произвольное количество условий равенства и одно условие на ближайшее совпадение. Например, `SELECT count() FROM table_1 ASOF LEFT JOIN table_2 ON table_1.a == table_2.b AND table_2.t <= table_1.t`. Условия, поддержанные для проверки на ближайшее совпадение: `>`, `>=`, `<`, `<=`. Синтаксис `ASOF JOIN ... USING`: -```sql +``` sql SELECT expressions_list FROM table_1 ASOF JOIN table_2 @@ -598,27 +606,25 @@ USING (equi_column1, ... equi_columnN, asof_column) Например, рассмотрим следующие таблицы: -``` - table_1 table_2 - event | ev_time | user_id event | ev_time | user_id -----------|---------|---------- ----------|---------|---------- - ... ... -event_1_1 | 12:00 | 42 event_2_1 | 11:59 | 42 - ... event_2_2 | 12:30 | 42 -event_1_2 | 13:00 | 42 event_2_3 | 13:00 | 42 - ... ... -``` + table_1 table_2 + event | ev_time | user_id event | ev_time | user_id + ----------|---------|---------- ----------|---------|---------- + ... ... + event_1_1 | 12:00 | 42 event_2_1 | 11:59 | 42 + ... event_2_2 | 12:30 | 42 + event_1_2 | 13:00 | 42 event_2_3 | 13:00 | 42 + ... ... `ASOF JOIN` принимает метку времени пользовательского события из `table_1` и находит такое событие в `table_2` метка времени которого наиболее близка к метке времени события из `table_1` в соответствии с условием на ближайшее совпадение. При этом столбец `user_id` используется для объединения по равенству, а столбец `ev_time` для объединения по ближайшему совпадению. В нашем примере `event_1_1` может быть объединено с `event_2_1`, `event_1_2` может быть объединено с `event_2_3`, а `event_2_2` не объединяется. !!! note "Примечание" `ASOF JOIN` не поддержан для движка таблиц [Join](../operations/table_engines/join.md). -Чтобы задать значение строгости по умолчанию, используйте сессионный параметр [join_default_strictness](../operations/settings/settings.md#settings-join_default_strictness). +Чтобы задать значение строгости по умолчанию, используйте сессионный параметр [join\_default\_strictness](../operations/settings/settings.md#settings-join_default_strictness). -#### GLOBAL JOIN +#### GLOBAL JOIN {#global-join} -При использовании обычного `JOIN` , запрос отправляется на удалённые серверы. На каждом из них выполняются подзапросы для формирования "правой" таблицы, и с этой таблицей выполняется соединение. То есть, "правая" таблица формируется на каждом сервере отдельно. +При использовании обычного `JOIN` , запрос отправляется на удалённые серверы. На каждом из них выполняются подзапросы для формирования «правой» таблицы, и с этой таблицей выполняется соединение. То есть, «правая» таблица формируется на каждом сервере отдельно. При использовании `GLOBAL ... JOIN`, сначала сервер-инициатор запроса запускает подзапрос для вычисления правой таблицы. Эта временная таблица передаётся на каждый удалённый сервер, и на них выполняются запросы с использованием переданных временных данных. @@ -628,11 +634,11 @@ event_1_2 | 13:00 | 42 event_2_3 | 13:00 | 42 Из подзапроса удаляются все столбцы, ненужные для `JOIN`. -При запуске `JOIN`, отсутствует оптимизация порядка выполнения по отношению к другим стадиям запроса. Соединение (поиск в "правой" таблице) выполняется до фильтрации в `WHERE` и до агрегации. Чтобы явно задать порядок вычислений, рекомендуется выполнять `JOIN` подзапроса с подзапросом. +При запуске `JOIN`, отсутствует оптимизация порядка выполнения по отношению к другим стадиям запроса. Соединение (поиск в «правой» таблице) выполняется до фильтрации в `WHERE` и до агрегации. Чтобы явно задать порядок вычислений, рекомендуется выполнять `JOIN` подзапроса с подзапросом. Пример: -```sql +``` sql SELECT CounterID, hits, @@ -656,7 +662,7 @@ ORDER BY hits DESC LIMIT 10 ``` -```text +``` text ┌─CounterID─┬───hits─┬─visits─┐ │ 1143050 │ 523264 │ 13665 │ │ 731962 │ 475698 │ 102716 │ @@ -676,40 +682,40 @@ LIMIT 10 В секции `USING` указывается один или несколько столбцов для соединения, что обозначает условие на равенство этих столбцов. Список столбцов задаётся без скобок. Более сложные условия соединения не поддерживаются. -"Правая" таблица (результат подзапроса) располагается в оперативной памяти. Если её не хватает, вы не сможете выполнить `JOIN`. +«Правая» таблица (результат подзапроса) располагается в оперативной памяти. Если её не хватает, вы не сможете выполнить `JOIN`. Каждый раз для выполнения запроса с одинаковым `JOIN`, подзапрос выполняется заново — результат не кэшируется. Это можно избежать, используя специальный движок таблиц [Join](../operations/table_engines/join.md), представляющий собой подготовленное множество для соединения, которое всегда находится в оперативке. В некоторых случаях более эффективно использовать `IN` вместо `JOIN`. Среди разных типов `JOIN`, наиболее эффективен `ANY LEFT JOIN`, следующий по эффективности `ANY INNER JOIN`. Наименее эффективны `ALL LEFT JOIN` и `ALL INNER JOIN`. -Если `JOIN` необходим для соединения с таблицами измерений (dimension tables - сравнительно небольшие таблицы, которые содержат свойства измерений - например, имена для рекламных кампаний), то использование `JOIN` может быть не очень удобным из-за громоздкости синтаксиса, а также из-за того, что правая таблица читается заново при каждом запросе. Специально для таких случаев существует функциональность "Внешние словари", которую следует использовать вместо `JOIN`. Дополнительные сведения смотрите в разделе [Внешние словари](dicts/external_dicts.md). +Если `JOIN` необходим для соединения с таблицами измерений (dimension tables - сравнительно небольшие таблицы, которые содержат свойства измерений - например, имена для рекламных кампаний), то использование `JOIN` может быть не очень удобным из-за громоздкости синтаксиса, а также из-за того, что правая таблица читается заново при каждом запросе. Специально для таких случаев существует функциональность «Внешние словари», которую следует использовать вместо `JOIN`. Дополнительные сведения смотрите в разделе [Внешние словари](dicts/external_dicts.md). **Ограничения по памяти** ClickHouse использует алгоритм [hash join](https://en.wikipedia.org/wiki/Hash_join). ClickHouse принимает `` и создает для него хэш-таблицу в RAM. Чтобы ограничить потребление памяти операцией `JOIN`, используйте следующие параметры: -- [max_rows_in_join](../operations/settings/query_complexity.md#settings-max_rows_in_join) — ограничивает количество строк в хэш-таблице. -- [max_bytes_in_join](../operations/settings/query_complexity.md#settings-max_bytes_in_join) — ограничивает размер хэш-таблицы. +- [max\_rows\_in\_join](../operations/settings/query_complexity.md#settings-max_rows_in_join) — ограничивает количество строк в хэш-таблице. +- [max\_bytes\_in\_join](../operations/settings/query_complexity.md#settings-max_bytes_in_join) — ограничивает размер хэш-таблицы. -По достижении любого из этих ограничений, ClickHouse действует в соответствии с настройкой [join_overflow_mode](../operations/settings/query_complexity.md#settings-join_overflow_mode). +По достижении любого из этих ограничений, ClickHouse действует в соответствии с настройкой [join\_overflow\_mode](../operations/settings/query_complexity.md#settings-join_overflow_mode). -#### Обработка пустых ячеек и NULL +#### Обработка пустых ячеек и NULL {#obrabotka-pustykh-iacheek-i-null} -При слиянии таблиц могут появляться пустые ячейки. То, каким образом ClickHouse заполняет эти ячейки, определяется настройкой [join_use_nulls](../operations/settings/settings.md#join_use_nulls). +При слиянии таблиц могут появляться пустые ячейки. То, каким образом ClickHouse заполняет эти ячейки, определяется настройкой [join\_use\_nulls](../operations/settings/settings.md#join_use_nulls). Если ключами `JOIN` выступают поля типа [Nullable](../data_types/nullable.md), то строки, где хотя бы один из ключей имеет значение [NULL](syntax.md#null-literal), не соединяются. -#### Ограничения синтаксиса +#### Ограничения синтаксиса {#ogranicheniia-sintaksisa} Для множественных секций `JOIN` в одном запросе `SELECT`: -- Получение всех столбцов через `*` возможно только при объединении таблиц, но не подзапросов. -- Секция `PREWHERE` недоступна. +- Получение всех столбцов через `*` возможно только при объединении таблиц, но не подзапросов. +- Секция `PREWHERE` недоступна. Для секций `ON`, `WHERE` и `GROUP BY`: -- Нельзя использовать произвольные выражения в секциях `ON`, `WHERE`, и `GROUP BY`, однако можно определить выражение в секции `SELECT` и затем использовать его через алиас в других секциях. +- Нельзя использовать произвольные выражения в секциях `ON`, `WHERE`, и `GROUP BY`, однако можно определить выражение в секции `SELECT` и затем использовать его через алиас в других секциях. ### Секция WHERE {#select-where} @@ -723,17 +729,16 @@ ClickHouse использует в выражении индексы, если Пример проверки на `NULL`: -```sql +``` sql SELECT * FROM t_null WHERE y IS NULL ``` -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ └───┴──────┘ ``` - ### Секция PREWHERE {#select-prewhere} Имеет такой же смысл, как и секция [WHERE](#select-where). Отличие состоит в том, какие данные читаются из таблицы. @@ -753,14 +758,14 @@ SELECT * FROM t_null WHERE y IS NULL Это одна из наиболее важных частей СУБД. -Секция GROUP BY, если есть, должна содержать список выражений. Каждое выражение далее будем называть "ключом". +Секция GROUP BY, если есть, должна содержать список выражений. Каждое выражение далее будем называть «ключом». При этом, все выражения в секциях SELECT, HAVING, ORDER BY, должны вычисляться из ключей или из агрегатных функций. То есть, каждый выбираемый из таблицы столбец, должен использоваться либо в ключах, либо внутри агрегатных функций. Если запрос содержит столбцы таблицы только внутри агрегатных функций, то секция GROUP BY может не указываться, и подразумевается агрегация по пустому набору ключей. Пример: -```sql +``` sql SELECT count(), median(FetchTiming > 60 ? 60 : FetchTiming), @@ -768,13 +773,13 @@ SELECT FROM hits ``` -Но, в отличие от стандартного SQL, если в таблице нет строк (вообще нет или после фильтрации с помощью WHERE), в качестве результата возвращается пустой результат, а не результат из одной строки, содержащий "начальные" значения агрегатных функций. +Но, в отличие от стандартного SQL, если в таблице нет строк (вообще нет или после фильтрации с помощью WHERE), в качестве результата возвращается пустой результат, а не результат из одной строки, содержащий «начальные» значения агрегатных функций. В отличие от MySQL (и в соответствии со стандартом SQL), вы не можете получить какое-нибудь значение некоторого столбца, не входящего в ключ или агрегатную функцию (за исключением константных выражений). Для обхода этого вы можете воспользоваться агрегатной функцией any (получить первое попавшееся значение) или min/max. Пример: -```sql +``` sql SELECT domainWithoutWWW(URL) AS domain, count(), @@ -789,7 +794,7 @@ GROUP BY вычисляет для каждого встретившегося Не поддерживается указание констант в качестве аргументов агрегатных функций. Пример: sum(1). Вместо этого, вы можете избавиться от констант. Пример: `count()`. -#### Обработка NULL +#### Обработка NULL {#obrabotka-null} При группировке, ClickHouse рассматривает [NULL](syntax.md) как значение, причём `NULL=NULL`. @@ -797,7 +802,7 @@ GROUP BY вычисляет для каждого встретившегося Пусть есть таблица: -```text +``` text ┌─x─┬────y─┐ │ 1 │ 2 │ │ 2 │ ᴺᵁᴸᴸ │ @@ -809,7 +814,7 @@ GROUP BY вычисляет для каждого встретившегося В результате запроса `SELECT sum(x), y FROM t_null_big GROUP BY y` мы получим: -```text +``` text ┌─sum(x)─┬────y─┐ │ 4 │ 2 │ │ 3 │ 3 │ @@ -821,24 +826,24 @@ GROUP BY вычисляет для каждого встретившегося Если в `GROUP BY` передать несколько ключей, то в результате мы получим все комбинации выборки, как если бы `NULL` был конкретным значением. -#### Модификатор WITH TOTALS +#### Модификатор WITH TOTALS {#modifikator-with-totals} -Если указан модификатор WITH TOTALS, то будет посчитана ещё одна строчка, в которой в столбцах-ключах будут содержаться значения по умолчанию (нули, пустые строки), а в столбцах агрегатных функций - значения, посчитанные по всем строкам ("тотальные" значения). +Если указан модификатор WITH TOTALS, то будет посчитана ещё одна строчка, в которой в столбцах-ключах будут содержаться значения по умолчанию (нули, пустые строки), а в столбцах агрегатных функций - значения, посчитанные по всем строкам («тотальные» значения). Эта дополнительная строчка выводится в форматах JSON\*, TabSeparated\*, Pretty\* отдельно от остальных строчек. В остальных форматах эта строчка не выводится. В форматах JSON\* строчка выводится отдельным полем totals. В форматах TabSeparated\* строчка выводится после основного результата, и перед ней (после остальных данных) вставляется пустая строка. В форматах Pretty\* строчка выводится отдельной табличкой после основного результата. -`WITH TOTALS` может выполняться по-разному при наличии HAVING. Поведение зависит от настройки totals_mode. -По умолчанию `totals_mode = 'before_having'`. В этом случае totals считается по всем строчкам, включая непрошедших через HAVING и max_rows_to_group_by. +`WITH TOTALS` может выполняться по-разному при наличии HAVING. Поведение зависит от настройки totals\_mode. +По умолчанию `totals_mode = 'before_having'`. В этом случае totals считается по всем строчкам, включая непрошедших через HAVING и max\_rows\_to\_group\_by. Остальные варианты учитывают в totals только строчки, прошедшие через HAVING, и имеют разное поведение при наличии настройки `max_rows_to_group_by` и `group_by_overflow_mode = 'any'`. `after_having_exclusive` - не учитывать строчки, не прошедшие `max_rows_to_group_by`. То есть в totals попадёт меньше или столько же строчек, чем если бы `max_rows_to_group_by` не было. -`after_having_inclusive` - учитывать в totals все строчки, не прошедшие max_rows_to_group_by. То есть в totals попадёт больше или столько же строчек, чем если бы `max_rows_to_group_by` не было. +`after_having_inclusive` - учитывать в totals все строчки, не прошедшие max\_rows\_to\_group\_by. То есть в totals попадёт больше или столько же строчек, чем если бы `max_rows_to_group_by` не было. -`after_having_auto` - считать долю строчек, прошедших через HAVING. Если она больше некоторого значения (по умолчанию - 50%), то включить все строчки, не прошедшие max_rows_to_group_by в totals, иначе - не включить. +`after_having_auto` - считать долю строчек, прошедших через HAVING. Если она больше некоторого значения (по умолчанию - 50%), то включить все строчки, не прошедшие max\_rows\_to\_group\_by в totals, иначе - не включить. `totals_auto_threshold` - по умолчанию 0.5. Коэффициент для работы `after_having_auto`. @@ -849,11 +854,11 @@ GROUP BY вычисляет для каждого встретившегося #### GROUP BY во внешней памяти {#select-group-by-in-external-memory} Можно включить сброс временных данных на диск, чтобы ограничить потребление оперативной памяти при выполнении `GROUP BY`. -Настройка [max_bytes_before_external_group_by](../operations/settings/settings.md#settings-max_bytes_before_external_group_by) определяет пороговое значение потребления RAM, по достижении которого временные данные `GROUP BY` сбрасываются в файловую систему. Если равно 0 (по умолчанию) - значит выключено. +Настройка [max\_bytes\_before\_external\_group\_by](../operations/settings/settings.md#settings-max_bytes_before_external_group_by) определяет пороговое значение потребления RAM, по достижении которого временные данные `GROUP BY` сбрасываются в файловую систему. Если равно 0 (по умолчанию) - значит выключено. При использовании `max_bytes_before_external_group_by`, рекомендуем выставить `max_memory_usage` приблизительно в два раза больше. Это следует сделать, потому что агрегация выполняется в две стадии: чтение и формирование промежуточных данных (1) и слияние промежуточных данных (2). Сброс данных на файловую систему может производиться только на стадии 1. Если сброса временных данных не было, то на стадии 2 может потребляться до такого же объёма памяти, как на стадии 1. -Например, если [max_memory_usage](../operations/settings/settings.md#settings_max_memory_usage) было выставлено в 10000000000, и вы хотите использовать внешнюю агрегацию, то имеет смысл выставить `max_bytes_before_external_group_by` в 10000000000, а max_memory_usage в 20000000000. При срабатывании внешней агрегации (если был хотя бы один сброс временных данных в файловую систему) максимальное потребление оперативки будет лишь чуть-чуть больше `max_bytes_before_external_group_by`. +Например, если [max\_memory\_usage](../operations/settings/settings.md#settings_max_memory_usage) было выставлено в 10000000000, и вы хотите использовать внешнюю агрегацию, то имеет смысл выставить `max_bytes_before_external_group_by` в 10000000000, а max\_memory\_usage в 20000000000. При срабатывании внешней агрегации (если был хотя бы один сброс временных данных в файловую систему) максимальное потребление оперативки будет лишь чуть-чуть больше `max_bytes_before_external_group_by`. При распределённой обработке запроса внешняя агрегация производится на удалённых серверах. Для того чтобы на сервере-инициаторе запроса использовалось немного оперативки, нужно выставить настройку `distributed_aggregation_memory_efficient` в 1. @@ -863,14 +868,14 @@ GROUP BY вычисляет для каждого встретившегося Если есть `ORDER BY` с `LIMIT` после `GROUP BY`, то объём потребляемой RAM будет зависеть от объёма данных в `LIMIT`, а не во всей таблице. Однако, если `ORDER BY` используется без `LIMIT`, не забудьте включить внешнюю сортировку (`max_bytes_before_external_sort`). -### Секция LIMIT BY +### Секция LIMIT BY {#sektsiia-limit-by} Запрос с секцией `LIMIT n BY expressions` выбирает первые `n` строк для каждого отличного значения `expressions`. Ключ `LIMIT BY` может содержать любое количество [выражений](syntax.md#syntax-expressions). ClickHouse поддерживает следующий синтаксис: -- `LIMIT [offset_value, ]n BY expressions` -- `LIMIT n OFFSET offset_value BY expressions` +- `LIMIT [offset_value, ]n BY expressions` +- `LIMIT n OFFSET offset_value BY expressions` Во время обработки запроса, ClickHouse выбирает данные, упорядоченные по ключу сортировки. Ключ сортировки задаётся явно в секции [ORDER BY](#select-order-by) или неявно в свойствах движка таблицы. Затем ClickHouse применяет `LIMIT n BY expressions` и возвращает первые `n` для каждой отличной комбинации `expressions`. Если указан `OFFSET`, то для каждого блока данных, который принадлежит отдельной комбинации `expressions`, ClickHouse отступает `offset_value` строк от начала блока и возвращает не более `n`. Если `offset_value` больше, чем количество строк в блоке данных, ClickHouse не возвращает ни одной строки. @@ -880,18 +885,18 @@ ClickHouse поддерживает следующий синтаксис: Образец таблицы: -```sql +``` sql CREATE TABLE limit_by(id Int, val Int) ENGINE = Memory; INSERT INTO limit_by values(1, 10), (1, 11), (1, 12), (2, 20), (2, 21); ``` Запросы: -```sql +``` sql SELECT * FROM limit_by ORDER BY id, val LIMIT 2 BY id ``` -```text +``` text ┌─id─┬─val─┐ │ 1 │ 10 │ │ 1 │ 11 │ @@ -900,11 +905,11 @@ SELECT * FROM limit_by ORDER BY id, val LIMIT 2 BY id └────┴─────┘ ``` -```sql +``` sql SELECT * FROM limit_by ORDER BY id, val LIMIT 1, 2 BY id ``` -```text +``` text ┌─id─┬─val─┐ │ 1 │ 11 │ │ 1 │ 12 │ @@ -916,7 +921,7 @@ SELECT * FROM limit_by ORDER BY id, val LIMIT 1, 2 BY id Следующий запрос выбирает топ 5 рефереров для каждой пары `domain, device_type`, но не более 100 строк (`LIMIT n BY + LIMIT`). -```sql +``` sql SELECT domainWithoutWWW(URL) AS domain, domainWithoutWWW(REFERRER_URL) AS referrer, @@ -933,13 +938,12 @@ LIMIT 100 `LIMIT n BY` работает с [NULL](syntax.md) как если бы это было конкретное значение. Т.е. в результате запроса пользователь получит все комбинации полей, указанных в `BY`. -### Секция HAVING +### Секция HAVING {#sektsiia-having} Позволяет отфильтровать результат, полученный после GROUP BY, аналогично секции WHERE. WHERE и HAVING отличаются тем, что WHERE выполняется до агрегации (GROUP BY), а HAVING - после. Если агрегации не производится, то HAVING использовать нельзя. - ### Секция ORDER BY {#select-order-by} Секция ORDER BY содержит список выражений, к каждому из которых также может быть приписано DESC или ASC (направление сортировки). Если ничего не приписано - это аналогично приписыванию ASC. ASC - сортировка по возрастанию, DESC - сортировка по убыванию. Обозначение направления сортировки действует на одно выражение, а не на весь список. Пример: `ORDER BY Visits DESC, SearchPhrase` @@ -953,15 +957,15 @@ WHERE и HAVING отличаются тем, что WHERE выполняется Порядок сортировки `NaN` и `NULL`: -- С модификатором `NULLS FIRST` — Сначала `NULL`, затем `NaN`, затем остальные значения. -- С модификатором `NULLS LAST` — Сначала значения, затем `NaN`, затем `NULL`. -- По умолчанию — Как с модификатором `NULLS LAST`. +- С модификатором `NULLS FIRST` — Сначала `NULL`, затем `NaN`, затем остальные значения. +- С модификатором `NULLS LAST` — Сначала значения, затем `NaN`, затем `NULL`. +- По умолчанию — Как с модификатором `NULLS LAST`. Пример: Для таблицы -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 2 │ @@ -978,7 +982,7 @@ WHERE и HAVING отличаются тем, что WHERE выполняется Выполним запрос `SELECT * FROM t_null_nan ORDER BY y NULLS FIRST`, получим: -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 7 │ ᴺᵁᴸᴸ │ @@ -995,9 +999,9 @@ WHERE и HAVING отличаются тем, что WHERE выполняется Если кроме ORDER BY указан также не слишком большой LIMIT, то расходуется меньше оперативки. Иначе расходуется количество памяти, пропорциональное количеству данных для сортировки. При распределённой обработке запроса, если отсутствует GROUP BY, сортировка частично делается на удалённых серверах, а на сервере-инициаторе запроса производится слияние результатов. Таким образом, при распределённой сортировке, может сортироваться объём данных, превышающий размер памяти на одном сервере. -Существует возможность выполнять сортировку во внешней памяти (с созданием временных файлов на диске), если оперативной памяти не хватает. Для этого предназначена настройка `max_bytes_before_external_sort`. Если она выставлена в 0 (по умолчанию), то внешняя сортировка выключена. Если она включена, то при достижении объёмом данных для сортировки указанного количества байт, накопленные данные будут отсортированы и сброшены во временный файл. После того, как все данные будут прочитаны, будет произведено слияние всех сортированных файлов и выдача результата. Файлы записываются в директорию /var/lib/clickhouse/tmp/ (по умолчанию, может быть изменено с помощью параметра tmp_path) в конфиге. +Существует возможность выполнять сортировку во внешней памяти (с созданием временных файлов на диске), если оперативной памяти не хватает. Для этого предназначена настройка `max_bytes_before_external_sort`. Если она выставлена в 0 (по умолчанию), то внешняя сортировка выключена. Если она включена, то при достижении объёмом данных для сортировки указанного количества байт, накопленные данные будут отсортированы и сброшены во временный файл. После того, как все данные будут прочитаны, будет произведено слияние всех сортированных файлов и выдача результата. Файлы записываются в директорию /var/lib/clickhouse/tmp/ (по умолчанию, может быть изменено с помощью параметра tmp\_path) в конфиге. -На выполнение запроса может расходоваться больше памяти, чем max_bytes_before_external_sort. Поэтому, значение этой настройки должно быть существенно меньше, чем max_memory_usage. Для примера, если на вашем сервере 128 GB оперативки, и вам нужно выполнить один запрос, то выставите max_memory_usage в 100 GB, а max_bytes_before_external_sort в 80 GB. +На выполнение запроса может расходоваться больше памяти, чем max\_bytes\_before\_external\_sort. Поэтому, значение этой настройки должно быть существенно меньше, чем max\_memory\_usage. Для примера, если на вашем сервере 128 GB оперативки, и вам нужно выполнить один запрос, то выставите max\_memory\_usage в 100 GB, а max\_bytes\_before\_external\_sort в 80 GB. Внешняя сортировка работает существенно менее эффективно, чем сортировка в оперативке. @@ -1010,22 +1014,23 @@ WHERE и HAVING отличаются тем, что WHERE выполняется Чтобы включить в результат несколько столбцов, выбрав их имена с помощью регулярных выражений [re2](https://en.wikipedia.org/wiki/RE2_(software)), используйте выражение `COLUMNS`. -```sql +``` sql COLUMNS('regexp') ``` Например, рассмотрим таблицу: -```sql +``` sql CREATE TABLE default.col_names (aa Int8, ab Int8, bc Int8) ENGINE = TinyLog ``` Следующий запрос выбирает данные из всех столбцов, содержащих в имени символ `a`. -```sql +``` sql SELECT COLUMNS('a') FROM col_names ``` -```text + +``` text ┌─aa─┬─ab─┐ │ 1 │ 1 │ └────┴────┘ @@ -1037,10 +1042,11 @@ SELECT COLUMNS('a') FROM col_names Например: -```sql +``` sql SELECT COLUMNS('a'), COLUMNS('c'), toTypeName(COLUMNS('c')) FROM col_names ``` -```text + +``` text ┌─aa─┬─ab─┬─bc─┬─toTypeName(bc)─┐ │ 1 │ 1 │ 1 │ Int8 │ └────┴────┴────┴────────────────┘ @@ -1050,12 +1056,13 @@ SELECT COLUMNS('a'), COLUMNS('c'), toTypeName(COLUMNS('c')) FROM col_names Например: -```sql +``` sql SELECT COLUMNS('a') + COLUMNS('c') FROM col_names ``` -```text + +``` text Received exception from server (version 19.14.1): -Code: 42. DB::Exception: Received from localhost:9000. DB::Exception: Number of arguments for function plus doesn't match: passed 3, should be 2. +Code: 42. DB::Exception: Received from localhost:9000. DB::Exception: Number of arguments for function plus doesn't match: passed 3, should be 2. ``` В этом примере, `COLUMNS('a')` возвращает два столбца: `aa` и `ab`. `COLUMNS('c')` возвращает столбец `bc`. Оператор `+` не работает с тремя аргументами, поэтому ClickHouse генерирует исключение с соответствущим сообщением. @@ -1067,9 +1074,9 @@ Code: 42. DB::Exception: Received from localhost:9000. DB::Exception: Number of Если указано `DISTINCT`, то из всех множеств полностью совпадающих строк результата, будет оставляться только одна строка. Результат выполнения будет таким же, как если указано `GROUP BY` по всем указанным полям в `SELECT` и не указаны агрегатные функции. Но имеется несколько отличий от `GROUP BY`: -- `DISTINCT` может применяться совместно с `GROUP BY`; -- при отсутствии `ORDER BY` и наличии `LIMIT`, запрос прекратит выполнение сразу после того, как будет прочитано необходимое количество различных строк - в этом случае использование DISTINCT существенно более оптимально; -- блоки данных будут выдаваться по мере их обработки, не дожидаясь выполнения всего запроса. +- `DISTINCT` может применяться совместно с `GROUP BY`; +- при отсутствии `ORDER BY` и наличии `LIMIT`, запрос прекратит выполнение сразу после того, как будет прочитано необходимое количество различных строк - в этом случае использование DISTINCT существенно более оптимально; +- блоки данных будут выдаваться по мере их обработки, не дожидаясь выполнения всего запроса. `DISTINCT` не поддерживается, если в `SELECT` присутствует хотя бы один столбец типа массив. @@ -1079,7 +1086,7 @@ ClickHouse поддерживает использование в одном з Таблица для примера: -```text +``` text ┌─a─┬─b─┐ │ 2 │ 1 │ │ 1 │ 2 │ @@ -1090,7 +1097,7 @@ ClickHouse поддерживает использование в одном з При выборке данных запросом `SELECT DISTINCT a FROM t1 ORDER BY b ASC`, мы получаем следующий результат: -```text +``` text ┌─a─┐ │ 2 │ │ 1 │ @@ -1100,7 +1107,7 @@ ClickHouse поддерживает использование в одном з Если изменить направление сортировки `SELECT DISTINCT a FROM t1 ORDER BY b DESC`, то результат получается следующий: -```text +``` text ┌─a─┐ │ 3 │ │ 1 │ @@ -1112,7 +1119,7 @@ ClickHouse поддерживает использование в одном з Учитывайте эту особенность реализации при программировании запросов. -### Секция LIMIT +### Секция LIMIT {#sektsiia-limit} `LIMIT m` позволяет выбрать из результата первые `m` строк. @@ -1122,11 +1129,11 @@ ClickHouse поддерживает использование в одном з При отсутствии секции `ORDER BY`, однозначно сортирующей результат, результат может быть произвольным и может являться недетерминированным. -### Секция UNION ALL +### Секция UNION ALL {#sektsiia-union-all} Произвольное количество запросов может быть объединено с помощью `UNION ALL`. Пример: -```sql +``` sql SELECT CounterID, 1 AS table, toInt64(count()) AS c FROM test.hits GROUP BY CounterID @@ -1159,12 +1166,11 @@ SELECT CounterID, 2 AS table, sum(Sign) AS c При указании FORMAT format вы можете получить данные в любом указанном формате. Это может использоваться для удобства или для создания дампов. -Подробнее смотрите раздел "Форматы". +Подробнее смотрите раздел «Форматы». Если секция FORMAT отсутствует, то используется формат по умолчанию, который зависит от используемого интерфейса для доступа к БД и от настроек. Для HTTP интерфейса, а также для клиента командной строки, используемого в batch-режиме, по умолчанию используется формат TabSeparated. Для клиента командной строки, используемого в интерактивном режиме, по умолчанию используется формат PrettyCompact (прикольные таблички, компактные). При использовании клиента командной строки данные на клиент передаются во внутреннем эффективном формате. При этом клиент самостоятельно интерпретирует секцию FORMAT запроса и форматирует данные на своей стороне (снимая нагрузку на сеть и сервер). - ### Операторы IN {#select-in-operators} Операторы `IN`, `NOT IN`, `GLOBAL IN`, `GLOBAL NOT IN` рассматриваются отдельно, так как их функциональность достаточно богатая. @@ -1173,18 +1179,17 @@ SELECT CounterID, 2 AS table, sum(Sign) AS c Примеры: -```sql +``` sql SELECT UserID IN (123, 456) FROM ... SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... ``` Если слева стоит один столбец, входящий в индекс, а справа - множество констант, то при выполнении запроса, система воспользуется индексом. -Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел "Внешние данные для обработки запроса"), и затем воспользоваться подзапросом. +Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел «Внешние данные для обработки запроса»), и затем воспользоваться подзапросом. В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. - Если в качестве правой части оператора указано имя таблицы (например, `UserID IN users`), то это эквивалентно подзапросу `UserID IN (SELECT * FROM users)`. Это используется при работе с внешними данными, отправляемым вместе с запросом. Например, вместе с запросом может быть отправлено множество идентификаторов посетителей, загруженное во временную таблицу users, по которому следует выполнить фильтрацию. Если в качестве правой части оператора, указано имя таблицы, имеющий движок Set (подготовленное множество, постоянно находящееся в оперативке), то множество не будет создаваться заново при каждом запросе. @@ -1192,7 +1197,7 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... В подзапросе может быть указано более одного столбца для фильтрации кортежей. Пример: -```sql +``` sql SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ... ``` @@ -1201,7 +1206,7 @@ SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ... Оператор IN и подзапрос могут встречаться в любой части запроса, в том числе в агрегатных и лямбда функциях. Пример: -```sql +``` sql SELECT EventDate, avg(UserID IN @@ -1215,7 +1220,7 @@ GROUP BY EventDate ORDER BY EventDate ASC ``` -```text +``` text ┌──EventDate─┬────ratio─┐ │ 2014-03-17 │ 1 │ │ 2014-03-18 │ 0.807696 │ @@ -1230,14 +1235,13 @@ ORDER BY EventDate ASC за каждый день после 17 марта считаем долю хитов, сделанных посетителями, которые заходили на сайт 17 марта. Подзапрос в секции IN на одном сервере всегда выполняется только один раз. Зависимых подзапросов не существует. - -#### Обработка NULL +#### Обработка NULL {#obrabotka-null-1} При обработке запроса оператор IN будет считать, что результат операции с [NULL](syntax.md) всегда равен `0`, независимо от того, находится `NULL` в правой или левой части оператора. Значения `NULL` не входят ни в какое множество, не соответствуют друг другу и не могут сравниваться. Рассмотрим для примера таблицу `t_null`: -```text +``` text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -1246,7 +1250,7 @@ ORDER BY EventDate ASC При выполнении запроса `SELECT x FROM t_null WHERE y IN (NULL,3)` получим следующий результат: -```text +``` text ┌─x─┐ │ 2 │ └───┘ @@ -1254,23 +1258,23 @@ ORDER BY EventDate ASC Видно, что строка, в которой `y = NULL`, выброшена из результатов запроса. Это произошло потому, что ClickHouse не может решить входит ли `NULL` в множество `(NULL,3)`, возвращает результат операции `0`, а `SELECT` выбрасывает эту строку из финальной выдачи. -```sql +``` sql SELECT y IN (NULL, 3) FROM t_null ``` -```text + +``` text ┌─in(y, tuple(NULL, 3))─┐ │ 0 │ │ 1 │ └───────────────────────┘ ``` - #### Распределённые подзапросы {#select-distributed-subqueries} Существует два варианта IN-ов с подзапросами (аналогично для JOIN-ов): обычный `IN` / `JOIN` и `GLOBAL IN` / `GLOBAL JOIN`. Они отличаются способом выполнения при распределённой обработке запроса. -!!! attention +!!! attention "Attention" Помните, что алгоритмы, описанные ниже, могут работать иначе в зависимости от [настройки](../operations/settings/settings.md) `distributed_product_mode`. При использовании обычного IN-а, запрос отправляется на удалённые серверы, и на каждом из них выполняются подзапросы в секциях `IN` / `JOIN`. @@ -1281,19 +1285,19 @@ FROM t_null Следует быть внимательным при использовании подзапросов в секции `IN` / `JOIN` в случае распределённой обработки запроса. -Рассмотрим это на примерах. Пусть на каждом сервере кластера есть обычная таблица **local_table**. Пусть также есть таблица **distributed_table** типа **Distributed**, которая смотрит на все серверы кластера. +Рассмотрим это на примерах. Пусть на каждом сервере кластера есть обычная таблица **local\_table**. Пусть также есть таблица **distributed\_table** типа **Distributed**, которая смотрит на все серверы кластера. -При запросе к распределённой таблице **distributed_table**, запрос будет отправлен на все удалённые серверы, и на них будет выполнен с использованием таблицы **local_table**. +При запросе к распределённой таблице **distributed\_table**, запрос будет отправлен на все удалённые серверы, и на них будет выполнен с использованием таблицы **local\_table**. Например, запрос -```sql +``` sql SELECT uniq(UserID) FROM distributed_table ``` будет отправлен на все удалённые серверы в виде -```sql +``` sql SELECT uniq(UserID) FROM local_table ``` @@ -1301,37 +1305,37 @@ SELECT uniq(UserID) FROM local_table Теперь рассмотрим запрос с IN-ом: -```sql +``` sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) ``` -- расчёт пересечения аудиторий двух сайтов. +- расчёт пересечения аудиторий двух сайтов. Этот запрос будет отправлен на все удалённые серверы в виде -```sql +``` sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) ``` То есть, множество в секции IN будет собрано на каждом сервере независимо, только по тем данным, которые есть локально на каждом из серверов. -Это будет работать правильно и оптимально, если вы предусмотрели такой случай, и раскладываете данные по серверам кластера таким образом, чтобы данные одного UserID-а лежали только на одном сервере. В таком случае все необходимые данные будут присутствовать на каждом сервере локально. В противном случае результат будет посчитан неточно. Назовём этот вариант запроса "локальный IN". +Это будет работать правильно и оптимально, если вы предусмотрели такой случай, и раскладываете данные по серверам кластера таким образом, чтобы данные одного UserID-а лежали только на одном сервере. В таком случае все необходимые данные будут присутствовать на каждом сервере локально. В противном случае результат будет посчитан неточно. Назовём этот вариант запроса «локальный IN». -Чтобы исправить работу запроса, когда данные размазаны по серверам кластера произвольным образом, можно было бы указать **distributed_table** внутри подзапроса. Запрос будет выглядеть так: +Чтобы исправить работу запроса, когда данные размазаны по серверам кластера произвольным образом, можно было бы указать **distributed\_table** внутри подзапроса. Запрос будет выглядеть так: -```sql +``` sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` Этот запрос будет отправлен на все удалённые серверы в виде -```sql +``` sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` На каждом удалённом сервере начнёт выполняться подзапрос. Так как в подзапросе используется распределённая таблица, то подзапрос будет, на каждом удалённом сервере, снова отправлен на каждый удалённый сервер, в виде -```sql +``` sql SELECT UserID FROM local_table WHERE CounterID = 34 ``` @@ -1339,19 +1343,19 @@ SELECT UserID FROM local_table WHERE CounterID = 34 В таких случаях всегда следует использовать GLOBAL IN вместо IN. Рассмотрим его работу для запроса -```sql +``` sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID GLOBAL IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` На сервере-инициаторе запроса будет выполнен подзапрос -```sql +``` sql SELECT UserID FROM distributed_table WHERE CounterID = 34 ``` , и результат будет сложен во временную таблицу в оперативке. Затем запрос будет отправлен на каждый удалённый сервер в виде -```sql +``` sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL IN _data1 ``` @@ -1359,25 +1363,25 @@ SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL Это гораздо более оптимально, чем при использовании обычного IN. Но при этом, следует помнить о нескольких вещах: -1. При создании временной таблицы данные не уникализируются. Чтобы уменьшить объём передаваемых по сети данных, укажите в подзапросе DISTINCT (для обычного IN-а этого делать не нужно). -2. Временная таблица будет передана на все удалённые серверы. Передача не учитывает топологию сети. Например, если 10 удалённых серверов расположены в удалённом относительно сервера-инициатора запроса дата-центре, то по каналу в удалённый дата-центр данные будет переданы 10 раз. Старайтесь не использовать большие множества при использовании GLOBAL IN. -3. При передаче данных на удалённые серверы не настраивается ограничение использования сетевой полосы. Вы можете перегрузить сеть. -4. Старайтесь распределять данные по серверам так, чтобы в GLOBAL IN-ах не было частой необходимости. -5. Если в GLOBAL IN есть частая необходимость, то спланируйте размещение кластера ClickHouse таким образом, чтобы в каждом дата-центре была хотя бы одна реплика каждого шарда, и среди них была быстрая сеть - чтобы запрос целиком можно было бы выполнить, передавая данные в пределах одного дата-центра. +1. При создании временной таблицы данные не уникализируются. Чтобы уменьшить объём передаваемых по сети данных, укажите в подзапросе DISTINCT (для обычного IN-а этого делать не нужно). +2. Временная таблица будет передана на все удалённые серверы. Передача не учитывает топологию сети. Например, если 10 удалённых серверов расположены в удалённом относительно сервера-инициатора запроса дата-центре, то по каналу в удалённый дата-центр данные будет переданы 10 раз. Старайтесь не использовать большие множества при использовании GLOBAL IN. +3. При передаче данных на удалённые серверы не настраивается ограничение использования сетевой полосы. Вы можете перегрузить сеть. +4. Старайтесь распределять данные по серверам так, чтобы в GLOBAL IN-ах не было частой необходимости. +5. Если в GLOBAL IN есть частая необходимость, то спланируйте размещение кластера ClickHouse таким образом, чтобы в каждом дата-центре была хотя бы одна реплика каждого шарда, и среди них была быстрая сеть - чтобы запрос целиком можно было бы выполнить, передавая данные в пределах одного дата-центра. В секции `GLOBAL IN` также имеет смысл указывать локальную таблицу - в случае, если эта локальная таблица есть только на сервере-инициаторе запроса, и вы хотите воспользоваться данными из неё на удалённых серверах. -### Экстремальные значения +### Экстремальные значения {#ekstremalnye-znacheniia} Вы можете получить в дополнение к результату также минимальные и максимальные значения по столбцам результата. Для этого выставите настройку **extremes** в 1. Минимумы и максимумы считаются для числовых типов, дат, дат-с-временем. Для остальных столбцов будут выведены значения по умолчанию. Вычисляются дополнительные две строчки - минимумы и максимумы, соответственно. Эти две дополнительные строки выводятся в [форматах](../interfaces/formats.md) `JSON*`, `TabSeparated*`, и `Pretty*` отдельно от остальных строчек. В остальных форматах они не выводится. -Во форматах `JSON*`, экстремальные значения выводятся отдельным полем 'extremes'. В форматах `TabSeparated*`, строка выводится после основного результата и после 'totals' если есть. Перед ней (после остальных данных) вставляется пустая строка. В форматах `Pretty*`, строка выводится отдельной таблицей после основного результата и после `totals` если есть. +Во форматах `JSON*`, экстремальные значения выводятся отдельным полем ‘extremes’. В форматах `TabSeparated*`, строка выводится после основного результата и после ‘totals’ если есть. Перед ней (после остальных данных) вставляется пустая строка. В форматах `Pretty*`, строка выводится отдельной таблицей после основного результата и после `totals` если есть. -Экстремальные значения вычисляются для строк перед `LIMIT`, но после `LIMIT BY`. Однако при использовании `LIMIT offset, size`, строки перед `offset` включаются в `extremes`. В потоковых запросах, в результате может учитываться также небольшое количество строчек, прошедших `LIMIT`. +Экстремальные значения вычисляются для строк перед `LIMIT`, но после `LIMIT BY`. Однако при использовании `LIMIT offset, size`, строки перед `offset` включаются в `extremes`. В потоковых запросах, в результате может учитываться также небольшое количество строчек, прошедших `LIMIT`. -### Замечания +### Замечания {#zamechaniia} В секциях `GROUP BY`, `ORDER BY`, в отличие от диалекта MySQL, и в соответствии со стандартным SQL, не поддерживаются позиционные аргументы. Например, если вы напишите `GROUP BY 1, 2` - то это будет воспринято, как группировка по константам (то есть, агрегация всех строк в одну). @@ -1386,11 +1390,11 @@ SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL В любом месте запроса, вместо выражения, может стоять звёздочка. При анализе запроса звёздочка раскрывается в список всех столбцов таблицы (за исключением `MATERIALIZED` и `ALIAS` столбцов). Есть лишь немного случаев, когда оправдано использовать звёздочку: -- при создании дампа таблицы; -- для таблиц, содержащих всего несколько столбцов - например, системных таблиц; -- для получения информации о том, какие столбцы есть в таблице; в этом случае, укажите `LIMIT 1`. Но лучше используйте запрос `DESC TABLE`; -- при наличии сильной фильтрации по небольшому количеству столбцов с помощью `PREWHERE`; -- в подзапросах (так как из подзапросов выкидываются столбцы, не нужные для внешнего запроса). +- при создании дампа таблицы; +- для таблиц, содержащих всего несколько столбцов - например, системных таблиц; +- для получения информации о том, какие столбцы есть в таблице; в этом случае, укажите `LIMIT 1`. Но лучше используйте запрос `DESC TABLE`; +- при наличии сильной фильтрации по небольшому количеству столбцов с помощью `PREWHERE`; +- в подзапросах (так как из подзапросов выкидываются столбцы, не нужные для внешнего запроса). В других случаях использование звёздочки является издевательством над системой, так как вместо преимуществ столбцовой СУБД вы получаете недостатки. То есть использовать звёздочку не рекомендуется. diff --git a/docs/ru/query_language/show.md b/docs/ru/query_language/show.md index 680b42809c5..03f99378ccc 100644 --- a/docs/ru/query_language/show.md +++ b/docs/ru/query_language/show.md @@ -1,8 +1,8 @@ -# SHOW Queries +# SHOW Queries {#show-queries} -## SHOW CREATE TABLE +## SHOW CREATE TABLE {#show-create-table} -```sql +``` sql SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY] [db.]table [INTO OUTFILE filename] [FORMAT format] ``` @@ -10,16 +10,16 @@ SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY] [db.]table [INTO OUTFILE filename] [F ## SHOW DATABASES {#show-databases} -```sql +``` sql SHOW DATABASES [INTO OUTFILE filename] [FORMAT format] ``` Выводит список всех баз данных. Запрос полностью аналогичен запросу `SELECT name FROM system.databases [INTO OUTFILE filename] [FORMAT format]`. -## SHOW PROCESSLIST +## SHOW PROCESSLIST {#show-processlist} -```sql +``` sql SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] ``` @@ -29,15 +29,15 @@ SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] Полезный совет (выполните в консоли): -```bash +``` bash $ watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" ``` -## SHOW TABLES +## SHOW TABLES {#show-tables} Выводит список таблиц. -```sql +``` sql SHOW [TEMPORARY] TABLES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` @@ -45,7 +45,7 @@ SHOW [TEMPORARY] TABLES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE Результат, идентичный тому, что выдаёт запрос `SHOW TABLES` можно получить также запросом следующего вида: -```sql +``` sql SELECT name FROM system.tables WHERE database = [AND name LIKE ] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` @@ -53,21 +53,22 @@ SELECT name FROM system.tables WHERE database = [AND name LIKE ] [ Следующий запрос выбирает первые две строки из списка таблиц в базе данных `system`, чьи имена содержат `co`. -```sql +``` sql SHOW TABLES FROM system LIKE '%co%' LIMIT 2 ``` -```text + +``` text ┌─name───────────────────────────┐ │ aggregate_function_combinators │ │ collations │ └────────────────────────────────┘ ``` -## SHOW DICTIONARIES +## SHOW DICTIONARIES {#show-dictionaries} Выводит список [внешних словарей](dicts/external_dicts.md). -```sql +``` sql SHOW DICTIONARIES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` @@ -75,7 +76,7 @@ SHOW DICTIONARIES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE [AND name LIKE ] [LIMIT ] [INTO OUTFILE ] [FORMAT ] ``` @@ -83,10 +84,11 @@ SELECT name FROM system.dictionaries WHERE database = [AND name LIKE ``` -### FLUSH DISTRIBUTED {#query_language-system-flush-distributed} +### FLUSH DISTRIBUTED {#query-language-system-flush-distributed} В синхронном режиме отправляет все данные на узлы кластера. Если какие-либо узлы недоступны, ClickHouse генерирует исключение и останавливает выполнение запроса. Такой запрос можно повторять до успешного завершения, что будет означать возвращение связанности с остальными узлами кластера. -```sql +``` sql SYSTEM FLUSH DISTRIBUTED [db.] ``` -### START DISTRIBUTED SENDS {#query_language-system-start-distributed-sends} +### START DISTRIBUTED SENDS {#query-language-system-start-distributed-sends} Включает фоновую отправку при вставке данных в распределенные таблицы. -```sql +``` sql SYSTEM START DISTRIBUTED SENDS [db.] ``` -### STOP MERGES {#query_language-system-stop-merges} +### STOP MERGES {#query-language-system-stop-merges} Позволяет остановить фоновые мержи для таблиц семейства MergeTree: -```sql +``` sql SYSTEM STOP MERGES [[db.]merge_tree_family_table_name] ``` + !!! note "Note" `DETACH / ATTACH` таблицы восстанавливает фоновые мержи для этой таблицы (даже в случае отключения фоновых мержей для всех таблиц семейства MergeTree до `DETACH`). - -### START MERGES {#query_language-system-start-merges} +### START MERGES {#query-language-system-start-merges} Включает фоновые мержи для таблиц семейства MergeTree: -```sql +``` sql SYSTEM START MERGES [[db.]merge_tree_family_table_name] ``` [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/system/) - diff --git a/docs/ru/query_language/table_functions/file.md b/docs/ru/query_language/table_functions/file.md index 0c51024dd33..e61fddadd2d 100644 --- a/docs/ru/query_language/table_functions/file.md +++ b/docs/ru/query_language/table_functions/file.md @@ -1,17 +1,16 @@ - -# file +# file {#file} Создаёт таблицу из файла. Данная табличная функция похожа на табличные функции [file](file.md) и [hdfs](hdfs.md). -```sql +``` sql file(path, format, structure) ``` **Входные параметры** -- `path` — относительный путь до файла от [user_files_path](../../operations/server_settings/settings.md#server_settings-user_files_path). Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, ``'abc', 'def'` — строки. -- `format` — [формат](../../interfaces/formats.md#formats) файла. -- `structure` — структура таблицы. Формат `'colunmn1_name column1_ype, column2_name column2_type, ...'`. +- `path` — относительный путь до файла от [user\_files\_path](../../operations/server_settings/settings.md#server_settings-user_files_path). Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, \``'abc', 'def'` — строки. +- `format` — [формат](../../interfaces/formats.md#formats) файла. +- `structure` — структура таблицы. Формат `'colunmn1_name column1_ype, column2_name column2_type, ...'`. **Возвращаемое значение** @@ -21,7 +20,7 @@ file(path, format, structure) Настройка `user_files_path` и содержимое файла `test.csv`: -```bash +``` bash $ grep user_files_path /etc/clickhouse-server/config.xml /var/lib/clickhouse/user_files/ @@ -33,12 +32,13 @@ $ cat /var/lib/clickhouse/user_files/test.csv Таблица из `test.csv` и выборка первых двух строк из неё: -```sql +``` sql SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2 ``` -```text + +``` text ┌─column1─┬─column2─┬─column3─┐ │ 1 │ 2 │ 3 │ │ 3 │ 2 │ 1 │ @@ -47,57 +47,61 @@ LIMIT 2 Шаблоны могут содержаться в нескольких компонентах пути. Обрабатываются только существующие файлы, название которых целиком удовлетворяет шаблону (не только суффиксом или префиксом). -- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. -- `?` — Заменяет ровно один любой символ. -- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. +- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. +- `?` — Заменяет ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. Конструкция с `{}` аналогична табличной функции [remote](remote.md). **Пример** -1. Предположим у нас есть несколько файлов со следующими относительными путями: +1. Предположим у нас есть несколько файлов со следующими относительными путями: -- 'some_dir/some_file_1' -- 'some_dir/some_file_2' -- 'some_dir/some_file_3' -- 'another_dir/some_file_1' -- 'another_dir/some_file_2' -- 'another_dir/some_file_3' +- ‘some\_dir/some\_file\_1’ +- ‘some\_dir/some\_file\_2’ +- ‘some\_dir/some\_file\_3’ +- ‘another\_dir/some\_file\_1’ +- ‘another\_dir/some\_file\_2’ +- ‘another\_dir/some\_file\_3’ -2. Запросим количество строк в этих файлах: +1. Запросим количество строк в этих файлах: -```sql + + +``` sql SELECT count(*) FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') ``` -3. Запросим количество строк во всех файлах этих двух директорий: +1. Запросим количество строк во всех файлах этих двух директорий: -```sql + + +``` sql SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') ``` -!!! warning +!!! warning "Warning" Если ваш список файлов содержит интервал с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры по отдельности или используйте `?`. **Пример** -Запрос данных из файлов с именами `file000`, `file001`, ... , `file999`: +Запрос данных из файлов с именами `file000`, `file001`, … , `file999`: -```sql +``` sql SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') ``` -## Виртуальные столбцы +## Виртуальные столбцы {#virtualnye-stolbtsy} -- `_path` — Путь к файлу. -- `_file` — Имя файла. +- `_path` — Путь к файлу. +- `_file` — Имя файла. **Смотрите также** -- [Виртуальные столбцы](index.md#table_engines-virtual_columns) +- [Виртуальные столбцы](index.md#table_engines-virtual_columns) [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/file/) diff --git a/docs/ru/query_language/table_functions/generate.md b/docs/ru/query_language/table_functions/generate.md index 53544d16e7d..636bd850737 100644 --- a/docs/ru/query_language/table_functions/generate.md +++ b/docs/ru/query_language/table_functions/generate.md @@ -1,10 +1,10 @@ -# generate +# generate {#generate} Генерирует случайные данные с заданной схемой. Позволяет заполнять тестовые таблицы данными. Поддерживает все типы данных, которые могут храниться в таблице, за исключением `LowCardinality` и `AggregateFunction`. -```sql +``` sql generate('name TypeName[, name TypeName]...', 'limit'[, 'max_array_length'[, 'max_string_length'[, 'random_seed']]]); ``` @@ -20,13 +20,13 @@ generate('name TypeName[, name TypeName]...', 'limit'[, 'max_array_length'[, 'ma Объект таблицы с запрошенной схемой. -## Пример +## Пример {#primer} - -```sql +``` sql SELECT * FROM generate('a Array(Int8), d Decimal32(4), c Tuple(DateTime64(3), UUID)', 3, 2, 10, 1); ``` -```text + +``` text ┌─a────────┬────────────d─┬─c──────────────────────────────────────────────────────────────────┐ │ [77] │ -124167.6723 │ ('2061-04-17 21:59:44.573','3f72f405-ec3e-13c8-44ca-66ef335f7835') │ │ [32,110] │ -141397.7312 │ ('1979-02-09 03:43:48.526','982486d1-5a5d-a308-e525-7bd8b80ffa73') │ diff --git a/docs/ru/query_language/table_functions/hdfs.md b/docs/ru/query_language/table_functions/hdfs.md index 9196f56b948..ab88545c36a 100644 --- a/docs/ru/query_language/table_functions/hdfs.md +++ b/docs/ru/query_language/table_functions/hdfs.md @@ -1,17 +1,16 @@ - -# hdfs +# hdfs {#hdfs} Создаёт таблицу из файла в HDFS. Данная табличная функция похожа на табличные функции [url](url.md) и [file](file.md). -```sql +``` sql hdfs(URI, format, structure) ``` **Входные параметры** -- `URI` — URI файла в HDFS. Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, ``'abc', 'def'` — строки. -- `format` — [формат](../../interfaces/formats.md#formats) файла. -- `structure` — структура таблицы. Формат `'column1_name column1_type, column2_name column2_type, ...'`. +- `URI` — URI файла в HDFS. Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, \``'abc', 'def'` — строки. +- `format` — [формат](../../interfaces/formats.md#formats) файла. +- `structure` — структура таблицы. Формат `'column1_name column1_type, column2_name column2_type, ...'`. **Возвращаемое значение** @@ -21,12 +20,13 @@ hdfs(URI, format, structure) Таблица из `hdfs://hdfs1:9000/test` и выборка первых двух строк из неё: -```sql +``` sql SELECT * FROM hdfs('hdfs://hdfs1:9000/test', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2 ``` -```text + +``` text ┌─column1─┬─column2─┬─column3─┐ │ 1 │ 2 │ 3 │ │ 3 │ 2 │ 1 │ @@ -35,25 +35,25 @@ LIMIT 2 **Шаблоны в пути** -- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. -- `?` — Заменяет ровно один любой символ. -- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. +- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. +- `?` — Заменяет ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. Конструкция с `{}` аналогична табличной функции [remote](remote.md). -!!! warning +!!! warning "Warning" Если ваш список файлов содержит интервал с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры по отдельности или используйте `?`. Шаблоны могут содержаться в разных частях пути. Обрабатываться будут ровно те файлы, которые и удовлетворяют всему шаблону пути, и существуют в файловой системе. -## Виртуальные столбцы +## Виртуальные столбцы {#virtualnye-stolbtsy} -- `_path` — Путь к файлу. -- `_file` — Имя файла. +- `_path` — Путь к файлу. +- `_file` — Имя файла. **Смотрите также** -- [Виртуальные столбцы](index.md#table_engines-virtual_columns) +- [Виртуальные столбцы](index.md#table_engines-virtual_columns) [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/hdfs/) diff --git a/docs/ru/query_language/table_functions/index.md b/docs/ru/query_language/table_functions/index.md index 59e5df3a645..446aa554243 100644 --- a/docs/ru/query_language/table_functions/index.md +++ b/docs/ru/query_language/table_functions/index.md @@ -1,30 +1,30 @@ -# Табличные функции +# Табличные функции {#tablichnye-funktsii} Табличные функции — это метод создания таблиц. Табличные функции можно использовать в: -* Секции [FROM](../select.md#select-from) запроса `SELECT`. +- Секции [FROM](../select.md#select-from) запроса `SELECT`. - Это способ создания временной таблицы, которая доступна только в текущем запросе. + Это способ создания временной таблицы, которая доступна только в текущем запросе. -* Запросе [CREATE TABLE AS ](../create.md#create-table-query). +- Запросе [CREATE TABLE AS \](../create.md#create-table-query). - Это один из методов создания таблицы. + Это один из методов создания таблицы. !!! warning "Предупреждение" - Если настройка [allow_ddl](../../operations/settings/permissions_for_queries.md#settings_allow_ddl) выключена, то использовать табличные функции невозможно. + Если настройка [allow\_ddl](../../operations/settings/permissions_for_queries.md#settings_allow_ddl) выключена, то использовать табличные функции невозможно. -Функция | Описание ----------|------------ -[file](file.md) | Создаёт таблицу с движком [File](../../operations/table_engines/file.md). -[merge](merge.md) | Создаёт таблицу с движком [Merge](../../operations/table_engines/merge.md). -[numbers](numbers.md) | Создаёт таблицу с единственным столбцом, заполненным целыми числами. -[remote](remote.md) | Предоставляет доступ к удалённым серверам, не создавая таблицу с движком [Distributed](../../operations/table_engines/distributed.md). -[url](url.md) | Создаёт таблицу с движком [Url](../../operations/table_engines/url.md). -[mysql](mysql.md) | Создаёт таблицу с движком [MySQL](../../operations/table_engines/mysql.md). -[jdbc](jdbc.md) | Создаёт таблицу с дижком [JDBC](../../operations/table_engines/jdbc.md). -[odbc](odbc.md) | Создаёт таблицу с движком [ODBC](../../operations/table_engines/odbc.md). -[hdfs](hdfs.md) | Создаёт таблицу с движком [HDFS](../../operations/table_engines/hdfs.md). +| Функция | Описание | +|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------| +| [file](file.md) | Создаёт таблицу с движком [File](../../operations/table_engines/file.md). | +| [merge](merge.md) | Создаёт таблицу с движком [Merge](../../operations/table_engines/merge.md). | +| [numbers](numbers.md) | Создаёт таблицу с единственным столбцом, заполненным целыми числами. | +| [remote](remote.md) | Предоставляет доступ к удалённым серверам, не создавая таблицу с движком [Distributed](../../operations/table_engines/distributed.md). | +| [url](url.md) | Создаёт таблицу с движком [Url](../../operations/table_engines/url.md). | +| [mysql](mysql.md) | Создаёт таблицу с движком [MySQL](../../operations/table_engines/mysql.md). | +| [jdbc](jdbc.md) | Создаёт таблицу с дижком [JDBC](../../operations/table_engines/jdbc.md). | +| [odbc](odbc.md) | Создаёт таблицу с движком [ODBC](../../operations/table_engines/odbc.md). | +| [hdfs](hdfs.md) | Создаёт таблицу с движком [HDFS](../../operations/table_engines/hdfs.md). | [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/) diff --git a/docs/ru/query_language/table_functions/input.md b/docs/ru/query_language/table_functions/input.md index e08fc934268..399268f9af6 100644 --- a/docs/ru/query_language/table_functions/input.md +++ b/docs/ru/query_language/table_functions/input.md @@ -1,4 +1,4 @@ -# input +# input {#input} `input(structure)` - табличная функция, позволяющая эффективно преобразовывать и вставлять отправленные на сервер данные, имеющие структуру `structure`, в таблицу другой структуры. @@ -19,15 +19,21 @@ **Примеры** -- Пусть у таблицы `test` следующая структура `(a String, b String)`, -а в файле `data.csv` данные имеют другую структуру `(col1 String, col2 Date, col3 Int32)`. Запрос для вставки -данных из файла `data.csv` в таблицу `test` с одновременным преобразованием и использованием функций выглядит так: -```bash +- Пусть у таблицы `test` следующая структура `(a String, b String)`, + а в файле `data.csv` данные имеют другую структуру `(col1 String, col2 Date, col3 Int32)`. Запрос для вставки + данных из файла `data.csv` в таблицу `test` с одновременным преобразованием и использованием функций выглядит так: + + + +``` bash $ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT lower(col1), col3 * col3 FROM input('col1 String, col2 Date, col3 Int32') FORMAT CSV"; ``` -- Если в `data.csv` лежат данные той же структуры `test_structure`, что и у таблицы `test`, то следующие два запроса эквивалентны: -```bash +- Если в `data.csv` лежат данные той же структуры `test_structure`, что и у таблицы `test`, то следующие два запроса эквивалентны: + + + +``` bash $ cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" $ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" ``` diff --git a/docs/ru/query_language/table_functions/jdbc.md b/docs/ru/query_language/table_functions/jdbc.md index 807a054ffc0..20ef0d1f107 100644 --- a/docs/ru/query_language/table_functions/jdbc.md +++ b/docs/ru/query_language/table_functions/jdbc.md @@ -1,23 +1,21 @@ - -# jdbc +# jdbc {#jdbc} `jdbc(jdbc_connection_uri, schema, table)` - возвращает таблицу, соединение с которой происходит через JDBC-драйвер. Для работы этой табличной функции требуется отдельно запускать приложение clickhouse-jdbc-bridge. Данная функция поддерживает Nullable типы (на основании DDL таблицы к которой происходит запрос). - **Пример** -```sql +``` sql SELECT * FROM jdbc('jdbc:mysql://localhost:3306/?user=root&password=root', 'schema', 'table') ``` -```sql +``` sql SELECT * FROM jdbc('mysql://localhost:3306/?user=root&password=root', 'schema', 'table') ``` -```sql +``` sql SELECT * FROM jdbc('datasource://mysql-local', 'schema', 'table') ``` diff --git a/docs/ru/query_language/table_functions/merge.md b/docs/ru/query_language/table_functions/merge.md index 0cc7d7c9943..d4e66391382 100644 --- a/docs/ru/query_language/table_functions/merge.md +++ b/docs/ru/query_language/table_functions/merge.md @@ -1,6 +1,6 @@ -# merge +# merge {#merge} -`merge(db_name, 'tables_regexp')` - создаёт временную таблицу типа Merge. Подробнее смотрите раздел "Движки таблиц, Merge". +`merge(db_name, 'tables_regexp')` - создаёт временную таблицу типа Merge. Подробнее смотрите раздел «Движки таблиц, Merge». Структура таблицы берётся из первой попавшейся таблицы, подходящей под регулярное выражение. diff --git a/docs/ru/query_language/table_functions/mysql.md b/docs/ru/query_language/table_functions/mysql.md index c5bebbbf740..228b0bbf38e 100644 --- a/docs/ru/query_language/table_functions/mysql.md +++ b/docs/ru/query_language/table_functions/mysql.md @@ -1,26 +1,32 @@ -# mysql +# mysql {#mysql} Позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом MySQL сервере. -```sql +``` sql mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); ``` **Параметры** -- `host:port` — адрес сервера MySQL. -- `database` — имя базы данных на удалённом сервере. -- `table` — имя таблицы на удалённом сервере. -- `user` — пользователь MySQL. -- `password` — пароль пользователя. -- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Если `replace_query=1`, то запрос заменяется. -- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое в запрос `INSERT`. +- `host:port` — адрес сервера MySQL. - Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1`. Чтобы узнать какие `on_duplicate_clause` можно использовать с секцией `ON DUPLICATE KEY` обратитесь к документации MySQL. +- `database` — имя базы данных на удалённом сервере. - Чтобы указать `'on_duplicate_clause'` необходимо передать `0` в параметр `replace_query`. Если одновременно передать `replace_query = 1` и `'on_duplicate_clause'`, то ClickHouse сгенерирует исключение. +- `table` — имя таблицы на удалённом сервере. -Простые условия `WHERE` такие как ` =, !=, >, >=, <, =` выполняются на стороне сервера MySQL. +- `user` — пользователь MySQL. + +- `password` — пароль пользователя. + +- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Если `replace_query=1`, то запрос заменяется. + +- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое в запрос `INSERT`. + + Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1`. Чтобы узнать какие `on_duplicate_clause` можно использовать с секцией `ON DUPLICATE KEY` обратитесь к документации MySQL. + + Чтобы указать `'on_duplicate_clause'` необходимо передать `0` в параметр `replace_query`. Если одновременно передать `replace_query = 1` и `'on_duplicate_clause'`, то ClickHouse сгенерирует исключение. + +Простые условия `WHERE` такие как `=, !=, >, >=, <, =` выполняются на стороне сервера MySQL. Остальные условия и ограничение выборки `LIMIT` будут выполнены в ClickHouse только после выполнения запроса к MySQL. @@ -28,11 +34,11 @@ mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_ Объект таблицы с теми же столбцами, что и в исходной таблице MySQL. -## Пример использования +## Пример использования {#primer-ispolzovaniia} Таблица в MySQL: -```text +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -55,19 +61,19 @@ mysql> select * from test; Получение данных в ClickHouse: -```sql +``` sql SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123') ``` -```text +``` text ┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ │ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ └────────┴──────────────┴───────┴────────────────┘ ``` -## Смотрите также +## Смотрите также {#smotrite-takzhe} -- [Движок таблиц 'MySQL'](../../operations/table_engines/mysql.md) -- [Использование MySQL как источника данных для внешнего словаря](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) +- [Движок таблиц ‘MySQL’](../../operations/table_engines/mysql.md) +- [Использование MySQL как источника данных для внешнего словаря](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-mysql) [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/mysql/) diff --git a/docs/ru/query_language/table_functions/numbers.md b/docs/ru/query_language/table_functions/numbers.md index 1f2710c3eae..79d01dd0b92 100644 --- a/docs/ru/query_language/table_functions/numbers.md +++ b/docs/ru/query_language/table_functions/numbers.md @@ -1,4 +1,4 @@ -# numbers +# numbers {#numbers} `numbers(N)` - возвращает таблицу с единственным столбцом `number` (UInt64), содержащим натуральные числа от `0` до `N-1`. `numbers(N, M)` - возвращает таблицу с единственным столбцом `number` (UInt64), содержащим натуральные числа от `N` to `(N + M - 1)`. @@ -7,13 +7,15 @@ Следующие запросы эквивалентны: -```sql +``` sql SELECT * FROM numbers(10); SELECT * FROM numbers(0,10); SELECT * FROM system.numbers LIMIT 10; ``` + Примеры: -```sql + +``` sql -- генерация последовательности всех дат от 2010-01-01 до 2010-12-31 select toDate('2010-01-01') + number as d FROM numbers(365); ``` diff --git a/docs/ru/query_language/table_functions/odbc.md b/docs/ru/query_language/table_functions/odbc.md index 86a6a018b6a..bff2c23cf47 100644 --- a/docs/ru/query_language/table_functions/odbc.md +++ b/docs/ru/query_language/table_functions/odbc.md @@ -1,22 +1,22 @@ -# odbc {#table_functions-odbc} +# odbc {#table-functions-odbc} Возвращает таблицу, подключенную через [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). -```sql +``` sql odbc(connection_settings, external_database, external_table) ``` Параметры: -- `connection_settings` — название секции с настройками соединения в файле `odbc.ini`. -- `external_database` — имя базы данных во внешней СУБД. -- `external_table` — имя таблицы в `external_database`. +- `connection_settings` — название секции с настройками соединения в файле `odbc.ini`. +- `external_database` — имя базы данных во внешней СУБД. +- `external_table` — имя таблицы в `external_database`. -Чтобы использование ODBC было безопасным, ClickHouse использует отдельную программу `clickhouse-odbc-bridge`. Если драйвер ODBC подгружать непосредственно из `clickhouse-server`, то проблемы с драйвером могут привести к аварийной остановке сервера ClickHouse. ClickHouse автоматически запускает `clickhouse-odbc-bridge` по мере необходимости. Программа устанавливается из того же пакета, что и `clickhouse-server`. +Чтобы использование ODBC было безопасным, ClickHouse использует отдельную программу `clickhouse-odbc-bridge`. Если драйвер ODBC подгружать непосредственно из `clickhouse-server`, то проблемы с драйвером могут привести к аварийной остановке сервера ClickHouse. ClickHouse автоматически запускает `clickhouse-odbc-bridge` по мере необходимости. Программа устанавливается из того же пакета, что и `clickhouse-server`. -Поля из внешней таблицы со значениями `NULL` получают значение по умолчанию для базового типа данных. Например, если поле в удалённой таблице MySQL имеет тип `INT NULL` оно сконвертируется в 0 (значение по умолчанию для типа данных ClickHouse `Int32`). +Поля из внешней таблицы со значениями `NULL` получают значение по умолчанию для базового типа данных. Например, если поле в удалённой таблице MySQL имеет тип `INT NULL` оно сконвертируется в 0 (значение по умолчанию для типа данных ClickHouse `Int32`). -## Пример использования +## Пример использования {#primer-ispolzovaniia} **Получение данных из локальной установки MySQL через ODBC** @@ -26,17 +26,18 @@ odbc(connection_settings, external_database, external_table) По умолчанию (если установлен из пакетов) ClickHouse запускается от имени пользователя `clickhouse`. Таким образом, вам нужно создать и настроить этого пользователя на сервере MySQL. -```bash +``` bash $ sudo mysql ``` -```sql + +``` sql mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; ``` Теперь настроим соединение в `/etc/odbc.ini`. -```bash +``` bash $ cat /etc/odbc.ini [mysqlconn] DRIVER = /usr/local/lib/libmyodbc5w.so @@ -49,7 +50,7 @@ PASSWORD = clickhouse Вы можете проверить соединение с помощью утилиты `isql` из установки unixODBC. -```bash +``` bash $ isql -v mysqlconn +---------------------------------------+ | Connected! | @@ -59,7 +60,7 @@ $ isql -v mysqlconn Таблица в MySQL: -```text +``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -82,19 +83,19 @@ mysql> select * from test; Получение данных из таблицы MySQL в ClickHouse: -```sql +``` sql SELECT * FROM odbc('DSN=mysqlconn', 'test', 'test') ``` -```text +``` text ┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ │ 1 │ 0 │ 2 │ 0 │ └────────┴──────────────┴───────┴────────────────┘ ``` -## Смотрите также +## Смотрите также {#smotrite-takzhe} -- [Внешние словари ODBC](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) -- [Движок таблиц ODBC](../../operations/table_engines/odbc.md). +- [Внешние словари ODBC](../../query_language/dicts/external_dicts_dict_sources.md#dicts-external_dicts_dict_sources-odbc) +- [Движок таблиц ODBC](../../operations/table_engines/odbc.md). [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/jdbc/) diff --git a/docs/ru/query_language/table_functions/remote.md b/docs/ru/query_language/table_functions/remote.md index 89a423fa6f2..9c504f36690 100644 --- a/docs/ru/query_language/table_functions/remote.md +++ b/docs/ru/query_language/table_functions/remote.md @@ -1,11 +1,10 @@ - -# remote, remoteSecure +# remote, remoteSecure {#remote-remotesecure} Позволяет обратиться к удалённым серверам без создания таблицы типа `Distributed`. Сигнатуры: -```sql +``` sql remote('addresses_expr', db, table[, 'user'[, 'password']]) remote('addresses_expr', db.table[, 'user'[, 'password']]) ``` @@ -17,7 +16,7 @@ remote('addresses_expr', db.table[, 'user'[, 'password']]) Примеры: -```text +``` text example01-01-1 example01-01-1:9000 localhost @@ -30,29 +29,29 @@ localhost Пример: -```text +``` text example01-01-1,example01-02-1 ``` Часть выражения может быть указана в фигурных скобках. Предыдущий пример может быть записан следующим образом: -```text +``` text example01-0{1,2}-1 ``` В фигурных скобках может быть указан диапазон (неотрицательных целых) чисел через две точки. В этом случае, диапазон раскрывается в множество значений, генерирующих адреса шардов. Если запись первого числа начинается с нуля, то значения формируются с таким же выравниванием нулями. Предыдущий пример может быть записан следующим образом: -```text +``` text example01-{01..02}-1 ``` При наличии нескольких пар фигурных скобок, генерируется прямое произведение соответствующих множеств. -Адреса или их фрагменты в фигурных скобках можно указать через символ |. В этом случае, соответствующие множества адресов понимаются как реплики - запрос будет отправлен на первую живую реплику. При этом, реплики перебираются в порядке, согласно текущей настройке [load_balancing](../../operations/settings/settings.md). +Адреса или их фрагменты в фигурных скобках можно указать через символ \|. В этом случае, соответствующие множества адресов понимаются как реплики - запрос будет отправлен на первую живую реплику. При этом, реплики перебираются в порядке, согласно текущей настройке [load\_balancing](../../operations/settings/settings.md). Пример: -```text +``` text example01-{01..02}-{1|2} ``` @@ -64,10 +63,10 @@ example01-{01..02}-{1|2} Табличная функция `remote` может быть полезна для следующих случаях: -- обращение на конкретный сервер в целях сравнения данных, отладки и тестирования; -- запросы между разными кластерами ClickHouse в целях исследований; -- нечастых распределённых запросов, задаваемых вручную; -- распределённых запросов, где набор серверов определяется каждый раз заново. +- обращение на конкретный сервер в целях сравнения данных, отладки и тестирования; +- запросы между разными кластерами ClickHouse в целях исследований; +- нечастых распределённых запросов, задаваемых вручную; +- распределённых запросов, где набор серверов определяется каждый раз заново. Если пользователь не задан,то используется `default`. Если пароль не задан, то используется пустой пароль. diff --git a/docs/ru/query_language/table_functions/url.md b/docs/ru/query_language/table_functions/url.md index c34cd4aa53f..4c49bc76751 100644 --- a/docs/ru/query_language/table_functions/url.md +++ b/docs/ru/query_language/table_functions/url.md @@ -1,5 +1,4 @@ - -# url +# url {#url} `url(URL, format, structure)` - возвращает таблицу со столбцами, указанными в `structure`, созданную из данных находящихся по `URL` в формате `format`. @@ -13,7 +12,7 @@ structure - структура таблицы в форме `'UserID UInt64, Nam **Пример** -```sql +``` sql -- получение 3-х строк таблицы, состоящей из двух колонк типа String и UInt32 от сервера, отдающего данные в формате CSV SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3 ``` diff --git a/docs/ru/security_changelog.md b/docs/ru/security_changelog.md index 6bb8f892b11..d0becdbde9d 100644 --- a/docs/ru/security_changelog.md +++ b/docs/ru/security_changelog.md @@ -1,63 +1,68 @@ -## Исправлено в релизе 19.14.3.3, 2019-09-10 +## Исправлено в релизе 19.14.3.3, 2019-09-10 {#ispravleno-v-relize-19-14-3-3-2019-09-10} -### CVE-2019-15024 +### CVE-2019-15024 {#cve-2019-15024} Злоумышленник с доступом на запись к ZooKeeper и возможностью запустить собственный сервер в сети доступной ClickHouse может создать вредоносный сервер, который будет вести себя как реплика ClickHouse и зарегистрируется в ZooKeeper. В процессе репликации вредоносный сервер может указать любой путь на файловой системе в который будут записаны данные. Обнаружено благодаря: Эльдару Заитову из Службы Информационной Безопасности Яндекса -### CVE-2019-16535 +### CVE-2019-16535 {#cve-2019-16535} Интерфейс декомпрессии позволял совершать OOB чтения и записи данных в памяти, а также переполнение целочисленных переменных, что могло приводить к отказу в обслуживании. Также потенциально могло использоваьтся для удаленного выполнения кода. Обнаружено благодаря: Эльдару Заитову из Службы Информационной Безопасности Яндекса -### CVE-2019-16536 +### CVE-2019-16536 {#cve-2019-16536} Аутентифицированный клиент злоумышленника имел возможность вызвать переполнение стека, что могло привести к отказу в обслуживании. Обнаружено благодаря: Эльдару Заитову из Службы Информационной Безопасности Яндекса -## Исправлено в релизе 19.13.6.1 от 20 сентября 2019 +## Исправлено в релизе 19.13.6.1 от 20 сентября 2019 {#ispravleno-v-relize-19-13-6-1-ot-20-sentiabria-2019} + +### CVE-2019-18657 {#cve-2019-18657} -### CVE-2019-18657 Уязвимость в табличной функции `url` позволяла злоумышленнику добавлять произвольные HTTP-заголовки в запрос. Обнаружено благодаря: [Никите Тихомирову](https://github.com/NSTikhomirov) -## Исправлено в релизе 18.12.13 от 10 сентября 2018 +## Исправлено в релизе 18.12.13 от 10 сентября 2018 {#ispravleno-v-relize-18-12-13-ot-10-sentiabria-2018} + +### CVE-2018-14672 {#cve-2018-14672} -### CVE-2018-14672 Функция для загрузки CatBoost моделей некорректно обрабатывала пути к файлам, что позволяло читать произвольные локальные файлы на сервере Clickhouse через сообщения об ошибках. Обнаружено благодаря: Андрею Красичкову из Службы Информационной Безопасности Яндекса -## Исправлено в релизе 18.10.3 от 13 августа 2018 +## Исправлено в релизе 18.10.3 от 13 августа 2018 {#ispravleno-v-relize-18-10-3-ot-13-avgusta-2018} + +### CVE-2018-14671 {#cve-2018-14671} -### CVE-2018-14671 unixODBC позволял указать путь для подключения произвольного shared object в качестве драйвера базы данных, что приводило к возможности выполнить произвольный код на сервере ClickHouse. Обнаружено благодаря: Андрею Красичкову и Евгению Сидорову из Службы Информационной Безопасности Яндекса -## Исправлено в релизе 1.1.54388 от 28 июня 2018 +## Исправлено в релизе 1.1.54388 от 28 июня 2018 {#ispravleno-v-relize-1-1-54388-ot-28-iiunia-2018} -### CVE-2018-14668 -Табличная функция "remote" допускала произвольные символы в полях "user", "password" и "default_database", что позволяло производить атаки класса Cross Protocol Request Forgery. +### CVE-2018-14668 {#cve-2018-14668} + +Табличная функция «remote» допускала произвольные символы в полях «user», «password» и «default\_database», что позволяло производить атаки класса Cross Protocol Request Forgery. Обнаружено благодаря: Андрею Красичкову из Службы Информационной Безопасности Яндекса -## Исправлено в релизе 1.1.54390 от 6 июля 2018 +## Исправлено в релизе 1.1.54390 от 6 июля 2018 {#ispravleno-v-relize-1-1-54390-ot-6-iiulia-2018} -### CVE-2018-14669 -В ClickHouse MySQL клиенте была включена функциональность "LOAD DATA LOCAL INFILE", что позволяло получать доступ на чтение к произвольным файлам на сервере, где запущен ClickHouse. +### CVE-2018-14669 {#cve-2018-14669} + +В ClickHouse MySQL клиенте была включена функциональность «LOAD DATA LOCAL INFILE», что позволяло получать доступ на чтение к произвольным файлам на сервере, где запущен ClickHouse. Обнаружено благодаря: Андрею Красичкову и Евгению Сидорову из Службы Информационной Безопасности Яндекса -## Исправлено в релизе 1.1.54131 от 10 января 2017 +## Исправлено в релизе 1.1.54131 от 10 января 2017 {#ispravleno-v-relize-1-1-54131-ot-10-ianvaria-2017} -### CVE-2018-14670 +### CVE-2018-14670 {#cve-2018-14670} Некорректная конфигурация в deb пакете могла привести к неавторизованному доступу к базе данных. -Обнаружено благодаря: the UK's National Cyber Security Centre (NCSC) +Обнаружено благодаря: the UK’s National Cyber Security Centre (NCSC) [Оригинальная статья](https://clickhouse.tech/docs/ru/security_changelog/) diff --git a/docs/toc_es.yml b/docs/toc_es.yml new file mode 100644 index 00000000000..8f322919ebf --- /dev/null +++ b/docs/toc_es.yml @@ -0,0 +1,242 @@ +nav: +- "Implantaci\xF3n": + - "Descripci\xF3n": index.md + - "Caracter\xEDsticas distintivas de ClickHouse": introduction/distinctive_features.md + - "Caracter\xEDsticas de ClickHouse que pueden considerarse desventajas": introduction/features_considered_disadvantages.md + - Rendimiento: introduction/performance.md + - Historia: introduction/history.md + - Adoptante: introduction/adopters.md +- Primeros pasos: + - oculto: getting_started/index.md + - "Instalaci\xF3n": getting_started/install.md + - Tutorial: getting_started/tutorial.md + - Datos De Ejemplo: + - A tiempo: getting_started/example_datasets/ontime.md + - Datos de taxis de Nueva York: getting_started/example_datasets/nyc_taxi.md + - Referencia de Big Data de AMPLab: getting_started/example_datasets/amplab_benchmark.md + - "Nombre de la red inal\xE1mbrica (SSID)": getting_started/example_datasets/wikistat.md + - Registros de clics de Terabyte de Criteo: getting_started/example_datasets/criteo.md + - Estrella Schema Benchmark: getting_started/example_datasets/star_schema.md + - El Yandex.Metrica Datos: getting_started/example_datasets/metrica.md +- Interfaz: + - "Implantaci\xF3n": interfaces/index.md + - "Cliente de l\xEDnea de comandos": interfaces/cli.md + - Interfaz nativa (TCP): interfaces/tcp.md + - Interfaz HTTP: interfaces/http.md + - Interfaz MySQL: interfaces/mysql.md + - Formatos de entrada y salida: interfaces/formats.md + - Controlador JDBC: interfaces/jdbc.md + - Conductor ODBC: interfaces/odbc.md + - Biblioteca de clientes de C++: interfaces/cpp.md + - tercero: + - Bibliotecas de clientes: interfaces/third-party/client_libraries.md + - "Integraci\xF3n": interfaces/third-party/integrations.md + - Interfaces Visuales: interfaces/third-party/gui.md + - Proxy: interfaces/third-party/proxy.md +- Motor: + - Motores de mesa: + - "Implantaci\xF3n": operations/table_engines/index.md + - Familia MergeTree: + - "M\xE9todo de codificaci\xF3n de datos": operations/table_engines/mergetree.md + - "Replicaci\xF3n de datos": operations/table_engines/replication.md + - "Clave de partici\xF3n personalizada": operations/table_engines/custom_partitioning_key.md + - ReplacingMergeTree: operations/table_engines/replacingmergetree.md + - SummingMergeTree: operations/table_engines/summingmergetree.md + - "Agregaci\xF3nMergeTree": operations/table_engines/aggregatingmergetree.md + - ColapsarMergeTree: operations/table_engines/collapsingmergetree.md + - VersionedCollapsingMergeTree: operations/table_engines/versionedcollapsingmergetree.md + - GraphiteMergeTree: operations/table_engines/graphitemergetree.md + - Familia de registro: + - "Implantaci\xF3n": operations/table_engines/log_family.md + - StripeLog: operations/table_engines/stripelog.md + - Registro: operations/table_engines/log.md + - TinyLog: operations/table_engines/tinylog.md + - "Integraci\xF3n": + - Kafka: operations/table_engines/kafka.md + - MySQL: operations/table_engines/mysql.md + - JDBC: operations/table_engines/jdbc.md + - ODBC: operations/table_engines/odbc.md + - HDFS: operations/table_engines/hdfs.md + - Especial: + - Distribuido: operations/table_engines/distributed.md + - Datos externos: operations/table_engines/external_data.md + - Diccionario: operations/table_engines/dictionary.md + - Fusionar: operations/table_engines/merge.md + - File: operations/table_engines/file.md + - Nulo: operations/table_engines/null.md + - Establecer: operations/table_engines/set.md + - Unir: operations/table_engines/join.md + - URL: operations/table_engines/url.md + - Vista: operations/table_engines/view.md + - "M\xE9todo de codificaci\xF3n de datos": operations/table_engines/materializedview.md + - Memoria: operations/table_engines/memory.md + - "B\xFAfer": operations/table_engines/buffer.md + - GenerateRandom: operations/table_engines/generate.md + - Motores de base de datos: + - "Implantaci\xF3n": database_engines/index.md + - MySQL: database_engines/mysql.md + - Perezoso: database_engines/lazy.md +- Referencia SQL: + - oculto: query_language/index.md + - Sintaxis: query_language/syntax.md + - "Instrucci\xF3n": + - SELECCIONAR: query_language/select.md + - INSERTAR EN: query_language/insert_into.md + - CREAR: query_language/create.md + - ALTERAR: query_language/alter.md + - SISTEMA: query_language/system.md + - MOSTRAR: query_language/show.md + - Otro: query_language/misc.md + - "Funci\xF3n": + - "Implantaci\xF3n": query_language/functions/index.md + - "Aritm\xE9tica": query_language/functions/arithmetic_functions.md + - "Comparaci\xF3n": query_language/functions/comparison_functions.md + - "L\xF3gico": query_language/functions/logical_functions.md + - "Conversi\xF3n de tipo": query_language/functions/type_conversion_functions.md + - Trabajar con fechas y horas: query_language/functions/date_time_functions.md + - Trabajar con cadenas: query_language/functions/string_functions.md + - Para buscar cadenas: query_language/functions/string_search_functions.md + - Para reemplazar en cadenas: query_language/functions/string_replace_functions.md + - 'Condicional ': query_language/functions/conditional_functions.md + - "Matem\xE1tica": query_language/functions/math_functions.md + - Redondeo: query_language/functions/rounding_functions.md + - Trabajar con matrices: query_language/functions/array_functions.md + - "Divisi\xF3n y fusi\xF3n de cuerdas y matrices": query_language/functions/splitting_merging_functions.md + - Trozo: query_language/functions/bit_functions.md + - Bits: query_language/functions/bitmap_functions.md + - Hash: query_language/functions/hash_functions.md + - "Generaci\xF3n de n\xFAmeros pseudo-aleatorios": query_language/functions/random_functions.md + - "Codificaci\xF3n": query_language/functions/encoding_functions.md + - Trabajando con UUID: query_language/functions/uuid_functions.md + - Trabajar con URL: query_language/functions/url_functions.md + - Trabajar con direcciones IP: query_language/functions/ip_address_functions.md + - Trabajando con JSON.: query_language/functions/json_functions.md + - Orden superior: query_language/functions/higher_order_functions.md + - Trabajar con diccionarios externos: query_language/functions/ext_dict_functions.md + - Trabajando con Yandex.Diccionarios de Metrica: query_language/functions/ym_dict_functions.md + - "Implementaci\xF3n del operador IN": query_language/functions/in_functions.md + - arrayJoin: query_language/functions/array_join.md + - "Trabajar con coordenadas geogr\xE1ficas": query_language/functions/geo.md + - Trabajar con argumentos Nullable: query_language/functions/functions_for_nulls.md + - "Funciones de aprendizaje autom\xE1tico": query_language/functions/machine_learning_functions.md + - "Introspecci\xF3n": query_language/functions/introspection.md + - Otro: query_language/functions/other_functions.md + - Funciones agregadas: + - "Implantaci\xF3n": query_language/agg_functions/index.md + - Referencia: query_language/agg_functions/reference.md + - Combinadores de funciones agregadas: query_language/agg_functions/combinators.md + - "Funciones agregadas param\xE9tricas": query_language/agg_functions/parametric_functions.md + - Funciones de tabla: + - "Implantaci\xF3n": query_language/table_functions/index.md + - file: query_language/table_functions/file.md + - fusionar: query_language/table_functions/merge.md + - numero: query_language/table_functions/numbers.md + - remoto: query_language/table_functions/remote.md + - URL: query_language/table_functions/url.md + - mysql: query_language/table_functions/mysql.md + - jdbc: query_language/table_functions/jdbc.md + - Nosotros: query_language/table_functions/odbc.md + - Hdfs: query_language/table_functions/hdfs.md + - entrada: query_language/table_functions/input.md + - generateRandom: query_language/table_functions/generate.md + - Diccionario: + - "Implantaci\xF3n": query_language/dicts/index.md + - Diccionarios externos: + - "Descripci\xF3n General": query_language/dicts/external_dicts.md + - "Configuraci\xF3n de un diccionario externo": query_language/dicts/external_dicts_dict.md + - Almacenamiento de diccionarios en la memoria: query_language/dicts/external_dicts_dict_layout.md + - Actualizaciones del diccionario: query_language/dicts/external_dicts_dict_lifetime.md + - Fuentes de diccionarios externos: query_language/dicts/external_dicts_dict_sources.md + - Clave y campos del diccionario: query_language/dicts/external_dicts_dict_structure.md + - "Diccionarios jer\xE1rquicos": query_language/dicts/external_dicts_dict_hierarchical.md + - Diccionarios internos: query_language/dicts/internal_dicts.md + - Operador: query_language/operators.md + - Tipos de datos: + - "Implantaci\xF3n": data_types/index.md + - UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64: data_types/int_uint.md + - "Descripci\xF3n del producto": data_types/float.md + - Decimal: data_types/decimal.md + - Booleana: data_types/boolean.md + - Cadena: data_types/string.md + - Cadena fija (N): data_types/fixedstring.md + - UUID: data_types/uuid.md + - Fecha: data_types/date.md + - FechaHora: data_types/datetime.md + - DateTime64: data_types/datetime64.md + - Enum: data_types/enum.md + - Matriz (T): data_types/array.md + - "Agregar funci\xF3n (nombre, types_of_arguments)...)": data_types/nested_data_structures/aggregatefunction.md + - Tuple (T1, T2, ...): data_types/tuple.md + - 'NULL': data_types/nullable.md + - Estructuras de datos anidados: + - oculto: data_types/nested_data_structures/index.md + - Anidado (Nombre1 Tipo1, Nombre2 Tipo2, ...): data_types/nested_data_structures/nested.md + - Tipos de datos especiales: + - oculto: data_types/special_data_types/index.md + - Expresion: data_types/special_data_types/expression.md + - Establecer: data_types/special_data_types/set.md + - Nada: data_types/special_data_types/nothing.md + - Intervalo: data_types/special_data_types/interval.md + - Dominio: + - "Descripci\xF3n": data_types/domains/overview.md + - IPv4: data_types/domains/ipv4.md + - IPv6: data_types/domains/ipv6.md +- Guiar: + - "Descripci\xF3n": guides/index.md + - "Aplicaci\xF3n de modelos CatBoost": guides/apply_catboost_model.md +- "Operaci\xF3n": + - "Implantaci\xF3n": operations/index.md + - Requisito: operations/requirements.md + - Monitoreo: operations/monitoring.md + - "Soluci\xF3n de problemas": operations/troubleshooting.md + - "Actualizaci\xF3n de ClickHouse": operations/update.md + - Derechos de acceso: operations/access_rights.md + - Copia de seguridad de datos: operations/backup.md + - "Archivos de configuraci\xF3n": operations/configuration_files.md + - Cuota: operations/quotas.md + - Tablas del sistema: operations/system_tables.md + - "Optimizaci\xF3n del rendimiento": + - "Generaci\xF3n de perfiles de consultas": operations/performance/sampling_query_profiler.md + - Prueba de hardware: operations/performance_test.md + - "Par\xE1metros de configuraci\xF3n del servidor": + - "Implantaci\xF3n": operations/server_settings/index.md + - "Configuraci\xF3n del servidor": operations/server_settings/settings.md + - "Configuraci\xF3n": + - "Implantaci\xF3n": operations/settings/index.md + - Permisos para consultas: operations/settings/permissions_for_queries.md + - Restricciones en la complejidad de consultas: operations/settings/query_complexity.md + - "Configuraci\xF3n": operations/settings/settings.md + - "Perfiles de configuraci\xF3n": operations/settings/settings_profiles.md + - "Restricciones en la configuraci\xF3n": operations/settings/constraints_on_settings.md + - "Configuraci\xF3n del usuario": operations/settings/settings_users.md + - Utilidad: + - "Descripci\xF3n": operations/utils/index.md + - "M\xE9todo de codificaci\xF3n de datos": operations/utils/clickhouse-copier.md + - Sistema abierto.: operations/utils/clickhouse-local.md + - Sistema abierto.: operations/utils/clickhouse-benchmark.md + - Recomendaciones de uso: operations/tips.md +- Desarrollo: + - oculto: development/index.md + - "La instrucci\xF3n para desarrolladores de ClickHouse para principiantes": development/developer_instruction.md + - "Descripci\xF3n general de la arquitectura ClickHouse": development/architecture.md + - "Examinar el c\xF3digo fuente de ClickHouse": development/browse_code.md + - "C\xF3mo crear ClickHouse en Linux": development/build.md + - "C\xF3mo crear ClickHouse en Mac OS X": development/build_osx.md + - "C\xF3mo construir ClickHouse en Linux para Mac OS X": development/build_cross_osx.md + - "C\xF3mo construir ClickHouse en Linux para AARCH64 (ARM64)": development/build_cross_arm.md + - "C\xF3mo escribir c\xF3digo C ++": development/style.md + - "C\xF3mo ejecutar pruebas de ClickHouse": development/tests.md + - Bibliotecas de terceros utilizadas: development/contrib.md +- Comercial: + - Proveedores de servicios en la nube: commercial/cloud.md +- "Qu\xE9 hay de Nuevo": + - Hoja de ruta: roadmap.md + - Changelog: + - Sistema abierto.: changelog/index.md + - Nuestros servicios: changelog/2019.md + - 2018 Nueva York: changelog/2018.md + - "M\xE1s informaci\xF3n": changelog/2017.md + - Seguridad Changelog: security_changelog.md +- Nivel de Cifrado WEP: + - Preguntas generales: faq/general.md + diff --git a/docs/tools/README.md b/docs/tools/README.md index 57a2e5a99ca..985577dfe36 100644 --- a/docs/tools/README.md +++ b/docs/tools/README.md @@ -1,22 +1,22 @@ -## How ClickHouse documentation is generated? +## How ClickHouse documentation is generated? {#how-clickhouse-documentation-is-generated} -ClickHouse documentation is built using [build.py](build.py) script that uses [mkdocs](https://www.mkdocs.org) library and it's dependencies to separately build all version of documentations (all languages in either single and multi page mode) as static HTMLs. The results are then put in correct directory structure. It can also generate PDF version. +ClickHouse documentation is built using build.py script that uses [mkdocs](https://www.mkdocs.org) library and it’s dependencies to separately build all version of documentations (all languages in either single and multi page mode) as static HTMLs. The results are then put in correct directory structure. It can also generate PDF version. -[release.sh](release.sh) also pulls static files needed for [official ClickHouse website](https://clickhouse.tech) from [../../website](../../website) folder, packs them alongside docs into Docker container and tries to deploy it (possible only from Yandex private network). +release.sh also pulls static files needed for [official ClickHouse website](https://clickhouse.tech) from ../../website folder, packs them alongside docs into Docker container and tries to deploy it (possible only from Yandex private network). -## How to check if the documentation will look fine? +## How to check if the documentation will look fine? {#how-to-check-if-the-documentation-will-look-fine} There are few options that are all useful depending on how large or complex your edits are. -### Install Markdown editor or plugin for your IDE +### Install Markdown editor or plugin for your IDE {#install-markdown-editor-or-plugin-for-your-ide} Usually those have some way to preview how Markdown will look like, which allows to catch basic errors like unclosed tags very early. -### Use build.py +### Use build.py {#use-build-py} -It'll take some effort to go through, but the result will be very close to production documentation. +It’ll take some effort to go through, but the result will be very close to production documentation. -For the first time you'll need to install [wkhtmltopdf](https://wkhtmltopdf.org/) and set up virtualenv: +For the first time you’ll need to install [wkhtmltopdf](https://wkhtmltopdf.org/) and set up virtualenv: ``` bash $ cd ClickHouse/docs/tools @@ -28,18 +28,15 @@ $ pip install -r requirements.txt Then running `build.py` without args (there are some, check `build.py --help`) will generate `ClickHouse/docs/build` folder with complete static html website. -You can just directly open those HTML files in browser, but usually it is more convenient to have some sort of HTTP server hosting them. For example, you can launch one by running `cd ClickHouse/docs/build && python -m SimpleHTTPServer 8888` and then go to in browser. +You can just directly open those HTML files in browser, but usually it is more convenient to have some sort of HTTP server hosting them. For example, you can launch one by running `cd ClickHouse/docs/build && python -m SimpleHTTPServer 8888` and then go to http://localhost:8888 in browser. +### Commit blindly {#commit-blindly} -### Commit blindly +Then push to GitHub so you can use it’s preview. It’s better to use previous methods too though. -Then push to GitHub so you can use it's preview. It's better to use previous methods too though. - -## How to subscribe on documentation changes? - -At the moment there's no easy way to do just that, but you can consider: - -* Hit the "Watch" button on top of GitHub web interface to know as early as possible, even during pull request. -* Some search engines allow to subscribe on specific website changes via email and you can opt-in for that for . +## How to subscribe on documentation changes? {#how-to-subscribe-on-documentation-changes} +At the moment there’s no easy way to do just that, but you can consider: +- Hit the “Watch” button on top of GitHub web interface to know as early as possible, even during pull request. +- Some search engines allow to subscribe on specific website changes via email and you can opt-in for that for https://clickhouse.tech. diff --git a/docs/tools/build.py b/docs/tools/build.py index 64c1b0e99c9..4e0ad21083e 100755 --- a/docs/tools/build.py +++ b/docs/tools/build.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- from __future__ import unicode_literals @@ -21,6 +21,7 @@ from mkdocs.commands import build as mkdocs_build from concatenate import concatenate from website import build_website, minify_website + import mdx_clickhouse import test import util @@ -75,10 +76,11 @@ def build_for_lang(lang, args): site_names = { 'en': 'ClickHouse %s Documentation', + 'es': 'Documentación de ClickHouse %s', 'ru': 'Документация ClickHouse %s', 'zh': 'ClickHouse文档 %s', 'ja': 'ClickHouseドキュメント %s', - 'fa': 'مستندات %sClickHouse' + 'fa': 'مستندات %sClickHouse' } if args.version_prefix: @@ -86,6 +88,12 @@ def build_for_lang(lang, args): else: site_dir = os.path.join(args.docs_output_dir, lang) + plugins = ['search'] + if not args.no_docs_macros: + plugins.append('macros') + if args.htmlproofer: + plugins.append('htmlproofer') + cfg = config.load_config( config_file=config_path, site_name=site_names.get(lang, site_names['en']) % args.version_prefix, @@ -101,10 +109,14 @@ def build_for_lang(lang, args): edit_uri='edit/master/docs/%s' % lang, extra_css=['assets/stylesheets/custom.css?%s' % args.rev_short], markdown_extensions=[ - 'clickhouse', + 'mdx_clickhouse', 'admonition', 'attr_list', 'codehilite', + 'nl2br', + 'sane_lists', + 'pymdownx.magiclink', + 'pymdownx.superfences', 'extra', { 'toc': { @@ -113,7 +125,7 @@ def build_for_lang(lang, args): } } ], - plugins=[], + plugins=plugins, extra={ 'stable_releases': args.stable_releases, 'version_prefix': args.version_prefix, @@ -196,28 +208,28 @@ def build_single_page_version(lang, args, cfg): ] }) mkdocs_build.build(cfg) - if not args.version_prefix: # maybe enable in future - test.test_single_page(os.path.join(test_dir, 'single', 'index.html'), lang) if args.save_raw_single_page: shutil.copytree(test_dir, args.save_raw_single_page) + if not args.version_prefix: # maybe enable in future + test.test_single_page(os.path.join(test_dir, 'single', 'index.html'), lang) def write_redirect_html(out_path, to_url): with open(out_path, 'w') as f: - f.write(''' + f.write(f''' - + Page Redirection - If you are not redirected automatically, follow this link. + If you are not redirected automatically, follow this link. -''' % (to_url, to_url, to_url)) +''') def build_redirect_html(args, from_path, to_path): @@ -290,7 +302,7 @@ if __name__ == '__main__': os.chdir(os.path.join(os.path.dirname(__file__), '..')) arg_parser = argparse.ArgumentParser() - arg_parser.add_argument('--lang', default='en,ru,zh,ja,fa') + arg_parser.add_argument('--lang', default='en,es,ru,zh,ja,fa') arg_parser.add_argument('--docs-dir', default='.') arg_parser.add_argument('--theme-dir', default='mkdocs-material-theme') arg_parser.add_argument('--website-dir', default=os.path.join('..', 'website')) @@ -302,19 +314,13 @@ if __name__ == '__main__': arg_parser.add_argument('--skip-pdf', action='store_true') arg_parser.add_argument('--skip-website', action='store_true') arg_parser.add_argument('--minify', action='store_true') + arg_parser.add_argument('--htmlproofer', action='store_true') + arg_parser.add_argument('--no-docs-macros', action='store_true') arg_parser.add_argument('--save-raw-single-page', type=str) arg_parser.add_argument('--livereload', type=int, default='0') arg_parser.add_argument('--verbose', action='store_true') args = arg_parser.parse_args() - args.docs_output_dir = os.path.join(os.path.abspath(args.output_dir), 'docs') - - from github import choose_latest_releases, get_events - args.stable_releases = choose_latest_releases() if args.enable_stable_releases else [] - args.rev = subprocess.check_output('git rev-parse HEAD', shell=True).strip() - args.rev_short = subprocess.check_output('git rev-parse --short HEAD', shell=True).strip() - args.rev_url = 'https://github.com/ClickHouse/ClickHouse/commit/%s' % args.rev - args.events = get_events(args) logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, @@ -323,6 +329,15 @@ if __name__ == '__main__': logging.getLogger('MARKDOWN').setLevel(logging.INFO) + args.docs_output_dir = os.path.join(os.path.abspath(args.output_dir), 'docs') + + from github import choose_latest_releases, get_events + args.stable_releases = choose_latest_releases() if args.enable_stable_releases else [] + args.rev = subprocess.check_output('git rev-parse HEAD', shell=True).decode('utf-8').strip() + args.rev_short = subprocess.check_output('git rev-parse --short HEAD', shell=True).decode('utf-8').strip() + args.rev_url = 'https://github.com/ClickHouse/ClickHouse/commit/%s' % args.rev + args.events = get_events(args) + from build import build build(args) diff --git a/docs/tools/github.py b/docs/tools/github.py index 7c4cf3fd741..4a02981b4b3 100644 --- a/docs/tools/github.py +++ b/docs/tools/github.py @@ -6,17 +6,20 @@ import os import sys import tarfile +import jinja2 import requests import util def choose_latest_releases(): + logging.info('Collecting release candidates') seen = collections.OrderedDict() candidates = [] for page in range(1, 10): url = 'https://api.github.com/repos/ClickHouse/ClickHouse/tags?per_page=100&page=%d' % page candidates += requests.get(url).json() + logging.info('Collected all release candidates') for tag in candidates: if isinstance(tag, dict): @@ -45,11 +48,15 @@ def process_release(args, callback, release): tar = tarfile.open(mode='r:gz', fileobj=buf) with util.temp_dir() as base_dir: tar.extractall(base_dir) - args = copy.deepcopy(args) + args = copy.copy(args) args.version_prefix = name args.is_stable_release = True args.docs_dir = os.path.join(base_dir, os.listdir(base_dir)[0], 'docs') - callback(args) + try: + callback(args) + except jinja2.exceptions.TemplateSyntaxError: + args.no_docs_macros = True + callback(args) def build_releases(args, callback): diff --git a/docs/tools/make_links.sh b/docs/tools/make_links.sh index 4c111315a7b..87b3f56a166 100755 --- a/docs/tools/make_links.sh +++ b/docs/tools/make_links.sh @@ -6,7 +6,7 @@ function do_make_links() { - langs=(en ru zh ja fa) + langs=(en es ru zh ja fa) src_file="$1" for lang in "${langs[@]}" do diff --git a/docs/tools/mdx_clickhouse.py b/docs/tools/mdx_clickhouse.py index a2d71b0ea58..bfe5ebbd1ae 100755 --- a/docs/tools/mdx_clickhouse.py +++ b/docs/tools/mdx_clickhouse.py @@ -7,15 +7,17 @@ import os import markdown.inlinepatterns import markdown.extensions import markdown.util +import macros.plugin import slugify as slugify_impl + class ClickHouseLinkMixin(object): - def handleMatch(self, m): + def handleMatch(self, m, data): single_page = (os.environ.get('SINGLE_PAGE') == '1') try: - el = super(ClickHouseLinkMixin, self).handleMatch(m) + el, start, end = super(ClickHouseLinkMixin, self).handleMatch(m, data) except IndexError: return @@ -23,21 +25,21 @@ class ClickHouseLinkMixin(object): href = el.get('href') or '' is_external = href.startswith('http:') or href.startswith('https:') if is_external: - if not href.startswith('https://clickhouse.yandex'): + if not href.startswith('https://clickhouse.tech'): el.set('rel', 'external nofollow') elif single_page: if '#' in href: el.set('href', '#' + href.split('#', 1)[1]) else: el.set('href', '#' + href.replace('/index.md', '/').replace('.md', '/')) - return el + return el, start, end -class ClickHouseAutolinkPattern(ClickHouseLinkMixin, markdown.inlinepatterns.AutolinkPattern): +class ClickHouseAutolinkPattern(ClickHouseLinkMixin, markdown.inlinepatterns.AutolinkInlineProcessor): pass -class ClickHouseLinkPattern(ClickHouseLinkMixin, markdown.inlinepatterns.LinkPattern): +class ClickHouseLinkPattern(ClickHouseLinkMixin, markdown.inlinepatterns.LinkInlineProcessor): pass @@ -59,8 +61,20 @@ class ClickHouseMarkdown(markdown.extensions.Extension): md.inlinePatterns['link'] = ClickHouseLinkPattern(markdown.inlinepatterns.LINK_RE, md) md.inlinePatterns['autolink'] = ClickHouseAutolinkPattern(markdown.inlinepatterns.AUTOLINK_RE, md) + def makeExtension(**kwargs): return ClickHouseMarkdown(**kwargs) + def slugify(value, separator): return slugify_impl.slugify(value, separator=separator, word_boundary=True, save_order=True) + + +class PatchedMacrosPlugin(macros.plugin.MacrosPlugin): + def on_config(self, config): + super(PatchedMacrosPlugin, self).on_config(config) + self.env.comment_start_string = '{##' + self.env.comment_end_string = '##}' + + +macros.plugin.MacrosPlugin = PatchedMacrosPlugin diff --git a/docs/tools/mkdocs-material-theme/partials/social.html b/docs/tools/mkdocs-material-theme/partials/social.html index 86fc636584c..89e6f0fec87 100644 --- a/docs/tools/mkdocs-material-theme/partials/social.html +++ b/docs/tools/mkdocs-material-theme/partials/social.html @@ -1,3 +1,3 @@ diff --git a/docs/tools/output.md b/docs/tools/output.md index ec5674004cf..58d6ad60955 100644 --- a/docs/tools/output.md +++ b/docs/tools/output.md @@ -1,17 +1,16 @@ -What is ClickHouse? -=================== +# What is ClickHouse? {#what-is-clickhouse} ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP). -In a "normal" row-oriented DBMS, data is stored in this order: +In a “normal” row-oriented DBMS, data is stored in this order: - Row WatchID JavaEnable Title GoodEvent EventTime - ----- ------------- ------------ -------------------- ----------- --------------------- - \#0 89354350662 1 Investor Relations 1 2016-05-18 05:19:20 - \#1 90329509958 0 Contact us 1 2016-05-18 08:10:20 - \#2 89953706054 1 Mission 1 2016-05-18 07:38:00 - \#N ... ... ... ... ... + Row WatchID JavaEnable Title GoodEvent EventTime + ----- ------------- ------------ -------------------- ----------- --------------------- + \#0 89354350662 1 Investor Relations 1 2016-05-18 05:19:20 + \#1 90329509958 0 Contact us 1 2016-05-18 08:10:20 + \#2 89953706054 1 Mission 1 2016-05-18 07:38:00 + \#N ... ... ... ... ... In other words, all the values related to a row are physically stored next to each other. @@ -21,13 +20,13 @@ Examples of a row-oriented DBMS are MySQL, Postgres, and MS SQL Server. In a column-oriented DBMS, data is stored like this: - Row: \#0 \#1 \#2 \#N - ------------- --------------------- --------------------- --------------------- ----- - WatchID: 89354350662 90329509958 89953706054 ... - JavaEnable: 1 0 1 ... - Title: Investor Relations Contact us Mission ... - GoodEvent: 1 1 1 ... - EventTime: 2016-05-18 05:19:20 2016-05-18 08:10:20 2016-05-18 07:38:00 ... + Row: \#0 \#1 \#2 \#N + ------------- --------------------- --------------------- --------------------- ----- + WatchID: 89354350662 90329509958 89953706054 ... + JavaEnable: 1 0 1 ... + Title: Investor Relations Contact us Mission ... + GoodEvent: 1 1 1 ... + EventTime: 2016-05-18 05:19:20 2016-05-18 08:10:20 2016-05-18 07:38:00 ... These examples only show the order that data is arranged in. The values from different columns are stored separately, and data from the same @@ -41,7 +40,7 @@ PowerDrill, Druid, and kdb+. {: .grey } Different orders for storing data are better suited to different scenarios. The data access scenario refers to what queries are made, how often, and in what proportion; how much data is read for each type of -query -- rows, columns, and bytes; the relationship between reading and +query – rows, columns, and bytes; the relationship between reading and updating data; the working size of the data and how locally it is used; whether transactions are used, and how isolated they are; requirements for data replication and logical integrity; requirements for latency and @@ -55,8 +54,7 @@ system is adaptable to a wide set of scenarios, under a high load, the system will handle all the scenarios equally poorly, or will work well for just one or few of possible scenarios. -Key Properties of the OLAP scenario ------------------------------------ +## Key Properties of the OLAP scenario {#key-properties-of-the-olap-scenario} - The vast majority of requests are for read access. - Data is updated in fairly large batches (\> 1000 rows), not by @@ -64,7 +62,7 @@ Key Properties of the OLAP scenario - Data is added to the DB but is not modified. - For reads, quite a large number of rows are extracted from the DB, but only a small subset of columns. -- Tables are "wide," meaning they contain a large number of columns. +- Tables are “wide,” meaning they contain a large number of columns. - Queries are relatively rare (usually hundreds of queries per server or less per second). - For simple queries, latencies around 50 ms are allowed. @@ -78,17 +76,16 @@ Key Properties of the OLAP scenario one. - A query result is significantly smaller than the source data. In other words, data is filtered or aggregated, so the result fits in a - single server's RAM. + single server’s RAM. It is easy to see that the OLAP scenario is very different from other -popular scenarios (such as OLTP or Key-Value access). So it doesn't make +popular scenarios (such as OLTP or Key-Value access). So it doesn’t make sense to try to use OLTP or a Key-Value DB for processing analytical queries if you want to get decent performance. For example, if you try to use MongoDB or Redis for analytics, you will get very poor performance compared to OLAP databases. -Why Column-Oriented Databases Work Better in the OLAP Scenario --------------------------------------------------------------- +## Why Column-Oriented Databases Work Better in the OLAP Scenario {#why-column-oriented-databases-work-better-in-the-olap-scenario} Column-oriented databases are better suited to OLAP scenarios: they are at least 100 times faster in processing most queries. The reasons are @@ -105,7 +102,7 @@ visually: See the difference? -### Input/output +### Input/output {#inputoutput} 1. For an analytical query, only a small number of table columns need to be read. In a column-oriented database, you can read just the @@ -116,8 +113,8 @@ See the difference? volume. 3. Due to the reduced I/O, more data fits in the system cache. -For example, the query "count the number of records for each advertising -platform" requires reading one "advertising platform ID" column, which +For example, the query “count the number of records for each advertising +platform” requires reading one “advertising platform ID” column, which takes up 1 byte uncompressed. If most of the traffic was not from advertising platforms, you can expect at least 10-fold compression of this column. When using a quick compression algorithm, data @@ -126,60 +123,60 @@ uncompressed data per second. In other words, this query can be processed at a speed of approximately several billion rows per second on a single server. This speed is actually achieved in practice. -
+
Example - $ clickhouse-client - ClickHouse client version 0.0.52053. - Connecting to localhost:9000. - Connected to ClickHouse server version 0.0.52053. + $ clickhouse-client + ClickHouse client version 0.0.52053. + Connecting to localhost:9000. + Connected to ClickHouse server version 0.0.52053. - :) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 + :) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 - SELECT - CounterID, - count() - FROM hits - GROUP BY CounterID - ORDER BY count() DESC - LIMIT 20 + SELECT + CounterID, + count() + FROM hits + GROUP BY CounterID + ORDER BY count() DESC + LIMIT 20 - ┌─CounterID─┬──count()─┐ - │ 114208 │ 56057344 │ - │ 115080 │ 51619590 │ - │ 3228 │ 44658301 │ - │ 38230 │ 42045932 │ - │ 145263 │ 42042158 │ - │ 91244 │ 38297270 │ - │ 154139 │ 26647572 │ - │ 150748 │ 24112755 │ - │ 242232 │ 21302571 │ - │ 338158 │ 13507087 │ - │ 62180 │ 12229491 │ - │ 82264 │ 12187441 │ - │ 232261 │ 12148031 │ - │ 146272 │ 11438516 │ - │ 168777 │ 11403636 │ - │ 4120072 │ 11227824 │ - │ 10938808 │ 10519739 │ - │ 74088 │ 9047015 │ - │ 115079 │ 8837972 │ - │ 337234 │ 8205961 │ - └───────────┴──────────┘ + ┌─CounterID─┬──count()─┐ + │ 114208 │ 56057344 │ + │ 115080 │ 51619590 │ + │ 3228 │ 44658301 │ + │ 38230 │ 42045932 │ + │ 145263 │ 42042158 │ + │ 91244 │ 38297270 │ + │ 154139 │ 26647572 │ + │ 150748 │ 24112755 │ + │ 242232 │ 21302571 │ + │ 338158 │ 13507087 │ + │ 62180 │ 12229491 │ + │ 82264 │ 12187441 │ + │ 232261 │ 12148031 │ + │ 146272 │ 11438516 │ + │ 168777 │ 11403636 │ + │ 4120072 │ 11227824 │ + │ 10938808 │ 10519739 │ + │ 74088 │ 9047015 │ + │ 115079 │ 8837972 │ + │ 337234 │ 8205961 │ + └───────────┴──────────┘ - 20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) + 20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) - :) + :)
-### CPU +### CPU {#cpu} Since executing a query requires processing a large number of rows, it helps to dispatch all operations for entire vectors instead of for separate rows, or to implement the query engine so that there is almost -no dispatching cost. If you don't do this, with any half-decent disk +no dispatching cost. If you don’t do this, with any half-decent disk subsystem, the query interpreter inevitably stalls the CPU. It makes sense to both store data in columns and process it, when possible, by columns. @@ -187,14 +184,14 @@ columns. There are two ways to do this: 1. A vector engine. All operations are written for vectors, instead of - for separate values. This means you don't need to call operations + for separate values. This means you don’t need to call operations very often, and dispatching costs are negligible. Operation code contains an optimized internal cycle. 2. Code generation. The code generated for the query has all the indirect calls in it. -This is not done in "normal" databases, because it doesn't make sense +This is not done in “normal” databases, because it doesn’t make sense when running simple queries. However, there are exceptions. For example, MemSQL uses code generation to reduce latency when processing SQL queries. (For comparison, analytical DBMSs require optimization of diff --git a/docs/tools/release.sh b/docs/tools/release.sh index 37d39c01448..bf318f17252 100755 --- a/docs/tools/release.sh +++ b/docs/tools/release.sh @@ -20,7 +20,7 @@ DOCKER_HASH="$2" if [[ -z "$1" ]] then source "${BASE_DIR}/venv/bin/activate" - python "${BASE_DIR}/build.py" ${EXTRA_BUILD_ARGS} + python3 "${BASE_DIR}/build.py" ${EXTRA_BUILD_ARGS} rm -rf "${PUBLISH_DIR}" || true git clone "${GIT_TEST_URI}" "${PUBLISH_DIR}" cd "${PUBLISH_DIR}" @@ -44,7 +44,7 @@ then if [[ ! -z "${CLOUDFLARE_TOKEN}" ]] then sleep 1m - git diff --stat="9999,9999" --diff-filter=M HEAD~1 | grep '|' | awk '$1 ~ /\.html$/ { if ($3>6) { url="https://'${BASE_DOMAIN}'/"$1; sub(/\/index.html/, "/", url); print "\""url"\""; }}' | split -l 25 /dev/stdin PURGE + git diff --stat="9999,9999" --diff-filter=M HEAD~1 | grep '|' | awk '$1 ~ /\.html$/ { if ($3>4) { url="https://'${BASE_DOMAIN}'/"$1; sub(/\/index.html/, "/", url); print "\""url"\""; }}' | split -l 25 /dev/stdin PURGE for FILENAME in $(ls PURGE*) do POST_DATA=$(cat "${FILENAME}" | sed -n -e 'H;${x;s/\n/,/g;s/^,//;p;}' | awk '{print "{\"files\":["$0"]}";}') diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index ed4a64a3c2a..a86e15cec19 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -1,35 +1,37 @@ -alabaster==0.7.12 -Babel==2.8.0 backports-abc==0.5 +backports.functools-lru-cache==1.6.1 beautifulsoup4==4.8.2 certifi==2019.11.28 chardet==3.0.4 click==7.1.1 -CommonMark==0.9.1 cssmin==0.2.0 -docutils==0.16 -futures==3.1.1 +future==0.18.2 htmlmin==0.1.12 idna==2.9 -imagesize==1.2.0 Jinja2==2.11.1 jinja2-highlight==0.6.1 jsmin==2.2.2 livereload==2.6.1 -Markdown==2.6.11 +lunr==0.5.6 +Markdown==3.2.1 MarkupSafe==1.1.1 -mkdocs==1.0.4 +mkdocs==1.1 +mkdocs-htmlproofer-plugin==0.0.3 +mkdocs-macros-plugin==0.4.4 +nltk==3.4.5 +nose==1.3.7 +protobuf==3.11.3 +numpy==1.18.2 +Pygments==2.5.2 +pymdown-extensions==6.3 python-slugify==1.2.6 -pytz==2019.3 -PyYAML==5.3 -recommonmark==0.4.0 +PyYAML==5.3.1 +repackage==0.7.3 requests==2.23.0 singledispatch==3.4.0.3 six==1.14.0 -snowballstemmer==1.2.1 -Sphinx==1.6.5 -sphinxcontrib-websupport==1.0.1 -tornado==5.1 -typing==3.7.4.1 +soupsieve==2.0 +termcolor==1.1.0 +tornado==5.1.1 Unidecode==1.1.1 urllib3==1.25.8 diff --git a/docs/tools/test.py b/docs/tools/test.py index f02d8fe0cfb..f4a1fb46ace 100755 --- a/docs/tools/test.py +++ b/docs/tools/test.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- from __future__ import unicode_literals import logging @@ -6,6 +6,7 @@ import sys import bs4 + def test_single_page(input_path, lang): with open(input_path) as f: soup = bs4.BeautifulSoup( @@ -33,10 +34,9 @@ def test_single_page(input_path, lang): if duplicate_anchor_points: logging.warning('Found %d duplicate anchor points' % duplicate_anchor_points) - if links_to_nowhere: - logging.error('Found %d links to nowhere' % links_to_nowhere) - sys.exit(10) + if lang == 'en': + assert not links_to_nowhere, 'Found %d links to nowhere' % links_to_nowhere assert len(anchor_points) > 10, 'Html parsing is probably broken' diff --git a/docs/tools/translate.py b/docs/tools/translate.py deleted file mode 100755 index 621fc37af19..00000000000 --- a/docs/tools/translate.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function -import sys -import pprint - -import googletrans -import pandocfilters - -translator = googletrans.Translator() - -def translate(key, value, format, _): - if key == 'Str': - print(value.encode('utf8'), file=sys.stderr) - return - [meta, contents] = value - cls = getattr(pandocfilters, key) - return cls(meta, translator.translate(contents, dest='es')) - -if __name__ == "__main__": - pandocfilters.toJSONFilter(translate) diff --git a/docs/tools/translate/filter.py b/docs/tools/translate/filter.py new file mode 100755 index 00000000000..3e737587df9 --- /dev/null +++ b/docs/tools/translate/filter.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 + +import os +import sys +import json.decoder + +import pandocfilters +import slugify + +import translate + + +is_debug = os.environ.get('DEBUG') is not None + + +def debug(*args): + if is_debug: + print(*args, file=sys.stderr) + + +def process_buffer(buffer, new_value, item=None): + if buffer: + text = ''.join(buffer) + + try: + translated_text = translate.translate(text) + except TypeError: + translated_text = text + except json.decoder.JSONDecodeError as e: + print('Failed to translate', str(e), file=sys.stderr) + sys.exit(1) + + debug('Translate', text, ' -> ', translated_text) + + if text and text[0].isupper() and not translated_text[0].isupper(): + translated_text = translated_text[0].upper() + translated_text[1:] + + if text.startswith(' ') and not translated_text.startswith(' '): + translated_text = ' ' + translated_text + + if text.endswith(' ') and not translated_text.endswith(' '): + translated_text = translated_text + ' ' + + for token in translated_text.split(' '): + new_value.append(pandocfilters.Str(token)) + new_value.append(pandocfilters.Space()) + + if item is None and len(new_value): + new_value.pop(len(new_value) - 1) + else: + new_value[-1] = item + elif item: + new_value.append(item) + + +def process_sentence(value): + new_value = [] + buffer = [] + for item in value: + if isinstance(item, list): + new_value.append([process_sentence(subitem) for subitem in item]) + continue + elif isinstance(item, dict): + t = item.get('t') + c = item.get('c') + if t == 'Str': + buffer.append(c) + elif t == 'Space': + buffer.append(' ') + elif t == 'DoubleQuote': + buffer.append('"') + else: + process_buffer(buffer, new_value, item) + buffer = [] + else: + new_value.append(item) + process_buffer(buffer, new_value) + return new_value + + +def translate_filter(key, value, _format, _): + if key not in ['Space', 'Str']: + debug(key, value) + try: + cls = getattr(pandocfilters, key) + except AttributeError: + return + + if key == 'Para' and value: + marker = value[0].get('c') + if isinstance(marker, str) and marker.startswith('!!!') and len(value) > 2: + # Admonition case + if marker != '!!!': + # Lost space after !!! case + value.insert(1, pandocfilters.Str(marker[3:])) + value.insert(1, pandocfilters.Space()) + value[0]['c'] = '!!!' + admonition_value = [] + remaining_para_value = [] + in_admonition = True + for item in value: + if in_admonition: + if item.get('t') == 'SoftBreak': + in_admonition = False + else: + admonition_value.append(item) + else: + remaining_para_value.append(item) + + break_value = [pandocfilters.LineBreak(),pandocfilters.Str(' ' * 4)] + if admonition_value[-1].get('t') == 'Quoted': + text = process_sentence(admonition_value[-1]['c'][-1]) + text[0]['c'] = '"' + text[0]['c'] + text[-1]['c'] = text[-1]['c'] + '"' + admonition_value.pop(-1) + admonition_value += text + else: + debug('>>>', ) + text = admonition_value[-1].get('c') + if text: + text = translate(text[0].upper() + text[1:]) + admonition_value.append(pandocfilters.Space()) + admonition_value.append(pandocfilters.Str(f'"{text}"')) + + return cls(admonition_value + break_value + process_sentence(remaining_para_value)) + else: + return cls(process_sentence(value)) + elif key == 'Plain' or key == 'Strong' or key == 'Emph': + return cls(process_sentence(value)) + elif key == 'Link': + try: + # Plain links case + if value[2][0] == value[1][0].get('c'): + return pandocfilters.Str(value[2][0]) + except IndexError: + pass + value[1] = process_sentence(value[1]) + return cls(*value) + elif key == 'Header': + # TODO: title case header in en + value[1][0] = slugify.slugify(value[1][0], separator='-', word_boundary=True, save_order=True) + # TODO: title case header in en + value[2] = process_sentence(value[2]) + return cls(*value) + elif key == 'SoftBreak': + return pandocfilters.LineBreak() + + return + + +if __name__ == "__main__": + pandocfilters.toJSONFilter(translate_filter) diff --git a/docs/tools/translate/normalize-markdown.sh b/docs/tools/translate/normalize-markdown.sh new file mode 100755 index 00000000000..d25c3ee65b2 --- /dev/null +++ b/docs/tools/translate/normalize-markdown.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Usage: normalize-en-markdown.sh +set -e +BASE_DIR=$(dirname $(readlink -f $0)) +TEMP_FILE=$(mktemp) +trap 'rm -f -- "${TEMP_FILE}"' INT TERM HUP EXIT +INPUT="$1" +if [[ ! -L "${INPUT}" ]] +then + cat "${INPUT}" > "${TEMP_FILE}" + "${BASE_DIR}/translate.sh" "en" "${TEMP_FILE}" "${INPUT}" +fi diff --git a/docs/tools/translate/replace-with-translation.sh b/docs/tools/translate/replace-with-translation.sh new file mode 100755 index 00000000000..6106b1e1e06 --- /dev/null +++ b/docs/tools/translate/replace-with-translation.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Usage: replace-with-translation.sh +set -e +BASE_DIR=$(dirname $(readlink -f $0)) +TEMP_FILE=$(mktemp) +trap 'rm -f -- "${TEMP_FILE}"' INT TERM HUP EXIT +TARGET_LANGUAGE="$1" +INPUT="$2" +cat "${INPUT}" > "${TEMP_FILE}" +if [[ ! -z $SLEEP ]] +then + sleep $[ ( $RANDOM % 20 ) + 1 ]s +fi +git rm -f "${INPUT}" +mkdir -p $(dirname "${INPUT}") || true +YANDEX=1 "${BASE_DIR}/translate.sh" "${TARGET_LANGUAGE}" "${TEMP_FILE}" "${INPUT}" +git add "${INPUT}" diff --git a/docs/tools/translate/requirements.txt b/docs/tools/translate/requirements.txt new file mode 100644 index 00000000000..975498ac012 --- /dev/null +++ b/docs/tools/translate/requirements.txt @@ -0,0 +1,10 @@ +certifi==2019.11.28 +chardet==3.0.4 +googletrans==2.4.0 +idna==2.9 +pandocfilters==1.4.2 +python-slugify==4.0.0 +PyYAML==5.3.1 +requests==2.23.0 +text-unidecode==1.3 +urllib3==1.25.8 diff --git a/docs/tools/translate/translate.py b/docs/tools/translate/translate.py new file mode 100755 index 00000000000..03445defcc6 --- /dev/null +++ b/docs/tools/translate/translate.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +import os +import random +import sys +import time +import urllib.parse + +import googletrans +import requests +import yaml + +import typograph_ru + + +translator = googletrans.Translator() +target_language = os.environ.get('TARGET_LANGUAGE', 'ru') + +is_yandex = os.environ.get('YANDEX') is not None + + +def translate(text): + if target_language == 'en': + return text + elif target_language == 'typograph_ru': + return typograph_ru.typograph(text) + elif is_yandex: + text = text.replace('‘', '\'') + text = text.replace('’', '\'') + if text.isascii() and not text.isupper(): + text = urllib.parse.quote(text) + url = f'http://translate.yandex.net/api/v1/tr.json/translate?srv=docs&lang=en-{target_language}&text={text}' + result = requests.get(url).json() + if result.get('code') == 200: + return result['text'][0] + else: + print('Failed to translate', str(result), file=sys.stderr) + sys.exit(1) + else: + return text + else: + time.sleep(random.random()) + return translator.translate(text, target_language).text + + +def translate_toc(root): + if isinstance(root, dict): + result = [] + for key, value in root.items(): + key = translate(key) if key != 'hidden' and not key.isupper() else key + result.append((key, translate_toc(value),)) + return dict(result) + elif isinstance(root, list): + return [translate_toc(item) for item in root] + elif isinstance(root, str): + return root + + +if __name__ == '__main__': + target_language = sys.argv[1] + is_yandex = True + result = translate_toc(yaml.full_load(sys.stdin.read())['nav']) + print(yaml.dump({'nav': result})) diff --git a/docs/tools/translate/translate.sh b/docs/tools/translate/translate.sh new file mode 100755 index 00000000000..89225e0cfcd --- /dev/null +++ b/docs/tools/translate/translate.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Usage: translate.sh +set -e +BASE_DIR=$(dirname $(readlink -f $0)) +OUTPUT=${3:-/dev/stdout} +export TARGET_LANGUAGE="$1" +export DEBUG +TEMP_FILE=$(mktemp) +trap 'rm -f -- "${TEMP_FILE}"' INT TERM HUP EXIT +source "${BASE_DIR}/venv/bin/activate" +pandoc "$2" --filter "${BASE_DIR}/filter.py" -o "${TEMP_FILE}" \ + -f "markdown-space_in_atx_header" -t "markdown_strict+pipe_tables+markdown_attribute+all_symbols_escapable+backtick_code_blocks+autolink_bare_uris-link_attributes+markdown_attribute+mmd_link_attributes-raw_attribute+header_attributes-grid_tables" \ + --atx-headers --wrap=none --columns=99999 --tab-stop=4 +perl -pi -e 's/{\\#\\#/{##/g' "${TEMP_FILE}" +perl -pi -e 's/\\#\\#}/##}/g' "${TEMP_FILE}" +perl -pi -e 's/ *$//gg' "${TEMP_FILE}" +if [[ "${TARGET_LANGUAGE}" == "ru" ]] +then + perl -pi -e 's/“/«/gg' "${TEMP_FILE}" + perl -pi -e 's/”/»/gg' "${TEMP_FILE}" +fi +cat "${TEMP_FILE}" > "${OUTPUT}" diff --git a/docs/tools/translate/typograph_ru.py b/docs/tools/translate/typograph_ru.py new file mode 100644 index 00000000000..db20109efff --- /dev/null +++ b/docs/tools/translate/typograph_ru.py @@ -0,0 +1,45 @@ +import requests + +class TypographError(Exception): + pass + + +def typograph(text): + text = text.replace('&', '&') + text = text.replace('<', '<') + text = text.replace('>', '>') + template = f''' + + + + {text} + 3 + 0 + 0 + 0 + + + + ''' + result = requests.post( + url='http://typograf.artlebedev.ru/webservices/typograf.asmx', + data=template.encode('utf-8'), + headers={ + 'Content-Type': 'text/xml', + 'SOAPAction': 'http://typograf.artlebedev.ru/webservices/ProcessText' + } + ) + if result.ok and 'ProcessTextResult' in result.text: + result_text = result.text.split('')[1].split('')[0].rstrip() + result_text = result_text.replace('&', '&') + result_text = result_text.replace('<', '<') + result_text = result_text.replace('>', '>') + return result_text + else: + raise TypographError(result.text) + + +if __name__ == '__main__': + import sys + print(typograph(sys.stdin.read())) diff --git a/docs/tools/website.py b/docs/tools/website.py index d7ba55745c8..82bd0d2510f 100644 --- a/docs/tools/website.py +++ b/docs/tools/website.py @@ -49,7 +49,7 @@ def build_website(args): for root, _, filenames in os.walk(args.output_dir): for filename in filenames: path = os.path.join(root, filename) - if not (filename.endswith('.html') or filename.endswith('.css')): + if not (filename.endswith('.html') or filename.endswith('.css') or filename.endswith('.js')): continue logging.info('Processing %s', path) with open(path, 'rb') as f: diff --git a/docs/zh/commercial/cloud.md b/docs/zh/commercial/cloud.md new file mode 120000 index 00000000000..eb58e4a90be --- /dev/null +++ b/docs/zh/commercial/cloud.md @@ -0,0 +1 @@ +../../en/commercial/cloud.md \ No newline at end of file diff --git a/docs/zh/data_types/array.md b/docs/zh/data_types/array.md index e9e59f2ed6f..774210b0d29 100644 --- a/docs/zh/data_types/array.md +++ b/docs/zh/data_types/array.md @@ -1,52 +1,46 @@ -# Array(T) {#data_type-array} +# Array(T) {#data-type-array} 由 `T` 类型元素组成的数组。 `T` 可以是任意类型,包含数组类型。 但不推荐使用多维数组,ClickHouse 对多维数组的支持有限。例如,不能存储在 `MergeTree` 表中存储多维数组。 -## 创建数组 +## 创建数组 {#chuang-jian-shu-zu} 您可以使用array函数来创建数组: -``` -array(T) -``` + array(T) 您也可以使用方括号: -``` -[] -``` + [] 创建数组示例: -``` -:) SELECT array(1, 2) AS x, toTypeName(x) + :) SELECT array(1, 2) AS x, toTypeName(x) -SELECT - [1, 2] AS x, - toTypeName(x) + SELECT + [1, 2] AS x, + toTypeName(x) -┌─x─────┬─toTypeName(array(1, 2))─┐ -│ [1,2] │ Array(UInt8) │ -└───────┴─────────────────────────┘ + ┌─x─────┬─toTypeName(array(1, 2))─┐ + │ [1,2] │ Array(UInt8) │ + └───────┴─────────────────────────┘ -1 rows in set. Elapsed: 0.002 sec. + 1 rows in set. Elapsed: 0.002 sec. -:) SELECT [1, 2] AS x, toTypeName(x) + :) SELECT [1, 2] AS x, toTypeName(x) -SELECT - [1, 2] AS x, - toTypeName(x) + SELECT + [1, 2] AS x, + toTypeName(x) -┌─x─────┬─toTypeName([1, 2])─┐ -│ [1,2] │ Array(UInt8) │ -└───────┴────────────────────┘ + ┌─x─────┬─toTypeName([1, 2])─┐ + │ [1,2] │ Array(UInt8) │ + └───────┴────────────────────┘ -1 rows in set. Elapsed: 0.002 sec. -``` + 1 rows in set. Elapsed: 0.002 sec. -## 使用数据类型 +## 使用数据类型 {#shi-yong-shu-ju-lei-xing} ClickHouse会自动检测数组元素,并根据元素计算出存储这些元素最小的数据类型。如果在元素中存在 [NULL](../query_language/syntax.md#null-literal) 或存在 [Nullable](nullable.md#data_type-nullable) 类型元素,那么数组的元素类型将会变成 [Nullable](nullable.md)。 @@ -54,29 +48,25 @@ ClickHouse会自动检测数组元素,并根据元素计算出存储这些元素 自动数据类型检测示例: -``` -:) SELECT array(1, 2, NULL) AS x, toTypeName(x) + :) SELECT array(1, 2, NULL) AS x, toTypeName(x) -SELECT - [1, 2, NULL] AS x, - toTypeName(x) + SELECT + [1, 2, NULL] AS x, + toTypeName(x) -┌─x──────────┬─toTypeName(array(1, 2, NULL))─┐ -│ [1,2,NULL] │ Array(Nullable(UInt8)) │ -└────────────┴───────────────────────────────┘ + ┌─x──────────┬─toTypeName(array(1, 2, NULL))─┐ + │ [1,2,NULL] │ Array(Nullable(UInt8)) │ + └────────────┴───────────────────────────────┘ -1 rows in set. Elapsed: 0.002 sec. -``` + 1 rows in set. Elapsed: 0.002 sec. 如果您尝试创建不兼容的数据类型数组,ClickHouse 将引发异常: -``` -:) SELECT array(1, 'a') + :) SELECT array(1, 'a') -SELECT [1, 'a'] + SELECT [1, 'a'] -Received exception from server (version 1.1.54388): -Code: 386. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: There is no supertype for types UInt8, String because some of them are String/FixedString and some of them are not. + Received exception from server (version 1.1.54388): + Code: 386. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: There is no supertype for types UInt8, String because some of them are String/FixedString and some of them are not. -0 rows in set. Elapsed: 0.246 sec. -``` + 0 rows in set. Elapsed: 0.246 sec. diff --git a/docs/zh/data_types/boolean.md b/docs/zh/data_types/boolean.md index 3a71691cdf4..1918bb1c56b 100644 --- a/docs/zh/data_types/boolean.md +++ b/docs/zh/data_types/boolean.md @@ -1,3 +1,3 @@ -# Boolean Values +# Boolean Values {#boolean-values} 没有单独的类型来存储布尔值。可以使用 UInt8 类型,取值限制为 0 或 1。 diff --git a/docs/zh/data_types/date.md b/docs/zh/data_types/date.md index 42fb124af9c..96ee60d53a8 100644 --- a/docs/zh/data_types/date.md +++ b/docs/zh/data_types/date.md @@ -1,4 +1,4 @@ -# Date +# Date {#date} 日期类型,用两个字节存储,表示从 1970-01-01 (无符号) 到当前的日期值。允许存储从 Unix 纪元开始到编译阶段定义的上限阈值常量(目前上限是2106年,但最终完全支持的年份为2105)。最小值输出为0000-00-00。 diff --git a/docs/zh/data_types/datetime.md b/docs/zh/data_types/datetime.md index 0bdd942bd72..e2b2b304ac2 100644 --- a/docs/zh/data_types/datetime.md +++ b/docs/zh/data_types/datetime.md @@ -1,8 +1,8 @@ -# DateTime {#data_type-datetime} +# DateTime {#data-type-datetime} 时间戳类型。用四个字节(无符号的)存储 Unix 时间戳)。允许存储与日期类型相同的范围内的值。最小值为 0000-00-00 00:00:00。时间戳类型值精确到秒(没有闰秒)。 -## 时区 +## 时区 {#shi-qu} 使用启动客户端或服务器时的系统时区,时间戳是从文本(分解为组件)转换为二进制并返回。在文本格式中,有关夏令时的信息会丢失。 diff --git a/docs/zh/data_types/decimal.md b/docs/zh/data_types/decimal.md index 8fc4db33981..1503da90d89 100644 --- a/docs/zh/data_types/decimal.md +++ b/docs/zh/data_types/decimal.md @@ -1,45 +1,44 @@ - -# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S) +# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S) {#decimalp-s-decimal32s-decimal64s-decimal128s} 有符号的定点数,可在加、减和乘法运算过程中保持精度。对于除法,最低有效数字会被丢弃(不舍入)。 -## 参数 +## 参数 {#can-shu} -- P - 精度。有效范围:[1:38],决定可以有多少个十进制数字(包括分数)。 -- S - 规模。有效范围:[0:P],决定数字的小数部分中包含的小数位数。 +- P - 精度。有效范围:\[1:38\],决定可以有多少个十进制数字(包括分数)。 +- S - 规模。有效范围:\[0:P\],决定数字的小数部分中包含的小数位数。 对于不同的 P 参数值 Decimal 表示,以下例子都是同义的: -- P from [ 1 : 9 ] - for Decimal32(S) -- P from [ 10 : 18 ] - for Decimal64(S) -- P from [ 19 : 38 ] - for Decimal128(S) +- P from \[ 1 : 9 \] - for Decimal32(S) +- P from \[ 10 : 18 \] - for Decimal64(S) +- P from \[ 19 : 38 \] - for Decimal128(S) -## 十进制值范围 +## 十进制值范围 {#shi-jin-zhi-zhi-fan-wei} -- Decimal32(S) - ( -1 * 10^(9 - S), 1 * 10^(9 - S) ) -- Decimal64(S) - ( -1 * 10^(18 - S), 1 * 10^(18 - S) ) -- Decimal128(S) - ( -1 * 10^(38 - S), 1 * 10^(38 - S) ) +- Decimal32(S) - ( -1 \* 10^(9 - S), 1 \* 10^(9 - S) ) +- Decimal64(S) - ( -1 \* 10^(18 - S), 1 \* 10^(18 - S) ) +- Decimal128(S) - ( -1 \* 10^(38 - S), 1 \* 10^(38 - S) ) 例如,Decimal32(4) 可以表示 -99999.9999 至 99999.9999 的数值,步长为0.0001。 -## 内部表示方式 +## 内部表示方式 {#nei-bu-biao-shi-fang-shi} 数据采用与自身位宽相同的有符号整数存储。这个数在内存中实际范围会高于上述范围,从 String 转换到十进制数的时候会做对应的检查。 由于现代CPU不支持128位数字,因此 Decimal128 上的操作由软件模拟。所以 Decimal128 的运算速度明显慢于 Decimal32/Decimal64。 -## 运算和结果类型 +## 运算和结果类型 {#yun-suan-he-jie-guo-lei-xing} 对Decimal的二进制运算导致更宽的结果类型(无论参数的顺序如何)。 -- Decimal64(S1) Decimal32(S2) -> Decimal64(S) -- Decimal128(S1) Decimal32(S2) -> Decimal128(S) -- Decimal128(S1) Decimal64(S2) -> Decimal128(S) +- Decimal64(S1) Decimal32(S2) -\> Decimal64(S) +- Decimal128(S1) Decimal32(S2) -\> Decimal128(S) +- Decimal128(S1) Decimal64(S2) -\> Decimal128(S) 精度变化的规则: -- 加法,减法:S = max(S1, S2)。 -- 乘法:S = S1 + S2。 -- 除法:S = S1。 +- 加法,减法:S = max(S1, S2)。 +- 乘法:S = S1 + S2。 +- 除法:S = S1。 对于 Decimal 和整数之间的类似操作,结果是与参数大小相同的十进制。 @@ -47,50 +46,35 @@ Decimal上的一些函数返回结果为Float64(例如,var或stddev)。对于其中一些,中间计算发生在Decimal中。对于此类函数,尽管结果类型相同,但Float64和Decimal中相同数据的结果可能不同。 -## 溢出检查 +## 溢出检查 {#yi-chu-jian-cha} 在对 Decimal 类型执行操作时,数值可能会发生溢出。分数中的过多数字被丢弃(不是舍入的)。整数中的过多数字将导致异常。 -``` -SELECT toDecimal32(2, 4) AS x, x / 3 -``` -``` -┌──────x─┬─divide(toDecimal32(2, 4), 3)─┐ -│ 2.0000 │ 0.6666 │ -└────────┴──────────────────────────────┘ -``` + SELECT toDecimal32(2, 4) AS x, x / 3 -``` -SELECT toDecimal32(4.2, 8) AS x, x * x -``` -``` -DB::Exception: Scale is out of bounds. -``` + ┌──────x─┬─divide(toDecimal32(2, 4), 3)─┐ + │ 2.0000 │ 0.6666 │ + └────────┴──────────────────────────────┘ -``` -SELECT toDecimal32(4.2, 8) AS x, 6 * x -``` -``` -DB::Exception: Decimal math overflow. -``` + SELECT toDecimal32(4.2, 8) AS x, x * x + + DB::Exception: Scale is out of bounds. + + SELECT toDecimal32(4.2, 8) AS x, 6 * x + + DB::Exception: Decimal math overflow. 检查溢出会导致计算变慢。如果已知溢出不可能,则可以通过设置`decimal_check_overflow`来禁用溢出检查,在这种情况下,溢出将导致结果不正确: -``` -SET decimal_check_overflow = 0; -SELECT toDecimal32(4.2, 8) AS x, 6 * x -``` -``` -┌──────────x─┬─multiply(6, toDecimal32(4.2, 8))─┐ -│ 4.20000000 │ -17.74967296 │ -└────────────┴──────────────────────────────────┘ -``` + SET decimal_check_overflow = 0; + SELECT toDecimal32(4.2, 8) AS x, 6 * x + + ┌──────────x─┬─multiply(6, toDecimal32(4.2, 8))─┐ + │ 4.20000000 │ -17.74967296 │ + └────────────┴──────────────────────────────────┘ 溢出检查不仅发生在算术运算上,还发生在比较运算上: -``` -SELECT toDecimal32(1, 8) < 100 -``` -``` -DB::Exception: Can't compare. -``` + SELECT toDecimal32(1, 8) < 100 + + DB::Exception: Can't compare. diff --git a/docs/zh/data_types/domains/ipv4.md b/docs/zh/data_types/domains/ipv4.md index aab11f30c27..65c066fb487 100644 --- a/docs/zh/data_types/domains/ipv4.md +++ b/docs/zh/data_types/domains/ipv4.md @@ -1,8 +1,8 @@ -## IPv4 +## IPv4 {#ipv4} `IPv4`是与`UInt32`类型保持二进制兼容的Domain类型,其用于存储IPv4地址的值。它提供了更为紧凑的二进制存储的同时支持识别可读性更加友好的输入输出格式。 -### 基本使用 +### 基本使用 {#ji-ben-shi-yong} ``` sql CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY url; @@ -10,12 +10,10 @@ CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -``` -┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ -│ url │ String │ │ │ │ │ -│ from │ IPv4 │ │ │ │ │ -└──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘ -``` + ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ + │ url │ String │ │ │ │ │ + │ from │ IPv4 │ │ │ │ │ + └──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘ 同时您也可以使用`IPv4`类型的列作为主键: @@ -31,13 +29,11 @@ INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '116.253.40.133')( SELECT * FROM hits; ``` -``` -┌─url────────────────────────────────┬───────────from─┐ -│ https://clickhouse.tech/docs/en/ │ 116.106.34.242 │ -│ https://wikipedia.org │ 116.253.40.133 │ -│ https://clickhouse.tech │ 183.247.232.58 │ -└────────────────────────────────────┴────────────────┘ -``` + ┌─url────────────────────────────────┬───────────from─┐ + │ https://clickhouse.tech/docs/en/ │ 116.106.34.242 │ + │ https://wikipedia.org │ 116.253.40.133 │ + │ https://clickhouse.tech │ 183.247.232.58 │ + └────────────────────────────────────┴────────────────┘ 同时它提供更为紧凑的二进制存储格式: @@ -45,11 +41,9 @@ SELECT * FROM hits; SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -``` -┌─toTypeName(from)─┬─hex(from)─┐ -│ IPv4 │ B7F7E83A │ -└──────────────────┴───────────┘ -``` + ┌─toTypeName(from)─┬─hex(from)─┐ + │ IPv4 │ B7F7E83A │ + └──────────────────┴───────────┘ 不可隐式转换为除`UInt32`以外的其他类型类型。如果要将`IPv4`类型的值转换成字符串,你可以使用`IPv4NumToString()`显示的进行转换: @@ -57,11 +51,9 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; SELECT toTypeName(s), IPv4NumToString(from) as s FROM hits LIMIT 1; ``` -``` -┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ -│ String │ 183.247.232.58 │ -└───────────────────────────────────┴────────────────┘ -``` + ┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ + │ String │ 183.247.232.58 │ + └───────────────────────────────────┴────────────────┘ 或可以使用`CAST`将它转换为`UInt32`类型: @@ -69,10 +61,8 @@ SELECT toTypeName(s), IPv4NumToString(from) as s FROM hits LIMIT 1; SELECT toTypeName(i), CAST(from as UInt32) as i FROM hits LIMIT 1; ``` -``` -┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ -│ UInt32 │ 3086477370 │ -└──────────────────────────────────┴────────────┘ -``` + ┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ + │ UInt32 │ 3086477370 │ + └──────────────────────────────────┴────────────┘ [来源文章](https://clickhouse.tech/docs/en/data_types/domains/ipv4) diff --git a/docs/zh/data_types/domains/ipv6.md b/docs/zh/data_types/domains/ipv6.md index 1f89bd93047..bc0f95932aa 100644 --- a/docs/zh/data_types/domains/ipv6.md +++ b/docs/zh/data_types/domains/ipv6.md @@ -1,8 +1,8 @@ -## IPv6 +## IPv6 {#ipv6} `IPv6`是与`FixedString(16)`类型保持二进制兼容的Domain类型,其用于存储IPv6地址的值。它提供了更为紧凑的二进制存储的同时支持识别可读性更加友好的输入输出格式。 -### 基本用法 +### 基本用法 {#ji-ben-yong-fa} ``` sql CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; @@ -10,12 +10,10 @@ CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -``` -┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ -│ url │ String │ │ │ │ │ -│ from │ IPv6 │ │ │ │ │ -└──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘ -``` + ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ + │ url │ String │ │ │ │ │ + │ from │ IPv6 │ │ │ │ │ + └──────┴────────┴──────────────┴────────────────────┴─────────┴──────────────────┘ 同时您也可以使用`IPv6`类型的列作为主键: @@ -31,13 +29,11 @@ INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '2a02:aa08:e000:31 SELECT * FROM hits; ``` -``` -┌─url────────────────────────────────┬─from──────────────────────────┐ -│ https://clickhouse.tech │ 2001:44c8:129:2632:33:0:252:2 │ -│ https://clickhouse.tech/docs/en/ │ 2a02:e980:1e::1 │ -│ https://wikipedia.org │ 2a02:aa08:e000:3100::2 │ -└────────────────────────────────────┴───────────────────────────────┘ -``` + ┌─url────────────────────────────────┬─from──────────────────────────┐ + │ https://clickhouse.tech │ 2001:44c8:129:2632:33:0:252:2 │ + │ https://clickhouse.tech/docs/en/ │ 2a02:e980:1e::1 │ + │ https://wikipedia.org │ 2a02:aa08:e000:3100::2 │ + └────────────────────────────────────┴───────────────────────────────┘ 同时它提供更为紧凑的二进制存储格式: @@ -45,11 +41,9 @@ SELECT * FROM hits; SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -``` -┌─toTypeName(from)─┬─hex(from)────────────────────────┐ -│ IPv6 │ 200144C8012926320033000002520002 │ -└──────────────────┴──────────────────────────────────┘ -``` + ┌─toTypeName(from)─┬─hex(from)────────────────────────┐ + │ IPv6 │ 200144C8012926320033000002520002 │ + └──────────────────┴──────────────────────────────────┘ 不可隐式转换为除`FixedString(16)`以外的其他类型类型。如果要将`IPv6`类型的值转换成字符串,你可以使用`IPv6NumToString()`显示的进行转换: @@ -57,11 +51,9 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1; ``` -``` -┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐ -│ String │ 2001:44c8:129:2632:33:0:252:2 │ -└───────────────────────────────────┴───────────────────────────────┘ -``` + ┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐ + │ String │ 2001:44c8:129:2632:33:0:252:2 │ + └───────────────────────────────────┴───────────────────────────────┘ 或使用`CAST`将其转换为`FixedString(16)`: @@ -69,10 +61,8 @@ SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1; SELECT toTypeName(i), CAST(from as FixedString(16)) as i FROM hits LIMIT 1; ``` -``` -┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐ -│ FixedString(16) │ ��� │ -└───────────────────────────────────────────┴─────────┘ -``` + ┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐ + │ FixedString(16) │ ��� │ + └───────────────────────────────────────────┴─────────┘ [来源文章](https://clickhouse.tech/docs/en/data_types/domains/ipv6) diff --git a/docs/zh/data_types/domains/overview.md b/docs/zh/data_types/domains/overview.md index 2a56baa4fda..6c59860132e 100644 --- a/docs/zh/data_types/domains/overview.md +++ b/docs/zh/data_types/domains/overview.md @@ -1,26 +1,26 @@ -# Domains +# Domains {#domains} Domain类型是特定实现的类型,它总是与某个现存的基础类型保持二进制兼容的同时添加一些额外的特性,以能够在维持磁盘数据不变的情况下使用这些额外的特性。目前ClickHouse暂不支持自定义domain类型。 如果你可以在一个地方使用与Domain类型二进制兼容的基础类型,那么在相同的地方您也可以使用Domain类型,例如: -* 使用Domain类型作为表中列的类型 -* 对Domain类型的列进行读/写数据 -* 如果与Domain二进制兼容的基础类型可以作为索引,那么Domain类型也可以作为索引 -* 将Domain类型作为参数传递给函数使用 -* 其他 +- 使用Domain类型作为表中列的类型 +- 对Domain类型的列进行读/写数据 +- 如果与Domain二进制兼容的基础类型可以作为索引,那么Domain类型也可以作为索引 +- 将Domain类型作为参数传递给函数使用 +- 其他 -### Domains的额外特性 +### Domains的额外特性 {#domainsde-e-wai-te-xing} -* 在执行SHOW CREATE TABLE 或 DESCRIBE TABLE时,其对应的列总是展示为Domain类型的名称 -* 在INSERT INTO domain_table(domain_column) VALUES(...)中输入数据总是以更人性化的格式进行输入 -* 在SELECT domain_column FROM domain_table中数据总是以更人性化的格式输出 -* 在INSERT INTO domain_table FORMAT CSV ...中,实现外部源数据以更人性化的格式载入 +- 在执行SHOW CREATE TABLE 或 DESCRIBE TABLE时,其对应的列总是展示为Domain类型的名称 +- 在INSERT INTO domain\_table(domain\_column) VALUES(…)中输入数据总是以更人性化的格式进行输入 +- 在SELECT domain\_column FROM domain\_table中数据总是以更人性化的格式输出 +- 在INSERT INTO domain\_table FORMAT CSV …中,实现外部源数据以更人性化的格式载入 -### Domains类型的限制 +### Domains类型的限制 {#domainslei-xing-de-xian-zhi} -* 无法通过`ALTER TABLE`将基础类型的索引转换为Domain类型的索引。 -* 当从其他列或表插入数据时,无法将string类型的值隐式地转换为Domain类型的值。 -* 无法对存储为Domain类型的值添加约束。 +- 无法通过`ALTER TABLE`将基础类型的索引转换为Domain类型的索引。 +- 当从其他列或表插入数据时,无法将string类型的值隐式地转换为Domain类型的值。 +- 无法对存储为Domain类型的值添加约束。 [来源文章](https://clickhouse.tech/docs/en/data_types/domains/overview) diff --git a/docs/zh/data_types/enum.md b/docs/zh/data_types/enum.md index 41133b56d45..034406a303b 100644 --- a/docs/zh/data_types/enum.md +++ b/docs/zh/data_types/enum.md @@ -1,78 +1,67 @@ - -# Enum8, Enum16 +# Enum8, Enum16 {#enum8-enum16} 包括 `Enum8` 和 `Enum16` 类型。`Enum` 保存 `'string'= integer` 的对应关系。在 ClickHouse 中,尽管用户使用的是字符串常量,但所有含有 `Enum` 数据类型的操作都是按照包含整数的值来执行。这在性能方面比使用 `String` 数据类型更有效。 -- `Enum8` 用 `'String'= Int8` 对描述。 -- `Enum16` 用 `'String'= Int16` 对描述。 +- `Enum8` 用 `'String'= Int8` 对描述。 +- `Enum16` 用 `'String'= Int16` 对描述。 -## 用法示例 +## 用法示例 {#yong-fa-shi-li} 创建一个带有一个枚举 `Enum8('hello' = 1, 'world' = 2)` 类型的列: -``` -CREATE TABLE t_enum -( - x Enum8('hello' = 1, 'world' = 2) -) -ENGINE = TinyLog -``` + CREATE TABLE t_enum + ( + x Enum8('hello' = 1, 'world' = 2) + ) + ENGINE = TinyLog 这个 `x` 列只能存储类型定义中列出的值:`'hello'`或`'world'`。如果您尝试保存任何其他值,ClickHouse 抛出异常。 -``` -:) INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') + :) INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') -INSERT INTO t_enum VALUES + INSERT INTO t_enum VALUES -Ok. + Ok. -3 rows in set. Elapsed: 0.002 sec. + 3 rows in set. Elapsed: 0.002 sec. -:) insert into t_enum values('a') + :) insert into t_enum values('a') -INSERT INTO t_enum VALUES + INSERT INTO t_enum VALUES -Exception on client: -Code: 49. DB::Exception: Unknown element 'a' for type Enum8('hello' = 1, 'world' = 2) -``` + Exception on client: + Code: 49. DB::Exception: Unknown element 'a' for type Enum8('hello' = 1, 'world' = 2) 当您从表中查询数据时,ClickHouse 从 `Enum` 中输出字符串值。 -``` -SELECT * FROM t_enum + SELECT * FROM t_enum -┌─x─────┐ -│ hello │ -│ world │ -│ hello │ -└───────┘ -``` + ┌─x─────┐ + │ hello │ + │ world │ + │ hello │ + └───────┘ 如果需要看到对应行的数值,则必须将 `Enum` 值转换为整数类型。 -``` -SELECT CAST(x, 'Int8') FROM t_enum + SELECT CAST(x, 'Int8') FROM t_enum -┌─CAST(x, 'Int8')─┐ -│ 1 │ -│ 2 │ -│ 1 │ -└─────────────────┘ -``` + ┌─CAST(x, 'Int8')─┐ + │ 1 │ + │ 2 │ + │ 1 │ + └─────────────────┘ 在查询中创建枚举值,您还需要使用 `CAST`。 -``` -SELECT toTypeName(CAST('a', 'Enum8(\'a\' = 1, \'b\' = 2)')) + SELECT toTypeName(CAST('a', 'Enum8(\'a\' = 1, \'b\' = 2)')) -┌─toTypeName(CAST('a', 'Enum8(\'a\' = 1, \'b\' = 2)'))─┐ -│ Enum8('a' = 1, 'b' = 2) │ -└──────────────────────────────────────────────────────┘ -``` + ┌─toTypeName(CAST('a', 'Enum8(\'a\' = 1, \'b\' = 2)'))─┐ + │ Enum8('a' = 1, 'b' = 2) │ + └──────────────────────────────────────────────────────┘ -## 规则及用法 +## 规则及用法 {#gui-ze-ji-yong-fa} `Enum8` 类型的每个值范围是 `-128 ... 127`,`Enum16` 类型的每个值范围是 `-32768 ... 32767`。所有的字符串或者数字都必须是不一样的。允许存在空字符串。如果某个 Enum 类型被指定了(在表定义的时候),数字可以是任意顺序。然而,顺序并不重要。 @@ -80,19 +69,15 @@ SELECT toTypeName(CAST('a', 'Enum8(\'a\' = 1, \'b\' = 2)')) `Enum` 包含在 [Nullable](nullable.md) 类型中。因此,如果您使用此查询创建一个表 -``` -CREATE TABLE t_enum_nullable -( - x Nullable( Enum8('hello' = 1, 'world' = 2) ) -) -ENGINE = TinyLog -``` + CREATE TABLE t_enum_nullable + ( + x Nullable( Enum8('hello' = 1, 'world' = 2) ) + ) + ENGINE = TinyLog 不仅可以存储 `'hello'` 和 `'world'` ,还可以存储 `NULL`。 -``` -INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) -``` + INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) 在内存中,`Enum` 列的存储方式与相应数值的 `Int8` 或 `Int16` 相同。 diff --git a/docs/zh/data_types/fixedstring.md b/docs/zh/data_types/fixedstring.md index 44f7bb775a6..27945b74fc8 100644 --- a/docs/zh/data_types/fixedstring.md +++ b/docs/zh/data_types/fixedstring.md @@ -1,10 +1,10 @@ -# FixedString +# FixedString {#fixedstring} 固定长度 N 的字符串(N 必须是严格的正自然数)。 您可以使用下面的语法对列声明为`FixedString`类型: -```sql +``` sql FixedString(N) ``` @@ -14,23 +14,23 @@ 可以有效存储在`FixedString`类型的列中的值的示例: -- 二进制表示的IP地址(IPv6使用`FixedString(16)`) -- 语言代码(ru_RU, en_US ... ) -- 货币代码(USD, RUB ... ) -- 二进制表示的哈希值(MD5使用`FixedString(16)`,SHA256使用`FixedString(32)`) +- 二进制表示的IP地址(IPv6使用`FixedString(16)`) +- 语言代码(ru\_RU, en\_US … ) +- 货币代码(USD, RUB … ) +- 二进制表示的哈希值(MD5使用`FixedString(16)`,SHA256使用`FixedString(32)`) 请使用[UUID](uuid.md)数据类型来存储UUID值,。 当向ClickHouse中插入数据时, -- 如果字符串包含的字节数少于`N',将对字符串末尾进行空字节填充。 -- 如果字符串包含的字节数大于`N`,将抛出`Too large value for FixedString(N)`异常。 +- 如果字符串包含的字节数少于\`N’,将对字符串末尾进行空字节填充。 +- 如果字符串包含的字节数大于`N`,将抛出`Too large value for FixedString(N)`异常。 当做数据查询时,ClickHouse不会删除字符串末尾的空字节。 如果使用`WHERE`子句,则须要手动添加空字节以匹配`FixedString`的值。 以下示例阐明了如何将`WHERE`子句与`FixedString`一起使用。 考虑带有`FixedString(2)`列的表: -```text +``` text ┌─name──┐ │ b │ └───────┘ @@ -38,11 +38,12 @@ 查询语句`SELECT * FROM FixedStringTable WHERE a = 'b'` 不会返回任何结果。请使用空字节来填充筛选条件。 -```sql +``` sql SELECT * FROM FixedStringTable WHERE a = 'b\0' ``` -```text + +``` text ┌─a─┐ │ b │ └───┘ diff --git a/docs/zh/data_types/float.md b/docs/zh/data_types/float.md index 2bada8c2410..ba2ff7098b9 100644 --- a/docs/zh/data_types/float.md +++ b/docs/zh/data_types/float.md @@ -1,72 +1,70 @@ -# Float32, Float64 +# Float32, Float64 {#float32-float64} [浮点数](https://en.wikipedia.org/wiki/IEEE_754)。 类型与以下 C 语言中类型是相同的: -- `Float32` - `float` -- `Float64` - `double` +- `Float32` - `float` +- `Float64` - `double` 我们建议您尽可能以整数形式存储数据。例如,将固定精度的数字转换为整数值,例如货币数量或页面加载时间用毫秒为单位表示 -## 使用浮点数 +## 使用浮点数 {#shi-yong-fu-dian-shu} -- 对浮点数进行计算可能引起四舍五入的误差。 +- 对浮点数进行计算可能引起四舍五入的误差。 -```sql + + +``` sql SELECT 1 - 0.9 ``` -``` -┌───────minus(1, 0.9)─┐ -│ 0.09999999999999998 │ -└─────────────────────┘ -``` + ┌───────minus(1, 0.9)─┐ + │ 0.09999999999999998 │ + └─────────────────────┘ -- 计算的结果取决于计算方法(计算机系统的处理器类型和体系结构) +- 计算的结果取决于计算方法(计算机系统的处理器类型和体系结构) -- 浮点计算结果可能是诸如无穷大(`INF`)和"非数字"(`NaN`)。对浮点数计算的时候应该考虑到这点。 +- 浮点计算结果可能是诸如无穷大(`INF`)和«非数字»(`NaN`)。对浮点数计算的时候应该考虑到这点。 -- 当一行行阅读浮点数的时候,浮点数的结果可能不是机器最近显示的数值。 +- 当一行行阅读浮点数的时候,浮点数的结果可能不是机器最近显示的数值。 -## NaN and Inf {#data_type-float-nan-inf} +## NaN and Inf {#data-type-float-nan-inf} 与标准SQL相比,ClickHouse 支持以下类别的浮点数: -- `Inf` – 正无穷 +- `Inf` – 正无穷 -```sql + + +``` sql SELECT 0.5 / 0 ``` -``` -┌─divide(0.5, 0)─┐ -│ inf │ -└────────────────┘ -``` + ┌─divide(0.5, 0)─┐ + │ inf │ + └────────────────┘ -- `-Inf` – 负无穷 +- `-Inf` – 负无穷 -```sql + + +``` sql SELECT -0.5 / 0 ``` -``` -┌─divide(-0.5, 0)─┐ -│ -inf │ -└─────────────────┘ -``` + ┌─divide(-0.5, 0)─┐ + │ -inf │ + └─────────────────┘ -- `NaN` – 非数字 +- `NaN` – 非数字 -``` -SELECT 0 / 0 -``` + -``` -┌─divide(0, 0)─┐ -│ nan │ -└──────────────┘ -``` + SELECT 0 / 0 -可以在 [ORDER BY 子句](../query_language/select.md) 查看更多关于 ` NaN` 排序的规则。 + ┌─divide(0, 0)─┐ + │ nan │ + └──────────────┘ + +可以在 [ORDER BY 子句](../query_language/select.md) 查看更多关于 `NaN` 排序的规则。 diff --git a/docs/zh/data_types/index.md b/docs/zh/data_types/index.md index 70aa976cb11..c1ecf936c26 100644 --- a/docs/zh/data_types/index.md +++ b/docs/zh/data_types/index.md @@ -1,4 +1,4 @@ -# 数据类型 {#data_types} +# 数据类型 {#data-types} ClickHouse 可以在数据表中存储多种数据类型。 diff --git a/docs/zh/data_types/int_uint.md b/docs/zh/data_types/int_uint.md index 9d215b93ba7..4e01ad017ca 100644 --- a/docs/zh/data_types/int_uint.md +++ b/docs/zh/data_types/int_uint.md @@ -1,18 +1,17 @@ - -# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 +# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 {#uint8-uint16-uint32-uint64-int8-int16-int32-int64} 固定长度的整型,包括有符号整型或无符号整型。 -## 整型范围 +## 整型范围 {#zheng-xing-fan-wei} -- Int8 - [-128 : 127] -- Int16 - [-32768 : 32767] -- Int32 - [-2147483648 : 2147483647] -- Int64 - [-9223372036854775808 : 9223372036854775807] +- Int8 - \[-128 : 127\] +- Int16 - \[-32768 : 32767\] +- Int32 - \[-2147483648 : 2147483647\] +- Int64 - \[-9223372036854775808 : 9223372036854775807\] -## 无符号整型范围 +## 无符号整型范围 {#wu-fu-hao-zheng-xing-fan-wei} -- UInt8 - [0 : 255] -- UInt16 - [0 : 65535] -- UInt32 - [0 : 4294967295] -- UInt64 - [0 : 18446744073709551615] +- UInt8 - \[0 : 255\] +- UInt16 - \[0 : 65535\] +- UInt32 - \[0 : 4294967295\] +- UInt64 - \[0 : 18446744073709551615\] diff --git a/docs/zh/data_types/nested_data_structures/aggregatefunction.md b/docs/zh/data_types/nested_data_structures/aggregatefunction.md index 4d4fadc5ffe..3153150d2bd 100644 --- a/docs/zh/data_types/nested_data_structures/aggregatefunction.md +++ b/docs/zh/data_types/nested_data_structures/aggregatefunction.md @@ -1,4 +1,4 @@ -# AggregateFunction(name, types_of_arguments...) {#data_type-aggregatefunction} +# AggregateFunction(name, types\_of\_arguments…) {#data-type-aggregatefunction} 聚合函数的中间状态,可以通过聚合函数名称加`-State`后缀的形式得到它。与此同时,当您需要访问该类型的最终状态数据时,您需要以相同的聚合函数名加`-Merge`后缀的形式来得到最终状态数据。 @@ -6,15 +6,15 @@ **参数** -- 聚合函数名 +- 聚合函数名 - 如果函数具备多个参数列表,请在此处指定其他参数列表中的值。 + 如果函数具备多个参数列表,请在此处指定其他参数列表中的值。 -- 聚合函数参数的类型 +- 聚合函数参数的类型 **示例** -```sql +``` sql CREATE TABLE t ( column1 AggregateFunction(uniq, UInt64), @@ -25,15 +25,15 @@ CREATE TABLE t 上述中的[uniq](../../query_language/agg_functions/reference.md#agg_function-uniq), anyIf ([any](../../query_language/agg_functions/reference.md#agg_function-any)+[If](../../query_language/agg_functions/combinators.md#agg-functions-combinator-if)) 以及 [quantiles](../../query_language/agg_functions/reference.md) 都为ClickHouse中支持的聚合函数。 -## 使用指南 +## 使用指南 {#shi-yong-zhi-nan} -### 数据写入 +### 数据写入 {#shu-ju-xie-ru} 当需要写入数据时,您需要将数据包含在`INSERT SELECT`语句中,同时对于`AggregateFunction`类型的数据,您需要使用对应的以`-State`为后缀的函数进行处理。 **函数使用示例** -```sql +``` sql uniqState(UserID) quantilesState(0.5, 0.9)(SendTiming) ``` @@ -42,7 +42,7 @@ quantilesState(0.5, 0.9)(SendTiming) 对于`SELECT`而言,`AggregateFunction`类型总是以特定的二进制形式展现在所有的输出格式中。例如,您可以使用`SELECT`语句将函数的状态数据转储为`TabSeparated`格式的同时使用`INSERT`语句将数据转储回去。 -### 数据查询 +### 数据查询 {#shu-ju-cha-xun} 当从`AggregatingMergeTree`表中查询数据时,对于`AggregateFunction`类型的字段,您需要使用以`-Merge`为后缀的相同聚合函数来聚合数据。对于非`AggregateFunction`类型的字段,请将它们包含在`GROUP BY`子句中。 @@ -50,15 +50,14 @@ quantilesState(0.5, 0.9)(SendTiming) 例如,如下的两个查询返回的结果总是一致: -```sql +``` sql SELECT uniq(UserID) FROM table SELECT uniqMerge(state) FROM (SELECT uniqState(UserID) AS state FROM table GROUP BY RegionID) ``` -## 使用示例 +## 使用示例 {#shi-yong-shi-li} 请参阅 [AggregatingMergeTree](../../operations/table_engines/aggregatingmergetree.md) 的说明 - [来源文章](https://clickhouse.tech/docs/en/data_types/nested_data_structures/aggregatefunction/) diff --git a/docs/zh/data_types/nested_data_structures/index.md b/docs/zh/data_types/nested_data_structures/index.md index 5f8d9839943..3914064674e 100644 --- a/docs/zh/data_types/nested_data_structures/index.md +++ b/docs/zh/data_types/nested_data_structures/index.md @@ -1 +1 @@ -# 嵌套数据结构 +# 嵌套数据结构 {#qian-tao-shu-ju-jie-gou} diff --git a/docs/zh/data_types/nested_data_structures/nested.md b/docs/zh/data_types/nested_data_structures/nested.md index e1a882ecfe1..d2fd1e3a630 100644 --- a/docs/zh/data_types/nested_data_structures/nested.md +++ b/docs/zh/data_types/nested_data_structures/nested.md @@ -1,10 +1,10 @@ -# Nested(Name1 Type1, Name2 Type2, ...) +# Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2} 嵌套数据结构类似于嵌套表。嵌套数据结构的参数(列名和类型)与 CREATE 查询类似。每个表可以包含任意多行嵌套数据结构。 示例: -```sql +``` sql CREATE TABLE test.visits ( CounterID UInt32, @@ -27,7 +27,7 @@ CREATE TABLE test.visits ) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) ``` -上述示例声明了 `Goals` 这种嵌套数据结构,它包含访客转化相关的数据(访客达到的目标)。在 'visits' 表中每一行都可以对应零个或者任意个转化数据。 +上述示例声明了 `Goals` 这种嵌套数据结构,它包含访客转化相关的数据(访客达到的目标)。在 ‘visits’ 表中每一行都可以对应零个或者任意个转化数据。 只支持一级嵌套。嵌套结构的列中,若列的类型是数组类型,那么该列其实和多维数组是相同的,所以目前嵌套层级的支持很局限(MergeTree 引擎中不支持存储这样的列) @@ -35,7 +35,7 @@ CREATE TABLE test.visits 示例: -```sql +``` sql SELECT Goals.ID, Goals.EventTime @@ -44,7 +44,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -```text +``` text ┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ │ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ │ [1073752] │ ['2014-03-17 00:28:25'] │ @@ -61,9 +61,9 @@ LIMIT 10 所以可以简单地把嵌套数据结构当做是所有列都是相同长度的多列数组。 -SELECT 查询只有在使用 ARRAY JOIN 的时候才可以指定整个嵌套数据结构的名称。更多信息,参考 "ARRAY JOIN 子句"。示例: +SELECT 查询只有在使用 ARRAY JOIN 的时候才可以指定整个嵌套数据结构的名称。更多信息,参考 «ARRAY JOIN 子句»。示例: -```sql +``` sql SELECT Goal.ID, Goal.EventTime @@ -73,7 +73,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -```text +``` text ┌─Goal.ID─┬──────Goal.EventTime─┐ │ 1073752 │ 2014-03-17 16:38:10 │ │ 591325 │ 2014-03-17 16:38:48 │ diff --git a/docs/zh/data_types/nullable.md b/docs/zh/data_types/nullable.md index 8bcbc115c20..62aad5541c9 100644 --- a/docs/zh/data_types/nullable.md +++ b/docs/zh/data_types/nullable.md @@ -1,6 +1,6 @@ -# Nullable(TypeName) {#data_type-nullable} +# Nullable(TypeName) {#data-type-nullable} -允许用特殊标记 ([NULL](../query_language/syntax.md)) 表示"缺失值",可以与 `TypeName` 的正常值存放一起。例如,`Nullable(Int8)` 类型的列可以存储 `Int8` 类型值,而没有值的行将存储 `NULL`。 +允许用特殊标记 ([NULL](../query_language/syntax.md)) 表示«缺失值»,可以与 `TypeName` 的正常值存放一起。例如,`Nullable(Int8)` 类型的列可以存储 `Int8` 类型值,而没有值的行将存储 `NULL`。 对于 `TypeName`,不能使用复合数据类型 [Array](array.md) 和 [Tuple](tuple.md)。复合数据类型可以包含 `Nullable` 类型值,例如`Array(Nullable(Int8))`。 @@ -8,32 +8,34 @@ 除非在 ClickHouse 服务器配置中另有说明,否则 `NULL` 是任何 `Nullable` 类型的默认值。 -## 存储特性 +## 存储特性 {#cun-chu-te-xing} 要在表的列中存储 `Nullable` 类型值,ClickHouse 除了使用带有值的普通文件外,还使用带有 `NULL` 掩码的单独文件。 掩码文件中的条目允许 ClickHouse 区分每个表行的 `NULL` 和相应数据类型的默认值。 由于附加了新文件,`Nullable` 列与类似的普通文件相比消耗额外的存储空间。 -!!! 注意点 +!!! 注意点 "注意点" 使用 `Nullable` 几乎总是对性能产生负面影响,在设计数据库时请记住这一点 -掩码文件中的条目允许ClickHouse区分每个表行的对应数据类型的"NULL"和默认值由于有额外的文件,"Nullable"列比普通列消耗更多的存储空间 +掩码文件中的条目允许ClickHouse区分每个表行的对应数据类型的«NULL»和默认值由于有额外的文件,«Nullable»列比普通列消耗更多的存储空间 -## 用法示例 +## 用法示例 {#yong-fa-shi-li} - -```sql +``` sql CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog ``` -```sql + +``` sql INSERT INTO t_null VALUES (1, NULL), (2, 3) ``` -```sql + +``` sql SELECT x + y FROM t_null ``` -```text + +``` text ┌─plus(x, y)─┐ │ ᴺᵁᴸᴸ │ │ 5 │ └────────────┘ ``` -[来源文章](https://clickhouse.tech/docs/en/data_types/nullable/) \ No newline at end of file +[来源文章](https://clickhouse.tech/docs/en/data_types/nullable/) diff --git a/docs/zh/data_types/special_data_types/expression.md b/docs/zh/data_types/special_data_types/expression.md index d135b2478ff..86b4d5591c7 100644 --- a/docs/zh/data_types/special_data_types/expression.md +++ b/docs/zh/data_types/special_data_types/expression.md @@ -1,3 +1,3 @@ -# Expression +# Expression {#expression} -用于表示高阶函数中的Lambd表达式。 \ No newline at end of file +用于表示高阶函数中的Lambd表达式。 diff --git a/docs/zh/data_types/special_data_types/index.md b/docs/zh/data_types/special_data_types/index.md index efcd141e2e1..5963c377f01 100644 --- a/docs/zh/data_types/special_data_types/index.md +++ b/docs/zh/data_types/special_data_types/index.md @@ -1,3 +1,3 @@ -# Special Data Types +# Special Data Types {#special-data-types} 特殊数据类型的值既不能存在表中也不能在结果中输出,但可用于查询的中间结果。 diff --git a/docs/zh/data_types/special_data_types/nothing.md b/docs/zh/data_types/special_data_types/nothing.md index 6d313d7e09b..7a6bf0e035b 100644 --- a/docs/zh/data_types/special_data_types/nothing.md +++ b/docs/zh/data_types/special_data_types/nothing.md @@ -1,5 +1,4 @@ - -# Nothing +# Nothing {#nothing} 此数据类型的唯一目的是表示不是期望值的情况。 所以不能创建一个 `Nothing` 类型的值。 @@ -7,7 +6,7 @@ `Nothing` 类型也可以用来表示空数组: -```bash +``` bash :) SELECT toTypeName(array()) SELECT toTypeName([]) diff --git a/docs/zh/data_types/special_data_types/set.md b/docs/zh/data_types/special_data_types/set.md index 5c8264b0aea..d1f2ad368ee 100644 --- a/docs/zh/data_types/special_data_types/set.md +++ b/docs/zh/data_types/special_data_types/set.md @@ -1,4 +1,3 @@ -# Set +# Set {#set} 可以用在 IN 表达式的右半部分。 - diff --git a/docs/zh/data_types/string.md b/docs/zh/data_types/string.md index 3aef96b8aa3..742452ee0bf 100644 --- a/docs/zh/data_types/string.md +++ b/docs/zh/data_types/string.md @@ -1,12 +1,10 @@ - -# String +# String {#string} 字符串可以任意长度的。它可以包含任意的字节集,包含空字节。因此,字符串类型可以代替其他 DBMSs 中的 VARCHAR、BLOB、CLOB 等类型。 -## 编码 +## 编码 {#bian-ma} ClickHouse 没有编码的概念。字符串可以是任意的字节集,按它们原本的方式进行存储和输出。 若需存储文本,我们建议使用 UTF-8 编码。至少,如果你的终端使用UTF-8(推荐),这样读写就不需要进行任何的转换了。 同样,对不同的编码文本 ClickHouse 会有不同处理字符串的函数。 比如,`length` 函数可以计算字符串包含的字节数组的长度,然而 `lengthUTF8` 函数是假设字符串以 UTF-8 编码,计算的是字符串包含的 Unicode 字符的长度。 - diff --git a/docs/zh/data_types/tuple.md b/docs/zh/data_types/tuple.md index c8231c00aa2..4efeb651e76 100644 --- a/docs/zh/data_types/tuple.md +++ b/docs/zh/data_types/tuple.md @@ -1,5 +1,4 @@ - -# Tuple(T1, T2, ...) +# Tuple(T1, T2, …) {#tuplet1-t2} 元组,其中每个元素都有单独的 [类型](index.md#data_types)。 @@ -7,46 +6,40 @@ 元组可以是查询的结果。在这种情况下,对于JSON以外的文本格式,括号中的值是逗号分隔的。在JSON格式中,元组作为数组输出(在方括号中)。 -## 创建元组 +## 创建元组 {#chuang-jian-yuan-zu} 可以使用函数来创建元组: -``` -tuple(T1, T2, ...) -``` + tuple(T1, T2, ...) 创建元组的示例: -``` -:) SELECT tuple(1,'a') AS x, toTypeName(x) + :) SELECT tuple(1,'a') AS x, toTypeName(x) -SELECT - (1, 'a') AS x, - toTypeName(x) + SELECT + (1, 'a') AS x, + toTypeName(x) -┌─x───────┬─toTypeName(tuple(1, 'a'))─┐ -│ (1,'a') │ Tuple(UInt8, String) │ -└─────────┴───────────────────────────┘ + ┌─x───────┬─toTypeName(tuple(1, 'a'))─┐ + │ (1,'a') │ Tuple(UInt8, String) │ + └─────────┴───────────────────────────┘ -1 rows in set. Elapsed: 0.021 sec. -``` + 1 rows in set. Elapsed: 0.021 sec. -## 元组中的数据类型 +## 元组中的数据类型 {#yuan-zu-zhong-de-shu-ju-lei-xing} 在动态创建元组时,ClickHouse 会自动为元组的每一个参数赋予最小可表达的类型。如果参数为 [NULL](../query_language/syntax.md#null-literal),那这个元组对应元素是 [Nullable](nullable.md)。 自动数据类型检测示例: -``` -SELECT tuple(1, NULL) AS x, toTypeName(x) + SELECT tuple(1, NULL) AS x, toTypeName(x) -SELECT - (1, NULL) AS x, - toTypeName(x) + SELECT + (1, NULL) AS x, + toTypeName(x) -┌─x────────┬─toTypeName(tuple(1, NULL))──────┐ -│ (1,NULL) │ Tuple(UInt8, Nullable(Nothing)) │ -└──────────┴─────────────────────────────────┘ + ┌─x────────┬─toTypeName(tuple(1, NULL))──────┐ + │ (1,NULL) │ Tuple(UInt8, Nullable(Nothing)) │ + └──────────┴─────────────────────────────────┘ -1 rows in set. Elapsed: 0.002 sec. -``` + 1 rows in set. Elapsed: 0.002 sec. diff --git a/docs/zh/database_engines/index.md b/docs/zh/database_engines/index.md index a3ea151d305..95c7ea2c319 100644 --- a/docs/zh/database_engines/index.md +++ b/docs/zh/database_engines/index.md @@ -1,4 +1,4 @@ -# 数据库引擎 +# 数据库引擎 {#shu-ju-ku-yin-qing} 您使用的所有表都是由数据库引擎所提供的 @@ -6,6 +6,6 @@ 除此之外,您还可以选择使用以下的数据库引擎: -- [MySQL](mysql.md) +- [MySQL](mysql.md) [来源文章](https://clickhouse.tech/docs/en/database_engines/) diff --git a/docs/zh/database_engines/mysql.md b/docs/zh/database_engines/mysql.md index 40afdec5a7f..9467269a2cc 100644 --- a/docs/zh/database_engines/mysql.md +++ b/docs/zh/database_engines/mysql.md @@ -1,4 +1,4 @@ -# MySQL +# MySQL {#mysql} MySQL引擎用于将远程的MySQL服务器中的表映射到ClickHouse中,并允许您对表进行`INSERT`和`SELECT`查询,以方便您在ClickHouse与MySQL之间进行数据交换。 @@ -6,14 +6,13 @@ MySQL引擎用于将远程的MySQL服务器中的表映射到ClickHouse中,并 但您无法对其执行以下操作: -- `ATTACH`/`DETACH` -- `DROP` -- `RENAME` -- `CREATE TABLE` -- `ALTER` +- `ATTACH`/`DETACH` +- `DROP` +- `RENAME` +- `CREATE TABLE` +- `ALTER` - -## CREATE DATABASE +## CREATE DATABASE {#create-database} ``` sql CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] @@ -22,99 +21,104 @@ ENGINE = MySQL('host:port', 'database', 'user', 'password') **MySQL数据库引擎参数** -- `host:port` — 链接的MySQL地址。 -- `database` — 链接的MySQL数据库。 -- `user` — 链接的MySQL用户。 -- `password` — 链接的MySQL用户密码。 +- `host:port` — 链接的MySQL地址。 +- `database` — 链接的MySQL数据库。 +- `user` — 链接的MySQL用户。 +- `password` — 链接的MySQL用户密码。 +## 支持的类型对应 {#zhi-chi-de-lei-xing-dui-ying} -## 支持的类型对应 - -MySQL | ClickHouse -------|------------ -UNSIGNED TINYINT | [UInt8](../data_types/int_uint.md) -TINYINT | [Int8](../data_types/int_uint.md) -UNSIGNED SMALLINT | [UInt16](../data_types/int_uint.md) -SMALLINT | [Int16](../data_types/int_uint.md) -UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../data_types/int_uint.md) -INT, MEDIUMINT | [Int32](../data_types/int_uint.md) -UNSIGNED BIGINT | [UInt64](../data_types/int_uint.md) -BIGINT | [Int64](../data_types/int_uint.md) -FLOAT | [Float32](../data_types/float.md) -DOUBLE | [Float64](../data_types/float.md) -DATE | [Date](../data_types/date.md) -DATETIME, TIMESTAMP | [DateTime](../data_types/datetime.md) -BINARY | [FixedString](../data_types/fixedstring.md) +| MySQL | ClickHouse | +|----------------------------------|---------------------------------------------| +| UNSIGNED TINYINT | [UInt8](../data_types/int_uint.md) | +| TINYINT | [Int8](../data_types/int_uint.md) | +| UNSIGNED SMALLINT | [UInt16](../data_types/int_uint.md) | +| SMALLINT | [Int16](../data_types/int_uint.md) | +| UNSIGNED INT, UNSIGNED MEDIUMINT | [UInt32](../data_types/int_uint.md) | +| INT, MEDIUMINT | [Int32](../data_types/int_uint.md) | +| UNSIGNED BIGINT | [UInt64](../data_types/int_uint.md) | +| BIGINT | [Int64](../data_types/int_uint.md) | +| FLOAT | [Float32](../data_types/float.md) | +| DOUBLE | [Float64](../data_types/float.md) | +| DATE | [Date](../data_types/date.md) | +| DATETIME, TIMESTAMP | [DateTime](../data_types/datetime.md) | +| BINARY | [FixedString](../data_types/fixedstring.md) | 其他的MySQL数据类型将全部都转换为[String](../data_types/string.md)。 同时以上的所有类型都支持[Nullable](../data_types/nullable.md)。 - -## 使用示例 +## 使用示例 {#shi-yong-shi-li} 在MySQL中创建表: -``` -mysql> USE test; -Database changed + mysql> USE test; + Database changed -mysql> CREATE TABLE `mysql_table` ( - -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `float` FLOAT NOT NULL, - -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) + mysql> CREATE TABLE `mysql_table` ( + -> `int_id` INT NOT NULL AUTO_INCREMENT, + -> `float` FLOAT NOT NULL, + -> PRIMARY KEY (`int_id`)); + Query OK, 0 rows affected (0,09 sec) -mysql> insert into mysql_table (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) + mysql> insert into mysql_table (`int_id`, `float`) VALUES (1,2); + Query OK, 1 row affected (0,00 sec) -mysql> select * from mysql_table; -+--------+-------+ -| int_id | value | -+--------+-------+ -| 1 | 2 | -+--------+-------+ -1 row in set (0,00 sec) -``` + mysql> select * from mysql_table; + +--------+-------+ + | int_id | value | + +--------+-------+ + | 1 | 2 | + +--------+-------+ + 1 row in set (0,00 sec) 在ClickHouse中创建MySQL类型的数据库,同时与MySQL服务器交换数据: -```sql +``` sql CREATE DATABASE mysql_db ENGINE = MySQL('localhost:3306', 'test', 'my_user', 'user_password') ``` -```sql + +``` sql SHOW DATABASES ``` -```text + +``` text ┌─name─────┐ │ default │ │ mysql_db │ │ system │ └──────────┘ ``` -```sql + +``` sql SHOW TABLES FROM mysql_db ``` -```text + +``` text ┌─name─────────┐ │ mysql_table │ └──────────────┘ ``` -```sql + +``` sql SELECT * FROM mysql_db.mysql_table ``` -```text + +``` text ┌─int_id─┬─value─┐ │ 1 │ 2 │ └────────┴───────┘ ``` -```sql + +``` sql INSERT INTO mysql_db.mysql_table VALUES (3,4) ``` -```sql + +``` sql SELECT * FROM mysql_db.mysql_table ``` -```text + +``` text ┌─int_id─┬─value─┐ │ 1 │ 2 │ │ 3 │ 4 │ diff --git a/docs/zh/development/architecture.md b/docs/zh/development/architecture.md index efb3d7c6716..22eaaf583d6 100644 --- a/docs/zh/development/architecture.md +++ b/docs/zh/development/architecture.md @@ -1,30 +1,30 @@ -# ClickHouse 架构概述 +# ClickHouse 架构概述 {#clickhouse-jia-gou-gai-shu} -ClickHouse 是一个真正的列式数据库管理系统(DBMS)。在 ClickHouse 中,数据始终是按列存储的,包括矢量(向量或列块)执行的过程。只要有可能,操作都是基于矢量进行分派的,而不是单个的值,这被称为“矢量化查询执行”,它有利于降低实际的数据处理开销。 +ClickHouse 是一个真正的列式数据库管理系统(DBMS)。在 ClickHouse 中,数据始终是按列存储的,包括矢量(向量或列块)执行的过程。只要有可能,操作都是基于矢量进行分派的,而不是单个的值,这被称为«矢量化查询执行»,它有利于降低实际的数据处理开销。 > 这个想法并不新鲜,其可以追溯到 `APL` 编程语言及其后代:`A +`、`J`、`K` 和 `Q`。矢量编程被大量用于科学数据处理中。即使在关系型数据库中,这个想法也不是什么新的东西:比如,矢量编程也被大量用于 `Vectorwise` 系统中。 通常有两种不同的加速查询处理的方法:矢量化查询执行和运行时代码生成。在后者中,动态地为每一类查询生成代码,消除了间接分派和动态分派。这两种方法中,并没有哪一种严格地比另一种好。运行时代码生成可以更好地将多个操作融合在一起,从而充分利用 CPU 执行单元和流水线。矢量化查询执行不是特别实用,因为它涉及必须写到缓存并读回的临时向量。如果 L2 缓存容纳不下临时数据,那么这将成为一个问题。但矢量化查询执行更容易利用 CPU 的 SIMD 功能。朋友写的一篇[研究论文](http://15721.courses.cs.cmu.edu/spring2016/papers/p5-sompolski.pdf)表明,将两种方法结合起来是更好的选择。ClickHouse 使用了矢量化查询执行,同时初步提供了有限的运行时动态代码生成。 -## 列(Columns) +## 列(Columns) {#lie-columns} 要表示内存中的列(实际上是列块),需使用 `IColumn` 接口。该接口提供了用于实现各种关系操作符的辅助方法。几乎所有的操作都是不可变的:这些操作不会更改原始列,但是会创建一个新的修改后的列。比如,`IColumn::filter` 方法接受过滤字节掩码,用于 `WHERE` 和 `HAVING` 关系操作符中。另外的例子:`IColumn::permute` 方法支持 `ORDER BY` 实现,`IColumn::cut` 方法支持 `LIMIT` 实现等等。 不同的 `IColumn` 实现(`ColumnUInt8`、`ColumnString` 等)负责不同的列内存布局。内存布局通常是一个连续的数组。对于数据类型为整型的列,只是一个连续的数组,比如 `std::vector`。对于 `String` 列和 `Array` 列,则由两个向量组成:其中一个向量连续存储所有的 `String` 或数组元素,另一个存储每一个 `String` 或 `Array` 的起始元素在第一个向量中的偏移。而 `ColumnConst` 则仅在内存中存储一个值,但是看起来像一个列。 -## Field +## Field {#field} 尽管如此,有时候也可能需要处理单个值。表示单个值,可以使用 `Field`。`Field` 是 `UInt64`、`Int64`、`Float64`、`String` 和 `Array` 组成的联合。`IColumn` 拥有 `operator[]` 方法来获取第 `n` 个值成为一个 `Field`,同时也拥有 `insert` 方法将一个 `Field` 追加到一个列的末尾。这些方法并不高效,因为它们需要处理表示单一值的临时 `Field` 对象,但是有更高效的方法比如 `insertFrom` 和 `insertRangeFrom` 等。 `Field` 中并没有足够的关于一个表(table)的特定数据类型的信息。比如,`UInt8`、`UInt16`、`UInt32` 和 `UInt64` 在 `Field` 中均表示为 `UInt64`。 -## 抽象漏洞 +## 抽象漏洞 {#chou-xiang-lou-dong} `IColumn` 具有用于数据的常见关系转换的方法,但这些方法并不能够满足所有需求。比如,`ColumnUInt64` 没有用于计算两列和的方法,`ColumnString` 没有用于进行子串搜索的方法。这些无法计算的例程在 `Icolumn` 之外实现。 -列(Columns)上的各种函数可以通过使用 `Icolumn` 的方法来提取 `Field` 值,或根据特定的 `Icolumn` 实现的数据内存布局的知识,以一种通用但不高效的方式实现。为此,函数将会转换为特定的 `IColumn` 类型并直接处理内部表示。比如,`ColumnUInt64` 具有 `getData` 方法,该方法返回一个指向列的内部数组的引用,然后一个单独的例程可以直接读写或填充该数组。实际上,“抽象漏洞(leaky abstractions)”允许我们以更高效的方式来实现各种特定的例程。 +列(Columns)上的各种函数可以通过使用 `Icolumn` 的方法来提取 `Field` 值,或根据特定的 `Icolumn` 实现的数据内存布局的知识,以一种通用但不高效的方式实现。为此,函数将会转换为特定的 `IColumn` 类型并直接处理内部表示。比如,`ColumnUInt64` 具有 `getData` 方法,该方法返回一个指向列的内部数组的引用,然后一个单独的例程可以直接读写或填充该数组。实际上,«抽象漏洞(leaky abstractions)»允许我们以更高效的方式来实现各种特定的例程。 -## 数据类型 +## 数据类型 {#shu-ju-lei-xing} `IDataType` 负责序列化和反序列化:读写二进制或文本形式的列或单个值构成的块。`IDataType` 直接与表的数据类型相对应。比如,有 `DataTypeUInt32`、`DataTypeDateTime`、`DataTypeString` 等数据类型。 @@ -34,7 +34,7 @@ ClickHouse 是一个真正的列式数据库管理系统(DBMS)。在 ClickHous `IDataType` 具有针对各种数据格式的辅助函数。比如如下一些辅助函数:序列化一个值并加上可能的引号;序列化一个值用于 JSON 格式;序列化一个值作为 XML 格式的一部分。辅助函数与数据格式并没有直接的对应。比如,两种不同的数据格式 `Pretty` 和 `TabSeparated` 均可以使用 `IDataType` 接口提供的 `serializeTextEscaped` 这一辅助函数。 -## 块(Block) +## 块(Block) {#kuai-block} `Block` 是表示内存中表的子集(chunk)的容器,是由三元组:`(IColumn, IDataType, 列名)` 构成的集合。在查询执行期间,数据是按 `Block` 进行处理的。如果我们有一个 `Block`,那么就有了数据(在 `IColumn` 对象中),有了数据的类型信息告诉我们如何处理该列,同时也有了列名(来自表的原始列名,或人为指定的用于临时计算结果的名字)。 @@ -42,29 +42,29 @@ ClickHouse 是一个真正的列式数据库管理系统(DBMS)。在 ClickHous `Block` 用于处理数据块。注意,对于相同类型的计算,列名和类型对不同的块保持相同,仅列数据不同。最好把块数据(block data)和块头(block header)分离开来,因为小块大小会因复制共享指针和列名而带来很高的临时字符串开销。 -## 块流(Block Streams) +## 块流(Block Streams) {#kuai-liu-block-streams} 块流用于处理数据。我们可以使用块流从某个地方读取数据,执行数据转换,或将数据写到某个地方。`IBlockInputStream` 具有 `read` 方法,其能够在数据可用时获取下一个块。`IBlockOutputStream` 具有 `write` 方法,其能够将块写到某处。 块流负责: -1. 读或写一个表。表仅返回一个流用于读写块。 -2. 完成数据格式化。比如,如果你打算将数据以 `Pretty` 格式输出到终端,你可以创建一个块输出流,将块写入该流中,然后进行格式化。 -3. 执行数据转换。假设你现在有 `IBlockInputStream` 并且打算创建一个过滤流,那么你可以创建一个 `FilterBlockInputStream` 并用 `IBlockInputStream` 进行初始化。之后,当你从 `FilterBlockInputStream` 中拉取块时,会从你的流中提取一个块,对其进行过滤,然后将过滤后的块返回给你。查询执行流水线就是以这种方式表示的。 +1. 读或写一个表。表仅返回一个流用于读写块。 +2. 完成数据格式化。比如,如果你打算将数据以 `Pretty` 格式输出到终端,你可以创建一个块输出流,将块写入该流中,然后进行格式化。 +3. 执行数据转换。假设你现在有 `IBlockInputStream` 并且打算创建一个过滤流,那么你可以创建一个 `FilterBlockInputStream` 并用 `IBlockInputStream` 进行初始化。之后,当你从 `FilterBlockInputStream` 中拉取块时,会从你的流中提取一个块,对其进行过滤,然后将过滤后的块返回给你。查询执行流水线就是以这种方式表示的。 还有一些更复杂的转换。比如,当你从 `AggregatingBlockInputStream` 拉取数据时,会从数据源读取全部数据进行聚集,然后将聚集后的数据流返回给你。另一个例子:`UnionBlockInputStream` 的构造函数接受多个输入源和多个线程,其能够启动多线程从多个输入源并行读取数据。 -> 块流使用“pull”方法来控制流:当你从第一个流中拉取块时,它会接着从嵌套的流中拉取所需的块,然后整个执行流水线开始工作。”pull“和“push”都不是最好的方案,因为控制流不是明确的,这限制了各种功能的实现,比如多个查询同步执行(多个流水线合并到一起)。这个限制可以通过协程或直接运行互相等待的线程来解决。如果控制流明确,那么我们会有更多的可能性:如果我们定位了数据从一个计算单元传递到那些外部的计算单元中其中一个计算单元的逻辑。阅读这篇[文章](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/)来获取更多的想法。 +> 块流使用«pull»方法来控制流:当你从第一个流中拉取块时,它会接着从嵌套的流中拉取所需的块,然后整个执行流水线开始工作。»pull«和«push»都不是最好的方案,因为控制流不是明确的,这限制了各种功能的实现,比如多个查询同步执行(多个流水线合并到一起)。这个限制可以通过协程或直接运行互相等待的线程来解决。如果控制流明确,那么我们会有更多的可能性:如果我们定位了数据从一个计算单元传递到那些外部的计算单元中其中一个计算单元的逻辑。阅读这篇[文章](http://journal.stuffwithstuff.com/2013/01/13/iteration-inside-and-out/)来获取更多的想法。 我们需要注意,查询执行流水线在每一步都会创建临时数据。我们要尽量使块的大小足够小,从而 CPU 缓存能够容纳下临时数据。在这个假设下,与其他计算相比,读写临时数据几乎是没有任何开销的。我们也可以考虑一种替代方案:将流水线中的多个操作融合在一起,使流水线尽可能短,并删除大量临时数据。这可能是一个优点,但同时也有缺点。比如,拆分流水线使得中间数据缓存、获取同时运行的类似查询的中间数据以及相似查询的流水线合并等功能很容易实现。 -## 格式(Formats) +## 格式(Formats) {#ge-shi-formats} -数据格式同块流一起实现。既有仅用于向客户端输出数据的”展示“格式,如 `IBlockOutputStream` 提供的 `Pretty` 格式,也有其它输入输出格式,比如 `TabSeparated` 或 `JSONEachRow`。 +数据格式同块流一起实现。既有仅用于向客户端输出数据的»展示«格式,如 `IBlockOutputStream` 提供的 `Pretty` 格式,也有其它输入输出格式,比如 `TabSeparated` 或 `JSONEachRow`。 此外还有行流:`IRowInputStream` 和 `IRowOutputStream`。它们允许你按行 pull/push 数据,而不是按块。行流只需要简单地面向行格式实现。包装器 `BlockInputStreamFromRowInputStream` 和 `BlockOutputStreamFromRowOutputStream` 允许你将面向行的流转换为正常的面向块的流。 -## I/O +## I/O {#io} 对于面向字节的输入输出,有 `ReadBuffer` 和 `WriteBuffer` 这两个抽象类。它们用来替代 C++ 的 `iostream`。不用担心:每个成熟的 C++ 项目都会有充分的理由使用某些东西来代替 `iostream`。 @@ -76,7 +76,7 @@ ClickHouse 是一个真正的列式数据库管理系统(DBMS)。在 ClickHous 让我们来看一下,当你把一个结果集以 `JSON` 格式写到标准输出(stdout)时会发生什么。你已经准备好从 `IBlockInputStream` 获取结果集,然后创建 `WriteBufferFromFileDescriptor(STDOUT_FILENO)` 用于写字节到标准输出,创建 `JSONRowOutputStream` 并用 `WriteBuffer` 初始化,用于将行以 `JSON` 格式写到标准输出,你还可以在其上创建 `BlockOutputStreamFromRowOutputStream`,将其表示为 `IBlockOutputStream`。然后调用 `copyData` 将数据从 `IBlockInputStream` 传输到 `IBlockOutputStream`,一切工作正常。在内部,`JSONRowOutputStream` 会写入 JSON 分隔符,并以指向 `IColumn` 的引用和行数作为参数调用 `IDataType::serializeTextJSON` 函数。随后,`IDataType::serializeTextJSON` 将会调用 `WriteHelpers.h` 中的一个方法:比如,`writeText` 用于数值类型,`writeJSONString` 用于 `DataTypeString` 。 -## 表(Tables) +## 表(Tables) {#biao-tables} 表由 `IStorage` 接口表示。该接口的不同实现对应不同的表引擎。比如 `StorageMergeTree`、`StorageMemory` 等。这些类的实例就是表。 @@ -86,8 +86,8 @@ ClickHouse 是一个真正的列式数据库管理系统(DBMS)。在 ClickHous 但是也有值得注意的例外: -- AST 查询被传递给 `read` 方法,表引擎可以使用它来判断是否能够使用索引,从而从表中读取更少的数据。 -- 有时候,表引擎能够将数据处理到一个特定阶段。比如,`StorageDistributed` 可以向远程服务器发送查询,要求它们将来自不同的远程服务器能够合并的数据处理到某个阶段,并返回预处理后的数据,然后查询解释器完成后续的数据处理。 +- AST 查询被传递给 `read` 方法,表引擎可以使用它来判断是否能够使用索引,从而从表中读取更少的数据。 +- 有时候,表引擎能够将数据处理到一个特定阶段。比如,`StorageDistributed` 可以向远程服务器发送查询,要求它们将来自不同的远程服务器能够合并的数据处理到某个阶段,并返回预处理后的数据,然后查询解释器完成后续的数据处理。 表的 `read` 方法能够返回多个 `IBlockInputStream` 对象以允许并行处理数据。多个块输入流能够从一个表中并行读取。然后你可以通过不同的转换对这些流进行装饰(比如表达式求值或过滤),转换过程能够独立计算,并在其上创建一个 `UnionBlockInputStream`,以并行读取多个流。 @@ -95,21 +95,21 @@ ClickHouse 是一个真正的列式数据库管理系统(DBMS)。在 ClickHous 要快速了解如何实现自己的表引擎,可以查看一些简单的表引擎,比如 `StorageMemory` 或 `StorageTinyLog`。 -> 作为 `read` 方法的结果,`IStorage` 返回 `QueryProcessingStage` - 关于 storage 里哪部分查询已经被计算的信息。当前我们仅有非常粗粒度的信息。Storage 无法告诉我们“对于这个范围的数据,我已经处理完了 WHERE 字句里的这部分表达式”。我们需要在这个地方继续努力。 +> 作为 `read` 方法的结果,`IStorage` 返回 `QueryProcessingStage` - 关于 storage 里哪部分查询已经被计算的信息。当前我们仅有非常粗粒度的信息。Storage 无法告诉我们«对于这个范围的数据,我已经处理完了 WHERE 字句里的这部分表达式»。我们需要在这个地方继续努力。 -## 解析器(Parsers) +## 解析器(Parsers) {#jie-xi-qi-parsers} 查询由一个手写递归下降解析器解析。比如, `ParserSelectQuery` 只是针对查询的不同部分递归地调用下层解析器。解析器创建 `AST`。`AST` 由节点表示,节点是 `IAST` 的实例。 > 由于历史原因,未使用解析器生成器。 -## 解释器(Interpreters) +## 解释器(Interpreters) {#jie-shi-qi-interpreters} 解释器负责从 `AST` 创建查询执行流水线。既有一些简单的解释器,如 `InterpreterExistsQuery` 和 `InterpreterDropQuery`,也有更复杂的解释器,如 `InterpreterSelectQuery`。查询执行流水线由块输入或输出流组成。比如,`SELECT` 查询的解释结果是从 `FROM` 字句的结果集中读取数据的 `IBlockInputStream`;`INSERT` 查询的结果是写入需要插入的数据的 `IBlockOutputStream`;`SELECT INSERT` 查询的解释结果是 `IBlockInputStream`,它在第一次读取时返回一个空结果集,同时将数据从 `SELECT` 复制到 `INSERT`。 `InterpreterSelectQuery` 使用 `ExpressionAnalyzer` 和 `ExpressionActions` 机制来进行查询分析和转换。这是大多数基于规则的查询优化完成的地方。`ExpressionAnalyzer` 非常混乱,应该进行重写:不同的查询转换和优化应该被提取出来并划分成不同的类,从而允许模块化转换或查询。 -## 函数(Functions) +## 函数(Functions) {#han-shu-functions} 函数既有普通函数,也有聚合函数。对于聚合函数,请看下一节。 @@ -121,11 +121,11 @@ ClickHouse 具有强类型,因此隐式类型转换不会发生。如果函数 实现函数可能有些不方便,因为函数的实现需要包含所有支持该操作的数据类型和 `IColumn` 类型。比如,`plus` 函数能够利用 C++ 模板针对不同的数字类型组合、常量以及非常量的左值和右值进行代码生成。 -> 这是一个实现动态代码生成的好地方,从而能够避免模板代码膨胀。同样,运行时代码生成也使得实现融合函数成为可能,比如融合“乘-加”,或者在单层循环迭代中进行多重比较。 +> 这是一个实现动态代码生成的好地方,从而能够避免模板代码膨胀。同样,运行时代码生成也使得实现融合函数成为可能,比如融合«乘-加»,或者在单层循环迭代中进行多重比较。 -由于向量查询执行,函数不会“短路”。比如,如果你写 `WHERE f(x) AND g(y)`,两边都会进行计算,即使是对于 `f(x)` 为 0 的行(除非 `f(x)` 是零常量表达式)。但是如果 `f(x)` 的选择条件很高,并且计算 `f(x)` 比计算 `g(y)` 要划算得多,那么最好进行多遍计算:首先计算 `f(x)`,根据计算结果对列数据进行过滤,然后计算 `g(y)`,之后只需对较小数量的数据进行过滤。 +由于向量查询执行,函数不会«短路»。比如,如果你写 `WHERE f(x) AND g(y)`,两边都会进行计算,即使是对于 `f(x)` 为 0 的行(除非 `f(x)` 是零常量表达式)。但是如果 `f(x)` 的选择条件很高,并且计算 `f(x)` 比计算 `g(y)` 要划算得多,那么最好进行多遍计算:首先计算 `f(x)`,根据计算结果对列数据进行过滤,然后计算 `g(y)`,之后只需对较小数量的数据进行过滤。 -## 聚合函数 +## 聚合函数 {#ju-he-han-shu} 聚合函数是状态函数。它们将传入的值激活到某个状态,并允许你从该状态获取结果。聚合函数使用 `IAggregateFunction` 接口进行管理。状态可以非常简单(`AggregateFunctionCount` 的状态只是一个单一的`UInt64` 值),也可以非常复杂(`AggregateFunctionUniqCombined` 的状态是由一个线性数组、一个散列表和一个 `HyperLogLog` 概率数据结构组合而成的)。 @@ -135,13 +135,13 @@ ClickHouse 具有强类型,因此隐式类型转换不会发生。如果函数 > 聚合函数状态的序列化数据格式目前尚未版本化。如果只是临时存储聚合状态,这样是可以的。但是我们有 `AggregatingMergeTree` 表引擎用于增量聚合,并且人们已经在生产中使用它。这就是为什么在未来当我们更改任何聚合函数的序列化格式时需要增加向后兼容的支持。 -## 服务器(Server) +## 服务器(Server) {#fu-wu-qi-server} 服务器实现了多个不同的接口: -- 一个用于任何外部客户端的 HTTP 接口。 -- 一个用于本机 ClickHouse 客户端以及在分布式查询执行中跨服务器通信的 TCP 接口。 -- 一个用于传输数据以进行拷贝的接口。 +- 一个用于任何外部客户端的 HTTP 接口。 +- 一个用于本机 ClickHouse 客户端以及在分布式查询执行中跨服务器通信的 TCP 接口。 +- 一个用于传输数据以进行拷贝的接口。 在内部,它只是一个没有协程、纤程等的基础多线程服务器。服务器不是为处理高速率的简单查询设计的,而是为处理相对低速率的复杂查询设计的,每一个复杂查询能够对大量的数据进行处理分析。 @@ -151,31 +151,31 @@ ClickHouse 具有强类型,因此隐式类型转换不会发生。如果函数 > 对于所有的外部应用,我们推荐使用 HTTP 接口,因为该接口很简单,容易使用。TCP 接口与内部数据结构的联系更加紧密:它使用内部格式传递数据块,并使用自定义帧来压缩数据。我们没有发布该协议的 C 库,因为它需要链接大部分的 ClickHouse 代码库,这是不切实际的。 -## 分布式查询执行 +## 分布式查询执行 {#fen-bu-shi-cha-xun-zhi-xing} -集群设置中的服务器大多是独立的。你可以在一个集群中的一个或多个服务器上创建一个 `Distributed` 表。`Distributed` 表本身并不存储数据,它只为集群的多个节点上的所有本地表提供一个“视图(view)”。当从 `Distributed` 表中进行 SELECT 时,它会重写该查询,根据负载平衡设置来选择远程节点,并将查询发送给节点。`Distributed` 表请求远程服务器处理查询,直到可以合并来自不同服务器的中间结果的阶段。然后它接收中间结果并进行合并。分布式表会尝试将尽可能多的工作分配给远程服务器,并且不会通过网络发送太多的中间数据。 +集群设置中的服务器大多是独立的。你可以在一个集群中的一个或多个服务器上创建一个 `Distributed` 表。`Distributed` 表本身并不存储数据,它只为集群的多个节点上的所有本地表提供一个«视图(view)»。当从 `Distributed` 表中进行 SELECT 时,它会重写该查询,根据负载平衡设置来选择远程节点,并将查询发送给节点。`Distributed` 表请求远程服务器处理查询,直到可以合并来自不同服务器的中间结果的阶段。然后它接收中间结果并进行合并。分布式表会尝试将尽可能多的工作分配给远程服务器,并且不会通过网络发送太多的中间数据。 > 当 `IN` 或 `JOIN` 子句中包含子查询并且每个子查询都使用分布式表时,事情会变得更加复杂。我们有不同的策略来执行这些查询。 -分布式查询执行没有全局查询计划。每个节点都有针对自己的工作部分的本地查询计划。我们仅有简单的一次性分布式查询执行:将查询发送给远程节点,然后合并结果。但是对于具有高基数的 `GROUP BY` 或具有大量临时数据的 `JOIN` 这样困难的查询的来说,这是不可行的:在这种情况下,我们需要在服务器之间“改组”数据,这需要额外的协调。ClickHouse 不支持这类查询执行,我们需要在这方面进行努力。 +分布式查询执行没有全局查询计划。每个节点都有针对自己的工作部分的本地查询计划。我们仅有简单的一次性分布式查询执行:将查询发送给远程节点,然后合并结果。但是对于具有高基数的 `GROUP BY` 或具有大量临时数据的 `JOIN` 这样困难的查询的来说,这是不可行的:在这种情况下,我们需要在服务器之间«改组»数据,这需要额外的协调。ClickHouse 不支持这类查询执行,我们需要在这方面进行努力。 -## Merge Tree +## Merge Tree {#merge-tree} -`MergeTree` 是一系列支持按主键索引的存储引擎。主键可以是一个任意的列或表达式的元组。`MergeTree` 表中的数据存储于“分块”中。每一个分块以主键序存储数据(数据按主键元组的字典序排序)。表的所有列都存储在这些“分块”中分离的 `column.bin` 文件中。`column.bin` 文件由压缩块组成,每一个块通常是 64 KB 到 1 MB 大小的未压缩数据,具体取决于平均值大小。这些块由一个接一个连续放置的列值组成。每一列的列值顺序相同(顺序由主键定义),因此当你按多列进行迭代时,你能够得到相应列的值。 +`MergeTree` 是一系列支持按主键索引的存储引擎。主键可以是一个任意的列或表达式的元组。`MergeTree` 表中的数据存储于«分块»中。每一个分块以主键序存储数据(数据按主键元组的字典序排序)。表的所有列都存储在这些«分块»中分离的 `column.bin` 文件中。`column.bin` 文件由压缩块组成,每一个块通常是 64 KB 到 1 MB 大小的未压缩数据,具体取决于平均值大小。这些块由一个接一个连续放置的列值组成。每一列的列值顺序相同(顺序由主键定义),因此当你按多列进行迭代时,你能够得到相应列的值。 -主键本身是“稀疏”的。它并不是索引单一的行,而是索引某个范围内的数据。一个单独的 `primary.idx` 文件具有每个第 N 行的主键值,其中 N 称为 `index_granularity`(通常,N = 8192)。同时,对于每一列,都有带有标记的 `column.mrk` 文件,该文件记录的是每个第 N 行在数据文件中的偏移量。每个标记是一个 pair:文件中的偏移量到压缩块的起始,以及解压缩块中的偏移量到数据的起始。通常,压缩块根据标记对齐,并且解压缩块中的偏移量为 0。`primary.idx` 的数据始终驻留在内存,同时 `column.mrk` 的数据被缓存。 +主键本身是«稀疏»的。它并不是索引单一的行,而是索引某个范围内的数据。一个单独的 `primary.idx` 文件具有每个第 N 行的主键值,其中 N 称为 `index_granularity`(通常,N = 8192)。同时,对于每一列,都有带有标记的 `column.mrk` 文件,该文件记录的是每个第 N 行在数据文件中的偏移量。每个标记是一个 pair:文件中的偏移量到压缩块的起始,以及解压缩块中的偏移量到数据的起始。通常,压缩块根据标记对齐,并且解压缩块中的偏移量为 0。`primary.idx` 的数据始终驻留在内存,同时 `column.mrk` 的数据被缓存。 当我们要从 `MergeTree` 的一个分块中读取部分内容时,我们会查看 `primary.idx` 数据并查找可能包含所请求数据的范围,然后查看 `column.mrk` 并计算偏移量从而得知从哪里开始读取些范围的数据。由于稀疏性,可能会读取额外的数据。ClickHouse 不适用于高负载的简单点查询,因为对于每一个键,整个 `index_granularity` 范围的行的数据都需要读取,并且对于每一列需要解压缩整个压缩块。我们使索引稀疏,是因为每一个单一的服务器需要在索引没有明显内存消耗的情况下,维护数万亿行的数据。另外,由于主键是稀疏的,导致其不是唯一的:无法在 INSERT 时检查一个键在表中是否存在。你可以在一个表中使用同一个键创建多个行。 -当你向 `MergeTree` 中插入一堆数据时,数据按主键排序并形成一个新的分块。为了保证分块的数量相对较少,有后台线程定期选择一些分块并将它们合并成一个有序的分块,这就是 `MergeTree` 的名称来源。当然,合并会导致“写入放大”。所有的分块都是不可变的:它们仅会被创建和删除,不会被修改。当运行 `SELECT` 查询时,`MergeTree` 会保存一个表的快照(分块集合)。合并之后,还会保留旧的分块一段时间,以便发生故障后更容易恢复,因此如果我们发现某些合并后的分块可能已损坏,我们可以将其替换为原分块。 +当你向 `MergeTree` 中插入一堆数据时,数据按主键排序并形成一个新的分块。为了保证分块的数量相对较少,有后台线程定期选择一些分块并将它们合并成一个有序的分块,这就是 `MergeTree` 的名称来源。当然,合并会导致«写入放大»。所有的分块都是不可变的:它们仅会被创建和删除,不会被修改。当运行 `SELECT` 查询时,`MergeTree` 会保存一个表的快照(分块集合)。合并之后,还会保留旧的分块一段时间,以便发生故障后更容易恢复,因此如果我们发现某些合并后的分块可能已损坏,我们可以将其替换为原分块。 -`MergeTree` 不是 LSM 树,因为它不包含”memtable“和”log“:插入的数据直接写入文件系统。这使得它仅适用于批量插入数据,而不适用于非常频繁地一行一行插入 - 大约每秒一次是没问题的,但是每秒一千次就会有问题。我们这样做是为了简单起见,因为我们已经在我们的应用中批量插入数据。 +`MergeTree` 不是 LSM 树,因为它不包含»memtable«和»log«:插入的数据直接写入文件系统。这使得它仅适用于批量插入数据,而不适用于非常频繁地一行一行插入 - 大约每秒一次是没问题的,但是每秒一千次就会有问题。我们这样做是为了简单起见,因为我们已经在我们的应用中批量插入数据。 > `MergeTree` 表只能有一个(主)索引:没有任何辅助索引。在一个逻辑表下,允许有多个物理表示,比如,可以以多个物理顺序存储数据,或者同时表示预聚合数据和原始数据。 有些 `MergeTree` 引擎会在后台合并期间做一些额外工作,比如 `CollapsingMergeTree` 和 `AggregatingMergeTree`。这可以视为对更新的特殊支持。请记住这些不是真正的更新,因为用户通常无法控制后台合并将会执行的时间,并且 `MergeTree` 中的数据几乎总是存储在多个分块中,而不是完全合并的形式。 -## 复制(Replication) +## 复制(Replication) {#fu-zhi-replication} ClickHouse 中的复制是基于表实现的。你可以在同一个服务器上有一些可复制的表和不可复制的表。你也可以以不同的方式进行表的复制,比如一个表进行双因子复制,另一个进行三因子复制。 @@ -183,7 +183,7 @@ ClickHouse 中的复制是基于表实现的。你可以在同一个服务器上 复制使用异步多主机方案。你可以将数据插入到与 `ZooKeeper` 进行会话的任意副本中,并将数据复制到所有其它副本中。由于 ClickHouse 不支持 UPDATEs,因此复制是无冲突的。由于没有对插入的仲裁确认,如果一个节点发生故障,刚刚插入的数据可能会丢失。 -用于复制的元数据存储在 ZooKeeper 中。其中一个复制日志列出了要执行的操作。操作包括:获取分块、合并分块和删除分区等。每一个副本将复制日志复制到其队列中,然后执行队列中的操作。比如,在插入时,在复制日志中创建“获取分块”这一操作,然后每一个副本都会去下载该分块。所有副本之间会协调进行合并以获得相同字节的结果。所有的分块在所有的副本上以相同的方式合并。为实现该目的,其中一个副本被选为领导者,该副本首先进行合并,并把“合并分块”操作写到日志中。 +用于复制的元数据存储在 ZooKeeper 中。其中一个复制日志列出了要执行的操作。操作包括:获取分块、合并分块和删除分区等。每一个副本将复制日志复制到其队列中,然后执行队列中的操作。比如,在插入时,在复制日志中创建«获取分块»这一操作,然后每一个副本都会去下载该分块。所有副本之间会协调进行合并以获得相同字节的结果。所有的分块在所有的副本上以相同的方式合并。为实现该目的,其中一个副本被选为领导者,该副本首先进行合并,并把«合并分块»操作写到日志中。 复制是物理的:只有压缩的分块会在节点之间传输,查询则不会。为了降低网络成本(避免网络放大),大多数情况下,会在每一个副本上独立地处理合并。只有在存在显著的合并延迟的情况下,才会通过网络发送大块的合并分块。 diff --git a/docs/zh/development/build.md b/docs/zh/development/build.md index e589bcc172f..3a81077fb0b 100644 --- a/docs/zh/development/build.md +++ b/docs/zh/development/build.md @@ -1,76 +1,76 @@ -# 如何构建 ClickHouse 发布包 +# 如何构建 ClickHouse 发布包 {#ru-he-gou-jian-clickhouse-fa-bu-bao} -## 安装 Git 和 Pbuilder +## 安装 Git 和 Pbuilder {#an-zhuang-git-he-pbuilder} -```bash +``` bash sudo apt-get update sudo apt-get install git pbuilder debhelper lsb-release fakeroot sudo debian-archive-keyring debian-keyring ``` -## 拉取 ClickHouse 源码 +## 拉取 ClickHouse 源码 {#la-qu-clickhouse-yuan-ma} -```bash +``` bash git clone --recursive https://github.com/ClickHouse/ClickHouse.git cd ClickHouse ``` -## 运行发布脚本 +## 运行发布脚本 {#yun-xing-fa-bu-jiao-ben} -```bash +``` bash ./release ``` -# 如何在开发过程中编译 ClickHouse +# 如何在开发过程中编译 ClickHouse {#ru-he-zai-kai-fa-guo-cheng-zhong-bian-yi-clickhouse} 以下教程是在 Ubuntu Linux 中进行编译的示例。 通过适当的更改,它应该可以适用于任何其他的 Linux 发行版。 -仅支持具有 x86_64、AArch64。 对 Power9 的支持是实验性的。 +仅支持具有 x86\_64、AArch64。 对 Power9 的支持是实验性的。 -## 安装 Git 和 CMake 和 Ninja +## 安装 Git 和 CMake 和 Ninja {#an-zhuang-git-he-cmake-he-ninja} -```bash +``` bash sudo apt-get install git cmake ninja-build ``` Or cmake3 instead of cmake on older systems. 或者在早期版本的系统中用 cmake3 替代 cmake -## 安装 GCC 9 +## 安装 GCC 9 {#an-zhuang-gcc-9} There are several ways to do this. -### 安装 PPA 包 +### 安装 PPA 包 {#an-zhuang-ppa-bao} -```bash +``` bash sudo apt-get install software-properties-common sudo apt-add-repository ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get install gcc-9 g++-9 ``` -### 源码安装 gcc +### 源码安装 gcc {#yuan-ma-an-zhuang-gcc} 请查看 [utils/ci/build-gcc-from-sources.sh](https://github.com/ClickHouse/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh) -## 使用 GCC 9 来编译 +## 使用 GCC 9 来编译 {#shi-yong-gcc-9-lai-bian-yi} -```bash +``` bash export CC=gcc-9 export CXX=g++-9 ``` -## 拉取 ClickHouse 源码 +## 拉取 ClickHouse 源码 {#la-qu-clickhouse-yuan-ma-1} -```bash +``` bash git clone --recursive git@github.com:ClickHouse/ClickHouse.git # or: git clone --recursive https://github.com/ClickHouse/ClickHouse.git cd ClickHouse ``` -## 编译 ClickHouse +## 编译 ClickHouse {#bian-yi-clickhouse} -```bash +``` bash mkdir build cd build cmake .. @@ -81,5 +81,4 @@ cd .. 若要创建一个执行文件, 执行 `ninja clickhouse`。 这个命令会使得 `dbms/programs/clickhouse` 文件可执行,您可以使用 `client` or `server` 参数运行。 - [来源文章](https://clickhouse.tech/docs/en/development/build/) diff --git a/docs/zh/development/build_cross_osx.md b/docs/zh/development/build_cross_osx.md index 31fee6ca31e..20577d1213a 100644 --- a/docs/zh/development/build_cross_osx.md +++ b/docs/zh/development/build_cross_osx.md @@ -1,24 +1,24 @@ -# 如何在Linux中编译Mac OS X ClickHouse +# 如何在Linux中编译Mac OS X ClickHouse {#ru-he-zai-linuxzhong-bian-yi-mac-os-x-clickhouse} -Linux机器也可以编译运行在OS X系统的`clickhouse`二进制包,这可以用于在Linux上跑持续集成测试。如果要在Mac OS X上直接构建ClickHouse,请参考另外一篇指南: https://clickhouse.tech/docs/zh/development/build_osx/ +Linux机器也可以编译运行在OS X系统的`clickhouse`二进制包,这可以用于在Linux上跑持续集成测试。如果要在Mac OS X上直接构建ClickHouse,请参考另外一篇指南: https://clickhouse.tech/docs/zh/development/build\_osx/ Mac OS X的交叉编译基于以下构建说明,请首先遵循它们。 -# Install Clang-8 +# Install Clang-8 {#install-clang-8} 按照https://apt.llvm.org/中的说明进行Ubuntu或Debian安装。 例如,安装Bionic的命令如下: -```bash +``` bash sudo echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" >> /etc/apt/sources.list sudo apt-get install clang-8 ``` -# 安装交叉编译工具集 +# 安装交叉编译工具集 {#an-zhuang-jiao-cha-bian-yi-gong-ju-ji} 我们假设安装 `cctools` 在 ${CCTOOLS} 路径下 -```bash +``` bash mkdir ${CCTOOLS} git clone https://github.com/tpoechtrager/apple-libtapi.git @@ -37,9 +37,9 @@ wget https://github.com/phracker/MacOSX-SDKs/releases/download/10.14-beta4/MacOS tar xJf MacOSX10.14.sdk.tar.xz ``` -# 编译 ClickHouse +# 编译 ClickHouse {#bian-yi-clickhouse} -```bash +``` bash cd ClickHouse mkdir build-osx CC=clang-8 CXX=clang++-8 cmake . -Bbuild-osx -DCMAKE_SYSTEM_NAME=Darwin \ @@ -50,4 +50,4 @@ CC=clang-8 CXX=clang++-8 cmake . -Bbuild-osx -DCMAKE_SYSTEM_NAME=Darwin \ ninja -C build-osx ``` -生成的二进制文件将具有Mach-O可执行格式,并且不能在Linux上运行。 \ No newline at end of file +生成的二进制文件将具有Mach-O可执行格式,并且不能在Linux上运行。 diff --git a/docs/zh/development/build_osx.md b/docs/zh/development/build_osx.md index 40b68f31d0f..e471b716a33 100644 --- a/docs/zh/development/build_osx.md +++ b/docs/zh/development/build_osx.md @@ -1,32 +1,32 @@ -# 在 Mac OS X 中编译 ClickHouse +# 在 Mac OS X 中编译 ClickHouse {#zai-mac-os-x-zhong-bian-yi-clickhouse} ClickHouse 支持在 Mac OS X 10.12 版本中编译。若您在用更早的操作系统版本,可以尝试在指令中使用 `Gentoo Prefix` 和 `clang sl`. 通过适当的更改,它应该可以适用于任何其他的 Linux 发行版。 -## 安装 Homebrew +## 安装 Homebrew {#an-zhuang-homebrew} -```bash +``` bash /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" ``` -## 安装编译器,工具库 +## 安装编译器,工具库 {#an-zhuang-bian-yi-qi-gong-ju-ku} -```bash +``` bash brew install cmake ninja gcc icu4c mariadb-connector-c openssl libtool gettext ``` -## 拉取 ClickHouse 源码 +## 拉取 ClickHouse 源码 {#la-qu-clickhouse-yuan-ma} -```bash +``` bash git clone --recursive git@github.com:ClickHouse/ClickHouse.git # or: git clone --recursive https://github.com/ClickHouse/ClickHouse.git cd ClickHouse ``` -## 编译 ClickHouse +## 编译 ClickHouse {#bian-yi-clickhouse} -```bash +``` bash mkdir build cd build cmake .. -DCMAKE_CXX_COMPILER=`which g++-8` -DCMAKE_C_COMPILER=`which gcc-8` @@ -34,16 +34,17 @@ ninja cd .. ``` -## 注意事项 +## 注意事项 {#zhu-yi-shi-xiang} 若你想运行 clickhouse-server,请先确保增加系统的最大文件数配置。 -!!! 注意 +!!! 注意 "注意" 可能需要用 sudo 为此,请创建以下文件: /Library/LaunchDaemons/limit.maxfiles.plist: + ``` xml diff --git a/docs/zh/development/contrib.md b/docs/zh/development/contrib.md index d157ebd3cdb..5491cc76f6f 100644 --- a/docs/zh/development/contrib.md +++ b/docs/zh/development/contrib.md @@ -1,34 +1,34 @@ -# 使用的三方库 +# 使用的三方库 {#shi-yong-de-san-fang-ku} -| Library | License | -| ------- | ------- | -| base64 | [BSD 2-Clause License](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) | -| boost | [Boost Software License 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) | -| brotli | [MIT](https://github.com/google/brotli/blob/master/LICENSE) | -| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/master/LICENSE) | -| cctz | [Apache License 2.0](https://github.com/google/cctz/blob/4f9776a310f4952454636363def82c2bf6641d5f/LICENSE.txt) | -| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | -| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | -| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | -| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | -| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | -| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | -| libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | -| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) -| libhdfs3 | [Apache License 2.0](https://github.com/ClickHouse-Extras/libhdfs3/blob/bd6505cbb0c130b0db695305b9a38546fa880e5a/LICENSE.txt) | -| libmetrohash | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | -| libpcg-random | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libpcg-random/LICENSE-APACHE.txt) | -| libressl | [OpenSSL License](https://github.com/ClickHouse-Extras/ssl/blob/master/COPYING) | -| librdkafka | [BSD 2-Clause License](https://github.com/edenhill/librdkafka/blob/363dcad5a23dc29381cc626620e68ae418b3af19/LICENSE) | -| libwidechar\_width | [CC0 1.0 Universal](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libwidechar_width/LICENSE) | -| llvm | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/llvm/blob/163def217817c90fb982a6daf384744d8472b92b/llvm/LICENSE.TXT) | -| lz4 | [BSD 2-Clause License](https://github.com/lz4/lz4/blob/c10863b98e1503af90616ae99725ecd120265dfb/LICENSE) | -| mariadb-connector-c | [LGPL v2.1](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/3.1/COPYING.LIB) | -| murmurhash | [Public Domain](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) -| pdqsort | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/pdqsort/license.txt) | -| poco | [Boost Software License - Version 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) | -| protobuf | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) | -| re2 | [BSD 3-Clause License](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) | -| UnixODBC | [LGPL v2.1](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) | -| zlib-ng | [Zlib License](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) | -| zstd | [BSD 3-Clause License](https://github.com/facebook/zstd/blob/dev/LICENSE) | +| Library | License | +|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------| +| base64 | [BSD 2-Clause License](https://github.com/aklomp/base64/blob/a27c565d1b6c676beaf297fe503c4518185666f7/LICENSE) | +| boost | [Boost Software License 1.0](https://github.com/ClickHouse-Extras/boost-extra/blob/6883b40449f378019aec792f9983ce3afc7ff16e/LICENSE_1_0.txt) | +| brotli | [MIT](https://github.com/google/brotli/blob/master/LICENSE) | +| capnproto | [MIT](https://github.com/capnproto/capnproto/blob/master/LICENSE) | +| cctz | [Apache License 2.0](https://github.com/google/cctz/blob/4f9776a310f4952454636363def82c2bf6641d5f/LICENSE.txt) | +| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | +| FastMemcpy | [MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | +| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | +| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | +| libbtrie | [BSD 2-Clause License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | +| libcxxabi | [BSD + MIT](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | +| libdivide | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | +| libgsasl | [LGPL v2.1](https://github.com/ClickHouse-Extras/libgsasl/blob/3b8948a4042e34fb00b4fb987535dc9e02e39040/LICENSE) | +| libhdfs3 | [Apache License 2.0](https://github.com/ClickHouse-Extras/libhdfs3/blob/bd6505cbb0c130b0db695305b9a38546fa880e5a/LICENSE.txt) | +| libmetrohash | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libmetrohash/LICENSE) | +| libpcg-random | [Apache License 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/libpcg-random/LICENSE-APACHE.txt) | +| libressl | [OpenSSL License](https://github.com/ClickHouse-Extras/ssl/blob/master/COPYING) | +| librdkafka | [BSD 2-Clause License](https://github.com/edenhill/librdkafka/blob/363dcad5a23dc29381cc626620e68ae418b3af19/LICENSE) | +| libwidechar\_width | [CC0 1.0 Universal](https://github.com/ClickHouse/ClickHouse/blob/master/libs/libwidechar_width/LICENSE) | +| llvm | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/llvm/blob/163def217817c90fb982a6daf384744d8472b92b/llvm/LICENSE.TXT) | +| lz4 | [BSD 2-Clause License](https://github.com/lz4/lz4/blob/c10863b98e1503af90616ae99725ecd120265dfb/LICENSE) | +| mariadb-connector-c | [LGPL v2.1](https://github.com/ClickHouse-Extras/mariadb-connector-c/blob/3.1/COPYING.LIB) | +| murmurhash | [Public Domain](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/murmurhash/LICENSE) | +| pdqsort | [Zlib License](https://github.com/ClickHouse/ClickHouse/blob/master/contrib/pdqsort/license.txt) | +| poco | [Boost Software License - Version 1.0](https://github.com/ClickHouse-Extras/poco/blob/fe5505e56c27b6ecb0dcbc40c49dc2caf4e9637f/LICENSE) | +| protobuf | [BSD 3-Clause License](https://github.com/ClickHouse-Extras/protobuf/blob/12735370922a35f03999afff478e1c6d7aa917a4/LICENSE) | +| re2 | [BSD 3-Clause License](https://github.com/google/re2/blob/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0/LICENSE) | +| UnixODBC | [LGPL v2.1](https://github.com/ClickHouse-Extras/UnixODBC/tree/b0ad30f7f6289c12b76f04bfb9d466374bb32168) | +| zlib-ng | [Zlib License](https://github.com/ClickHouse-Extras/zlib-ng/blob/develop/LICENSE.md) | +| zstd | [BSD 3-Clause License](https://github.com/facebook/zstd/blob/dev/LICENSE) | diff --git a/docs/zh/development/developer_instruction.md b/docs/zh/development/developer_instruction.md index 7e8f7559e15..27f3c1ad8b2 100644 --- a/docs/zh/development/developer_instruction.md +++ b/docs/zh/development/developer_instruction.md @@ -1,42 +1,38 @@ ClickHose支持Linux,FreeBSD 及 Mac OS X 系统。 -# Windows使用指引 +# Windows使用指引 {#windowsshi-yong-zhi-yin} -如果您的系统是Windows,则需要创建Ubuntu虚拟机。可以安装VirtualBox来构建虚拟机。Ubuntu的下载链接为:https://www.ubuntu.com/#download 。请使用下载好的镜像创建一个虚拟机(请确保虚拟机有至少4GB的内存容量)。在Ubuntu中使用"terminal"程序(gnome-terminal,konsole等)运行命令行终端,或使用快捷键Ctrl+Alt+T。 +如果您的系统是Windows,则需要创建Ubuntu虚拟机。可以安装VirtualBox来构建虚拟机。Ubuntu的下载链接为:https://www.ubuntu.com/\#download 。请使用下载好的镜像创建一个虚拟机(请确保虚拟机有至少4GB的内存容量)。在Ubuntu中使用«terminal»程序(gnome-terminal,konsole等)运行命令行终端,或使用快捷键Ctrl+Alt+T。 - -# 在GitHub上创建源码库 +# 在GitHub上创建源码库 {#zai-githubshang-chuang-jian-yuan-ma-ku} 您需要(申请)一个GitHub账户来使用ClickHouse。 如果没有账户,请在https://github.com上注册一个。如果没有SSH密钥,请在本地创建密钥并将公钥上传到GitHub上。这有助于你提交更新代码。并且在不同的SSH服务端,你也可以使用相同的SSH密钥。 -要创建ClickHouse源码库的分支,请在https://github.com/ClickHouse/ClickHouse页面上点击右上角的"fork"按钮。它会在本账户上创建您个人的ClickHouse/ClickHouse分支。 +要创建ClickHouse源码库的分支,请在https://github.com/ClickHouse/ClickHouse页面上点击右上角的«fork»按钮。它会在本账户上创建您个人的ClickHouse/ClickHouse分支。 -若要参与开发,首先请在ClickHouse的分支中提交您期望的变更,然后创建一个“pull请求”,以便这些变更能够被(ClickHouse/ClickHouse)主库接受。 +若要参与开发,首先请在ClickHouse的分支中提交您期望的变更,然后创建一个«pull请求»,以便这些变更能够被(ClickHouse/ClickHouse)主库接受。 请先安装`git`来使用git源码库。 请在Ubuntu终端上使用下列的指令来安装`git`: -``` -sudo apt update -sudo apt install git -``` + sudo apt update + sudo apt install git 在https://services.github.com/on-demand/downloads/github-git-cheat-sheet.pdf中找到有关使用Git的简易手册。有关Git的详细手册,请参见: https://git-scm.com/book/ru/v2 。 +# 拷贝源码库到开发机 {#kao-bei-yuan-ma-ku-dao-kai-fa-ji} -# 拷贝源码库到开发机 - -接下来,请将源码下载到开发机上。这步操作被称为“拷贝源码库”,是因为它在您的开发机上创建了源码库的本地副本。 +接下来,请将源码下载到开发机上。这步操作被称为«拷贝源码库»,是因为它在您的开发机上创建了源码库的本地副本。 在终端命令行输入下列指令: -``` -git clone --recursive git@guthub.com:your_github_username/ClickHouse.git -cd ClickHouse -``` -请注意,您需要将*your_github_username* 替换成实际使用的账户名! + + git clone --recursive git@guthub.com:your_github_username/ClickHouse.git + cd ClickHouse + +请注意,您需要将*your\_github\_username* 替换成实际使用的账户名! 这个指令将创建一个包含项目副本的`ClickHouse`工作目录。 @@ -44,75 +40,64 @@ cd ClickHouse 请注意,ClickHouse源码库使用了`submodules`。这是对其他库的引用(即项目所依赖的外部库)。即在拷贝源码库时,需要如上述指令中那样指定`--recursive`。如果在拷贝源码库时没有包含子模块,需要执行使用下列的指令: -``` -git submodule init -git submodule update -``` + git submodule init + git submodule update + 可以通过 `git submodule status`来检查子模块的状态。 如果提示下列的错误信息: -``` -Permission denied (publickey). -fatal: Could not read from remote repository. + Permission denied (publickey). + fatal: Could not read from remote repository. + + Please make sure you have the correct access rights + and the repository exists. -Please make sure you have the correct access rights -and the repository exists. -``` 这通常表示缺少用于连接GitHub的SSH密钥。这些密钥一般都在`~/.ssh`中。要接受SSH密钥,请在GitHub UI的设置页面中上传它们。 您还可以通过https协议来拷贝源码库: -``` -git clone https://github.com/ClickHouse/ClickHouse.git -``` + git clone https://github.com/ClickHouse/ClickHouse.git 但是,这无法将变更提交到服务器上。您仍然可以暂时使用,并后续再添加SSH密钥,用`git remote`命令替换源码库的远程地址。 还可以将原始ClickHouse库的地址添加到本地库中,以便从那里获取更新: -``` -git remote add upstream git@github.com:ClickHouse/ClickHouse.git -``` + git remote add upstream git@github.com:ClickHouse/ClickHouse.git 命令执行成功后,可以通过执行`git pull upstream master`,从ClickHouse的主分支中拉去更新。 - -## Working with submodules +## Working with submodules {#working-with-submodules} Working with submodules in git could be painful. Next commands will help to manage it: -``` -# ! each command accepts --recursive -# Update remote URLs for submodules. Barely rare case -git submodule sync -# Add new submodules -git submodule init -# Update existing submodules to the current state -git submodule update -# Two last commands could be merged together -git submodule update --init -``` + # ! each command accepts --recursive + # Update remote URLs for submodules. Barely rare case + git submodule sync + # Add new submodules + git submodule init + # Update existing submodules to the current state + git submodule update + # Two last commands could be merged together + git submodule update --init The next commands would help you to reset all submodules to the initial state (!WARING! - any chenges inside will be deleted): -``` -# Synchronizes submodules' remote URL with .gitmodules -git submodule sync --recursive -# Update the registered submodules with initialize not yet initialized -git submodule update --init --recursive -# Reset all changes done after HEAD -git submodule foreach git reset --hard -# Clean files from .gitignore -git submodule foreach git clean -xfd -# Repeat last 4 commands for all submodule -git submodule foreach git submodule sync --recursive -git submodule foreach git submodule update --init --recursive -git submodule foreach git submodule foreach git reset --hard -git submodule foreach git submodule foreach git clean -xfd -``` + # Synchronizes submodules' remote URL with .gitmodules + git submodule sync --recursive + # Update the registered submodules with initialize not yet initialized + git submodule update --init --recursive + # Reset all changes done after HEAD + git submodule foreach git reset --hard + # Clean files from .gitignore + git submodule foreach git clean -xfd + # Repeat last 4 commands for all submodule + git submodule foreach git submodule sync --recursive + git submodule foreach git submodule update --init --recursive + git submodule foreach git submodule foreach git reset --hard + git submodule foreach git submodule foreach git clean -xfd -# 构建系统 +# 构建系统 {#gou-jian-xi-tong} ClickHouse使用 CMake 和 Ninja 来构建系统。 @@ -127,83 +112,66 @@ Ninja - 一个轻量级的构建系统,专注于速度,用于执行这些cma 若要在Mac OS X上安装CMake和Ninja,请先安装Homebrew,然后再通过brew安装其他内容: -``` -/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" -brew install cmake ninja -``` + /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" + brew install cmake ninja 接下来,检查CMake的版本:`cmake --version`。如果版本低于3.3,则需要从以下网站安装更新版本:https://cmake.org/download/ 。 - -# 可供选择的外部库 +# 可供选择的外部库 {#ke-gong-xuan-ze-de-wai-bu-ku} ClickHouse使用多个外部库进行构建。大多数外部库不需要单独安装,而是和ClickHouse一起在子模块中构建。可以查看`contrib`中罗列的清单。 +# C++ 编译器 {#c-bian-yi-qi} -# C++ 编译器 - -GCC编译器从版本9开始,以及Clang版本>=8都可支持构建ClickHouse。 +GCC编译器从版本9开始,以及Clang版本\>=8都可支持构建ClickHouse。 Yandex官方当前使用GCC构建ClickHouse,因为它生成的机器代码性能较好(根据测评,最多可以相差几个百分点)。Clang通常可以更加便捷的开发。我们的持续集成(CI)平台会运行大约十二种构建组合的检查。 在Ubuntu上安装GCC,请执行:`sudo apt install gcc g++` -请使用`gcc --version`查看gcc的版本。如果gcc版本低于9,请参考此处的指示:https://clickhouse.tech/docs/en/development/build/#install-gcc-9 。 +请使用`gcc --version`查看gcc的版本。如果gcc版本低于9,请参考此处的指示:https://clickhouse.tech/docs/en/development/build/\#install-gcc-9 。 在Mac OS X上安装GCC,请执行:`brew install gcc` 如果您决定使用Clang,还可以同时安装 `libc++`以及`lld`,前提是您也熟悉它们。此外,也推荐使用`ccache`。 - -# 构建的过程 +# 构建的过程 {#gou-jian-de-guo-cheng} 如果当前已经准备好构建ClickHouse,我们建议您在`ClickHouse`中创建一个单独的目录`build`,其中包含所有构建组件: -``` -mkdir build -cd build -``` + mkdir build + cd build -您也可以有多个不同类型的构建目录(例如,build_release, build_debug等等)。 +您也可以有多个不同类型的构建目录(例如,build\_release, build\_debug等等)。 在`build`目录下,通过运行CMake配置构建。 在第一次运行之前,请定义用于指定编译器的环境变量(本示例中为gcc 9 编译器)。 -``` -export CC=gcc-9 CXX=g++-9 -cmake .. -``` + export CC=gcc-9 CXX=g++-9 + cmake .. `CC`变量指代C的编译器(C Compiler的缩写),而`CXX`变量指代要使用哪个C++编译器进行编译。 为了更快的构建,请使用`debug`构建类型-不含优化的构建。为此提供以下的参数`-D CMAKE_BUILD_TYPE=Debug`: -``` -cmake -D CMAKE_BUILD_TYPE=Debug .. -``` + cmake -D CMAKE_BUILD_TYPE=Debug .. 您可以通过在`build`目录中运行此命令来更改构建类型。 运行ninja进行构建: -``` -ninja clickhouse-server clickhouse-client -``` + ninja clickhouse-server clickhouse-client 在此示例中,仅将构建所需的二进制文件。 如果您需要构建所有的二进制文件(utilities和tests),请运行不带参数的ninja: -``` -ninja -``` + ninja 全量构建需要大约30GB的可用磁盘空间或15GB的空间来构建主要的二进制文件。 当构建的机器上有大量内存时,可考虑设置与`-j`参数并行运行的构建任务数量: -``` -ninja -j 1 clickhouse-server clickhouse-client -``` + ninja -j 1 clickhouse-server clickhouse-client 在拥有4GB内存的机器上,建议设置成1,在拥有8GB内存的机器上,建议按`-j 2`设置。 @@ -219,18 +187,13 @@ ninja -j 1 clickhouse-server clickhouse-client 成功构建后,会得到一个可执行文件`ClickHouse//dbms/programs/clickhouse`: -``` -ls -l dbms/programs/clickhouse -``` + ls -l dbms/programs/clickhouse - -# 运行ClickHouse可执行文件 +# 运行ClickHouse可执行文件 {#yun-xing-clickhouseke-zhi-xing-wen-jian} 要以当前的用户身份运行服务,请进入到`ClickHouse/dbms/programs/server/` 目录(在`build`文件夹外)并运行: -``` -../../../build/dbms/programs/clickhouse server -``` + ../../../build/dbms/programs/clickhouse server 在这种情况下,ClickHouse将使用位于当前目录中的配置文件。您可以从任何目录运行`Clickhouse server`,并将配置文件`--config-file`的路径指定为命令行参数。 @@ -238,40 +201,32 @@ ls -l dbms/programs/clickhouse 如果您在Mac OS X 或者 FreeBSD上收到`Connection refused`的消息,请尝试指定主机地址为127.0.0.1: -``` -clickhouse client --host 127.0.0.1 -``` + clickhouse client --host 127.0.0.1 您可以使用自定义构建的ClickHouse二进制文件替换系统中安装的ClickHouse二进制文件的生成版本。为此,请参照官方网站上的说明在计算机上安装ClickHouse。 接下来,运行以下命令: -``` -sudo service clickhouse-server stop -sudo cp ClickHouse/build/dbms/programs/clickhouse /usr/bin/ -sudo service clickhouse-server start -``` + sudo service clickhouse-server stop + sudo cp ClickHouse/build/dbms/programs/clickhouse /usr/bin/ + sudo service clickhouse-server start 请注意,`clickhouse-client`,`clickhouse-server`和其他服务通常共享`clickhouse`二进制文件的符号链接。 您还可以使用系统上安装的ClickHouse软件包中的配置文件运行自定义构建的ClickHouse二进制文件: -``` -sudo service clickhouse-server stop -sudo -u clickhouse ClickHouse/build/dbms/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml -``` + sudo service clickhouse-server stop + sudo -u clickhouse ClickHouse/build/dbms/programs/clickhouse server --config-file /etc/clickhouse-server/config.xml - -# IDE (集成开发环境) +# IDE (集成开发环境) {#ide-ji-cheng-kai-fa-huan-jing} 如果您还不知道使用哪款IDE,我们推荐使用CLion。CLion是一款商业软件,但能够有30天的免费使用时间。它同时也对学生免费。CLion可以在Linux和Mac OS X上使用。 -KDevelop和QTCreator是另外两款适合开发ClickHouse的替代IDE。尽管不太稳定,但KDevelop还是作为一款非常便捷的IDE。如果KDevelop在打开项目后不久崩溃,则您应该在打开项目文件列表后立即单击“全部停止”按钮。按此处理后,KDevelop可以正常使用。 +KDevelop和QTCreator是另外两款适合开发ClickHouse的替代IDE。尽管不太稳定,但KDevelop还是作为一款非常便捷的IDE。如果KDevelop在打开项目后不久崩溃,则您应该在打开项目文件列表后立即单击«全部停止»按钮。按此处理后,KDevelop可以正常使用。 作为简易的代码编辑器,您可以使用Sublime Text或Visual Studio Code或Kate(在Linux上都可用)。 值得一提的是CLion会创建自己的`build`路径,它还会自行选择`debug`作为构建类型。对于配置,它使用CLion中定义的CMake版本,而不是您安装的版本。最后,CLion会使用`make`而不是`ninja`去构建任务。这属于正常的现象,请记住这一点,以免造成混淆。 - -# 编写代码 +# 编写代码 {#bian-xie-dai-ma} ClickHouse的架构描述可以在此处查看:https://clickhouse.tech/docs/en/development/architecture/ @@ -279,41 +234,37 @@ ClickHouse的架构描述可以在此处查看:https://clickhouse.tech/docs/en 编写测试用例:https://clickhouse.tech/docs/en/development/tests/ -任务列表:https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/instructions/easy_tasks_sorted_en.md +任务列表:https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/instructions/easy\_tasks\_sorted\_en.md - -# 测试数据 +# 测试数据 {#ce-shi-shu-ju} 开发ClickHouse通常需要加载现实的数据集,尤其是在性能测试的场景。我们可以从Yandex.Metrica获取一组特别准备的匿名数据。这些数据需要额外使用3GB的空闲磁盘空间。请注意,完成大多数开发任务并不需要此数据。 -``` -sudo apt install wget xz-utils + sudo apt install wget xz-utils -wget https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz -wget https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz + wget https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz + wget https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz -xz -v -d hits_v1.tsv.xz -xz -v -d visits_v1.tsv.xz + xz -v -d hits_v1.tsv.xz + xz -v -d visits_v1.tsv.xz -clickhouse-client + clickhouse-client -CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime); + CREATE TABLE test.hits ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree PARTITION BY toYYYYMM(EventDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID), EventTime); -CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID); + CREATE TABLE test.visits ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), `Goals.ID` Array(UInt32), `Goals.Serial` Array(UInt32), `Goals.EventTime` Array(DateTime), `Goals.Price` Array(Int64), `Goals.OrderID` Array(String), `Goals.CurrencyID` Array(UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, `TraficSource.ID` Array(Int8), `TraficSource.SearchEngineID` Array(UInt16), `TraficSource.AdvEngineID` Array(UInt8), `TraficSource.PlaceID` Array(UInt16), `TraficSource.SocialSourceNetworkID` Array(UInt8), `TraficSource.Domain` Array(String), `TraficSource.SearchPhrase` Array(String), `TraficSource.SocialSourcePage` Array(String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, `ParsedParams.Key1` Array(String), `ParsedParams.Key2` Array(String), `ParsedParams.Key3` Array(String), `ParsedParams.Key4` Array(String), `ParsedParams.Key5` Array(String), `ParsedParams.ValueDouble` Array(Float64), `Market.Type` Array(UInt8), `Market.GoalID` Array(UInt32), `Market.OrderID` Array(String), `Market.OrderPrice` Array(Int64), `Market.PP` Array(UInt32), `Market.DirectPlaceID` Array(UInt32), `Market.DirectOrderID` Array(UInt32), `Market.DirectBannerID` Array(UInt32), `Market.GoodID` Array(String), `Market.GoodName` Array(String), `Market.GoodQuantity` Array(Int32), `Market.GoodPrice` Array(Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) SAMPLE BY intHash32(UserID) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID); -clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv -clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv -``` + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.hits FORMAT TSV" < hits_v1.tsv + clickhouse-client --max_insert_block_size 100000 --query "INSERT INTO test.visits FORMAT TSV" < visits_v1.tsv +# 创建拉取请求 {#chuang-jian-la-qu-qing-qiu} -# 创建拉取请求 +进入到GitHub 用户界面中的fork库。如果您已经在某个分支中进行开发,则需要选择该分支。在屏幕中有一个 «拉取请求»的按钮。实际上这等价于«创建一个请求以接受对主库的变更»。 -进入到GitHub 用户界面中的fork库。如果您已经在某个分支中进行开发,则需要选择该分支。在屏幕中有一个 "拉取请求"的按钮。实际上这等价于“创建一个请求以接受对主库的变更”。 +即使工作尚未完成,也可以创建拉取请求。在这种情况下,请在标题的开头加上«WIP»(正在进行中),以便后续更改。这对于协同审查和讨论更改以及运行所有可用测试用例很有用。提供有关变更的简短描述很重要,这将在后续用于生成重新发布变更日志。 -即使工作尚未完成,也可以创建拉取请求。在这种情况下,请在标题的开头加上“WIP”(正在进行中),以便后续更改。这对于协同审查和讨论更改以及运行所有可用测试用例很有用。提供有关变更的简短描述很重要,这将在后续用于生成重新发布变更日志。 +Yandex成员一旦在您的拉取请求上贴上«可以测试»标签,就会开始测试。一些初始检查项(例如,代码类型)的结果会在几分钟内反馈。构建的检查结果将在半小时内完成。而主要的测试用例集结果将在一小时内报告给您。 -Yandex成员一旦在您的拉取请求上贴上“可以测试”标签,就会开始测试。一些初始检查项(例如,代码类型)的结果会在几分钟内反馈。构建的检查结果将在半小时内完成。而主要的测试用例集结果将在一小时内报告给您。 - -系统将分别为您的拉取请求准备ClickHouse二进制版本。若要检索这些构建信息,请在检查列表中单击“ ClickHouse构建检查”旁边的“详细信息”链接。在这里,您会找到指向ClickHouse的.deb软件包的直接链接,此外,甚至可以将其部署在生产服务器上(如果您不担心)。 +系统将分别为您的拉取请求准备ClickHouse二进制版本。若要检索这些构建信息,请在检查列表中单击« ClickHouse构建检查»旁边的«详细信息»链接。在这里,您会找到指向ClickHouse的.deb软件包的直接链接,此外,甚至可以将其部署在生产服务器上(如果您不担心)。 某些构建项很可能会在首次构建时失败。这是因为我们同时检查了基于gcc和clang的构建,几乎所有现有的被clang启用的警告(总是带有`-Werror`标志)。在同一页面上,您可以找到所有构建的日志,因此不必以所有可能的方式构建ClickHouse。 diff --git a/docs/zh/development/index.md b/docs/zh/development/index.md index d536d93441f..187ee1b3e25 100644 --- a/docs/zh/development/index.md +++ b/docs/zh/development/index.md @@ -1,4 +1,3 @@ -# ClickHouse 开发 - +# ClickHouse 开发 {#clickhouse-kai-fa} [来源文章](https://clickhouse.tech/docs/en/development/) diff --git a/docs/zh/development/style.md b/docs/zh/development/style.md index c0489775fd6..4d374f9b2e8 100644 --- a/docs/zh/development/style.md +++ b/docs/zh/development/style.md @@ -1,6 +1,6 @@ -# 如何编写 C++ 代码 +# 如何编写 C++ 代码 {#ru-he-bian-xie-c-dai-ma} -## 一般建议 +## 一般建议 {#yi-ban-jian-yi} **1.** 以下是建议,而不是要求。 @@ -10,7 +10,7 @@ **4.** 许多规则没有逻辑原因; 它们是由既定的做法决定的。 -## 格式化 +## 格式化 {#ge-shi-hua} **1.** 大多数格式化可以用 `clang-format` 自动完成。 @@ -18,7 +18,7 @@ **3.** 左右花括号需在单独的行。 -```cpp +``` cpp inline void readBoolText(bool & x, ReadBuffer & buf) { char tmp = '0'; @@ -29,30 +29,30 @@ inline void readBoolText(bool & x, ReadBuffer & buf) **4.** 若整个方法体仅有一行 `描述`, 则可以放到单独的行上。 在花括号周围放置空格(除了行尾的空格)。 -```cpp +``` cpp inline size_t mask() const { return buf_size() - 1; } inline size_t place(HashValue x) const { return x & mask(); } ``` **5.** 对于函数。 不要在括号周围放置空格。 -```cpp +``` cpp void reinsert(const Value & x) ``` -```cpp +``` cpp memcpy(&buf[place_value], &x, sizeof(x)); ``` **6.** 在`if`,`for`,`while`和其他表达式中,在开括号前面插入一个空格(与函数声明相反)。 -```cpp +``` cpp for (size_t i = 0; i < rows; i += storage.index_granularity) ``` -**7.** 在二元运算符(`+`,`-`,`*`,`/`,`%`,...)和三元运算符 `?:` 周围添加空格。 +**7.** 在二元运算符(`+`,`-`,`*`,`/`,`%`,…)和三元运算符 `?:` 周围添加空格。 -```cpp +``` cpp UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); @@ -60,7 +60,7 @@ UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); **8.** 若有换行,新行应该以运算符开头,并且增加对应的缩进。 -```cpp +``` cpp if (elapsed_ns) message << " (" << rows_read_on_server * 1000000000 / elapsed_ns << " rows/s., " @@ -69,7 +69,7 @@ if (elapsed_ns) **9.** 如果需要,可以在一行内使用空格来对齐。 -```cpp +``` cpp dst.ClickLogID = click.LogID; dst.ClickEventID = click.EventID; dst.ClickGoodEvent = click.GoodEvent; @@ -79,7 +79,7 @@ dst.ClickGoodEvent = click.GoodEvent; 如有必要,运算符可以包裹到下一行。 在这种情况下,它前面的偏移量增加。 -**11.** 不要使用空格来分开一元运算符 (`--`, `++`, `*`, `&`, ...) 和参数。 +**11.** 不要使用空格来分开一元运算符 (`--`, `++`, `*`, `&`, …) 和参数。 **12.** 在逗号后面加一个空格,而不是在之前。同样的规则也适合 `for` 循环中的分号。 @@ -87,7 +87,7 @@ dst.ClickGoodEvent = click.GoodEvent; **14.** 在 `template <...>` 表达式中,在 `template` 和 `<` 中加入一个空格,在 `<` 后面或在 `>` 前面都不要有空格。 -```cpp +``` cpp template struct AggregatedStatElement {} @@ -95,7 +95,7 @@ struct AggregatedStatElement **15.** 在类和结构体中, `public`, `private` 以及 `protected` 同 `class/struct` 无需缩进,其他代码须缩进。 -```cpp +``` cpp template class MultiVersion { @@ -108,11 +108,11 @@ public: **16.** 如果对整个文件使用相同的 `namespace`,并且没有其他重要的东西,则 `namespace` 中不需要偏移量。 -**17.** 在 `if`, `for`, `while` 中包裹的代码块中,若代码是一个单行的 `statement`,那么大括号是可选的。 可以将 `statement` 放到一行中。这个规则同样适用于嵌套的 `if`, `for`, `while`, ... +**17.** 在 `if`, `for`, `while` 中包裹的代码块中,若代码是一个单行的 `statement`,那么大括号是可选的。 可以将 `statement` 放到一行中。这个规则同样适用于嵌套的 `if`, `for`, `while`, … 但是如果内部 `statement` 包含大括号或 `else`,则外部块应该用大括号括起来。 -```cpp +``` cpp /// Finish write. for (auto & stream : streams) stream.second->finalize(); @@ -124,7 +124,7 @@ for (auto & stream : streams) **20.** 非ASCII字符可用于字符串文字。 -```cpp +``` cpp << ", " << (timer.elapsed() / chunks_stats.hits) << " μsec/hit."; ``` @@ -136,7 +136,7 @@ for (auto & stream : streams) **24.** `const` 必须写在类型名称之前。 -```cpp +``` cpp //correct const char * pos const std::string & s @@ -146,7 +146,7 @@ char const * pos **25.** 声明指针或引用时,`*` 和 `&` 符号两边应该都用空格分隔。 -```cpp +``` cpp //correct const char * pos //incorrect @@ -160,7 +160,7 @@ const char *pos `using`可以在本地声明,例如在函数内部。 -```cpp +``` cpp //correct using FileStreams = std::map>; FileStreams streams; @@ -170,14 +170,14 @@ std::map> streams; **27.** 不要在一个语句中声明不同类型的多个变量。 -```cpp +``` cpp //incorrect int x, *y; ``` **28.** 不要使用C风格的类型转换。 -```cpp +``` cpp //incorrect std::cerr << (int)c <<; std::endl; //correct @@ -196,17 +196,17 @@ std::cerr << static_cast(c) << std::endl; **32.** 如果不需要 postfix,请始终使用前缀增量/减量运算符。 -```cpp +``` cpp for (Names::const_iterator it = column_names.begin(); it != column_names.end(); ++it) ``` -## Comments +## Comments {#comments} **1.** 请务必为所有非常重要的代码部分添加注释。 这是非常重要的。 编写注释可能会帮助您意识到代码不是必需的,或者设计错误。 -```cpp +``` cpp /** Part of piece of memory, that can be used. * For example, if internal_buffer is 1MB, and there was only 10 bytes loaded to buffer from file for reading, * then working_buffer will have size of only 10 bytes @@ -218,7 +218,7 @@ for (Names::const_iterator it = column_names.begin(); it != column_names.end(); **3.** 在他们描述的代码之前放置注释。 在极少数情况下,注释可以在代码之后,在同一行上。 -```cpp +``` cpp /** Parses and executes the query. */ void executeQuery( @@ -236,7 +236,7 @@ void executeQuery( **6.** 请勿添加无效的注释。 特别是,不要留下像这样的空注释: -```cpp +``` cpp /* * Procedure Name: * Original procedure name: @@ -257,9 +257,9 @@ void executeQuery( */ ``` -这个示例来源于 [http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/](http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/)。 +这个示例来源于 http://home.tamk.fi/~jaalto/course/coding-style/doc/unmaintainable-code/。 -**7.** 不要在每个文件的开头写入垃圾注释(作者,创建日期...)。 +**7.** 不要在每个文件的开头写入垃圾注释(作者,创建日期…)。 **8.** 单行注释用三个斜杆: `///` ,多行注释以 `/**`开始。 这些注释会当做文档。 @@ -267,7 +267,7 @@ void executeQuery( **9.** 多行注释的开头和结尾不得有空行(关闭多行注释的行除外)。 -**10.** 要注释掉代码,请使用基本注释,而不是“记录”注释。 +**10.** 要注释掉代码,请使用基本注释,而不是«记录»注释。 **11.** 在提交之前删除代码的无效注释部分。 @@ -275,69 +275,69 @@ void executeQuery( **13.** 不要使用大写字母。 不要使用过多的标点符号。 -```cpp +``` cpp /// WHAT THE FAIL??? ``` **14.** 不要使用注释来制作分隔符。 -```cpp +``` cpp ///****************************************************** ``` **15.** 不要在注释中开始讨论。 -```cpp +``` cpp /// Why did you do this stuff? ``` **16.** 没有必要在块的末尾写一条注释来描述它的含义。 -```cpp +``` cpp /// for ``` -## Names +## Names {#names} **1.** 在变量和类成员的名称中使用带下划线的小写字母。 -```cpp +``` cpp size_t max_block_size; ``` **2.** 对于函数(方法)的名称,请使用以小写字母开头的驼峰标识。 -```cpp +``` cpp std::string getName() const override { return "Memory"; } ``` **3.** 对于类(结构)的名称,使用以大写字母开头的驼峰标识。接口名称用I前缀。 -```cpp +``` cpp class StorageMemory : public IStorage ``` -**4.** `using` 的命名方式与类相同,或者以__t`命名。 +**4.** `using` 的命名方式与类相同,或者以\_\_t\`命名。 **5.** 模板类型参数的名称:在简单的情况下,使用`T`; `T`,`U`; `T1`,`T2`。 对于更复杂的情况,要么遵循类名规则,要么添加前缀`T`。 -```cpp +``` cpp template struct AggregatedStatElement ``` **6.** 模板常量参数的名称:遵循变量名称的规则,或者在简单的情况下使用 `N`。 -```cpp +``` cpp template struct ExtractDomain ``` **7.** 对于抽象类型(接口),用 `I` 前缀。 -```cpp +``` cpp class IBlockInputStream ``` @@ -345,13 +345,13 @@ class IBlockInputStream 在所有其他情况下,请使用能描述含义的名称。 -```cpp +``` cpp bool info_successfully_loaded = false; ``` **9.** `define` 和全局常量的名称使用带下划线的 `ALL_CAPS`。 -```cpp +``` cpp #define MAX_SRC_TABLE_NAMES_TO_STORE 1000 ``` @@ -363,12 +363,12 @@ bool info_successfully_loaded = false; **11.** 如果名称包含缩写,则: -- 对于变量名,缩写应使用小写字母 `mysql_connection`(不是 `mySQL_connection` )。 -- 对于类和函数的名称,请将大写字母保留在缩写 `MySQLConnection`(不是 `MySqlConnection` 。 +- 对于变量名,缩写应使用小写字母 `mysql_connection`(不是 `mySQL_connection` )。 +- 对于类和函数的名称,请将大写字母保留在缩写 `MySQLConnection`(不是 `MySqlConnection` 。 **12.** 仅用于初始化类成员的构造方法参数的命名方式应与类成员相同,但最后使用下划线。 -```cpp +``` cpp FileQueueProcessor( const std::string & path_, const std::string & prefix_, @@ -385,13 +385,13 @@ FileQueueProcessor( **13.** 局部变量和类成员的名称没有区别(不需要前缀)。 -```cpp +``` cpp timer (not m_timer) ``` -**14.** 对于 `enum` 中的常量,请使用带大写字母的驼峰标识。ALL_CAPS 也可以接受。如果 `enum` 是非本地的,请使用 `enum class`。 +**14.** 对于 `enum` 中的常量,请使用带大写字母的驼峰标识。ALL\_CAPS 也可以接受。如果 `enum` 是非本地的,请使用 `enum class`。 -```cpp +``` cpp enum class CompressionMethod { QuickLZ = 0, @@ -401,17 +401,13 @@ enum class CompressionMethod **15.** 所有名字必须是英文。不允许音译俄语单词。 -``` -not Stroka -``` + not Stroka **16.** 缩写须是众所周知的(当您可以在维基百科或搜索引擎中轻松找到缩写的含义时)。 -``` -`AST`, `SQL`. + `AST`, `SQL`. -Not `NVDH` (some random letters) -``` + Not `NVDH` (some random letters) 如果缩短版本是常用的,则可以接受不完整的单词。 @@ -419,7 +415,7 @@ Not `NVDH` (some random letters) **17.** C++ 源码文件名称必须为 `.cpp` 拓展名。 头文件必须为 `.h` 拓展名。 -## 如何编写代码 +## 如何编写代码 {#ru-he-bian-xie-dai-ma} **1.** 内存管理。 @@ -431,9 +427,9 @@ Not `NVDH` (some random letters) 示例: -- 最简单的方法是将对象放在堆栈上,或使其成为另一个类的成员。 -- 对于大量小对象,请使用容器。 -- 对于自动释放少量在堆中的对象,可以用 `shared_ptr/unique_ptr`。 +- 最简单的方法是将对象放在堆栈上,或使其成为另一个类的成员。 +- 对于大量小对象,请使用容器。 +- 对于自动释放少量在堆中的对象,可以用 `shared_ptr/unique_ptr`。 **2.** 资源管理。 @@ -449,7 +445,7 @@ Not `NVDH` (some random letters) 在线程函数中,你应该在 `join` 之后捕获并保留所有异常以在主线程中重新抛出它们。 -```cpp +``` cpp /// If there weren't any calculations yet, calculate the first block synchronously if (!started) { @@ -465,14 +461,14 @@ if (exception) 不处理就不要隐藏异常。 永远不要盲目地把所有异常都记录到日志中。 -```cpp +``` cpp //Not correct catch (...) {} ``` 如果您需要忽略某些异常,请仅针对特定异常执行此操作并重新抛出其余异常。 -```cpp +``` cpp catch (const DB::Exception & e) { if (e.code() == ErrorCodes::UNKNOWN_AGGREGATE_FUNCTION) @@ -484,7 +480,7 @@ catch (const DB::Exception & e) 当使用具有返回码或 `errno` 的函数时,请始终检查结果并在出现错误时抛出异常。 -```cpp +``` cpp if (0 != close(fd)) throwFromErrno("Cannot close file " + file_name, ErrorCodes::CANNOT_CLOSE_FILE); ``` @@ -501,16 +497,16 @@ if (0 != close(fd)) 按照以下选项: -- 创建一个函数( `done()` 或 `finalize()` ),它将提前完成所有可能导致异常的工作。 如果调用了该函数,则稍后在析构函数中应该没有异常。 -- 过于复杂的任务(例如通过网络发送消息)可以放在单独的方法中,类用户必须在销毁之前调用它们。 -- 如果析构函数中存在异常,则最好记录它而不是隐藏它(如果 logger 可用)。 -- 在简单的应用程序中,依赖于`std::terminate`(对于C++ 11中默认情况下为 `noexcept` 的情况)来处理异常是可以接受的。 +- 创建一个函数( `done()` 或 `finalize()` ),它将提前完成所有可能导致异常的工作。 如果调用了该函数,则稍后在析构函数中应该没有异常。 +- 过于复杂的任务(例如通过网络发送消息)可以放在单独的方法中,类用户必须在销毁之前调用它们。 +- 如果析构函数中存在异常,则最好记录它而不是隐藏它(如果 logger 可用)。 +- 在简单的应用程序中,依赖于`std::terminate`(对于C++ 11中默认情况下为 `noexcept` 的情况)来处理异常是可以接受的。 **6.** 匿名代码块。 您可以在单个函数内创建单独的代码块,以使某些变量成为局部变量,以便在退出块时调用析构函数。 -```cpp +``` cpp Block block = data.in->read(); { @@ -526,11 +522,11 @@ ready_any.set(); 在离线数据处理程序中: -- 尝试在单个CPU核心上获得最佳性能。 然后,您可以根据需要并行化代码。 +- 尝试在单个CPU核心上获得最佳性能。 然后,您可以根据需要并行化代码。 在服务端应用中: -- 使用线程池来处理请求。 此时,我们还没有任何需要用户空间上下文切换的任务。 +- 使用线程池来处理请求。 此时,我们还没有任何需要用户空间上下文切换的任务。 Fork不用于并行化。 @@ -582,7 +578,7 @@ Fork不用于并行化。 在极少数情况下,您可能需要通过参数返回值。 在这种情况下,参数应该是引用传递的。 -```cpp +``` cpp using AggregateFunctionPtr = std::shared_ptr; /** Allows creating an aggregate function by its name. @@ -614,7 +610,7 @@ public: 如果稍后您需要延迟初始化,则可以添加将创建无效对象的默认构造函数。 或者,对于少量对象,您可以使用 `shared_ptr / unique_ptr`。 -```cpp +``` cpp Loader(DB::Connection * connection_, const std::string & query, size_t max_block_size_); /// For deferred initialization @@ -669,13 +665,13 @@ Loader() {} **24.** 不要使用 `trailing return type` 为必要的功能。 -```cpp +``` cpp [auto f() -> void;]{.strike} ``` **25.** 声明和初始化变量。 -```cpp +``` cpp //right way std::string s = "Hello"; std::string s{"Hello"}; @@ -686,13 +682,13 @@ auto s = std::string{"Hello"}; **26.** 对于虚函数,在基类中编写 `virtual`,但在后代类中写 `override` 而不是`virtual`。 -## 没有用到的 C++ 特性。 +## 没有用到的 C++ 特性。 {#mei-you-yong-dao-de-c-te-xing} **1.** 不使用虚拟继承。 **2.** 不使用 C++03 中的异常标准。 -## 平台 +## 平台 {#ping-tai} **1.** 我们为特定平台编写代码。 @@ -706,7 +702,7 @@ auto s = std::string{"Hello"}; **4.** 操作系统:Linux Ubuntu,不比 Precise 早。 -**5.** 代码是为x86_64 CPU架构编写的。 +**5.** 代码是为x86\_64 CPU架构编写的。 CPU指令集是我们服务器中支持的最小集合。 目前,它是SSE 4.2。 @@ -716,7 +712,7 @@ CPU指令集是我们服务器中支持的最小集合。 目前,它是SSE 4.2 **8.** 使用发布的设置来开发和调试代码。 -## 工具 +## 工具 {#gong-ju} **1.** KDevelop 是一个好的 IDE. @@ -744,7 +740,7 @@ CPU指令集是我们服务器中支持的最小集合。 目前,它是SSE 4.2 **10.** 未使用的代码将从 repo 中删除。 -## 库 +## 库 {#ku} **1.** 使用C ++ 14标准库(允许实验性功能),以及 `boost` 和 `Poco` 框架。 @@ -760,7 +756,7 @@ CPU指令集是我们服务器中支持的最小集合。 目前,它是SSE 4.2 **5.** 始终优先考虑已经使用的库。 -## 一般建议 +## 一般建议 {#yi-ban-jian-yi-1} **1.** 尽可能精简代码。 @@ -774,7 +770,7 @@ CPU指令集是我们服务器中支持的最小集合。 目前,它是SSE 4.2 **6.** 鼓励简化代码。 尽可能减小代码的大小。 -## 其他建议 +## 其他建议 {#qi-ta-jian-yi} **1.** 从 `stddef.h` 明确指定 `std ::` 的类型。 @@ -802,32 +798,32 @@ CPU指令集是我们服务器中支持的最小集合。 目前,它是SSE 4.2 允许以下任何包装样式: -```cpp +``` cpp function( T1 x1, T2 x2) ``` -```cpp +``` cpp function( size_t left, size_t right, const & RangesInDataParts ranges, size_t limit) ``` -```cpp +``` cpp function(size_t left, size_t right, const & RangesInDataParts ranges, size_t limit) ``` -```cpp +``` cpp function(size_t left, size_t right, const & RangesInDataParts ranges, size_t limit) ``` -```cpp +``` cpp function( size_t left, size_t right, diff --git a/docs/zh/development/tests.md b/docs/zh/development/tests.md index 787314339c5..0416daf307c 100644 --- a/docs/zh/development/tests.md +++ b/docs/zh/development/tests.md @@ -1,13 +1,12 @@ -# ClickHouse 测试 +# ClickHouse 测试 {#clickhouse-ce-shi} - -## 功能性测试 +## 功能性测试 {#gong-neng-xing-ce-shi} 功能性测试是最简便使用的。绝大部分 ClickHouse 的功能可以通过功能性测试来测试,任何代码的更改都必须通过该测试。 每个功能测试会向正在运行的 ClickHouse服 务器发送一个或多个查询,并将结果与预期结果进行比较。 -测试用例在 `dbms/src/tests/queries` 目录中。这里有两个子目录:`stateless` 和 `stateful`目录。 无状态的测试无需预加载测试数据集 - 通常是在测试运行期间动态创建小量的数据集。有状态测试需要来自 Yandex.Metrica 的预加载测试数据,而不向一般公众提供。 我们倾向于仅使用“无状态”测试并避免添加新的“有状态”测试。 +测试用例在 `dbms/src/tests/queries` 目录中。这里有两个子目录:`stateless` 和 `stateful`目录。 无状态的测试无需预加载测试数据集 - 通常是在测试运行期间动态创建小量的数据集。有状态测试需要来自 Yandex.Metrica 的预加载测试数据,而不向一般公众提供。 我们倾向于仅使用«无状态»测试并避免添加新的«有状态»测试。 每个测试用例可以是两种类型之一:`.sql` 和 `.sh`。`.sql` 测试文件是用于管理`clickhouse-client --multiquery --testmode`的简单SQL脚本。`.sh` 测试文件是一个可以自己运行的脚本。 @@ -23,13 +22,11 @@ 有些测试在名称中标有 `zookeeper`,`shard` 或 `long`。`zookeeper` 用于使用ZooKeeper的测试; `shard` 用于需要服务器监听`127.0.0.*`的测试。`long` 适用于运行时间稍长一秒的测试。 - -## 已知的bug +## 已知的bug {#yi-zhi-de-bug} 如果我们知道一些可以通过功能测试轻松复制的错误,我们将准备好的功能测试放在 `dbms/src/tests/queries/bugs` 目录中。当修复错误时,这些测试将被移动到 `dbms/src/tests/queries/0_stateless` 目录中。 - -## 集成测试 +## 集成测试 {#ji-cheng-ce-shi} 集成测试允许在集群配置中测试 ClickHouse,并与其他服务器(如MySQL,Postgres,MongoDB)进行 ClickHouse 交互。它们可用于模拟网络拆分,数据包丢弃等。这些测试在Docker 下运行,并使用各种软件创建多个容器。 @@ -37,44 +34,37 @@ 请注意,ClickHouse 与第三方驱动程序的集成未经过测试。此外,我们目前还没有与 JDBC 和ODBC 驱动程序进行集成测试。 - -## 单元测试 +## 单元测试 {#dan-yuan-ce-shi} 当您想要测试整个 ClickHouse,而不是单个独立的库或类时,单元测试非常有用。您可以使用`ENABLE_TESTS` CMake 选项启用或禁用测试构建。单元测试(和其他测试程序)位于代码中的`tests` 子目录中。要运行单元测试,请键入 `ninja test`。有些测试使用 `gtest`,但有些只是在测试失败时返回非零状态码。 如果代码已经被功能测试覆盖(并且功能测试通常使用起来要简单得多),则不一定要进行单元测试。 - -## 性能测试 +## 性能测试 {#xing-neng-ce-shi} 性能测试允许测量和比较综合查询中 ClickHouse 的某些独立部分的性能。测试位于`dbms/tests/performance` 目录中。每个测试都由 `.xml` 文件表示,并附有测试用例的描述。使用 `clickhouse performance-test` 工具(嵌入在 `clickhouse` 二进制文件中)运行测试。请参阅 `--help` 以进行调用。 -每个测试在循环中运行一个或多个查询(可能带有参数组合),并具有一些停止条件(如“最大执行速度不会在三秒内更改”)并测量一些有关查询性能的指标(如“最大执行速度”))。某些测试可以包含预加载的测试数据集的前提条件。 +每个测试在循环中运行一个或多个查询(可能带有参数组合),并具有一些停止条件(如«最大执行速度不会在三秒内更改»)并测量一些有关查询性能的指标(如«最大执行速度»))。某些测试可以包含预加载的测试数据集的前提条件。 如果要在某些情况下提高 ClickHouse 的性能,并且如果可以在简单查询上观察到改进,则强烈建议编写性能测试。在测试过程中使用 `perf top` 或其他 perf 工具总是有意义的。 - 性能测试不是基于每个提交运行的。不收集性能测试结果,我们手动比较它们。 - -## 测试工具和脚本 +## 测试工具和脚本 {#ce-shi-gong-ju-he-jiao-ben} `tests`目录中的一些程序不是准备测试,而是测试工具。例如,对于`Lexer`,有一个工具`dbms/src/Parsers/tests/lexer` 标准输出。您可以使用这些工具作为代码示例以及探索和手动测试。 您还可以将一对文件 `.sh` 和 `.reference` 与工具放在一些预定义的输入上运行它 - 然后可以将脚本结果与 `.reference` 文件进行比较。这些测试不是自动化的。 - -## 杂项测试 +## 杂项测试 {#za-xiang-ce-shi} 有一些外部字典的测试位于 `dbms/tests/external_dictionaries`,机器学习模型在`dbms/tests/external_models`目录。这些测试未更新,必须转移到集成测试。 对于分布式数据的插入,有单独的测试。此测试在单独的服务器上运行 ClickHouse 集群并模拟各种故障情况:网络拆分,数据包丢弃(ClickHouse 节点之间,ClickHouse 和 ZooKeeper之间,ClickHouse 服务器和客户端之间等),进行 `kill -9`,`kill -STOP` 和`kill -CONT` 等操作,类似[Jepsen](https://aphyr.com/tags/Jepsen)。然后,测试检查是否已写入所有已确认的插入,并且所有已拒绝的插入都未写入。 - 在 ClickHouse 开源之前,分布式测试是由单独的团队编写的,但该团队不再使用 ClickHouse,测试是在 Java 中意外编写的。由于这些原因,必须重写分布式测试并将其移至集成测试。 - -## 手动测试 +## 手动测试 {#shou-dong-ce-shi} 当您开发了新的功能,做手动测试也是合理的。可以按照以下步骤来进行: @@ -84,56 +74,47 @@ 当您的系统上已经安装了 ClickHouse 时,您可以构建一个新的 `clickhouse` 二进制文件并替换现有的二进制文件: -``` -sudo service clickhouse-server stop -sudo cp ./clickhouse /usr/bin/ -sudo service clickhouse-server start -``` + sudo service clickhouse-server stop + sudo cp ./clickhouse /usr/bin/ + sudo service clickhouse-server start 您也可以停止 clickhouse-server 并使用相同的配置运行您自己的服务器,日志打印到终端: -``` -sudo service clickhouse-server stop -sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml -``` + + sudo service clickhouse-server stop + sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml 使用 gdb 的一个示例: -``` -sudo -u clickhouse gdb --args /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml -``` + + sudo -u clickhouse gdb --args /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml 如果 clickhouse-server 已经运行并且您不想停止它,您可以更改 `config.xml` 中的端口号(或在 `config.d` 目录中的文件中覆盖它们),配置适当的数据路径,然后运行它。 `clickhouse` 二进制文件几乎没有依赖关系,适用于各种 Linux 发行版。要快速地测试服务器上的更改,您可以简单地将新建的 `clickhouse` 二进制文件 `scp` 到其他服务器,然后按照上面的示例运行它。 +## 测试环境 {#ce-shi-huan-jing} -## 测试环境 +在将版本发布为稳定之前,我们将其部署在测试环境中 测试环境是一个处理\[Yandex.Metrica\](https://metrica.yandex.com/)总数据的1/39部分大小的集群。 我们与 Yandex.Metrica 团队公用我们的测试环境。ClickHouse 在现有数据的基础上无需停机即可升级。 我们首先看到数据处理成功而不会实时滞后,复制继续工作,并且 Yandex.Metrica 团队无法看到问题。 首先的检查可以通过以下方式完成: -在将版本发布为稳定之前,我们将其部署在测试环境中 测试环境是一个处理[Yandex.Metrica](https://metrica.yandex.com/)总数据的1/39部分大小的集群。 我们与 Yandex.Metrica 团队公用我们的测试环境。ClickHouse 在现有数据的基础上无需停机即可升级。 我们首先看到数据处理成功而不会实时滞后,复制继续工作,并且 Yandex.Metrica 团队无法看到问题。 首先的检查可以通过以下方式完成: - -``` -SELECT hostName() AS h, any(version()), any(uptime()), max(UTCEventTime), count() FROM remote('example01-01-{1..3}t', merge, hits) WHERE EventDate >= today() - 2 GROUP BY h ORDER BY h; -``` + SELECT hostName() AS h, any(version()), any(uptime()), max(UTCEventTime), count() FROM remote('example01-01-{1..3}t', merge, hits) WHERE EventDate >= today() - 2 GROUP BY h ORDER BY h; 在某些情况下,我们还部署到 Yandex 的合作团队的测试环境:市场,云等。此外,我们还有一些用于开发目的的硬件服务器。 -## 负载测试 +## 负载测试 {#fu-zai-ce-shi} 部署到测试环境后,我们使用生产群集中的查询运行负载测试。 这是手动完成的。 确保在生产集群中开启了 `query_log` 选项。 收集一天或更多的查询日志: -``` -clickhouse-client --query="SELECT DISTINCT query FROM system.query_log WHERE event_date = today() AND query LIKE '%ym:%' AND query NOT LIKE '%system.query_log%' AND type = 2 AND is_initial_query" > queries.tsv -``` + + clickhouse-client --query="SELECT DISTINCT query FROM system.query_log WHERE event_date = today() AND query LIKE '%ym:%' AND query NOT LIKE '%system.query_log%' AND type = 2 AND is_initial_query" > queries.tsv 这是一个复杂的例子。`type = 2` 将过滤成功执行的查询。`query LIKE'%ym:%'` 用于从 Yandex.Metrica 中选择相关查询。`is_initial_query` 是仅选择由客户端发起的查询,而不是由 ClickHouse 本身(作为分布式查询处理的一部分)。 `scp` 这份日志到测试机器,并运行以下操作: -``` -clickhouse benchmark --concurrency 16 < queries.tsv -``` + clickhouse benchmark --concurrency 16 < queries.tsv + (可能你需要指定运行的用户 `--user`) 然后离开它一晚或周末休息一下。 @@ -142,8 +123,7 @@ clickhouse benchmark --concurrency 16 < queries.tsv 由于查询和环境的高度可变性,不会记录精确的查询执行时序并且不进行比较。 - -## 编译测试 +## 编译测试 {#bian-yi-ce-shi} 构建测试允许检查构建在各种替代配置和某些外部系统上是否被破坏。测试位于`ci`目录。 它们从 Docker,Vagrant 中的源代码运行构建,有时在 Docker 中运行 `qemu-user-static`。这些测试正在开发中,测试运行不是自动化的。 @@ -151,22 +131,20 @@ clickhouse benchmark --concurrency 16 < queries.tsv 通常我们会在 ClickHouse 构建的单个版本上发布并运行所有测试。 但是有一些未经过彻底测试的替代构建版本。 例子: -- 在 FreeBSD 中的构建; -- 在 Debian 中使用系统包中的库进行构建; -- 使用库的共享链接构建; -- 在 AArch64 平台进行构建。 +- 在 FreeBSD 中的构建; +- 在 Debian 中使用系统包中的库进行构建; +- 使用库的共享链接构建; +- 在 AArch64 平台进行构建。 例如,使用系统包构建是不好的做法,因为我们无法保证系统具有的确切版本的软件包。但 Debian 维护者确实需要这样做。出于这个原因,我们至少必须支持这种构建。另一个例子:共享链接是一个常见的麻烦来源,但是对于一些爱好者来说需要它。 虽然我们无法对所有构建版本运行所有测试,但我们想要检查至少不会破坏各种构建变体。为此,我们使用构建测试。 - -## 测试协议兼容性 +## 测试协议兼容性 {#ce-shi-xie-yi-jian-rong-xing} 当我们扩展 ClickHouse 网络协议时,我们手动测试旧的 clickhouse-client 与新的 clickhouse-server 和新的clickhouse-client 一起使用旧的 clickhouse-server (只需从相应的包中运行二进制文件) - -## 来自编译器的提示 +## 来自编译器的提示 {#lai-zi-bian-yi-qi-de-ti-shi} ClickHouse 主要的代码 (位于`dbms`目录中) 使用 `-Wall -Wextra -Werror` 构建,并带有一些其他已启用的警告。 虽然没有为第三方库启用这些选项。 @@ -174,7 +152,7 @@ Clang 有更多有用的警告 - 您可以使用 `-Weverything` 查找它们并 对于生产构建,使用 gcc(它仍然生成比 clang 稍高效的代码)。对于开发来说,clang 通常更方便使用。您可以使用调试模式在自己的机器上构建(以节省笔记本电脑的电量),但请注意,由于更好的控制流程和过程分析,编译器使用 `-O3` 会生成更多警告。 当使用 clang 构建时,使用 `libc++` 而不是 `libstdc++`,并且在使用调试模式构建时,使用调试版本的 `libc++`,它允许在运行时捕获更多错误。 -## Sanitizers +## Sanitizers {#sanitizers} **Address sanitizer**. 我们在每个提交的基础上在 ASan 下运行功能和集成测试。 @@ -196,30 +174,27 @@ Clang 有更多有用的警告 - 您可以使用 `-Weverything` 查找它们并 更多请参阅 `dbms/tests/instructions/sanitizers.txt`。 - -## 模糊测试 +## 模糊测试 {#mo-hu-ce-shi} 我们使用简单的模糊测试来生成随机SQL查询并检查服务器是否正常,使用 Address sanitizer 执行模糊测试。你可以在`00746_sql_fuzzy.pl` 找到它。 测试应连续进行(过夜和更长时间)。 截至2018年12月,我们仍然不使用库代码的孤立模糊测试。 -## 安全审计 +## 安全审计 {#an-quan-shen-ji} Yandex Cloud 部门的人员从安全角度对 ClickHouse 功能进行了一些基本概述。 - -## 静态分析 +## 静态分析 {#jing-tai-fen-xi} 我们偶尔使用静态分析。我们已经评估过 `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`。您将在 `dbms/tests/instructions/` 目录中找到使用说明。你也可以阅读[俄文文章](https://habr.com/company/yandex/blog/342018/). 如果您使用 `CLion` 作为 IDE,您可以开箱即用一些 `clang-tidy` 检查。 -## 其他强化 +## 其他强化 {#qi-ta-qiang-hua} 默认情况下使用 `FORTIFY_SOURCE`。它几乎没用,但在极少数情况下仍然有意义,我们不会禁用它。 - -## 代码风格 +## 代码风格 {#dai-ma-feng-ge} 代码风格在[这里](https://clickhouse.tech/docs/en/development/style/) 有说明。 @@ -231,24 +206,21 @@ Yandex Cloud 部门的人员从安全角度对 ClickHouse 功能进行了一些 `CLion` 有自己的代码格式化程序,必须调整为我们的代码风格。 - -## Metrica B2B 测试 +## Metrica B2B 测试 {#metrica-b2b-ce-shi} 每个 ClickHouse 版本都经过 Yandex Metrica 和 AppMetrica 引擎的测试。测试和稳定版本的 ClickHouse 部署在虚拟机上,并使用处理输入数据固定样本的度量引擎的小副本运行。 将度量引擎的两个实例的结果一起进行比较 这些测试是由单独的团队自动完成的。由于移动部件的数量很多,大部分时间的测试都是完全无关的,很难弄清楚。很可能这些测试对我们来说是负值。然而,这些测试被证明是有用的大约一个或两个倍的数百。 - -## 测试覆盖率 +## 测试覆盖率 {#ce-shi-fu-gai-lu} 截至2018年7月,我们不会跟踪测试复盖率。 +## 自动化测试 {#zi-dong-hua-ce-shi} -## 自动化测试 +我们使用 Yandex 内部 CI 和名为«沙箱»的作业自动化系统运行测试。 我们还继续使用 Jenkins(可在Yandex内部使用)。 -我们使用 Yandex 内部 CI 和名为"沙箱"的作业自动化系统运行测试。 我们还继续使用 Jenkins(可在Yandex内部使用)。 - -构建作业和测试在沙箱中按每次提交的基础上运行。结果包和测试结果发布在 GitHub 上,可以通过直接链接下载,结果会被永久存储。当您在 GitHub 上发送拉取请求时,我们将其标记为"可以测试",我们的 CI 系统将为您构建 ClickHouse 包(发布,调试,地址消除等)。 +构建作业和测试在沙箱中按每次提交的基础上运行。结果包和测试结果发布在 GitHub 上,可以通过直接链接下载,结果会被永久存储。当您在 GitHub 上发送拉取请求时,我们将其标记为«可以测试»,我们的 CI 系统将为您构建 ClickHouse 包(发布,调试,地址消除等)。 由于时间和计算能力的限制,我们不使用 Travis CI。 diff --git a/docs/zh/faq/general.md b/docs/zh/faq/general.md index f8498cc3156..17f4fe9b11b 100644 --- a/docs/zh/faq/general.md +++ b/docs/zh/faq/general.md @@ -1,6 +1,6 @@ -# 常见问题 +# 常见问题 {#chang-jian-wen-ti} -## 为什么不使用MapReduce之类的产品呢? +## 为什么不使用MapReduce之类的产品呢? {#wei-shi-yao-bu-shi-yong-mapreducezhi-lei-de-chan-pin-ni} 我们可以将MapReduce这类的系统称为分布式计算系统,其reduce操作基于分布式排序。其中最常见的开源解决方案是 [Apache Hadoop](http://hadoop.apache.org)。 Yandex使用他们的内部解决方案YT。 @@ -8,15 +8,12 @@ 大多数MapReduce系统允许您在集群上执行任意代码。但是,声明性查询语言更适合OLAP,以便快速运行实验。例如,Hadoop包含Hive和Pig,Cloudera Impala或Shark(过时)for Spark,以及Spark SQL、Presto和Apache Drill。与专业系统相比,运行此类任务时的性能非常不理想,所以将这些系统用作Web接口的后端服务是不现实的,因为延迟相对较高。 - ## What to do if I have a problem with encodings when using Oracle through ODBC? {#oracle-odbc-encodings} -If you use Oracle through ODBC driver as a source of external dictionaries, you need to set up correctly value for the `NLS_LANG` variable in the `/etc/default/clickhouse`. For more details see the [Oracle NLS_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). +If you use Oracle through ODBC driver as a source of external dictionaries, you need to set up correctly value for the `NLS_LANG` variable in the `/etc/default/clickhouse`. For more details see the [Oracle NLS\_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). **Example** -``` -NLS_LANG=CHINESE_CHINA.ZHS16GBK -``` + NLS_LANG=CHINESE_CHINA.ZHS16GBK [来源文章](https://clickhouse.tech/docs/zh/faq/general/) diff --git a/docs/zh/getting_started/example_datasets/amplab_benchmark.md b/docs/zh/getting_started/example_datasets/amplab_benchmark.md index d0c43a23862..fc78daa6a46 100644 --- a/docs/zh/getting_started/example_datasets/amplab_benchmark.md +++ b/docs/zh/getting_started/example_datasets/amplab_benchmark.md @@ -1,12 +1,12 @@ -# AMPLab 大数据基准测试 +# AMPLab 大数据基准测试 {#amplab-da-shu-ju-ji-zhun-ce-shi} -参考 +参考 https://amplab.cs.berkeley.edu/benchmark/ -需要您在注册一个免费的账号。注册时需要您提供信用卡、邮箱、电话等信息。之后可以在获取新的访问密钥 +需要您在https://aws.amazon.com注册一个免费的账号。注册时需要您提供信用卡、邮箱、电话等信息。之后可以在https://console.aws.amazon.com/iam/home?nc2=h\_m\_sc\#security\_credential获取新的访问密钥 在控制台运行以下命令: -```bash +``` bash $ sudo apt-get install s3cmd $ mkdir tiny; cd tiny; $ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . @@ -21,7 +21,7 @@ $ cd .. 在ClickHouse运行如下查询: -```sql +``` sql CREATE TABLE rankings_tiny ( pageURL String, @@ -85,7 +85,7 @@ CREATE TABLE uservisits_5nodes_on_single 回到控制台运行如下命令: -```bash +``` bash $ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done $ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done $ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done @@ -119,5 +119,4 @@ ORDER BY totalRevenue DESC LIMIT 1 ``` - [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/amplab_benchmark/) diff --git a/docs/zh/getting_started/example_datasets/criteo.md b/docs/zh/getting_started/example_datasets/criteo.md index 0c78b06ed75..6083566113a 100644 --- a/docs/zh/getting_started/example_datasets/criteo.md +++ b/docs/zh/getting_started/example_datasets/criteo.md @@ -1,16 +1,16 @@ -# Criteo TB级别点击日志 +# Criteo TB级别点击日志 {#criteo-tbji-bie-dian-ji-ri-zhi} -可以从上下载数据 +可以从http://labs.criteo.com/downloads/download-terabyte-click-logs/上下载数据 创建原始数据对应的表结构: -```sql +``` sql CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log ``` 下载数据: -```bash +``` bash $ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done ``` @@ -65,11 +65,10 @@ CREATE TABLE criteo 将第一张表中的原始数据转化写入到第二张表中去: -```sql +``` sql INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log; DROP TABLE criteo_log; ``` - [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/criteo/) diff --git a/docs/zh/getting_started/example_datasets/nyc_taxi.md b/docs/zh/getting_started/example_datasets/nyc_taxi.md index 9f7625391cc..50dcbed0988 100644 --- a/docs/zh/getting_started/example_datasets/nyc_taxi.md +++ b/docs/zh/getting_started/example_datasets/nyc_taxi.md @@ -1,20 +1,20 @@ -# 纽约市出租车数据 +# 纽约市出租车数据 {#niu-yue-shi-chu-zu-che-shu-ju} 纽约市出租车数据有以下两个方式获取: 从原始数据导入 下载预处理好的分区数据 -## 怎样导入原始数据 +## 怎样导入原始数据 {#zen-yang-dao-ru-yuan-shi-shu-ju} -可以参考中的关于数据集结构描述与数据下载指令说明。 +可以参考https://github.com/toddwschneider/nyc-taxi-data和http://tech.marksblogg.com/billion-nyc-taxi-rides-redshift.html中的关于数据集结构描述与数据下载指令说明。 数据集包含227GB的CSV文件。这大约需要一个小时的下载时间(1Gbit带宽下,并行下载大概是一半时间)。 下载时注意损坏的文件。可以检查文件大小并重新下载损坏的文件。 有些文件中包含一些无效的行,您可以使用如下语句修复他们: -```bash +``` bash sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-02.csv > data/yellow_tripdata_2010-02.csv_ sed -E '/(.*,){18,}/d' data/yellow_tripdata_2010-03.csv > data/yellow_tripdata_2010-03.csv_ mv data/yellow_tripdata_2010-02.csv_ data/yellow_tripdata_2010-02.csv @@ -29,7 +29,7 @@ mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv 您可以按如下方式检查下载的行数: -```bash +``` bash $ time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" ## Count 1298979494 @@ -44,7 +44,7 @@ PostgreSQL处理这些数据大概需要370GB的磁盘空间。 从PostgreSQL中导出数据: -```sql +``` sql COPY ( SELECT trips.id, @@ -119,7 +119,7 @@ COPY 在ClickHouse中创建临时表: -```sql +``` sql CREATE TABLE trips ( trip_id UInt32, @@ -178,7 +178,7 @@ dropoff_puma Nullable(String) 接下来,需要将字段转换为更正确的数据类型,并且在可能的情况下,消除NULL。 -```bash +``` bash $ time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv real 75m56.214s @@ -190,13 +190,13 @@ real 75m56.214s (也可以直接使用`COPY ... TO PROGRAM`从Postgres中导入数据) -由于数据中与天气相关的所有数据(precipitation......average_wind_speed)都填充了NULL。 所以,我们将从最终数据集中删除它们 +由于数据中与天气相关的所有数据(precipitation……average\_wind\_speed)都填充了NULL。 所以,我们将从最终数据集中删除它们 首先,我们使用单台服务器创建表,后面我们将在多台节点上创建这些表。 创建表结构并写入数据: -```sql +``` sql CREATE TABLE trips_mergetree ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) AS SELECT @@ -259,14 +259,15 @@ FROM trips ``` 这需要3030秒,速度约为每秒428,000行。 -要加快速度,可以使用`Log`引擎替换'MergeTree`引擎来创建表。 在这种情况下,下载速度超过200秒。 +要加快速度,可以使用`Log`引擎替换’MergeTree\`引擎来创建表。 在这种情况下,下载速度超过200秒。 这个表需要使用126GB的磁盘空间。 -```sql +``` sql SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active ``` -```text + +``` text ┌─formatReadableSize(sum(bytes))─┐ │ 126.18 GiB │ └────────────────────────────────┘ @@ -274,9 +275,9 @@ SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mer 除此之外,你还可以在MergeTree上运行OPTIMIZE查询来进行优化。但这不是必须的,因为即使在没有进行优化的情况下它的表现依然是很好的。 -## 下载预处理好的分区数据 +## 下载预处理好的分区数据 {#xia-zai-yu-chu-li-hao-de-fen-qu-shu-ju} -```bash +``` bash $ curl -O https://clickhouse-datasets.s3.yandex.net/trips_mergetree/partitions/trips_mergetree.tar $ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # path to ClickHouse data directory $ # check permissions of unpacked data, fix if required @@ -284,16 +285,15 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "select count(*) from datasets.trips_mergetree" ``` -!!!info - 如果要运行下面的SQL查询,必须使用完整的表名, - `datasets.trips_mergetree`。 +!!! info "Info" + 如果要运行下面的SQL查询,必须使用完整的表名, +`datasets.trips_mergetree`。 - -## 单台服务器运行结果 +## 单台服务器运行结果 {#dan-tai-fu-wu-qi-yun-xing-jie-guo} Q1: -```sql +``` sql SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type ``` @@ -301,7 +301,7 @@ SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type Q2: -```sql +``` sql SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenger_count ``` @@ -309,7 +309,7 @@ SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenge Q3: -```sql +``` sql SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetree GROUP BY passenger_count, year ``` @@ -317,7 +317,7 @@ SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetr Q4: -```sql +``` sql SELECT passenger_count, toYear(pickup_date) AS year, round(trip_distance) AS distance, count(*) FROM trips_mergetree GROUP BY passenger_count, year, distance @@ -336,19 +336,19 @@ Two Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz, 16 physical kernels total,128 GiB 在每台服务器中运行: -```sql +``` sql CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) ``` 在之前的服务器中运行: -```sql +``` sql CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) ``` 运行如下查询重新分布数据: -```sql +``` sql INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree ``` @@ -373,13 +373,12 @@ Q4: 0.072 sec. 在这种情况下,查询处理时间首先由网络延迟确定。 我们使用位于芬兰的Yandex数据中心中的客户端去位于俄罗斯的集群上运行查询,这增加了大约20毫秒的延迟。 -## 总结 - -| servers | Q1 | Q2 | Q3 | Q4 | -| ------- | ----- | ----- | ----- | ----- | -| 1 | 0.490 | 1.224 | 2.104 | 3.593 | -| 3 | 0.212 | 0.438 | 0.733 | 1.241 | -| 140 | 0.028 | 0.043 | 0.051 | 0.072 | +## 总结 {#zong-jie} +| servers | Q1 | Q2 | Q3 | Q4 | +|---------|-------|-------|-------|-------| +| 1 | 0.490 | 1.224 | 2.104 | 3.593 | +| 3 | 0.212 | 0.438 | 0.733 | 1.241 | +| 140 | 0.028 | 0.043 | 0.051 | 0.072 | [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/nyc_taxi/) diff --git a/docs/zh/getting_started/example_datasets/ontime.md b/docs/zh/getting_started/example_datasets/ontime.md index 11994bfa97b..6db294b12fb 100644 --- a/docs/zh/getting_started/example_datasets/ontime.md +++ b/docs/zh/getting_started/example_datasets/ontime.md @@ -1,16 +1,15 @@ - -# 航班飞行数据 +# 航班飞行数据 {#hang-ban-fei-xing-shu-ju} 航班飞行数据有以下两个方式获取: -- 从原始数据导入 -- 下载预处理好的分区数据 +- 从原始数据导入 +- 下载预处理好的分区数据 -## 从原始数据导入 +## 从原始数据导入 {#cong-yuan-shi-shu-ju-dao-ru} 下载数据: -```bash +``` bash for s in `seq 1987 2018` do for m in `seq 1 12` @@ -20,11 +19,11 @@ done done ``` -(引用 ) +(引用 https://github.com/Percona-Lab/ontime-airline-performance/blob/master/download.sh ) 创建表结构: -```sql +``` sql CREATE TABLE `ontime` ( `Year` UInt16, `Quarter` UInt8, @@ -135,36 +134,37 @@ CREATE TABLE `ontime` ( `Div5LongestGTime` String, `Div5WheelsOff` String, `Div5TailNum` String -) ENGINE = MergeTree -PARTITION BY Year -ORDER BY (Carrier, FlightDate) +) ENGINE = MergeTree +PARTITION BY Year +ORDER BY (Carrier, FlightDate) SETTINGS index_granularity = 8192; ``` 加载数据: -```bash +``` bash $ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ``` -## 下载预处理好的分区数据 +## 下载预处理好的分区数据 {#xia-zai-yu-chu-li-hao-de-fen-qu-shu-ju} -```bash +``` bash $ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar $ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory $ # check permissions of unpacked data, fix if required $ sudo service clickhouse-server restart $ clickhouse-client --query "select count(*) from datasets.ontime" ``` -!!!info - 如果要运行下面的SQL查询,必须使用完整的表名, - `datasets.ontime`。 -## 查询: +!!! info "Info" + 如果要运行下面的SQL查询,必须使用完整的表名, +`datasets.ontime`。 + +## 查询: {#cha-xun} Q0. -```sql +``` sql SELECT avg(c1) FROM ( @@ -176,7 +176,7 @@ FROM Q1. 查询从2000年到2008年每天的航班数 -```sql +``` sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -186,7 +186,7 @@ ORDER BY c DESC; Q2. 查询从2000年到2008年每周延误超过10分钟的航班数。 -```sql +``` sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -196,7 +196,7 @@ ORDER BY c DESC; Q3. 查询2000年到2008年每个机场延误超过10分钟以上的次数 -```sql +``` sql SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -207,7 +207,7 @@ LIMIT 10; Q4. 查询2007年各航空公司延误超过10分钟以上的次数 -```sql +``` sql SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year=2007 @@ -217,7 +217,7 @@ ORDER BY count(*) DESC; Q5. 查询2007年各航空公司延误超过10分钟以上的百分比 -```sql +``` sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -243,7 +243,7 @@ ORDER BY c3 DESC; 更好的查询版本: -```sql +``` sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year=2007 @@ -253,7 +253,7 @@ ORDER BY c3 DESC Q6. 同上一个查询一致,只是查询范围扩大到2000年到2008年 -```sql +``` sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -279,7 +279,7 @@ ORDER BY c3 DESC; 更好的查询版本: -```sql +``` sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -289,7 +289,7 @@ ORDER BY c3 DESC; Q7. 每年航班延误超过10分钟的百分比 -```sql +``` sql SELECT Year, c1/c2 FROM ( @@ -313,7 +313,7 @@ ORDER BY Year; 更好的查询版本: -```sql +``` sql SELECT Year, avg(DepDelay>10)*100 FROM ontime GROUP BY Year @@ -322,7 +322,7 @@ ORDER BY Year; Q8. 每年更受人们喜爱的目的地 -```sql +``` sql SELECT DestCityName, uniqExact(OriginCityName) AS u FROM ontime WHERE Year >= 2000 and Year <= 2010 @@ -332,7 +332,7 @@ ORDER BY u DESC LIMIT 10; Q9. -```sql +``` sql SELECT Year, count(*) AS c1 FROM ontime GROUP BY Year; @@ -340,7 +340,7 @@ GROUP BY Year; Q10. -```sql +``` sql SELECT min(Year), max(Year), Carrier, count(*) AS cnt, sum(ArrDelayMinutes>30) AS flights_delayed, @@ -358,7 +358,7 @@ LIMIT 1000; Bonus: -```sql +``` sql SELECT avg(cnt) FROM ( @@ -396,9 +396,9 @@ LIMIT 10; 这个性能测试由Vadim Tkachenko提供。参考: -- -- -- -- -- -- +- https://www.percona.com/blog/2009/10/02/analyzing-air-traffic-performance-with-infobright-and-monetdb/ +- https://www.percona.com/blog/2009/10/26/air-traffic-queries-in-luciddb/ +- https://www.percona.com/blog/2009/11/02/air-traffic-queries-in-infinidb-early-alpha/ +- https://www.percona.com/blog/2014/04/21/using-apache-hadoop-and-impala-together-with-mysql-for-data-analysis/ +- https://www.percona.com/blog/2016/01/07/apache-spark-with-air-ontime-performance-data/ +- http://nickmakos.blogspot.ru/2012/08/analyzing-air-traffic-performance-with.html diff --git a/docs/zh/getting_started/example_datasets/star_schema.md b/docs/zh/getting_started/example_datasets/star_schema.md index 1bd83283163..4680fe652b2 100644 --- a/docs/zh/getting_started/example_datasets/star_schema.md +++ b/docs/zh/getting_started/example_datasets/star_schema.md @@ -1,8 +1,8 @@ -# Star Schema Benchmark +# Star Schema Benchmark {#star-schema-benchmark} 编译 dbgen: -```bash +``` bash $ git clone git@github.com:vadimtk/ssb-dbgen.git $ cd ssb-dbgen $ make @@ -10,7 +10,7 @@ $ make 开始生成数据: -```bash +``` bash $ ./dbgen -s 1000 -T c $ ./dbgen -s 1000 -T l $ ./dbgen -s 1000 -T p @@ -20,7 +20,7 @@ $ ./dbgen -s 1000 -T d 在ClickHouse中创建表结构: -```sql +``` sql CREATE TABLE customer ( C_CUSTKEY UInt32, @@ -85,16 +85,16 @@ ENGINE = MergeTree ORDER BY S_SUPPKEY; 写入数据: -```bash +``` bash $ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl $ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl $ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl $ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl ``` -将“星型模型”转换为非规范化的“平面模型”: +将«星型模型»转换为非规范化的«平面模型»: -```sql +``` sql SET max_memory_usage = 20000000000, allow_experimental_multiple_joins_emulation = 1; CREATE TABLE lineorder_flat @@ -113,55 +113,80 @@ ALTER TABLE lineorder_flat DROP COLUMN C_CUSTKEY, DROP COLUMN S_SUPPKEY, DROP CO Running the queries: Q1.1 -```sql + +``` sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; ``` + Q1.2 -```sql + +``` sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; ``` + Q1.3 -```sql + +``` sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; ``` + Q2.1 -```sql + +``` sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; ``` + Q2.2 -```sql + +``` sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228' AND S_REGION = 'ASIA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; ``` + Q2.3 -```sql + +``` sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; ``` + Q3.1 -```sql + +``` sql SELECT C_NATION, S_NATION, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 GROUP BY C_NATION, S_NATION, year ORDER BY year asc, revenue desc; ``` + Q3.2 -```sql + +``` sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; ``` + Q3.3 -```sql + +``` sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; ``` + Q3.4 -```sql + +``` sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = '199712' GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; ``` + Q4.1 -```sql + +``` sql SELECT toYear(LO_ORDERDATE) AS year, C_NATION, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, C_NATION ORDER BY year, C_NATION; ``` + Q4.2 -```sql + +``` sql SELECT toYear(LO_ORDERDATE) AS year, S_NATION, P_CATEGORY, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, S_NATION, P_CATEGORY ORDER BY year, S_NATION, P_CATEGORY; ``` + Q4.3 -```sql + +``` sql SELECT toYear(LO_ORDERDATE) AS year, S_CITY, P_BRAND, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' GROUP BY year, S_CITY, P_BRAND ORDER BY year, S_CITY, P_BRAND; ``` diff --git a/docs/zh/getting_started/example_datasets/wikistat.md b/docs/zh/getting_started/example_datasets/wikistat.md index 218f8fdb5b0..aacbdbf37f0 100644 --- a/docs/zh/getting_started/example_datasets/wikistat.md +++ b/docs/zh/getting_started/example_datasets/wikistat.md @@ -1,10 +1,10 @@ -# 维基访问数据 +# 维基访问数据 {#wei-ji-fang-wen-shu-ju} -参考: +参考: http://dumps.wikimedia.org/other/pagecounts-raw/ 创建表结构: -```sql +``` sql CREATE TABLE wikistat ( date Date, @@ -19,11 +19,10 @@ CREATE TABLE wikistat 加载数据: -```bash +``` bash $ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt $ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done $ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done ``` - [Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/wikistat/) diff --git a/docs/zh/getting_started/index.md b/docs/zh/getting_started/index.md index e8edf392cbd..d6830aa6c84 100644 --- a/docs/zh/getting_started/index.md +++ b/docs/zh/getting_started/index.md @@ -1,10 +1,10 @@ -# 入门 +# 入门 {#ru-men} 如果您是ClickHouse的新手,并希望亲身体验它的性能,首先您需要通过 [安装过程](install.md). 之后,您可以选择以下选项之一: -* [通过详细的教程](tutorial.md) -* [试验示例数据集](example_datasets/ontime.md) +- [通过详细的教程](tutorial.md) +- [试验示例数据集](example_datasets/ontime.md) [来源文章](https://clickhouse.tech/docs/zh/getting_started/) diff --git a/docs/zh/getting_started/install.md b/docs/zh/getting_started/install.md index 2017f41e666..bf0ace6824f 100644 --- a/docs/zh/getting_started/install.md +++ b/docs/zh/getting_started/install.md @@ -1,8 +1,8 @@ -## 系统要求 +## 系统要求 {#xi-tong-yao-qiu} ClickHouse可以在任何具有x86\_64,AArch64或PowerPC64LE CPU架构的Linux,FreeBSD或Mac OS X上运行。 -虽然预构建的二进制文件通常是为x86 \ _64编译并利用SSE 4.2指令集,但除非另有说明,否则使用支持它的CPU将成为额外的系统要求。这是检查当前CPU是否支持SSE 4.2的命令: +虽然预构建的二进制文件通常是为x86  \_64编译并利用SSE 4.2指令集,但除非另有说明,否则使用支持它的CPU将成为额外的系统要求。这是检查当前CPU是否支持SSE 4.2的命令: ``` bash $ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" @@ -10,41 +10,41 @@ $ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not 要在不支持SSE 4.2或具有AArch64或PowerPC64LE体系结构的处理器上运行ClickHouse,您应该[通过源构建ClickHouse](#from-sources)进行适当的配置调整。 -##可用的安装选项 +## 可用的安装选项 {#ke-yong-de-an-zhuang-xuan-xiang} -### 为Debian/Ubuntu安装 {#from-deb-packages} +\#\#\# 为Debian/Ubuntu安装 {\#from-deb-packages} 在`/etc/apt/sources.list` (或创建`/etc/apt/sources.list.d/clickhouse.list`文件)中添加仓库: -```text +``` text deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ ``` -如果你想使用最新的测试版本,请使用'testing'替换'stable'。 +如果你想使用最新的测试版本,请使用’testing’替换’stable’。 然后运行: -```bash +``` bash sudo apt-get install dirmngr # optional sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional sudo apt-get update sudo apt-get install clickhouse-client clickhouse-server ``` -你也可以从这里手动下载安装包:。 +你也可以从这里手动下载安装包:https://repo.yandex.ru/clickhouse/deb/stable/main/。 -ClickHouse包含访问控制配置,它们位于`users.xml`文件中(与'config.xml'同目录)。 +ClickHouse包含访问控制配置,它们位于`users.xml`文件中(与’config.xml’同目录)。 默认情况下,允许从任何地方使用默认的‘default’用户无密码的访问ClickHouse。参考‘user/default/networks’。 -有关更多信息,请参考"Configuration files"部分。 +有关更多信息,请参考«Configuration files»部分。 -###来自RPM包 {#from-rpm-packages} +### 来自RPM包 {#from-rpm-packages} Yandex ClickHouse团队建议使用官方预编译的`rpm`软件包,用于CentOS,RedHat和所有其他基于rpm的Linux发行版。 首先,您需要添加官方存储库: -```bash +``` bash sudo yum install yum-utils sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64 @@ -54,13 +54,13 @@ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/ 然后运行这些命令以实际安装包: -```bash +``` bash sudo yum install clickhouse-server clickhouse-client ``` -您也可以从此处手动下载和安装软件包:。 +您也可以从此处手动下载和安装软件包:https://repo.yandex.ru/clickhouse/rpm/stable/x86\_64。 -###来自Docker {#from-docker-image} +### 来自Docker {#from-docker-image} 要在Docker中运行ClickHouse,请遵循[Docker Hub](https://hub.docker.com/r/yandex/clickhouse-server/)上的指南。那些图像使用官方的`deb`包。 @@ -71,14 +71,14 @@ sudo yum install clickhouse-server clickhouse-client 你可以编译并安装它们。 你也可以直接使用而不进行安装。 -```text +``` text Client: dbms/programs/clickhouse-client Server: dbms/programs/clickhouse-server ``` 在服务器中为数据创建如下目录: -```text +``` text /opt/clickhouse/data/default/ /opt/clickhouse/metadata/default/ ``` @@ -88,11 +88,11 @@ Server: dbms/programs/clickhouse-server 日志的路径可以在server config (src/dbms/programs/server/config.xml)中配置。 -## 启动 +## 启动 {#qi-dong} 可以运行如下命令在后台启动服务: -```bash +``` bash sudo service clickhouse-server start ``` @@ -102,31 +102,31 @@ sudo service clickhouse-server start 你也可以在控制台中直接启动服务: -```bash +``` bash clickhouse-server --config-file=/etc/clickhouse-server/config.xml ``` 在这种情况下,日志将被打印到控制台中,这在开发过程中很方便。 -如果配置文件在当前目录中,你可以不指定‘--config-file’参数。它默认使用‘./config.xml’。 +如果配置文件在当前目录中,你可以不指定‘–config-file’参数。它默认使用‘./config.xml’。 你可以使用命令行客户端连接到服务: -```bash +``` bash clickhouse-client ``` 默认情况下它使用‘default’用户无密码的与localhost:9000服务建立连接。 客户端也可以用于连接远程服务,例如: -```bash +``` bash clickhouse-client --host=example.com ``` -有关更多信息,请参考"Command-line client"部分。 +有关更多信息,请参考«Command-line client»部分。 检查系统是否工作: -```bash +``` bash milovidov@hostname:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client ClickHouse client version 0.0.18749. Connecting to localhost:9000. @@ -149,5 +149,4 @@ SELECT 1 为了继续进行实验,你可以尝试下载测试数据集。 - [Original article](https://clickhouse.tech/docs/en/getting_started/install/) diff --git a/docs/zh/images/logo.svg b/docs/zh/images/logo.svg index 865b96d98c7..b5ab923ff65 100644 --- a/docs/zh/images/logo.svg +++ b/docs/zh/images/logo.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/docs/zh/index.md b/docs/zh/index.md index cc48a721a86..b10fafebe93 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -1,15 +1,15 @@ -# 什么是ClickHouse? +# 什么是ClickHouse? {#shi-yao-shi-clickhouse} ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS)。 在传统的行式数据库系统中,数据按如下顺序存储: | Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | -| --- | ----------- | ---------- | ------------------ | --------- | ------------------- | -| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | ... | ... | ... | ... | ... | +|-----|-------------|------------|--------------------|-----------|---------------------| +| \#0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| \#1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| \#2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| \#N | … | … | … | … | … | 处于同一行中的数据总是被物理的存储在一起。 @@ -18,15 +18,13 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) 在列式数据库系统中,数据按如下的顺序存储: - -| Row: | #0 | #1 | #2 | #N | -| ----------- | ------------------- | ------------------- | ------------------- | ------------------- | -| WatchID: | 89354350662 | 90329509958 | 89953706054 | ... | -| JavaEnable: | 1 | 0 | 1 | ... | -| Title: | Investor Relations | Contact us | Mission | ... | -| GoodEvent: | 1 | 1 | 1 | ... | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | ... | - +| Row: | \#0 | \#1 | \#2 | \#N | +|-------------|---------------------|---------------------|---------------------|-----| +| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | +| JavaEnable: | 1 | 0 | 1 | … | +| Title: | Investor Relations | Contact us | Mission | … | +| GoodEvent: | 1 | 1 | 1 | … | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | 该示例中只展示了数据在列式数据库中数据的排列顺序。 对于存储而言,列式数据库总是将同一列的数据存储在一起,不同列的数据也总是分开存储。 @@ -38,25 +36,25 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) 系统负载越高,根据使用场景进行定制化就越重要,并且定制将会变的越精细。没有一个系统同样适用于明显不同的场景。如果系统适用于广泛的场景,在负载高的情况下,所有的场景可以会被公平但低效处理,或者高效处理一小部分场景。 -## OLAP场景的关键特征 +## OLAP场景的关键特征 {#olapchang-jing-de-guan-jian-te-zheng} -- 大多数是读请求 -- 数据总是以相当大的批(> 1000 rows)进行写入 -- 不修改已添加的数据 -- 每次查询都从数据库中读取大量的行,但是同时又仅需要少量的列 -- 宽表,即每个表包含着大量的列 -- 较少的查询(通常每台服务器每秒数百个查询或更少) -- 对于简单查询,允许延迟大约50毫秒 -- 列中的数据相对较小: 数字和短字符串(例如,每个URL 60个字节) -- 处理单个查询时需要高吞吐量(每个服务器每秒高达数十亿行) -- 事务不是必须的 -- 对数据一致性要求低 -- 每一个查询除了一个大表外都很小 -- 查询结果明显小于源数据,换句话说,数据被过滤或聚合后能够被盛放在单台服务器的内存中 +- 大多数是读请求 +- 数据总是以相当大的批(\> 1000 rows)进行写入 +- 不修改已添加的数据 +- 每次查询都从数据库中读取大量的行,但是同时又仅需要少量的列 +- 宽表,即每个表包含着大量的列 +- 较少的查询(通常每台服务器每秒数百个查询或更少) +- 对于简单查询,允许延迟大约50毫秒 +- 列中的数据相对较小: 数字和短字符串(例如,每个URL 60个字节) +- 处理单个查询时需要高吞吐量(每个服务器每秒高达数十亿行) +- 事务不是必须的 +- 对数据一致性要求低 +- 每一个查询除了一个大表外都很小 +- 查询结果明显小于源数据,换句话说,数据被过滤或聚合后能够被盛放在单台服务器的内存中 很容易可以看出,OLAP场景与其他流行场景(例如,OLTP或K/V)有很大的不同, 因此想要使用OLTP或Key-Value数据库去高效的处理分析查询是没有意义的,例如,使用OLAP数据库去处理分析请求通常要优于使用MongoDB或Redis去处理分析请求。 -## 列式数据库更适合OLAP场景的原因 +## 列式数据库更适合OLAP场景的原因 {#lie-shi-shu-ju-ku-geng-gua-he-olapchang-jing-de-yuan-yin} 列式数据库更适合于OLAP场景(对于大多数查询而言,处理速度至少提高了100倍),下面详细解释了原因(通过图片更有利于直观理解): @@ -70,69 +68,71 @@ ClickHouse是一个用于联机分析(OLAP)的列式数据库管理系统(DBMS) 看到差别了么?下面将详细介绍为什么会发生这种情况。 -### Input/output +### Input/output {#inputoutput} -1. 针对分析类查询,通常只需要读取表的一小部分列。在列式数据库中你可以只读取你需要的数据。例如,如果只需要读取100列中的5列,这将帮助你最少减少20倍的I/O消耗。 -2. 由于数据总是打包成批量读取的,所以压缩是非常容易的。同时数据按列分别存储这也更容易压缩。这进一步降低了I/O的体积。 -3. 由于I/O的降低,这将帮助更多的数据被系统缓存。 +1. 针对分析类查询,通常只需要读取表的一小部分列。在列式数据库中你可以只读取你需要的数据。例如,如果只需要读取100列中的5列,这将帮助你最少减少20倍的I/O消耗。 +2. 由于数据总是打包成批量读取的,所以压缩是非常容易的。同时数据按列分别存储这也更容易压缩。这进一步降低了I/O的体积。 +3. 由于I/O的降低,这将帮助更多的数据被系统缓存。 -例如,查询“统计每个广告平台的记录数量”需要读取“广告平台ID”这一列,它在未压缩的情况下需要1个字节进行存储。如果大部分流量不是来自广告平台,那么这一列至少可以以十倍的压缩率被压缩。当采用快速压缩算法,它的解压速度最少在十亿字节(未压缩数据)每秒。换句话说,这个查询可以在单个服务器上以每秒大约几十亿行的速度进行处理。这实际上是当前实现的速度。 +例如,查询«统计每个广告平台的记录数量»需要读取«广告平台ID»这一列,它在未压缩的情况下需要1个字节进行存储。如果大部分流量不是来自广告平台,那么这一列至少可以以十倍的压缩率被压缩。当采用快速压缩算法,它的解压速度最少在十亿字节(未压缩数据)每秒。换句话说,这个查询可以在单个服务器上以每秒大约几十亿行的速度进行处理。这实际上是当前实现的速度。 -
示例 -``` -$ clickhouse-client -ClickHouse client version 0.0.52053. -Connecting to localhost:9000. -Connected to ClickHouse server version 0.0.52053. +
-:) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 +示例 -SELECT - CounterID, - count() -FROM hits -GROUP BY CounterID -ORDER BY count() DESC -LIMIT 20 + $ clickhouse-client + ClickHouse client version 0.0.52053. + Connecting to localhost:9000. + Connected to ClickHouse server version 0.0.52053. -┌─CounterID─┬──count()─┐ -│ 114208 │ 56057344 │ -│ 115080 │ 51619590 │ -│ 3228 │ 44658301 │ -│ 38230 │ 42045932 │ -│ 145263 │ 42042158 │ -│ 91244 │ 38297270 │ -│ 154139 │ 26647572 │ -│ 150748 │ 24112755 │ -│ 242232 │ 21302571 │ -│ 338158 │ 13507087 │ -│ 62180 │ 12229491 │ -│ 82264 │ 12187441 │ -│ 232261 │ 12148031 │ -│ 146272 │ 11438516 │ -│ 168777 │ 11403636 │ -│ 4120072 │ 11227824 │ -│ 10938808 │ 10519739 │ -│ 74088 │ 9047015 │ -│ 115079 │ 8837972 │ -│ 337234 │ 8205961 │ -└───────────┴──────────┘ + :) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 -20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) + SELECT + CounterID, + count() + FROM hits + GROUP BY CounterID + ORDER BY count() DESC + LIMIT 20 + + ┌─CounterID─┬──count()─┐ + │ 114208 │ 56057344 │ + │ 115080 │ 51619590 │ + │ 3228 │ 44658301 │ + │ 38230 │ 42045932 │ + │ 145263 │ 42042158 │ + │ 91244 │ 38297270 │ + │ 154139 │ 26647572 │ + │ 150748 │ 24112755 │ + │ 242232 │ 21302571 │ + │ 338158 │ 13507087 │ + │ 62180 │ 12229491 │ + │ 82264 │ 12187441 │ + │ 232261 │ 12148031 │ + │ 146272 │ 11438516 │ + │ 168777 │ 11403636 │ + │ 4120072 │ 11227824 │ + │ 10938808 │ 10519739 │ + │ 74088 │ 9047015 │ + │ 115079 │ 8837972 │ + │ 337234 │ 8205961 │ + └───────────┴──────────┘ + + 20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) + + :) -:) -```
-### CPU +### CPU {#cpu} 由于执行一个查询需要处理大量的行,因此在整个向量上执行所有操作将比在每一行上执行所有操作更加高效。同时这将有助于实现一个几乎没有调用成本的查询引擎。如果你不这样做,使用任何一个机械硬盘,查询引擎都不可避免的停止CPU进行等待。所以,在数据按列存储并且按列执行是很有意义的。 有两种方法可以做到这一点: -1. 向量引擎:所有的操作都是为向量而不是为单个值编写的。这意味着多个操作之间的不再需要频繁的调用,并且调用的成本基本可以忽略不计。操作代码包含一个优化的内部循环。 +1. 向量引擎:所有的操作都是为向量而不是为单个值编写的。这意味着多个操作之间的不再需要频繁的调用,并且调用的成本基本可以忽略不计。操作代码包含一个优化的内部循环。 -2. 代码生成:生成一段代码,包含查询中的所有操作。 +2. 代码生成:生成一段代码,包含查询中的所有操作。 这是不应该在一个通用数据库中实现的,因为这在运行简单查询时是没有意义的。但是也有例外,例如,MemSQL使用代码生成来减少处理SQL查询的延迟(只是为了比较,分析型数据库通常需要优化的是吞吐而不是延迟)。 diff --git a/docs/zh/interfaces/cli.md b/docs/zh/interfaces/cli.md index 91a6f71ca5a..7a7f3dda8de 100644 --- a/docs/zh/interfaces/cli.md +++ b/docs/zh/interfaces/cli.md @@ -1,17 +1,17 @@ -# 命令行客户端 +# 命令行客户端 {#ming-ling-xing-ke-hu-duan} 通过命令行来访问 ClickHouse,您可以使用 `clickhouse-client` -```bash +``` bash $ clickhouse-client ClickHouse client version 0.0.26176. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.26176.:) ``` -该客户端支持命令行参数以及配置文件。查看更多,请看 "[配置](#interfaces_cli_configuration)" +该客户端支持命令行参数以及配置文件。查看更多,请看 «[配置](#interfaces_cli_configuration)» -## 使用方式 +## 使用方式 {#shi-yong-fang-shi} 这个客户端可以选择使用交互式与非交互式(批量)两种模式。 使用批量模式,要指定 `query` 参数,或者发送数据到 `stdin`(它会检查 `stdin` 是否是 Terminal),或者两种同时使用。 @@ -19,7 +19,7 @@ Connected to ClickHouse server version 0.0.26176.:) 使用这个客户端插入数据的示例: -```bash +``` bash echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; @@ -37,7 +37,7 @@ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMA 在交互模式下,每条查询过后,你可以直接输入下一条查询命令。 -如果 `multiline` 没有指定(默认没指定):为了执行查询,按下 Enter 即可。查询语句不是必须使用分号结尾。如果需要写一个多行的查询语句,可以在换行之前输入一个反斜杠` \ `,然后在您按下 Enter 键后,您就可以输入当前语句的下一行查询了。 +如果 `multiline` 没有指定(默认没指定):为了执行查询,按下 Enter 即可。查询语句不是必须使用分号结尾。如果需要写一个多行的查询语句,可以在换行之前输入一个反斜杠`\`,然后在您按下 Enter 键后,您就可以输入当前语句的下一行查询了。 如果 `multiline` 指定了:为了执行查询,需要以分号结尾并且按下 Enter 键。如果行末没有分号,将认为当前语句并没有输入完而要求继续输入下一行。 @@ -54,60 +54,59 @@ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMA 当执行一个查询的时候,客户端会显示: -1. 进度, 进度会每秒更新十次 (默认情况下)。 对于很快的查询,进度可能没有时间显示。 -2. 为了调试会显示解析且格式化后的查询语句。 -3. 指定格式的输出结果。 -4. 输出结果的行数的行数,经过的时间,以及查询处理的速度。 +1. 进度, 进度会每秒更新十次 (默认情况下)。 对于很快的查询,进度可能没有时间显示。 +2. 为了调试会显示解析且格式化后的查询语句。 +3. 指定格式的输出结果。 +4. 输出结果的行数的行数,经过的时间,以及查询处理的速度。 您可以通过 Ctrl+C 来取消一个长时间的查询。然而,您依然需要等待服务端来中止请求。在某个阶段去取消查询是不可能的。如果您不等待并再次按下 Ctrl + C,客户端将会退出。 -命令行客户端允许通过外部数据 (外部临时表) 来查询。更多相关信息,请参考 "[外部数据查询处理](../operations/table_engines/external_data.md)". +命令行客户端允许通过外部数据 (外部临时表) 来查询。更多相关信息,请参考 «[外部数据查询处理](../operations/table_engines/external_data.md)». -## 配置 {#interfaces_cli_configuration} +## 配置 {#interfaces-cli-configuration} 您可以通过以下方式传入参数到 `clickhouse-client` 中 (所有的参数都有默认值): -- 通过命令行 +- 通过命令行 - 命令行参数会覆盖默认值和配置文件的配置。 + 命令行参数会覆盖默认值和配置文件的配置。 -- 配置文件 +- 配置文件 - 配置文件的配置会覆盖默认值 + 配置文件的配置会覆盖默认值 -### 命令行参数 +### 命令行参数 {#ming-ling-xing-can-shu} -- `--host, -h` -– 服务端的 host 名称, 默认是 'localhost'。 您可以选择使用 host 名称或者 IPv4 或 IPv6 地址。 -- `--port` – 连接的端口,默认值: 9000。注意 HTTP 接口以及 TCP 原生接口是使用不同端口的。 -- `--user, -u` – 用户名。 默认值: default。 -- `--password` – 密码。 默认值: 空字符串。 -- `--query, -q` – 非交互模式下的查询语句. -- `--database, -d` – 默认当前操作的数据库. 默认值: 服务端默认的配置 (默认是 `default`)。 -- `--multiline, -m` – 如果指定,允许多行语句查询(Enter 仅代表换行,不代表查询语句完结)。 -- `--multiquery, -n` – 如果指定, 允许处理用逗号分隔的多个查询,只在非交互模式下生效。 -- `--format, -f` – 使用指定的默认格式输出结果。 -- `--vertical, -E` – 如果指定,默认情况下使用垂直格式输出结果。这与 '--format=Vertical' 相同。在这种格式中,每个值都在单独的行上打印,这种方式对显示宽表很有帮助。 -- `--time, -t` – 如果指定,非交互模式下会打印查询执行的时间到 'stderr' 中。 -- `--stacktrace` – 如果指定,如果出现异常,会打印堆栈跟踪信息。 -- `--config-file` – 配置文件的名称。 +- `--host, -h` -– 服务端的 host 名称, 默认是 ‘localhost’。 您可以选择使用 host 名称或者 IPv4 或 IPv6 地址。 +- `--port` – 连接的端口,默认值: 9000。注意 HTTP 接口以及 TCP 原生接口是使用不同端口的。 +- `--user, -u` – 用户名。 默认值: default。 +- `--password` – 密码。 默认值: 空字符串。 +- `--query, -q` – 非交互模式下的查询语句. +- `--database, -d` – 默认当前操作的数据库. 默认值: 服务端默认的配置 (默认是 `default`)。 +- `--multiline, -m` – 如果指定,允许多行语句查询(Enter 仅代表换行,不代表查询语句完结)。 +- `--multiquery, -n` – 如果指定, 允许处理用逗号分隔的多个查询,只在非交互模式下生效。 +- `--format, -f` – 使用指定的默认格式输出结果。 +- `--vertical, -E` – 如果指定,默认情况下使用垂直格式输出结果。这与 ‘–format=Vertical’ 相同。在这种格式中,每个值都在单独的行上打印,这种方式对显示宽表很有帮助。 +- `--time, -t` – 如果指定,非交互模式下会打印查询执行的时间到 ‘stderr’ 中。 +- `--stacktrace` – 如果指定,如果出现异常,会打印堆栈跟踪信息。 +- `--config-file` – 配置文件的名称。 -### 配置文件 +### 配置文件 {#pei-zhi-wen-jian} `clickhouse-client` 使用一下第一个存在的文件: -- 通过 `--config-file` 参数指定的文件. -- `./clickhouse-client.xml` -- `\~/.clickhouse-client/config.xml` -- `/etc/clickhouse-client/config.xml` +- 通过 `--config-file` 参数指定的文件. +- `./clickhouse-client.xml` +- `\~/.clickhouse-client/config.xml` +- `/etc/clickhouse-client/config.xml` 配置文件示例: -```xml +``` xml username password ``` - [来源文章](https://clickhouse.tech/docs/zh/interfaces/cli/) diff --git a/docs/zh/interfaces/cpp.md b/docs/zh/interfaces/cpp.md index 7d7bf7c0cbf..6f162036e01 100644 --- a/docs/zh/interfaces/cpp.md +++ b/docs/zh/interfaces/cpp.md @@ -1,4 +1,4 @@ -# C ++客户端库 +# C ++客户端库 {#c-ke-hu-duan-ku} 请参阅以下网站的自述文件[clickhouse-cpp](https://github.com/ClickHouse/clickhouse-cpp)资料库。 diff --git a/docs/zh/interfaces/formats.md b/docs/zh/interfaces/formats.md index 7b7f839447a..dee61745cd4 100644 --- a/docs/zh/interfaces/formats.md +++ b/docs/zh/interfaces/formats.md @@ -4,40 +4,40 @@ ClickHouse 可以接受多种数据格式,可以在 (`INSERT`) 以及 (`SELECT 下列表格列出了支持的数据格式以及在 (`INSERT`) 以及 (`SELECT`) 请求中使用它们的方式。 -| 格式 | INSERT | SELECT | -| ------- | -------- | -------- | -| [TabSeparated](#tabseparated) | ✔ | ✔ | -| [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | -| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | -| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | -| [Template](#format-template) | ✔ | ✔ | -| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | -| [CSV](#csv) | ✔ | ✔ | -| [CSVWithNames](#csvwithnames) | ✔ | ✔ | -| [CustomSeparated](#format-customseparated) | ✔ | ✔ | -| [Values](#data-format-values) | ✔ | ✔ | -| [Vertical](#vertical) | ✗ | ✔ | -| VerticalRaw | ✗ | ✔ | -| [JSON](#json) | ✗ | ✔ | -| [JSONCompact](#jsoncompact) | ✗ | ✔ | -| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | -| [TSKV](#tskv) | ✔ | ✔ | -| [Pretty](#pretty) | ✗ | ✔ | -| [PrettyCompact](#prettycompact) | ✗ | ✔ | -| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | -| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | -| [PrettySpace](#prettyspace) | ✗ | ✔ | -| [Protobuf](#protobuf) | ✔ | ✔ | -| [Avro](#data-format-avro) | ✔ | ✔ | -| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | -| [Parquet](#data-format-parquet) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | -| [RowBinary](#rowbinary) | ✔ | ✔ | -| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | -| [Native](#native) | ✔ | ✔ | -| [Null](#null) | ✗ | ✔ | -| [XML](#xml) | ✗ | ✔ | -| [CapnProto](#capnproto) | ✔ | ✔ | +| 格式 | INSERT | SELECT | +|-----------------------------------------------------------------|--------|--------| +| [TabSeparated](#tabseparated) | ✔ | ✔ | +| [TabSeparatedRaw](#tabseparatedraw) | ✗ | ✔ | +| [TabSeparatedWithNames](#tabseparatedwithnames) | ✔ | ✔ | +| [TabSeparatedWithNamesAndTypes](#tabseparatedwithnamesandtypes) | ✔ | ✔ | +| [Template](#format-template) | ✔ | ✔ | +| [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | +| [CSV](#csv) | ✔ | ✔ | +| [CSVWithNames](#csvwithnames) | ✔ | ✔ | +| [CustomSeparated](#format-customseparated) | ✔ | ✔ | +| [Values](#data-format-values) | ✔ | ✔ | +| [Vertical](#vertical) | ✗ | ✔ | +| VerticalRaw | ✗ | ✔ | +| [JSON](#json) | ✗ | ✔ | +| [JSONCompact](#jsoncompact) | ✗ | ✔ | +| [JSONEachRow](#jsoneachrow) | ✔ | ✔ | +| [TSKV](#tskv) | ✔ | ✔ | +| [Pretty](#pretty) | ✗ | ✔ | +| [PrettyCompact](#prettycompact) | ✗ | ✔ | +| [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | +| [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | +| [PrettySpace](#prettyspace) | ✗ | ✔ | +| [Protobuf](#protobuf) | ✔ | ✔ | +| [Avro](#data-format-avro) | ✔ | ✔ | +| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | +| [Parquet](#data-format-parquet) | ✔ | ✔ | +| [ORC](#data-format-orc) | ✔ | ✗ | +| [RowBinary](#rowbinary) | ✔ | ✔ | +| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | +| [Native](#native) | ✔ | ✔ | +| [Null](#null) | ✗ | ✔ | +| [XML](#xml) | ✗ | ✔ | +| [CapnProto](#capnproto) | ✔ | ✔ | ## TabSeparated {#tabseparated} @@ -47,32 +47,30 @@ ClickHouse 可以接受多种数据格式,可以在 (`INSERT`) 以及 (`SELECT TabSeparated 格式非常方便用于自定义程序或脚本处理数据。HTTP 客户端接口默认会用这种格式,命令行客户端批量模式下也会用这种格式。这种格式允许在不同数据库之间传输数据。例如,从 MYSQL 中导出数据然后导入到 ClickHouse 中,反之亦然。 -TabSeparated 格式支持输出数据总值(当使用 WITH TOTALS) 以及极值(当 'extremes' 设置是1)。这种情况下,总值和极值输出在主数据的后面。主要的数据,总值,极值会以一个空行隔开,例如: +TabSeparated 格式支持输出数据总值(当使用 WITH TOTALS) 以及极值(当 ‘extremes’ 设置是1)。这种情况下,总值和极值输出在主数据的后面。主要的数据,总值,极值会以一个空行隔开,例如: ``` sql SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated`` ``` -``` -2014-03-17 1406958 -2014-03-18 1383658 -2014-03-19 1405797 -2014-03-20 1353623 -2014-03-21 1245779 -2014-03-22 1031592 -2014-03-23 1046491 + 2014-03-17 1406958 + 2014-03-18 1383658 + 2014-03-19 1405797 + 2014-03-20 1353623 + 2014-03-21 1245779 + 2014-03-22 1031592 + 2014-03-23 1046491 -0000-00-00 8873898 + 0000-00-00 8873898 -2014-03-17 1031592 -2014-03-23 1406958 -``` + 2014-03-17 1031592 + 2014-03-23 1406958 -### 数据解析方式 +### 数据解析方式 {#shu-ju-jie-xi-fang-shi} 整数以十进制形式写入。数字在开头可以包含额外的 `+` 字符(解析时忽略,格式化时不记录)。非负数不能包含负号。 读取时,允许将空字符串解析为零,或者(对于带符号的类型)将仅包含负号的字符串解析为零。 不符合相应数据类型的数字可能会被解析为不同的数字,而不会显示错误消息。 -浮点数以十进制形式写入。点号用作小数点分隔符。支持指数等符号,如'inf','+ inf','-inf'和'nan'。 浮点数的输入可以以小数点开始或结束。 +浮点数以十进制形式写入。点号用作小数点分隔符。支持指数等符号,如’inf’,‘+ inf’,‘-inf’和’nan’。 浮点数的输入可以以小数点开始或结束。 格式化的时候,浮点数的精确度可能会丢失。 解析的时候,没有严格需要去读取与机器可以表示的最接近的数值。 @@ -85,12 +83,10 @@ SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORD 字符串以反斜线转义的特殊字符输出。 以下转义序列用于输出:`\b`,`\f`,`\r`,`\n`,`\t`,`\0`,`\'`,`\\`。 解析还支持`\a`,`\v`和`\xHH`(十六进制转义字符)和任何`\c`字符,其中`c`是任何字符(这些序列被转换为`c`)。 因此,读取数据支持可以将换行符写为`\n`或`\`的格式,或者换行。例如,字符串 `Hello world` 在单词之间换行而不是空格可以解析为以下任何形式: -``` -Hello\nworld + Hello\nworld -Hello\ -world -``` + Hello\ + world 第二种形式是支持的,因为 MySQL 读取 tab-separated 格式数据集的时候也会使用它。 @@ -115,65 +111,66 @@ world 在解析过程中,第一行完全被忽略。您不能使用列名来确定其位置或检查其正确性。 (未来可能会加入解析头行的功能) -这种格式也可以使用名称 ` TSVWithNames` 来表示。 +这种格式也可以使用名称 `TSVWithNames` 来表示。 ## TabSeparatedWithNamesAndTypes {#tabseparatedwithnamesandtypes} 与 `TabSeparated` 格式不一样的是,第一行会显示列的名称,第二行会显示列的类型。 在解析过程中,第一行和第二行完全被忽略。 -这种格式也可以使用名称 ` TSVWithNamesAndTypes` 来表示。 +这种格式也可以使用名称 `TSVWithNamesAndTypes` 来表示。 ## Template {#format-template} This format allows to specify a custom format string with placeholders for values with specified escaping rule. -It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) +It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) Format string `format_schema_rows` specifies rows format with the following syntax: - `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, +`delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, - where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), - `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), - `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: + where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), + `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), + `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: - - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) - - `Escaped` (similarly to `TSV`) - - `Quoted` (similarly to `Values`) - - `Raw` (without escaping, similarly to `TSVRaw`) - - `None` (no escaping rule, see further) + - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) + - `Escaped` (similarly to `TSV`) + - `Quoted` (similarly to `Values`) + - `Raw` (without escaping, similarly to `TSVRaw`) + - `None` (no escaping rule, see further) - If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. + If escaping rule is omitted, then`None` will be used. `XML` and `Raw` are suitable only for output. - So, for the following format string: + So, for the following format string: - `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` + `Search phrase: ${SearchPhrase:Quoted}, count: ${c:Escaped}, ad price: $$${price:JSON};` - the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: + the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quoted`, `Escaped` and `JSON` will be printed (for select) or will be expected (for insert) between `Search phrase: `, `, count: `, `, ad price: $` and `;` delimiters respectively. For example: - `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` + `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` - The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) +The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) Format string `format_schema` has the same syntax as `format_schema_rows` and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: - - `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. - - `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) - - `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) - - `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) - - `rows` is the total number of output rows - - `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. - - `time` is the request execution time in seconds - - `rows_read` is the number of rows have been read - - `bytes_read` is the number of bytes (uncompressed) have been read +- `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. +- `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) +- `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) +- `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) +- `rows` is the total number of output rows +- `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows\_before\_limit\_at\_least is the exact number of rows there would have been without a LIMIT. +- `time` is the request execution time in seconds +- `rows_read` is the number of rows have been read +- `bytes_read` is the number of bytes (uncompressed) have been read - The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. - If the `format_schema` setting is an empty string, `${data}` is used as default value. - For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). +The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. +If the `format_schema` setting is an empty string, `${data}` is used as default value. +For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). - `Select` example: -```sql +`Select` example: + +``` sql SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 FORMAT Template SETTINGS format_schema = ' @@ -192,7 +189,8 @@ SETTINGS format_schema = ' format_schema_rows = '
', format_schema_rows_between_delimiter = '\n ' ``` -```html + +``` html Search phrases @@ -213,26 +211,28 @@ format_schema_rows_between_delimiter = '\n ' ``` `Insert` example: -``` -Some header -Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 -Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 -Total rows: 2 -``` -```sql + + Some header + Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 + Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 + Total rows: 2 + +``` sql INSERT INTO UserActivity FORMAT Template SETTINGS format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' ``` -`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows: ` in suffix will be ignored. + +`PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows:` in suffix will be ignored. All delimiters in the input data must be strictly equal to delimiters in specified format strings. ## TemplateIgnoreSpaces {#templateignorespaces} This format is suitable only for input. -Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. -It's possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): -```sql +Similar to `Template`, but skips whitespace characters between delimiters and values in the input stream. However, if format strings contain whitespace characters, these characters will be expected in the input stream. Also allows to specify empty placeholders (`${}` or `${:None}`) to split some delimiter into separate parts to ignore spaces between them. Such placeholders are used only for skipping whitespace characters. +It’s possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): + +``` sql INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', @@ -243,18 +243,16 @@ format_schema_rows_between_delimiter = ',' 与 `TabSeparated` 格式类似,但它输出的是 `name=value` 的格式。名称会和 `TabSeparated` 格式一样被转义,`=` 字符也会被转义。 -``` -SearchPhrase= count()=8267016 -SearchPhrase=bathroom interior design count()=2166 -SearchPhrase=yandex count()=1655 -SearchPhrase=2014 spring fashion count()=1549 -SearchPhrase=freeform photos count()=1480 -SearchPhrase=angelina jolie count()=1245 -SearchPhrase=omsk count()=1112 -SearchPhrase=photos of dog breeds count()=1091 -SearchPhrase=curtain designs count()=1064 -SearchPhrase=baku count()=1000 -``` + SearchPhrase= count()=8267016 + SearchPhrase=bathroom interior design count()=2166 + SearchPhrase=yandex count()=1655 + SearchPhrase=2014 spring fashion count()=1549 + SearchPhrase=freeform photos count()=1480 + SearchPhrase=angelina jolie count()=1245 + SearchPhrase=omsk count()=1112 + SearchPhrase=photos of dog breeds count()=1091 + SearchPhrase=curtain designs count()=1064 + SearchPhrase=baku count()=1000 [NULL](../query_language/syntax.md) 输出为 `\N`。 @@ -262,9 +260,7 @@ SearchPhrase=baku count()=1000 SELECT * FROM t_null FORMAT TSKV ``` -``` -x=1 y=\N -``` + x=1 y=\N 当有大量的小列时,这种格式是低效的,通常没有理由使用它。它被用于 Yandex 公司的一些部门。 @@ -278,12 +274,9 @@ x=1 y=\N 格式化的时候,行是用双引号括起来的。字符串中的双引号会以两个双引号输出,除此之外没有其他规则来做字符转义了。日期和时间也会以双引号包括。数字的输出不带引号。值由一个单独的字符隔开,这个字符默认是 `,`。行使用 Unix 换行符(LF)分隔。 数组序列化成 CSV 规则如下:首先将数组序列化为 TabSeparated 格式的字符串,然后将结果字符串用双引号包括输出到 CSV。CSV 格式的元组被序列化为单独的列(即它们在元组中的嵌套关系会丢失)。 + clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv -``` -clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv -``` - -*默认情况下间隔符是 `,` ,在 [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) 中可以了解更多间隔符配置。 +\*默认情况下间隔符是 `,` ,在 [format\_csv\_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) 中可以了解更多间隔符配置。 解析的时候,可以使用或不使用引号来解析所有值。支持双引号和单引号。行也可以不用引号排列。 在这种情况下,它们被解析为逗号或换行符(CR 或 LF)。在解析不带引号的行时,若违反 RFC 规则,会忽略前导和尾随的空格和制表符。 对于换行,全部支持 Unix(LF),Windows(CR LF)和 Mac OS Classic(CR LF)。 @@ -291,7 +284,7 @@ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMA CSV 格式是和 TabSeparated 一样的方式输出总数和极值。 -## CSVWithNames +## CSVWithNames {#csvwithnames} 会输出带头部行,和 `TabSeparatedWithNames` 一样。 @@ -308,7 +301,7 @@ There is also `CustomSeparatedIgnoreSpaces` format, which is similar to `Templat SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON ``` -```json +``` json { "meta": [ @@ -372,12 +365,12 @@ SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTA } ``` -JSON 与 JavaScript 兼容。为了确保这一点,一些字符被另外转义:斜线`/`被转义为`\/`; 替代的换行符 `U+2028` 和 `U+2029` 会打断一些浏览器解析,它们会被转义为 `\uXXXX`。 ASCII 控制字符被转义:退格,换页,换行,回车和水平制表符被替换为`\b`,`\f`,`\n`,`\r`,`\t` 作为使用`\uXXXX`序列的00-1F范围内的剩余字节。 无效的 UTF-8 序列更改为替换字符 ,因此输出文本将包含有效的 UTF-8 序列。 为了与 JavaScript 兼容,默认情况下,Int64 和 UInt64 整数用双引号引起来。要除去引号,可以将配置参数 output_format_json_quote_64bit_integers 设置为0。 +JSON 与 JavaScript 兼容。为了确保这一点,一些字符被另外转义:斜线`/`被转义为`\/`; 替代的换行符 `U+2028` 和 `U+2029` 会打断一些浏览器解析,它们会被转义为 `\uXXXX`。 ASCII 控制字符被转义:退格,换页,换行,回车和水平制表符被替换为`\b`,`\f`,`\n`,`\r`,`\t` 作为使用`\uXXXX`序列的00-1F范围内的剩余字节。 无效的 UTF-8 序列更改为替换字符 ,因此输出文本将包含有效的 UTF-8 序列。 为了与 JavaScript 兼容,默认情况下,Int64 和 UInt64 整数用双引号引起来。要除去引号,可以将配置参数 output\_format\_json\_quote\_64bit\_integers 设置为0。 `rows` – 结果输出的行数。 `rows_before_limit_at_least` 去掉 LIMIT 过滤后的最小行总数。 只会在查询包含 LIMIT 条件时输出。 -若查询包含 GROUP BY,rows_before_limit_at_least 就是去掉 LIMIT 后过滤后的准确行数。 +若查询包含 GROUP BY,rows\_before\_limit\_at\_least 就是去掉 LIMIT 后过滤后的准确行数。 `totals` – 总值 (当使用 TOTALS 条件时)。 @@ -395,7 +388,7 @@ ClickHouse 支持 [NULL](../query_language/syntax.md), 在 JSON 格式中以 `nu 示例: -```json +``` json { "meta": [ @@ -439,7 +432,7 @@ ClickHouse 支持 [NULL](../query_language/syntax.md), 在 JSON 格式中以 `nu 将数据结果每一行以 JSON 结构体输出(换行分割 JSON 结构体)。 -```json +``` json {"SearchPhrase":"","count()":"8267016"} {"SearchPhrase": "bathroom interior design","count()": "2166"} {"SearchPhrase":"yandex","count()":"1655"} @@ -458,23 +451,23 @@ ClickHouse 支持 [NULL](../query_language/syntax.md), 在 JSON 格式中以 `nu ### Usage of Nested Structures {#jsoneachrow-nested} -If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input_format_import_nested_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. +If you have a table with the [Nested](../data_types/nested_data_structures/nested.md) data type columns, you can insert JSON data having the same structure. Enable this functionality with the [input\_format\_import\_nested\_json](../operations/settings/settings.md#settings-input_format_import_nested_json) setting. For example, consider the following table: -```sql +``` sql CREATE TABLE json_each_row_nested (n Nested (s String, i Int32) ) ENGINE = Memory ``` As you can find in the `Nested` data type description, ClickHouse treats each component of the nested structure as a separate column, `n.s` and `n.i` for our table. So you can insert the data the following way: -```sql +``` sql INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n.s": ["abc", "def"], "n.i": [1, 23]} ``` -To insert data as hierarchical JSON object set [input_format_import_nested_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). +To insert data as hierarchical JSON object set [input\_format\_import\_nested\_json=1](../operations/settings/settings.md#settings-input_format_import_nested_json). -```json +``` json { "n": { "s": ["abc", "def"], @@ -485,26 +478,31 @@ To insert data as hierarchical JSON object set [input_format_import_nested_json= Without this setting ClickHouse throws the exception. -```sql +``` sql SELECT name, value FROM system.settings WHERE name = 'input_format_import_nested_json' ``` -```text + +``` text ┌─name────────────────────────────┬─value─┐ │ input_format_import_nested_json │ 0 │ └─────────────────────────────────┴───────┘ ``` -```sql + +``` sql INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} ``` -```text + +``` text Code: 117. DB::Exception: Unknown field found while parsing JSONEachRow format: n: (at row 1) ``` -```sql + +``` sql SET input_format_import_nested_json=1 INSERT INTO json_each_row_nested FORMAT JSONEachRow {"n": {"s": ["abc", "def"], "i": [1, 23]}} SELECT * FROM json_each_row_nested ``` -```text + +``` text ┌─n.s───────────┬─n.i────┐ │ ['abc','def'] │ [1,23] │ └───────────────┴────────┘ @@ -512,7 +510,7 @@ SELECT * FROM json_each_row_nested ## Native {#native} -最高性能的格式。 据通过二进制格式的块进行写入和读取。对于每个块,该块中的行数,列数,列名称和类型以及列的部分将被相继记录。 换句话说,这种格式是 “列式”的 - 它不会将列转换为行。 这是用于在服务器之间进行交互的本地界面中使用的格式,用于使用命令行客户端和 C++ 客户端。 +最高性能的格式。 据通过二进制格式的块进行写入和读取。对于每个块,该块中的行数,列数,列名称和类型以及列的部分将被相继记录。 换句话说,这种格式是 «列式»的 - 它不会将列转换为行。 这是用于在服务器之间进行交互的本地界面中使用的格式,用于使用命令行客户端和 C++ 客户端。 您可以使用此格式快速生成只能由 ClickHouse DBMS 读取的格式。但自己处理这种格式是没有意义的。 @@ -533,13 +531,11 @@ SELECT * FROM json_each_row_nested SELECT * FROM t_null ``` -``` -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -└───┴──────┘ -``` + ┌─x─┬────y─┐ + │ 1 │ ᴺᵁᴸᴸ │ + └───┴──────┘ -为避免将太多数据传输到终端,只打印前10,000行。 如果行数大于或等于10,000,则会显示消息“Showed first 10 000”。 +为避免将太多数据传输到终端,只打印前10,000行。 如果行数大于或等于10,000,则会显示消息«Showed first 10 000»。 该格式仅适用于输出查询结果,但不适用于解析输入(将数据插入到表中)。 Pretty格式支持输出总值(当使用 WITH TOTALS 时)和极值(当 `extremes` 设置为1时)。 在这些情况下,总数值和极值在主数据之后以单独的表格形式输出。 示例(以 PrettyCompact 格式显示): @@ -548,28 +544,26 @@ Pretty格式支持输出总值(当使用 WITH TOTALS 时)和极值(当 `ex SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact ``` -``` -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1406958 │ -│ 2014-03-18 │ 1383658 │ -│ 2014-03-19 │ 1405797 │ -│ 2014-03-20 │ 1353623 │ -│ 2014-03-21 │ 1245779 │ -│ 2014-03-22 │ 1031592 │ -│ 2014-03-23 │ 1046491 │ -└────────────┴─────────┘ + ┌──EventDate─┬───────c─┐ + │ 2014-03-17 │ 1406958 │ + │ 2014-03-18 │ 1383658 │ + │ 2014-03-19 │ 1405797 │ + │ 2014-03-20 │ 1353623 │ + │ 2014-03-21 │ 1245779 │ + │ 2014-03-22 │ 1031592 │ + │ 2014-03-23 │ 1046491 │ + └────────────┴─────────┘ -Totals: -┌──EventDate─┬───────c─┐ -│ 0000-00-00 │ 8873898 │ -└────────────┴─────────┘ + Totals: + ┌──EventDate─┬───────c─┐ + │ 0000-00-00 │ 8873898 │ + └────────────┴─────────┘ -Extremes: -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1031592 │ -│ 2014-03-23 │ 1406958 │ -└────────────┴─────────┘ -``` + Extremes: + ┌──EventDate─┬───────c─┐ + │ 2014-03-17 │ 1031592 │ + │ 2014-03-23 │ 1406958 │ + └────────────┴─────────┘ ## PrettyCompact {#prettycompact} @@ -585,23 +579,23 @@ Extremes: 示例: -```bash +``` bash watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" ``` 您可以使用 HTTP 接口来获取数据,显示在浏览器中。 -### PrettyCompactNoEscapes +### PrettyCompactNoEscapes {#prettycompactnoescapes} 用法类似上述。 -### PrettySpaceNoEscapes +### PrettySpaceNoEscapes {#prettyspacenoescapes} 用法类似上述。 ## PrettySpace {#prettyspace} -与 `PrettyCompact`(#prettycompact) 格式不一样的是,它使用空格来代替网格来显示数据。 +与 `PrettyCompact`(\#prettycompact) 格式不一样的是,它使用空格来代替网格来显示数据。 ## RowBinary {#rowbinary} @@ -622,9 +616,9 @@ FixedString 被简单地表示为一个字节序列。 Similar to [RowBinary](#rowbinary), but with added header: - * [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N) - * N `String`s specifying column names - * N `String`s specifying column types +- [LEB128](https://en.wikipedia.org/wiki/LEB128)-encoded number of columns (N) +- N `String`s specifying column names +- N `String`s specifying column types ## Values {#data-format-values} @@ -646,50 +640,43 @@ Similar to [RowBinary](#rowbinary), but with added header: SELECT * FROM t_null FORMAT Vertical ``` -``` -Row 1: -────── -x: 1 -y: ᴺᵁᴸᴸ -``` + Row 1: + ────── + x: 1 + y: ᴺᵁᴸᴸ 该格式仅适用于输出查询结果,但不适用于解析输入(将数据插入到表中)。 -## VerticalRaw +## VerticalRaw {#verticalraw} 和 `Vertical` 格式不同点在于,行是不会被转义的。 这种格式仅仅适用于输出,但不适用于解析输入(将数据插入到表中)。 示例: -``` -:) SHOW CREATE TABLE geonames FORMAT VerticalRaw; -Row 1: -────── -statement: CREATE TABLE default.geonames ( geonameid UInt32, date Date DEFAULT CAST('2017-12-08' AS Date)) ENGINE = MergeTree(date, geonameid, 8192) + :) SHOW CREATE TABLE geonames FORMAT VerticalRaw; + Row 1: + ────── + statement: CREATE TABLE default.geonames ( geonameid UInt32, date Date DEFAULT CAST('2017-12-08' AS Date)) ENGINE = MergeTree(date, geonameid, 8192) -:) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT VerticalRaw; -Row 1: -────── -test: string with 'quotes' and with some special - characters -``` + :) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT VerticalRaw; + Row 1: + ────── + test: string with 'quotes' and with some special + characters 和 Vertical 格式相比: -``` -:) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical; -Row 1: -────── -test: string with \'quotes\' and \t with some special \n characters -``` + :) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical; + Row 1: + ────── + test: string with \'quotes\' and \t with some special \n characters ## XML {#xml} 该格式仅适用于输出查询结果,但不适用于解析输入,示例: - -```xml +``` xml @@ -760,9 +747,9 @@ test: string with \'quotes\' and \t with some special \n characters ## CapnProto {#capnproto} -Cap'n Proto 是一种二进制消息格式,类似 Protocol Buffers 和 Thriftis,但与 JSON 或 MessagePack 格式不一样。 +Cap’n Proto 是一种二进制消息格式,类似 Protocol Buffers 和 Thriftis,但与 JSON 或 MessagePack 格式不一样。 -Cap'n Proto 消息格式是严格类型的,而不是自我描述,这意味着它们不需要外部的描述。这种格式可以实时地应用,并针对每个查询进行缓存。 +Cap’n Proto 消息格式是严格类型的,而不是自我描述,这意味着它们不需要外部的描述。这种格式可以实时地应用,并针对每个查询进行缓存。 ``` sql SELECT SearchPhrase, count() AS c FROM test.hits @@ -771,17 +758,14 @@ SELECT SearchPhrase, count() AS c FROM test.hits 其中 `schema.capnp` 描述如下: -``` -struct Message { - SearchPhrase @0 :Text; - c @1 :Uint64; -} -``` + struct Message { + SearchPhrase @0 :Text; + c @1 :Uint64; + } -格式文件存储的目录可以在服务配置中的 [ format_schema_path ](../operations/server_settings/settings.md) 指定。 - -Cap'n Proto 反序列化是很高效的,通常不会增加系统的负载。 +格式文件存储的目录可以在服务配置中的 [format\_schema\_path](../operations/server_settings/settings.md) 指定。 +Cap’n Proto 反序列化是很高效的,通常不会增加系统的负载。 ## Protobuf {#protobuf} @@ -792,17 +776,17 @@ ClickHouse supports both `proto2` and `proto3` syntaxes. Repeated/optional/requi Usage examples: -```sql +``` sql SELECT * FROM test.table FORMAT Protobuf SETTINGS format_schema = 'schemafile:MessageType' ``` -```bash +``` bash cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'" ``` where the file `schemafile.proto` looks like this: -```capnp +``` capnp syntax = "proto3"; message MessageType { @@ -813,13 +797,13 @@ message MessageType { }; ``` -To find the correspondence between table columns and fields of Protocol Buffers' message type ClickHouse compares their names. +To find the correspondence between table columns and fields of Protocol Buffers’ message type ClickHouse compares their names. This comparison is case-insensitive and the characters `_` (underscore) and `.` (dot) are considered as equal. -If types of a column and a field of Protocol Buffers' message are different the necessary conversion is applied. +If types of a column and a field of Protocol Buffers’ message are different the necessary conversion is applied. Nested messages are supported. For example, for the field `z` in the following message type -```capnp +``` capnp message MessageType { message XType { message YType { @@ -836,7 +820,7 @@ Nested messages are suitable to input or output a [nested data structures](../da Default values defined in a protobuf schema like this -```capnp +``` capnp syntax = "proto2"; message MessageType { @@ -852,43 +836,41 @@ See also [how to read/write length-delimited protobuf messages in popular langua ## Avro {#data-format-avro} -[Apache Avro](http://avro.apache.org/) is a row-oriented data serialization framework developed within Apache's Hadoop project. +[Apache Avro](http://avro.apache.org/) is a row-oriented data serialization framework developed within Apache’s Hadoop project. ClickHouse Avro format supports reading and writing [Avro data files](http://avro.apache.org/docs/current/spec.html#Object+Container+Files). -### Data Types Matching +### Data Types Matching {#data-types-matching} The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` and `SELECT` queries. -| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | -| -------------------- | -------------------- | ------------------ | -| `boolean`, `int`, `long`, `float`, `double` | [Int(8\|16\|32\)](../data_types/int_uint.md), [UInt(8\|16\|32)](../data_types/int_uint.md) | `int` | -| `boolean`, `int`, `long`, `float`, `double` | [Int64](../data_types/int_uint.md), [UInt64](../data_types/int_uint.md) | `long` | -| `boolean`, `int`, `long`, `float`, `double` | [Float32](../data_types/float.md) | `float` | -| `boolean`, `int`, `long`, `float`, `double` | [Float64](../data_types/float.md) | `double` | -| `bytes`, `string`, `fixed`, `enum` | [String](../data_types/string.md) | `bytes` | -| `bytes`, `string`, `fixed` | [FixedString(N)](../data_types/fixedstring.md) | `fixed(N)` | -| `enum` | [Enum(8\|16)](../data_types/enum.md) | `enum` | -| `array(T)` | [Array(T)](../data_types/array.md) | `array(T)` | -| `union(null, T)`, `union(T, null)` | [Nullable(T)](../data_types/date.md) | `union(null, T)`| -| `null` | [Nullable(Nothing)](../data_types/special_data_types/nothing.md) | `null` | -| `int (date)` * | [Date](../data_types/date.md) | `int (date)` * | -| `long (timestamp-millis)` * | [DateTime64(3)](../data_types/datetime.md) | `long (timestamp-millis)` * | -| `long (timestamp-micros)` * | [DateTime64(6)](../data_types/datetime.md) | `long (timestamp-micros)` * | +| Avro data type `INSERT` | ClickHouse data type | Avro data type `SELECT` | +|---------------------------------------------|-------------------------------------------------------------------------------------------|------------------------------| +| `boolean`, `int`, `long`, `float`, `double` | [Int(8\|16\|32)](../data_types/int_uint.md), [UInt(8\|16\|32)](../data_types/int_uint.md) | `int` | +| `boolean`, `int`, `long`, `float`, `double` | [Int64](../data_types/int_uint.md), [UInt64](../data_types/int_uint.md) | `long` | +| `boolean`, `int`, `long`, `float`, `double` | [Float32](../data_types/float.md) | `float` | +| `boolean`, `int`, `long`, `float`, `double` | [Float64](../data_types/float.md) | `double` | +| `bytes`, `string`, `fixed`, `enum` | [String](../data_types/string.md) | `bytes` | +| `bytes`, `string`, `fixed` | [FixedString(N)](../data_types/fixedstring.md) | `fixed(N)` | +| `enum` | [Enum(8\|16)](../data_types/enum.md) | `enum` | +| `array(T)` | [Array(T)](../data_types/array.md) | `array(T)` | +| `union(null, T)`, `union(T, null)` | [Nullable(T)](../data_types/date.md) | `union(null, T)` | +| `null` | [Nullable(Nothing)](../data_types/special_data_types/nothing.md) | `null` | +| `int (date)` \* | [Date](../data_types/date.md) | `int (date)` \* | +| `long (timestamp-millis)` \* | [DateTime64(3)](../data_types/datetime.md) | `long (timestamp-millis)` \* | +| `long (timestamp-micros)` \* | [DateTime64(6)](../data_types/datetime.md) | `long (timestamp-micros)` \* | \* [Avro logical types](http://avro.apache.org/docs/current/spec.html#Logical+Types) - - Unsupported Avro data types: `record` (non-root), `map` Unsupported Avro logical data types: `uuid`, `time-millis`, `time-micros`, `duration` -### Inserting Data +### Inserting Data {#inserting-data} To insert data from an Avro file into ClickHouse table: -```bash +``` bash $ cat file.avro | clickhouse-client --query="INSERT INTO {some_table} FORMAT Avro" ``` @@ -899,20 +881,20 @@ Unused fields are skipped. Data types of a ClickHouse table columns can differ from the corresponding fields of the Avro data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to corresponding column type. -### Selecting Data +### Selecting Data {#selecting-data} To select data from ClickHouse table into an Avro file: -```bash +``` bash $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Avro" > file.avro ``` Column names must: -- start with `[A-Za-z_]` -- subsequently contain only `[A-Za-z0-9_]` +- start with `[A-Za-z_]` +- subsequently contain only `[A-Za-z0-9_]` -Output Avro file compression and sync interval can be configured with [output_format_avro_codec](../operations/settings/settings.md#settings-output_format_avro_codec) and [output_format_avro_sync_interval](../operations/settings/settings.md#settings-output_format_avro_sync_interval) respectively. +Output Avro file compression and sync interval can be configured with [output\_format\_avro\_codec](../operations/settings/settings.md#settings-output_format_avro_codec) and [output\_format\_avro\_sync\_interval](../operations/settings/settings.md#settings-output_format_avro_sync_interval) respectively. ## AvroConfluent {#data-format-avro-confluent} @@ -922,17 +904,17 @@ Each Avro message embeds a schema id that can be resolved to the actual schema w Schemas are cached once resolved. -Schema Registry URL is configured with [format_avro_schema_registry_url](../operations/settings/settings.md#settings-format_avro_schema_registry_url) +Schema Registry URL is configured with [format\_avro\_schema\_registry\_url](../operations/settings/settings.md#settings-format_avro_schema_registry_url) -### Data Types Matching +### Data Types Matching {#data-types-matching-1} Same as [Avro](#data-format-avro) -### Usage +### Usage {#usage} To quickly verify schema resolution you can use [kafkacat](https://github.com/edenhill/kafkacat) with [clickhouse-local](../operations/utils/clickhouse-local.md): -```bash +``` bash $ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse-local --input-format AvroConfluent --format_avro_schema_registry_url 'http://schema-registry' -S "field1 Int64, field2 String" -q 'select * from table' 1 a 2 b @@ -940,7 +922,8 @@ $ kafkacat -b kafka-broker -C -t topic1 -o beginning -f '%s' -c 3 | clickhouse- ``` To use `AvroConfluent` with [Kafka](../operations/table_engines/kafka.md): -```sql + +``` sql CREATE TABLE topic1_stream ( field1 String, @@ -959,34 +942,33 @@ SELECT * FROM topic1_stream; ``` !!! note "Warning" - Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it's value after a restart. - + Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. ## Parquet {#data-format-parquet} [Apache Parquet](http://parquet.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. ClickHouse supports read and write operations for this format. -### Data Types Matching +### Data Types Matching {#data-types-matching-2} The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` and `SELECT` queries. -| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) | -| -------------------- | ------------------ | ---- | -| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | `UINT8` | -| `INT8` | [Int8](../data_types/int_uint.md) | `INT8` | -| `UINT16` | [UInt16](../data_types/int_uint.md) | `UINT16` | -| `INT16` | [Int16](../data_types/int_uint.md) | `INT16` | -| `UINT32` | [UInt32](../data_types/int_uint.md) | `UINT32` | -| `INT32` | [Int32](../data_types/int_uint.md) | `INT32` | -| `UINT64` | [UInt64](../data_types/int_uint.md) | `UINT64` | -| `INT64` | [Int64](../data_types/int_uint.md) | `INT64` | -| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | `FLOAT` | -| `DOUBLE` | [Float64](../data_types/float.md) | `DOUBLE` | -| `DATE32` | [Date](../data_types/date.md) | `UINT16` | -| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | `UINT32` | -| `STRING`, `BINARY` | [String](../data_types/string.md) | `STRING` | -| — | [FixedString](../data_types/fixedstring.md) | `STRING` | -| `DECIMAL` | [Decimal](../data_types/decimal.md) | `DECIMAL` | +| Parquet data type (`INSERT`) | ClickHouse data type | Parquet data type (`SELECT`) | +|------------------------------|---------------------------------------------|------------------------------| +| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | `UINT8` | +| `INT8` | [Int8](../data_types/int_uint.md) | `INT8` | +| `UINT16` | [UInt16](../data_types/int_uint.md) | `UINT16` | +| `INT16` | [Int16](../data_types/int_uint.md) | `INT16` | +| `UINT32` | [UInt32](../data_types/int_uint.md) | `UINT32` | +| `INT32` | [Int32](../data_types/int_uint.md) | `INT32` | +| `UINT64` | [UInt64](../data_types/int_uint.md) | `UINT64` | +| `INT64` | [Int64](../data_types/int_uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../data_types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../data_types/date.md) | `UINT16` | +| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | `UINT32` | +| `STRING`, `BINARY` | [String](../data_types/string.md) | `STRING` | +| — | [FixedString](../data_types/fixedstring.md) | `STRING` | +| `DECIMAL` | [Decimal](../data_types/decimal.md) | `DECIMAL` | ClickHouse supports configurable precision of `Decimal` type. The `INSERT` query treats the Parquet `DECIMAL` type as the ClickHouse `Decimal128` type. @@ -994,17 +976,17 @@ Unsupported Parquet data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, Data types of a ClickHouse table columns can differ from the corresponding fields of the Parquet data inserted. When inserting data, ClickHouse interprets data types according to the table above and then [cast](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to that data type which is set for the ClickHouse table column. -### Inserting and Selecting Data +### Inserting and Selecting Data {#inserting-and-selecting-data} You can insert Parquet data from a file into ClickHouse table by the following command: -```bash +``` bash $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" ``` You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command: -```bash +``` bash $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} ``` @@ -1014,38 +996,38 @@ To exchange data with Hadoop, you can use [HDFS table engine](../operations/tabl [Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. You can only insert data in this format to ClickHouse. -### Data Types Matching +### Data Types Matching {#data-types-matching-3} The table below shows supported data types and how they match ClickHouse [data types](../data_types/index.md) in `INSERT` queries. -| ORC data type (`INSERT`) | ClickHouse data type | -| -------------------- | ------------------ | -| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | -| `INT8` | [Int8](../data_types/int_uint.md) | -| `UINT16` | [UInt16](../data_types/int_uint.md) | -| `INT16` | [Int16](../data_types/int_uint.md) | -| `UINT32` | [UInt32](../data_types/int_uint.md) | -| `INT32` | [Int32](../data_types/int_uint.md) | -| `UINT64` | [UInt64](../data_types/int_uint.md) | -| `INT64` | [Int64](../data_types/int_uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | -| `DOUBLE` | [Float64](../data_types/float.md) | -| `DATE32` | [Date](../data_types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | -| `STRING`, `BINARY` | [String](../data_types/string.md) | -| `DECIMAL` | [Decimal](../data_types/decimal.md) | +| ORC data type (`INSERT`) | ClickHouse data type | +|--------------------------|---------------------------------------| +| `UINT8`, `BOOL` | [UInt8](../data_types/int_uint.md) | +| `INT8` | [Int8](../data_types/int_uint.md) | +| `UINT16` | [UInt16](../data_types/int_uint.md) | +| `INT16` | [Int16](../data_types/int_uint.md) | +| `UINT32` | [UInt32](../data_types/int_uint.md) | +| `INT32` | [Int32](../data_types/int_uint.md) | +| `UINT64` | [UInt64](../data_types/int_uint.md) | +| `INT64` | [Int64](../data_types/int_uint.md) | +| `FLOAT`, `HALF_FLOAT` | [Float32](../data_types/float.md) | +| `DOUBLE` | [Float64](../data_types/float.md) | +| `DATE32` | [Date](../data_types/date.md) | +| `DATE64`, `TIMESTAMP` | [DateTime](../data_types/datetime.md) | +| `STRING`, `BINARY` | [String](../data_types/string.md) | +| `DECIMAL` | [Decimal](../data_types/decimal.md) | ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -The data types of ClickHouse table columns don't have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. +The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../query_language/functions/type_conversion_functions/#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. -### Inserting Data +### Inserting Data {#inserting-data-1} You can insert ORC data from a file into ClickHouse table by the following command: -```bash +``` bash $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` @@ -1054,9 +1036,9 @@ To exchange data with Hadoop, you can use [HDFS table engine](../operations/tabl ## Format Schema {#formatschema} The file name containing the format schema is set by the setting `format_schema`. -It's required to set this setting when it is used one of the formats `Cap'n Proto` and `Protobuf`. +It’s required to set this setting when it is used one of the formats `Cap'n Proto` and `Protobuf`. The format schema is a combination of a file name and the name of a message type in this file, delimited by colon, -e.g. `schemafile.proto:MessageType`. +e.g. `schemafile.proto:MessageType`. If the file has the standard extension for the format (for example, `.proto` for `Protobuf`), it can be omitted and in this case the format schema looks like `schemafile:MessageType`. @@ -1065,18 +1047,17 @@ can contain an absolute path or a path relative to the current directory on the If you use the client in the batch mode, the path to the schema must be relative due to security reasons. If you input or output data via the [HTTP interface](../interfaces/http.md) the file name specified in the format schema -should be located in the directory specified in [format_schema_path](../operations/server_settings/settings.md#server_settings-format_schema_path) +should be located in the directory specified in [format\_schema\_path](../operations/server_settings/settings.md#server_settings-format_schema_path) in the server configuration. [Original article](https://clickhouse.tech/docs/en/interfaces/formats/) ## Skipping Errors {#skippingerrors} -Some formats such as `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` and `Protobuf` can skip broken row if parsing error occurred and continue parsing from the beginning of next row. See [input_format_allow_errors_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) and -[input_format_allow_errors_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio) settings. +Some formats such as `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` and `Protobuf` can skip broken row if parsing error occurred and continue parsing from the beginning of next row. See [input\_format\_allow\_errors\_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) and +[input\_format\_allow\_errors\_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio) settings. Limitations: - - In case of parsing error `JSONEachRow` skips all data until the new line (or EOF), so rows must be delimited by `\n` to count errors correctly. - - `Template` and `CustomSeparated` use delimiter after the last column and delimiter between rows to find the beginning of next row, so skipping errors works only if at least one of them is not empty. - +- In case of parsing error `JSONEachRow` skips all data until the new line (or EOF), so rows must be delimited by `\n` to count errors correctly. +- `Template` and `CustomSeparated` use delimiter after the last column and delimiter between rows to find the beginning of next row, so skipping errors works only if at least one of them is not empty. [来源文章](https://clickhouse.tech/docs/zh/interfaces/formats/) diff --git a/docs/zh/interfaces/http.md b/docs/zh/interfaces/http.md index a775d489e05..1dfbe87b7e0 100644 --- a/docs/zh/interfaces/http.md +++ b/docs/zh/interfaces/http.md @@ -1,11 +1,11 @@ -# HTTP 客户端 +# HTTP 客户端 {#http-ke-hu-duan} HTTP 接口可以让你通过任何平台和编程语言来使用 ClickHouse。我们用 Java 和 Perl 以及 shell 脚本来访问它。在其他的部门中,HTTP 接口会用在 Perl,Python 以及 Go 中。HTTP 接口比 TCP 原生接口更为局限,但是却有更好的兼容性。 默认情况下,clickhouse-server 会在端口 8123 上监控 HTTP 请求(这可以在配置中修改)。 -如果你发送了一个不带参数的 GET 请求,它会返回一个字符串 "Ok."(结尾有换行)。可以将它用在健康检查脚本中。 +如果你发送了一个不带参数的 GET 请求,它会返回一个字符串 «Ok.»(结尾有换行)。可以将它用在健康检查脚本中。 -```bash +``` bash $ curl 'http://localhost:8123/' Ok. ``` @@ -19,7 +19,7 @@ Ok. Examples: -```bash +``` bash $ curl 'http://localhost:8123/?query=SELECT%201' 1 @@ -39,7 +39,7 @@ Date: Fri, 16 Nov 2012 19:21:50 GMT 可以看到,curl 命令由于空格需要 URL 转义,所以不是很方便。尽管 wget 命令对url做了 URL 转义,但我们并不推荐使用他,因为在 HTTP 1.1 协议下使用 keep-alive 和 Transfer-Encoding: chunked 头部设置它并不能很好的工作。 -```bash +``` bash $ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @- 1 @@ -53,17 +53,17 @@ $ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @- 如果一部分请求是通过参数发送的,另外一部分通过 POST 主体发送,两部分查询之间会一行空行插入。 错误示例: -```bash +``` bash $ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @- Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL ECT 1 , expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception ``` -默认情况下,返回的数据是 TabSeparated 格式的,更多信息,见 "[数据格式]" 部分。 +默认情况下,返回的数据是 TabSeparated 格式的,更多信息,见 «\[数据格式\]» 部分。 可以使用 FORMAT 设置查询来请求不同格式。 -```bash +``` bash $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @- ┏━━━┓ ┃ 1 ┃ @@ -76,37 +76,37 @@ INSERT 必须通过 POST 方法来插入数据。这种情况下,你可以将 示例: 创建一个表: -```bash +``` bash echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | POST 'http://localhost:8123/' ``` 使用类似 INSERT 的查询来插入数据: -```bash +``` bash echo 'INSERT INTO t VALUES (1),(2),(3)' | POST 'http://localhost:8123/' ``` 数据可以从查询中单独发送: -```bash +``` bash echo '(4),(5),(6)' | POST 'http://localhost:8123/?query=INSERT INTO t VALUES' ``` 可以指定任何数据格式。值的格式和写入表 `t` 的值的格式相同: -```bash +``` bash echo '(7),(8),(9)' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT Values' ``` 若要插入 tab 分割的数据,需要指定对应的格式: -```bash +``` bash echo -ne '10\n11\n12\n' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT TabSeparated' ``` 从表中读取内容。由于查询处理是并行的,数据以随机顺序输出。 -```bash +``` bash $ GET 'http://localhost:8123/?query=SELECT a FROM t' 7 8 @@ -124,7 +124,7 @@ $ GET 'http://localhost:8123/?query=SELECT a FROM t' 删除表。 -```bash +``` bash POST 'http://localhost:8123/?query=DROP TABLE t' ``` @@ -137,9 +137,9 @@ POST 'http://localhost:8123/?query=DROP TABLE t' 可以通过为每份数据进行立即压缩来减少大规模数据传输中的网络压力。 -可以指定 'database' 参数来指定默认的数据库。 +可以指定 ‘database’ 参数来指定默认的数据库。 -```bash +``` bash $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @- 0 1 @@ -157,24 +157,28 @@ $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?data 用户名密码可以通过以下两种方式指定: -1. 通过 HTTP Basic Authentication。示例: +1. 通过 HTTP Basic Authentication。示例: -```bash + + +``` bash echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- ``` -2. 通过 URL 参数 中的 'user' 和 'password'。示例: +1. 通过 URL 参数 中的 ‘user’ 和 ‘password’。示例: -```bash + + +``` bash echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- ``` 如果用户名没有指定,默认的用户是 `default`。如果密码没有指定,默认会使用空密码。 可以使用 URL 参数指定配置或者设置整个配置文件来处理单个查询。示例:`http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1` -更多信息,参见 "[设置](../operations/settings/index.md#settings)" 部分。 +更多信息,参见 «[设置](../operations/settings/index.md#settings)» 部分。 -```bash +``` bash $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @- 0 1 @@ -188,17 +192,17 @@ $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:812 9 ``` -更多关于其他参数的信息,参见 "[设置](../operations/settings/index.md#settings)" 部分。 +更多关于其他参数的信息,参见 «[设置](../operations/settings/index.md#settings)» 部分。 相比起 TCP 原生接口,HTTP 接口不支持会话和会话设置的概念,不允许中止查询(准确地说,只在少数情况下允许),不显示查询处理的进展。执行解析和数据格式化都是在服务端处理,网络上会比 TCP 原生接口更低效。 -可选的 `query_id` 参数可能当做 query ID 传入(或者任何字符串)。更多信息,参见 "[设置 replace_running_query](../operations/settings/settings.md)" 部分。 +可选的 `query_id` 参数可能当做 query ID 传入(或者任何字符串)。更多信息,参见 «[设置 replace\_running\_query](../operations/settings/settings.md)» 部分。 -可选的 `quota_key` 参数可能当做 quota key 传入(或者任何字符串)。更多信息,参见 "[配额](../operations/quotas.md#quotas)" 部分。 +可选的 `quota_key` 参数可能当做 quota key 传入(或者任何字符串)。更多信息,参见 «[配额](../operations/quotas.md#quotas)» 部分。 -HTTP 接口允许传入额外的数据(外部临时表)来查询。更多信息,参见 "[外部数据查询处理](../operations/table_engines/external_data.md)" 部分。 +HTTP 接口允许传入额外的数据(外部临时表)来查询。更多信息,参见 «[外部数据查询处理](../operations/table_engines/external_data.md)» 部分。 -## 响应缓冲 +## 响应缓冲 {#xiang-ying-huan-chong} 可以在服务器端启用响应缓冲。提供了 `buffer_size` 和 `wait_end_of_query` 两个URL 参数来达此目的。 @@ -208,7 +212,7 @@ HTTP 接口允许传入额外的数据(外部临时表)来查询。更多信 示例: -```bash +``` bash curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' ``` diff --git a/docs/zh/interfaces/index.md b/docs/zh/interfaces/index.md index 577cf707e57..df0313cc3d2 100644 --- a/docs/zh/interfaces/index.md +++ b/docs/zh/interfaces/index.md @@ -2,18 +2,18 @@ ClickHouse提供了两个网络接口(两者都可以选择包装在TLS中以提高安全性): -* [HTTP](http.md),记录在案,易于使用. -* [本地TCP](tcp.md),这有较少的开销. +- [HTTP](http.md),记录在案,易于使用. +- [本地TCP](tcp.md),这有较少的开销. 在大多数情况下,建议使用适当的工具或库,而不是直接与这些工具或库进行交互。 Yandex的官方支持如下: -* [命令行客户端](cli.md) -* [JDBC驱动程序](jdbc.md) -* [ODBC驱动程序](odbc.md) -* [C++客户端库](cpp.md) +\* [命令行客户端](cli.md) +\* [JDBC驱动程序](jdbc.md) +\* [ODBC驱动程序](odbc.md) +\* [C++客户端库](cpp.md) 还有许多第三方库可供使用ClickHouse: -* [客户端库](third-party/client_libraries.md) -* [集成](third-party/integrations.md) -* [可视界面](third-party/gui.md) +\* [客户端库](third-party/client_libraries.md) +\* [集成](third-party/integrations.md) +\* [可视界面](third-party/gui.md) [来源文章](https://clickhouse.tech/docs/zh/interfaces/) diff --git a/docs/zh/interfaces/jdbc.md b/docs/zh/interfaces/jdbc.md index 1a38d71c693..a2aac229cca 100644 --- a/docs/zh/interfaces/jdbc.md +++ b/docs/zh/interfaces/jdbc.md @@ -1,9 +1,8 @@ -# JDBC 驱动 - -* **[官方JDBC 的驱动](https://github.com/ClickHouse/clickhouse-jdbc)** -* 三方提供的 JDBC 驱动: - * [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) - * [clickhouse4j](https://github.com/blynkkk/clickhouse4j) +# JDBC 驱动 {#jdbc-qu-dong} +- **[官方JDBC 的驱动](https://github.com/ClickHouse/clickhouse-jdbc)** +- 三方提供的 JDBC 驱动: + - [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) + - [clickhouse4j](https://github.com/blynkkk/clickhouse4j) [来源文章](https://clickhouse.tech/docs/zh/interfaces/jdbc/) diff --git a/docs/zh/interfaces/odbc.md b/docs/zh/interfaces/odbc.md index b3014e718c4..b45c54f8507 100644 --- a/docs/zh/interfaces/odbc.md +++ b/docs/zh/interfaces/odbc.md @@ -1,5 +1,5 @@ -# ODBC 驱动 +# ODBC 驱动 {#odbc-qu-dong} -- ClickHouse官方有 ODBC 的驱动。 见 [这里](https://github.com/ClickHouse/clickhouse-odbc)。 +- ClickHouse官方有 ODBC 的驱动。 见 [这里](https://github.com/ClickHouse/clickhouse-odbc)。 [来源文章](https://clickhouse.tech/docs/zh/interfaces/odbc/) diff --git a/docs/zh/interfaces/tcp.md b/docs/zh/interfaces/tcp.md index a5adda268c9..9fd4e6b108f 100644 --- a/docs/zh/interfaces/tcp.md +++ b/docs/zh/interfaces/tcp.md @@ -1,4 +1,4 @@ -# 原生客户端接口(TCP) +# 原生客户端接口(TCP) {#yuan-sheng-ke-hu-duan-jie-kou-tcp} 本机协议用于 [命令行客户端](cli.md),用于分布式查询处理期间的服务器间通信,以及其他C ++程序。 不幸的是,本机ClickHouse协议还没有正式的规范,但它可以从ClickHouse源代码进行逆向工程 [从这里开始](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client))和/或拦截和分析TCP流量。 diff --git a/docs/zh/interfaces/third-party/client_libraries.md b/docs/zh/interfaces/third-party/client_libraries.md index 54675acf4b1..bf74c490092 100644 --- a/docs/zh/interfaces/third-party/client_libraries.md +++ b/docs/zh/interfaces/third-party/client_libraries.md @@ -1,49 +1,49 @@ -# 第三方开发的库 +# 第三方开发的库 {#di-san-fang-kai-fa-de-ku} !!! warning "放弃" Yandex不维护下面列出的库,也没有进行任何广泛的测试以确保其质量。 -- Python - - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) - - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) - - [clickhouse-client](https://github.com/yurial/clickhouse-client) -- PHP - - [SeasClick](https://github.com/SeasX/SeasClick) - - [phpClickHouse](https://github.com/smi2/phpClickHouse) - - [clickhouse-php-client](https://github.com/8bitov/clickhouse-php-client) - - [clickhouse-client](https://github.com/bozerkins/clickhouse-client) - - [PhpClickHouseClient](https://github.com/SevaCode/PhpClickHouseClient) -- Go - - [clickhouse](https://github.com/kshvakov/clickhouse/) - - [go-clickhouse](https://github.com/roistat/go-clickhouse) - - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) - - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) -- NodeJs - - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) - - [node-clickhouse](https://github.com/apla/node-clickhouse) -- Perl - - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) - - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) - - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) -- Ruby - - [ClickHouse (Ruby)](https://github.com/shlima/click_house) -- R - - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) - - [RClickhouse](https://github.com/IMSMWU/RClickhouse) -- Java - - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) -- Scala - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- Kotlin - - [AORM](https://github.com/TanVD/AORM) -- C# - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) - - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) -- Elixir - - [clickhousex](https://github.com/appodeal/clickhousex/) -- Nim - - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) - +- Python + - [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm) + - [clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) + - [clickhouse-client](https://github.com/yurial/clickhouse-client) +- PHP + - [smi2/phpclickhouse](https://packagist.org/packages/smi2/phpClickHouse) + - [8bitov/clickhouse-php-client](https://packagist.org/packages/8bitov/clickhouse-php-client) + - [bozerkins/clickhouse-client](https://packagist.org/packages/bozerkins/clickhouse-client) + - [simpod/clickhouse-client](https://packagist.org/packages/simpod/clickhouse-client) + - [seva-code/php-click-house-client](https://packagist.org/packages/seva-code/php-click-house-client) + - [SeasClick C++ client](https://github.com/SeasX/SeasClick) +- Go + - [clickhouse](https://github.com/kshvakov/clickhouse/) + - [go-clickhouse](https://github.com/roistat/go-clickhouse) + - [mailrugo-clickhouse](https://github.com/mailru/go-clickhouse) + - [golang-clickhouse](https://github.com/leprosus/golang-clickhouse) +- NodeJs + - [clickhouse (NodeJs)](https://github.com/TimonKK/clickhouse) + - [node-clickhouse](https://github.com/apla/node-clickhouse) +- Perl + - [perl-DBD-ClickHouse](https://github.com/elcamlost/perl-DBD-ClickHouse) + - [HTTP-ClickHouse](https://metacpan.org/release/HTTP-ClickHouse) + - [AnyEvent-ClickHouse](https://metacpan.org/release/AnyEvent-ClickHouse) +- Ruby + - [ClickHouse (Ruby)](https://github.com/shlima/click_house) +- R + - [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r) + - [RClickhouse](https://github.com/IMSMWU/RClickhouse) +- Java + - [clickhouse-client-java](https://github.com/VirtusAI/clickhouse-client-java) +- Scala + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- Kotlin + - [AORM](https://github.com/TanVD/AORM) +- C\# + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) + - [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client) +- Elixir + - [clickhousex](https://github.com/appodeal/clickhousex/) +- Nim + - [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse) [来源文章](https://clickhouse.tech/docs/zh/interfaces/third-party/client_libraries/) diff --git a/docs/zh/interfaces/third-party/gui.md b/docs/zh/interfaces/third-party/gui.md index 64234f4f8de..83656d18858 100644 --- a/docs/zh/interfaces/third-party/gui.md +++ b/docs/zh/interfaces/third-party/gui.md @@ -1,34 +1,33 @@ -# 第三方开发的可视化界面 +# 第三方开发的可视化界面 {#di-san-fang-kai-fa-de-ke-shi-hua-jie-mian} -## 开源 +## 开源 {#kai-yuan} -### Tabix +### Tabix {#tabix} ClickHouse Web 界面 [Tabix](https://github.com/tabixio/tabix). 主要功能: -- 浏览器直接连接 ClickHouse,不需要安装其他软件。 -- 高亮语法的编辑器。 -- 自动命令补全。 -- 查询命令执行的图形分析工具。 -- 配色方案选项。 +- 浏览器直接连接 ClickHouse,不需要安装其他软件。 +- 高亮语法的编辑器。 +- 自动命令补全。 +- 查询命令执行的图形分析工具。 +- 配色方案选项。 [Tabix 文档](https://tabix.io/doc/). - -### HouseOps +### HouseOps {#houseops} [HouseOps](https://github.com/HouseOps/HouseOps) 是一个交互式 UI/IDE 工具,可以运行在 OSX, Linux and Windows 平台中。 主要功能: -- 查询高亮语法提示,可以以表格或 JSON 格式查看数据。 -- 支持导出 CSV 或 JSON 格式数据。 -- 支持查看查询执行的详情,支持 KILL 查询。 -- 图形化显示,支持显示数据库中所有的表和列的详细信息。 -- 快速查看列占用的空间。 -- 服务配置。 +- 查询高亮语法提示,可以以表格或 JSON 格式查看数据。 +- 支持导出 CSV 或 JSON 格式数据。 +- 支持查看查询执行的详情,支持 KILL 查询。 +- 图形化显示,支持显示数据库中所有的表和列的详细信息。 +- 快速查看列占用的空间。 +- 服务配置。 以下功能正在计划开发: - 数据库管理 @@ -38,44 +37,44 @@ ClickHouse Web 界面 [Tabix](https://github.com/tabixio/tabix). - 集群管理 - 监控副本情况以及 Kafka 引擎表 -### LightHouse +### LightHouse {#lighthouse} [LightHouse](https://github.com/VKCOM/lighthouse) 是ClickHouse的轻量级Web界面。 特征: - - 包含过滤和元数据的表列表。 - - 带有过滤和排序的表格预览。 - - 只读查询执行。 - -### DBeaver +- 包含过滤和元数据的表列表。 +- 带有过滤和排序的表格预览。 +- 只读查询执行。 + +### DBeaver {#dbeaver} [DBeaver](https://dbeaver.io/) 具有ClickHouse支持的通用桌面数据库客户端。 特征: - - 使用语法高亮显示查询开发。 - - 表格预览。 - - 自动完成。 +- 使用语法高亮显示查询开发。 +- 表格预览。 +- 自动完成。 -### clickhouse-cli +### clickhouse-cli {#clickhouse-cli} [clickhouse-cli](https://github.com/hatarist/clickhouse-cli) 是ClickHouse的替代命令行客户端,用Python 3编写。 特征: - - 自动完成。 - - 查询和数据输出的语法高亮显示。 - - 寻呼机支持数据输出。 - - 自定义PostgreSQL类命令。 +- 自动完成。 +- 查询和数据输出的语法高亮显示。 +- 寻呼机支持数据输出。 +- 自定义PostgreSQL类命令。 -### clickhouse-flamegraph +### clickhouse-flamegraph {#clickhouse-flamegraph} - [clickhouse-flamegraph](https://github.com/Slach/clickhouse-flamegraph) 是一个可视化的专业工具`system.trace_log`如[flamegraph](http://www.brendangregg.com/flamegraphs.html). + [clickhouse-flamegraph](https://github.com/Slach/clickhouse-flamegraph) 是一个可视化的专业工具`system.trace_log`如[flamegraph](http://www.brendangregg.com/flamegraphs.html). -## 商业 +## 商业 {#shang-ye} -### Holistics Software +### Holistics Software {#holistics-software} [Holistics](https://www.holistics.io/) 在2019年被Gartner FrontRunners列为可用性最高排名第二的商业智能工具之一。 Holistics是一个基于SQL的全栈数据平台和商业智能工具,用于设置您的分析流程。 @@ -87,17 +86,17 @@ ClickHouse Web 界面 [Tabix](https://github.com/tabixio/tabix). -数据准备和ETL功能。 -SQL数据建模支持数据的关系映射。 -### DataGrip +### DataGrip {#datagrip} [DataGrip](https://www.jetbrains.com/datagrip/) 是JetBrains的数据库IDE,专门支持ClickHouse。 它还嵌入到其他基于IntelliJ的工具中:PyCharm,IntelliJ IDEA,GoLand,PhpStorm等。 特征: - - 非常快速的代码完成。 - - ClickHouse语法高亮显示。 - - 支持ClickHouse特有的功能,例如嵌套列,表引擎。 - - 数据编辑器。 - - 重构。 - - 搜索和导航。 +- 非常快速的代码完成。 +- ClickHouse语法高亮显示。 +- 支持ClickHouse特有的功能,例如嵌套列,表引擎。 +- 数据编辑器。 +- 重构。 +- 搜索和导航。 [来源文章](https://clickhouse.tech/docs/zh/interfaces/third-party/gui/) diff --git a/docs/zh/interfaces/third-party/integrations.md b/docs/zh/interfaces/third-party/integrations.md index a37e7610656..e1aa87573c9 100644 --- a/docs/zh/interfaces/third-party/integrations.md +++ b/docs/zh/interfaces/third-party/integrations.md @@ -1,84 +1,85 @@ -# 第三方集成库 +# 第三方集成库 {#di-san-fang-ji-cheng-ku} !!! warning "声明" Yandex不维护下面列出的库,也没有进行任何广泛的测试以确保其质量。 -## 基建产品 -- 关系数据库管理系统 - - [MySQL](https://www.mysql.com) - - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) - - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) - - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) - - [PostgreSQL](https://www.postgresql.org) - - [clickhousedb_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) - - [infi.clickhouse_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (使用 [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pg2ch](https://github.com/mkabilov/pg2ch) - - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) - - [ClickHouseMightrator](https://github.com/zlzforever/ClickHouseMigrator) -- 消息队列 - - [Kafka](https://kafka.apache.org) - - [clickhouse_sinker](https://github.com/housepower/clickhouse_sinker) (使用 [Go client](https://github.com/kshvakov/clickhouse/)) -- 对象存储 - - [S3](https://en.wikipedia.org/wiki/Amazon_S3) - - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) -- 容器编排 - - [Kubernetes](https://kubernetes.io) - - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) -- 配置管理 - - [puppet](https://puppet.com) - - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) - - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) -- 监控 - - [Graphite](https://graphiteapp.org) - - [graphouse](https://github.com/yandex/graphouse) - - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + - - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) - - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - optimizes staled partitions in [\*GraphiteMergeTree](../../operations/table_engines/graphitemergetree.md#graphitemergetree) if rules from [rollup configuration](../../operations/table_engines/graphitemergetree.md#rollup-configuration) could be applied - - [Grafana](https://grafana.com/) - - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) - - [Prometheus](https://prometheus.io/) - - [clickhouse_exporter](https://github.com/f1yegor/clickhouse_exporter) - - [PromHouse](https://github.com/Percona-Lab/PromHouse) - - [Nagios](https://www.nagios.org/) - - [check_clickhouse](https://github.com/exogroup/check_clickhouse/) - - [Zabbix](https://www.zabbix.com) - - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) - - [Sematext](https://sematext.com/) - - [clickhouse积分](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) -- 记录 - - [rsyslog](https://www.rsyslog.com/) - - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) - - [fluentd](https://www.fluentd.org) - - [loghouse](https://github.com/flant/loghouse) (对于 [Kubernetes](https://kubernetes.io)) - - [logagent](https://www.sematext.com/logagent) - - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) -- 地理 - - [MaxMind](https://dev.maxmind.com/geoip/) - - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) +## 基建产品 {#ji-jian-chan-pin} -## 编程语言生态系统 +- 关系数据库管理系统 + - [MySQL](https://www.mysql.com) + - [ProxySQL](https://github.com/sysown/proxysql/wiki/ClickHouse-Support) + - [clickhouse-mysql-data-reader](https://github.com/Altinity/clickhouse-mysql-data-reader) + - [horgh-replicator](https://github.com/larsnovikov/horgh-replicator) + - [PostgreSQL](https://www.postgresql.org) + - [clickhousedb\_fdw](https://github.com/Percona-Lab/clickhousedb_fdw) + - [infi.clickhouse\_fdw](https://github.com/Infinidat/infi.clickhouse_fdw) (使用 [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pg2ch](https://github.com/mkabilov/pg2ch) + - [MSSQL](https://en.wikipedia.org/wiki/Microsoft_SQL_Server) + - [ClickHouseMightrator](https://github.com/zlzforever/ClickHouseMigrator) +- 消息队列 + - [Kafka](https://kafka.apache.org) + - [clickhouse\_sinker](https://github.com/housepower/clickhouse_sinker) (使用 [Go client](https://github.com/kshvakov/clickhouse/)) +- 对象存储 + - [S3](https://en.wikipedia.org/wiki/Amazon_S3) + - [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup) +- 容器编排 + - [Kubernetes](https://kubernetes.io) + - [clickhouse-operator](https://github.com/Altinity/clickhouse-operator) +- 配置管理 + - [puppet](https://puppet.com) + - [innogames/clickhouse](https://forge.puppet.com/innogames/clickhouse) + - [mfedotov/clickhouse](https://forge.puppet.com/mfedotov/clickhouse) +- 监控 + - [Graphite](https://graphiteapp.org) + - [graphouse](https://github.com/yandex/graphouse) + - [carbon-clickhouse](https://github.com/lomik/carbon-clickhouse) + + - [graphite-clickhouse](https://github.com/lomik/graphite-clickhouse) + - [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer) - optimizes staled partitions in [\*GraphiteMergeTree](../../operations/table_engines/graphitemergetree.md#graphitemergetree) if rules from [rollup configuration](../../operations/table_engines/graphitemergetree.md#rollup-configuration) could be applied + - [Grafana](https://grafana.com/) + - [clickhouse-grafana](https://github.com/Vertamedia/clickhouse-grafana) + - [Prometheus](https://prometheus.io/) + - [clickhouse\_exporter](https://github.com/f1yegor/clickhouse_exporter) + - [PromHouse](https://github.com/Percona-Lab/PromHouse) + - [Nagios](https://www.nagios.org/) + - [check\_clickhouse](https://github.com/exogroup/check_clickhouse/) + - [Zabbix](https://www.zabbix.com) + - [clickhouse-zabbix-template](https://github.com/Altinity/clickhouse-zabbix-template) + - [Sematext](https://sematext.com/) + - [clickhouse积分](https://github.com/sematext/sematext-agent-integrations/tree/master/clickhouse) +- 记录 + - [rsyslog](https://www.rsyslog.com/) + - [omclickhouse](https://www.rsyslog.com/doc/master/configuration/modules/omclickhouse.html) + - [fluentd](https://www.fluentd.org) + - [loghouse](https://github.com/flant/loghouse) (对于 [Kubernetes](https://kubernetes.io)) + - [logagent](https://www.sematext.com/logagent) + - [logagent output-plugin-clickhouse](https://sematext.com/docs/logagent/output-plugin-clickhouse/) +- 地理 + - [MaxMind](https://dev.maxmind.com/geoip/) + - [clickhouse-maxmind-geoip](https://github.com/AlexeyKupershtokh/clickhouse-maxmind-geoip) -- Python - - [SQLAlchemy](https://www.sqlalchemy.org) - - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (使用 [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm)) - - [pandas](https://pandas.pydata.org) - - [pandahouse](https://github.com/kszucs/pandahouse) -- R - - [dplyr](https://db.rstudio.com/dplyr/) - - [RClickhouse](https://github.com/IMSMWU/RClickhouse) (使用 [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) -- Java - - [Hadoop](http://hadoop.apache.org) - - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (使用 [JDBC](../../query_language/table_functions/jdbc.md)) -- Scala - - [Akka](https://akka.io) - - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) -- C# - - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) - - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) - - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) - - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) -- Elixir - - [Ecto](https://github.com/elixir-ecto/ecto) - - [clickhouse_ecto](https://github.com/appodeal/clickhouse_ecto) +## 编程语言生态系统 {#bian-cheng-yu-yan-sheng-tai-xi-tong} + +- Python + - [SQLAlchemy](https://www.sqlalchemy.org) + - [sqlalchemy-clickhouse](https://github.com/cloudflare/sqlalchemy-clickhouse) (使用 [infi.clickhouse\_orm](https://github.com/Infinidat/infi.clickhouse_orm)) + - [pandas](https://pandas.pydata.org) + - [pandahouse](https://github.com/kszucs/pandahouse) +- R + - [dplyr](https://db.rstudio.com/dplyr/) + - [RClickhouse](https://github.com/IMSMWU/RClickhouse) (使用 [clickhouse-cpp](https://github.com/artpaul/clickhouse-cpp)) +- Java + - [Hadoop](http://hadoop.apache.org) + - [clickhouse-hdfs-loader](https://github.com/jaykelin/clickhouse-hdfs-loader) (使用 [JDBC](../../query_language/table_functions/jdbc.md)) +- Scala + - [Akka](https://akka.io) + - [clickhouse-scala-client](https://github.com/crobox/clickhouse-scala-client) +- C\# + - [ADO.NET](https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/ado-net-overview) + - [ClickHouse.Ado](https://github.com/killwort/ClickHouse-Net) + - [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net) + - [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations) +- Elixir + - [Ecto](https://github.com/elixir-ecto/ecto) + - [clickhouse\_ecto](https://github.com/appodeal/clickhouse_ecto) [来源文章](https://clickhouse.tech/docs/zh/interfaces/third-party/integrations/) diff --git a/docs/zh/interfaces/third-party/proxy.md b/docs/zh/interfaces/third-party/proxy.md index df92bd4dbe5..727bff00cbb 100644 --- a/docs/zh/interfaces/third-party/proxy.md +++ b/docs/zh/interfaces/third-party/proxy.md @@ -1,4 +1,4 @@ -# 来自第三方开发人员的代理服务器 +# 来自第三方开发人员的代理服务器 {#lai-zi-di-san-fang-kai-fa-ren-yuan-de-dai-li-fu-wu-qi} [chproxy](https://github.com/Vertamedia/chproxy) 是ClickHouse数据库的http代理和负载均衡器。 @@ -6,11 +6,11 @@ *每用户路由和响应缓存。 *灵活的限制。 -*自动SSL证书续订。 +\*自动SSL证书续订。 在Go中实现。 -## KittenHouse +## KittenHouse {#kittenhouse} [KittenHouse](https://github.com/VKCOM/kittenhouse) 设计为ClickHouse和应用程序服务器之间的本地代理,以防在应用程序端缓冲INSERT数据是不可能或不方便的。 @@ -18,11 +18,11 @@ *内存和磁盘数据缓冲。 *每表路由。 -*负载平衡和健康检查。 +\*负载平衡和健康检查。 在Go中实现。 -## ClickHouse-Bulk +## ClickHouse-Bulk {#clickhouse-bulk} [ClickHouse-Bulk](https://github.com/nikepan/clickhouse-bulk) 是一个简单的ClickHouse插入收集器。 @@ -30,7 +30,7 @@ *分组请求并按阈值或间隔发送。 *多个远程服务器。 -*基本身份验证。 +\*基本身份验证。 在Go中实现。 diff --git a/docs/zh/introduction/distinctive_features.md b/docs/zh/introduction/distinctive_features.md index 394627f1d87..250a1a20e87 100644 --- a/docs/zh/introduction/distinctive_features.md +++ b/docs/zh/introduction/distinctive_features.md @@ -1,61 +1,61 @@ -# ClickHouse的独特功能 +# ClickHouse的独特功能 {#clickhousede-du-te-gong-neng} -## 真正的列式数据库管理系统 +## 真正的列式数据库管理系统 {#zhen-zheng-de-lie-shi-shu-ju-ku-guan-li-xi-tong} -在一个真正的列式数据库管理系统中,除了数据本身外不应该存在其他额外的数据。这意味着为了避免在值旁边存储它们的长度“number”,你必须支持固定长度数值类型。例如,10亿个UInt8类型的数据在未压缩的情况下大约消耗1GB左右的空间,如果不是这样的话,这将对CPU的使用产生强烈影响。即使是在未压缩的情况下,紧凑的存储数据也是非常重要的,因为解压缩的速度主要取决于未压缩数据的大小。 +在一个真正的列式数据库管理系统中,除了数据本身外不应该存在其他额外的数据。这意味着为了避免在值旁边存储它们的长度«number»,你必须支持固定长度数值类型。例如,10亿个UInt8类型的数据在未压缩的情况下大约消耗1GB左右的空间,如果不是这样的话,这将对CPU的使用产生强烈影响。即使是在未压缩的情况下,紧凑的存储数据也是非常重要的,因为解压缩的速度主要取决于未压缩数据的大小。 这是非常值得注意的,因为在一些其他系统中也可以将不同的列分别进行存储,但由于对其他场景进行的优化,使其无法有效的处理分析查询。例如: HBase,BigTable,Cassandra,HyperTable。在这些系统中,你可以得到每秒数十万的吞吐能力,但是无法得到每秒几亿行的吞吐能力。 需要说明的是,ClickHouse不单单是一个数据库, 它是一个数据库管理系统。因为它允许在运行时创建表和数据库、加载数据和运行查询,而无需重新配置或重启服务。 -## 数据压缩 +## 数据压缩 {#shu-ju-ya-suo} 在一些列式数据库管理系统中(例如:InfiniDB CE 和 MonetDB) 并没有使用数据压缩。但是, 若想达到比较优异的性能,数据压缩确实起到了至关重要的作用。 -## 数据的磁盘存储 +## 数据的磁盘存储 {#shu-ju-de-ci-pan-cun-chu} 许多的列式数据库(如 SAP HANA, Google PowerDrill)只能在内存中工作,这种方式会造成比实际更多的设备预算。ClickHouse被设计用于工作在传统磁盘上的系统,它提供每GB更低的存储成本,但如果有可以使用SSD和内存,它也会合理的利用这些资源。 -## 多核心并行处理 +## 多核心并行处理 {#duo-he-xin-bing-xing-chu-li} ClickHouse会使用服务器上一切可用的资源,从而以最自然的方式并行处理大型查询。 -## 多服务器分布式处理 +## 多服务器分布式处理 {#duo-fu-wu-qi-fen-bu-shi-chu-li} 上面提到的列式数据库管理系统中,几乎没有一个支持分布式的查询处理。 在ClickHouse中,数据可以保存在不同的shard上,每一个shard都由一组用于容错的replica组成,查询可以并行地在所有shard上进行处理。这些对用户来说是透明的 -## 支持SQL +## 支持SQL {#zhi-chi-sql} ClickHouse支持基于SQL的声明式查询语言,该语言大部分情况下是与SQL标准兼容的。 支持的查询包括 GROUP BY,ORDER BY,IN,JOIN以及非相关子查询。 不支持窗口函数和相关子查询。 -## 向量引擎 +## 向量引擎 {#xiang-liang-yin-qing} 为了高效的使用CPU,数据不仅仅按列存储,同时还按向量(列的一部分)进行处理,这样可以更加高效地使用CPU。 -## 实时的数据更新 +## 实时的数据更新 {#shi-shi-de-shu-ju-geng-xin} ClickHouse支持在表中定义主键。为了使查询能够快速在主键中进行范围查找,数据总是以增量的方式有序的存储在MergeTree中。因此,数据可以持续不断地高效的写入到表中,并且写入的过程中不会存在任何加锁的行为。 -## 索引 +## 索引 {#suo-yin} 按照主键对数据进行排序,这将帮助ClickHouse在几十毫秒以内完成对数据特定值或范围的查找。 -## 适合在线查询 +## 适合在线查询 {#gua-he-zai-xian-cha-xun} 在线查询意味着在没有对数据做任何预处理的情况下以极低的延迟处理查询并将结果加载到用户的页面中。 -## 支持近似计算 +## 支持近似计算 {#zhi-chi-jin-si-ji-suan} ClickHouse提供各种各样在允许牺牲数据精度的情况下对查询进行加速的方法: -1. 用于近似计算的各类聚合函数,如:distinct values, medians, quantiles -2. 基于数据的部分样本进行近似查询。这时,仅会从磁盘检索少部分比例的数据。 -3. 不使用全部的聚合条件,通过随机选择有限个数据聚合条件进行聚合。这在数据聚合条件满足某些分布条件下,在提供相当准确的聚合结果的同时降低了计算资源的使用。 +1. 用于近似计算的各类聚合函数,如:distinct values, medians, quantiles +2. 基于数据的部分样本进行近似查询。这时,仅会从磁盘检索少部分比例的数据。 +3. 不使用全部的聚合条件,通过随机选择有限个数据聚合条件进行聚合。这在数据聚合条件满足某些分布条件下,在提供相当准确的聚合结果的同时降低了计算资源的使用。 -## 支持数据复制和数据完整性 +## 支持数据复制和数据完整性 {#zhi-chi-shu-ju-fu-zhi-he-shu-ju-wan-zheng-xing} ClickHouse使用异步的多主复制技术。当数据被写入任何一个可用副本后,系统会在后台将数据分发给其他副本,以保证系统在不同副本上保持相同的数据。在大多数情况下ClickHouse能在故障后自动恢复,在一些少数的复杂情况下需要手动恢复。 diff --git a/docs/zh/introduction/features_considered_disadvantages.md b/docs/zh/introduction/features_considered_disadvantages.md index 9f916b2c20b..67ce1b847f7 100644 --- a/docs/zh/introduction/features_considered_disadvantages.md +++ b/docs/zh/introduction/features_considered_disadvantages.md @@ -1,7 +1,7 @@ -# ClickHouse可以认为是缺点的功能 +# ClickHouse可以认为是缺点的功能 {#clickhouseke-yi-ren-wei-shi-que-dian-de-gong-neng} -1. 没有完整的事务支持。 -2. 缺少高频率,低延迟的修改或删除已存在数据的能力。仅能用于批量删除或修改数据,但这符合 [GDPR](https://gdpr-info.eu)。 -3. 稀疏索引使得ClickHouse不适合通过其键检索单行的点查询。 +1. 没有完整的事务支持。 +2. 缺少高频率,低延迟的修改或删除已存在数据的能力。仅能用于批量删除或修改数据,但这符合 [GDPR](https://gdpr-info.eu)。 +3. 稀疏索引使得ClickHouse不适合通过其键检索单行的点查询。 [来源文章](https://clickhouse.tech/docs/zh/introduction/features_considered_disadvantages/) diff --git a/docs/zh/introduction/history.md b/docs/zh/introduction/history.md index cd25af50942..7c1a058ea76 100644 --- a/docs/zh/introduction/history.md +++ b/docs/zh/introduction/history.md @@ -1,4 +1,4 @@ -# ClickHouse历史 +# ClickHouse历史 {#clickhouseli-shi} ClickHouse最初是为 [Yandex.Metrica](https://metrica.yandex.com/) [世界第二大Web分析平台](http://w3techs.com/technologies/overview/traffic_analysis/all) 而开发的。多年来一直作为该系统的核心组件被该系统持续使用着。目前为止,该系统在ClickHouse中有超过13万亿条记录,并且每天超过200多亿个事件被处理。它允许直接从原始数据中动态查询并生成报告。本文简要介绍了ClickHouse在其早期发展阶段的目标。 @@ -6,35 +6,35 @@ Yandex.Metrica基于用户定义的字段,对实时访问、连接会话,生 截至2014年4月,Yandex.Metrica每天跟踪大约120亿个事件(用户的点击和浏览)。为了可以创建自定义的报表,我们必须存储全部这些事件。同时,这些查询可能需要在几百毫秒内扫描数百万行的数据,或在几秒内扫描数亿行的数据。 -## Yandex.Metrica以及其他Yandex服务的使用案例 +## Yandex.Metrica以及其他Yandex服务的使用案例 {#yandex-metricayi-ji-qi-ta-yandexfu-wu-de-shi-yong-an-li} 在Yandex.Metrica中,ClickHouse被用于多个场景中。 它的主要任务是使用原始数据在线的提供各种数据报告。它使用374台服务器的集群,存储了20.3万亿行的数据。在去除重复与副本数据的情况下,压缩后的数据达到了2PB。未压缩前(TSV格式)它大概有17PB。 ClickHouse还被使用在: -- 存储来自Yandex.Metrica回话重放数据。 -- 处理中间数据 -- 与Analytics一起构建全球报表。 -- 为调试Yandex.Metrica引擎运行查询 -- 分析来自API和用户界面的日志数据 +- 存储来自Yandex.Metrica回话重放数据。 +- 处理中间数据 +- 与Analytics一起构建全球报表。 +- 为调试Yandex.Metrica引擎运行查询 +- 分析来自API和用户界面的日志数据 ClickHouse在其他Yandex服务中至少有12个安装:search verticals, Market, Direct, business analytics, mobile development, AdFox, personal services等。 -## 聚合与非聚合数据 +## 聚合与非聚合数据 {#ju-he-yu-fei-ju-he-shu-ju} 有一种流行的观点认为,想要有效的计算统计数据,必须要聚合数据,因为聚合将降低数据量。 但是数据聚合是一个有诸多限制的解决方案,例如: -- 你必须提前知道用户定义的报表的字段列表 -- 用户无法自定义报表 -- 当聚合条件过多时,可能不会减少数据,聚合是无用的。 -- 存在大量报表时,有太多的聚合变化(组合爆炸) -- 当聚合条件有非常大的基数时(如:url),数据量没有太大减少(少于两倍) -- 聚合的数据量可能会增长而不是收缩 -- 用户不会查看我们为他生成的所有报告,大部分计算将是无用的 -- 各种聚合可能违背了数据的逻辑完整性 +- 你必须提前知道用户定义的报表的字段列表 +- 用户无法自定义报表 +- 当聚合条件过多时,可能不会减少数据,聚合是无用的。 +- 存在大量报表时,有太多的聚合变化(组合爆炸) +- 当聚合条件有非常大的基数时(如:url),数据量没有太大减少(少于两倍) +- 聚合的数据量可能会增长而不是收缩 +- 用户不会查看我们为他生成的所有报告,大部分计算将是无用的 +- 各种聚合可能违背了数据的逻辑完整性 如果我们直接使用非聚合数据而不进行任何聚合时,我们的计算量可能是减少的。 @@ -46,5 +46,4 @@ OLAPServer可以很好的工作在非聚合数据上,但是它有诸多限制 为了消除OLAPServer的这些局限性,解决所有报表使用非聚合数据的问题,我们开发了ClickHouse数据库管理系统。 - [来源文章](https://clickhouse.tech/docs/en/introduction/ya_metrika_task/) diff --git a/docs/zh/introduction/performance.md b/docs/zh/introduction/performance.md index 9131b803ffd..ed44ec760bb 100644 --- a/docs/zh/introduction/performance.md +++ b/docs/zh/introduction/performance.md @@ -1,24 +1,24 @@ -# Performance +# Performance {#performance} 根据Yandex的内部测试结果,ClickHouse表现出了比同类可比较产品更优的性能。你可以在 [这里](https://clickhouse.tech/benchmark.html) 查看具体的测试结果。 许多其他的测试也证实这一点。你可以使用互联网搜索到它们,或者你也可以从 [我们收集的部分相关连接](https://clickhouse.tech/#independent-benchmarks) 中查看。 -## 单个大查询的吞吐量 +## 单个大查询的吞吐量 {#dan-ge-da-cha-xun-de-tun-tu-liang} 吞吐量可以使用每秒处理的行数或每秒处理的字节数来衡量。如果数据被放置在page cache中,则一个不太复杂的查询在单个服务器上大约能够以2-10GB/s(未压缩)的速度进行处理(对于简单的查询,速度可以达到30GB/s)。如果数据没有在page cache中的话,那么速度将取决于你的磁盘系统和数据的压缩率。例如,如果一个磁盘允许以400MB/s的速度读取数据,并且数据压缩率是3,则数据的处理速度为1.2GB/s。这意味着,如果你是在提取一个10字节的列,那么它的处理速度大约是1-2亿行每秒。 对于分布式处理,处理速度几乎是线性扩展的,但这受限于聚合或排序的结果不是那么大的情况下。 -## 处理短查询的延迟时间 +## 处理短查询的延迟时间 {#chu-li-duan-cha-xun-de-yan-chi-shi-jian} 如果一个查询使用主键并且没有太多行(几十万)进行处理,并且没有查询太多的列,那么在数据被page cache缓存的情况下,它的延迟应该小于50毫秒(在最佳的情况下应该小于10毫秒)。 否则,延迟取决于数据的查找次数。如果你当前使用的是HDD,在数据没有加载的情况下,查询所需要的延迟可以通过以下公式计算得知: 查找时间(10 ms) \* 查询的列的数量 \* 查询的数据块的数量。 -## 处理大量短查询的吞吐量 +## 处理大量短查询的吞吐量 {#chu-li-da-liang-duan-cha-xun-de-tun-tu-liang} 在相同的情况下,ClickHouse可以在单个服务器上每秒处理数百个查询(在最佳的情况下最多可以处理数千个)。但是由于这不适用于分析型场景。因此我们建议每秒最多查询100次。 -## 数据的写入性能 +## 数据的写入性能 {#shu-ju-de-xie-ru-xing-neng} 我们建议每次写入不少于1000行的批量写入,或每秒不超过一个写入请求。当使用tab-separated格式将一份数据写入到MergeTree表中时,写入速度大约为50到200MB/s。如果您写入的数据每行为1Kb,那么写入的速度为50,000到200,000行每秒。如果您的行更小,那么写入速度将更高。为了提高写入性能,您可以使用多个INSERT进行并行写入,这将带来线性的性能提升。 diff --git a/docs/zh/operations/access_rights.md b/docs/zh/operations/access_rights.md index ebe3a2e7c0f..1c648a29f26 100644 --- a/docs/zh/operations/access_rights.md +++ b/docs/zh/operations/access_rights.md @@ -1,10 +1,10 @@ -# Access Rights +# Access Rights {#access-rights} Users and access rights are set up in the user config. This is usually `users.xml`. Users are recorded in the `users` section. Here is a fragment of the `users.xml` file: -```xml +``` xml @@ -61,15 +61,15 @@ Users are recorded in the `users` section. Here is a fragment of the `users.xml` You can see a declaration from two users: `default`and`web`. We added the `web` user separately. -The `default` user is chosen in cases when the username is not passed. The `default` user is also used for distributed query processing, if the configuration of the server or cluster doesn't specify the `user` and `password` (see the section on the [Distributed](../operations/table_engines/distributed.md) engine). +The `default` user is chosen in cases when the username is not passed. The `default` user is also used for distributed query processing, if the configuration of the server or cluster doesn’t specify the `user` and `password` (see the section on the [Distributed](../operations/table_engines/distributed.md) engine). The user that is used for exchanging information between servers combined in a cluster must not have substantial restrictions or quotas – otherwise, distributed queries will fail. -The password is specified in clear text (not recommended) or in SHA-256. The hash isn't salted. In this regard, you should not consider these passwords as providing security against potential malicious attacks. Rather, they are necessary for protection from employees. +The password is specified in clear text (not recommended) or in SHA-256. The hash isn’t salted. In this regard, you should not consider these passwords as providing security against potential malicious attacks. Rather, they are necessary for protection from employees. A list of networks is specified that access is allowed from. In this example, the list of networks for both users is loaded from a separate file (`/etc/metrika.xml`) containing the `networks` substitution. Here is a fragment of it: -```xml +``` xml ... @@ -81,22 +81,21 @@ A list of networks is specified that access is allowed from. In this example, th ``` -You could define this list of networks directly in `users.xml`, or in a file in the `users.d` directory (for more information, see the section "[Configuration files](configuration_files.md#configuration_files)"). +You could define this list of networks directly in `users.xml`, or in a file in the `users.d` directory (for more information, see the section «[Configuration files](configuration_files.md#configuration_files)»). The config includes comments explaining how to open access from everywhere. For use in production, only specify `ip` elements (IP addresses and their masks), since using `host` and `hoost_regexp` might cause extra latency. -Next the user settings profile is specified (see the section "[Settings profiles](settings/settings_profiles.md)"). You can specify the default profile, `default'`. The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is `readonly=1`, which ensures read-only access. -Then specify the quota to be used (see the section "[Quotas](quotas.md#quotas)"). You can specify the default quota: `default`. It is set in the config by default to only count resource usage, without restricting it. The quota can have any name. You can specify the same quota for different users – in this case, resource usage is calculated for each user individually. +Next the user settings profile is specified (see the section «[Settings profiles](settings/settings_profiles.md)»). You can specify the default profile, `default'`. The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is `readonly=1`, which ensures read-only access. +Then specify the quota to be used (see the section «[Quotas](quotas.md#quotas)»). You can specify the default quota: `default`. It is set in the config by default to only count resource usage, without restricting it. The quota can have any name. You can specify the same quota for different users – in this case, resource usage is calculated for each user individually. In the optional `` section, you can also specify a list of databases that the user can access. By default, all databases are available to the user. You can specify the `default` database. In this case, the user will receive access to the database by default. Access to the `system` database is always allowed (since this database is used for processing queries). -The user can get a list of all databases and tables in them by using `SHOW` queries or system tables, even if access to individual databases isn't allowed. - -Database access is not related to the [readonly](settings/permissions_for_queries.md#settings_readonly) setting. You can't grant full access to one database and `readonly` access to another one. +The user can get a list of all databases and tables in them by using `SHOW` queries or system tables, even if access to individual databases isn’t allowed. +Database access is not related to the [readonly](settings/permissions_for_queries.md#settings_readonly) setting. You can’t grant full access to one database and `readonly` access to another one. [Original article](https://clickhouse.tech/docs/en/operations/access_rights/) diff --git a/docs/zh/operations/configuration_files.md b/docs/zh/operations/configuration_files.md index c86e5720247..3054f6e01c2 100644 --- a/docs/zh/operations/configuration_files.md +++ b/docs/zh/operations/configuration_files.md @@ -1,4 +1,4 @@ -# Configuration Files {#configuration_files} +# Configuration Files {#configuration-files} The main server config file is `config.xml`. It resides in the `/etc/clickhouse-server/` directory. @@ -12,13 +12,14 @@ If `replace` is specified, it replaces the entire element with the specified one If `remove` is specified, it deletes the element. -The config can also define "substitutions". If an element has the `incl` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is `/etc/metrika.xml`. This can be changed in the [include_from](server_settings/settings.md#server_settings-include_from) element in the server config. The substitution values are specified in `/yandex/substitution_name` elements in this file. If a substitution specified in `incl` does not exist, it is recorded in the log. To prevent ClickHouse from logging missing substitutions, specify the `optional="true"` attribute (for example, settings for [macros](#macros) server_settings/settings.md)). +The config can also define «substitutions». If an element has the `incl` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is `/etc/metrika.xml`. This can be changed in the [include\_from](server_settings/settings.md#server_settings-include_from) element in the server config. The substitution values are specified in `/yandex/substitution_name` elements in this file. If a substitution specified in `incl` does not exist, it is recorded in the log. To prevent ClickHouse from logging missing substitutions, specify the `optional="true"` attribute (for example, settings for [macros](#macros) server\_settings/settings.md)). Substitutions can also be performed from ZooKeeper. To do this, specify the attribute `from_zk = "/path/to/node"`. The element value is replaced with the contents of the node at `/path/to/node` in ZooKeeper. You can also put an entire XML subtree on the ZooKeeper node and it will be fully inserted into the source element. -The `config.xml` file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the 'users_config' element. By default, it is `users.xml`. If `users_config` is omitted, the user settings, profiles, and quotas are specified directly in `config.xml`. +The `config.xml` file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the ‘users\_config’ element. By default, it is `users.xml`. If `users_config` is omitted, the user settings, profiles, and quotas are specified directly in `config.xml`. In addition, `users_config` may have overrides in files from the `users_config.d` directory (for example, `users.d`) and substitutions. For example, you can have separate config file for each user like this: + ``` xml $ cat /etc/clickhouse-server/users.d/alice.xml diff --git a/docs/zh/operations/index.md b/docs/zh/operations/index.md index 1c5bb3198e3..596ec065f40 100644 --- a/docs/zh/operations/index.md +++ b/docs/zh/operations/index.md @@ -1,4 +1,3 @@ -# Operations - +# Operations {#operations} [Original article](https://clickhouse.tech/docs/en/operations/) diff --git a/docs/zh/operations/monitoring.md b/docs/zh/operations/monitoring.md index bd177fe15b2..97cb8329b2b 100644 --- a/docs/zh/operations/monitoring.md +++ b/docs/zh/operations/monitoring.md @@ -1,37 +1,37 @@ -# 监控 +# 监控 {#jian-kong} 可以监控到: -- 硬件资源的利用率。 -- ClickHouse 服务的指标。 +- 硬件资源的利用率。 +- ClickHouse 服务的指标。 -## 硬件资源利用率 +## 硬件资源利用率 {#ying-jian-zi-yuan-li-yong-lu} ClickHouse 本身不会去监控硬件资源的状态。 强烈推荐监控以下监控项: -- 处理器上的负载和温度。 +- 处理器上的负载和温度。 - 可以使用 [dmesg](https://en.wikipedia.org/wiki/Dmesg), [turbostat](https://www.linux.org/docs/man8/turbostat.html) 或者其他工具。 + 可以使用 [dmesg](https://en.wikipedia.org/wiki/Dmesg), [turbostat](https://www.linux.org/docs/man8/turbostat.html) 或者其他工具。 -- 磁盘存储,RAM和网络的使用率。 +- 磁盘存储,RAM和网络的使用率。 -## ClickHouse 服务的指标。 +## ClickHouse 服务的指标。 {#clickhouse-fu-wu-de-zhi-biao} ClickHouse服务本身具有用于自我状态监视指标。 -要跟踪服务器事件,请观察服务器日志。 请参阅配置文件的[logger](server_settings/settings.md#server_settings-logger)部分。 +要跟踪服务器事件,请观察服务器日志。 请参阅配置文件的\[logger\](server\_settings/settings.md\#server\_settings-logger)部分。 ClickHouse 收集的指标项: -- 服务用于计算的资源占用的各种指标。 -- 关于查询处理的常见统计信息。 +- 服务用于计算的资源占用的各种指标。 +- 关于查询处理的常见统计信息。 -可以在 [system.metrics](system_tables.md#system_tables-metrics) ,[system.events](system_tables.md#system_tables-events) 以及[system.asynchronous_metrics](system_tables.md#system_tables-asynchronous_metrics) 等系统表查看所有的指标项。 +可以在 [system.metrics](system_tables.md#system_tables-metrics) ,[system.events](system_tables.md#system_tables-events) 以及[system.asynchronous\_metrics](system_tables.md#system_tables-asynchronous_metrics) 等系统表查看所有的指标项。 可以配置ClickHouse 往 [Graphite](https://github.com/graphite-project)导入指标。 参考 [Graphite section](server_settings/settings.md#server_settings-graphite) 配置文件。在配置指标导出之前,需要参考Graphite[官方教程](https://graphite.readthedocs.io/en/latest/install.html)搭建服务。 此外,您可以通过HTTP API监视服务器可用性。 将HTTP GET请求发送到 `/ping`。 如果服务器可用,它将以 `200 OK` 响应。 -要监视服务器集群的配置中,应设置[max_replica_delay_for_distributed_queries](settings/settings.md#settings-max_replica_delay_for_distributed_queries)参数并使用HTTP资源`/replicas_status`。 如果副本可用,并且不延迟在其他副本之后,则对`/replicas_status`的请求将返回200 OK。 如果副本滞后,请求将返回 `503 HTTP_SERVICE_UNAVAILABLE`,包括有关待办事项大小的信息。 +要监视服务器集群的配置中,应设置[max\_replica\_delay\_for\_distributed\_queries](settings/settings.md#settings-max_replica_delay_for_distributed_queries)参数并使用HTTP资源`/replicas_status`。 如果副本可用,并且不延迟在其他副本之后,则对`/replicas_status`的请求将返回200 OK。 如果副本滞后,请求将返回 `503 HTTP_SERVICE_UNAVAILABLE`,包括有关待办事项大小的信息。 diff --git a/docs/zh/operations/performance/sampling_query_profiler_example_result.txt b/docs/zh/operations/performance/sampling_query_profiler_example_result.txt new file mode 120000 index 00000000000..58c5abe7122 --- /dev/null +++ b/docs/zh/operations/performance/sampling_query_profiler_example_result.txt @@ -0,0 +1 @@ +../../../en/operations/performance/sampling_query_profiler_example_result.txt \ No newline at end of file diff --git a/docs/zh/operations/quotas.md b/docs/zh/operations/quotas.md index 85ba0577f03..06f25f57016 100644 --- a/docs/zh/operations/quotas.md +++ b/docs/zh/operations/quotas.md @@ -1,18 +1,18 @@ # Quotas {#quotas} Quotas allow you to limit resource usage over a period of time, or simply track the use of resources. -Quotas are set up in the user config. This is usually 'users.xml'. +Quotas are set up in the user config. This is usually ‘users.xml’. -The system also has a feature for limiting the complexity of a single query. See the section "Restrictions on query complexity"). +The system also has a feature for limiting the complexity of a single query. See the section «Restrictions on query complexity»). In contrast to query complexity restrictions, quotas: -- Place restrictions on a set of queries that can be run over a period of time, instead of limiting a single query. -- Account for resources spent on all remote servers for distributed query processing. +- Place restrictions on a set of queries that can be run over a period of time, instead of limiting a single query. +- Account for resources spent on all remote servers for distributed query processing. -Let's look at the section of the 'users.xml' file that defines quotas. +Let’s look at the section of the ‘users.xml’ file that defines quotas. -```xml +``` xml @@ -35,7 +35,7 @@ Let's look at the section of the 'users.xml' file that defines quotas. By default, the quota just tracks resource consumption for each hour, without limiting usage. The resource consumption calculated for each interval is output to the server log after each request. -```xml +``` xml @@ -61,7 +61,7 @@ The resource consumption calculated for each interval is output to the server lo ``` -For the 'statbox' quota, restrictions are set for every hour and for every 24 hours (86,400 seconds). The time interval is counted starting from an implementation-defined fixed moment in time. In other words, the 24-hour interval doesn't necessarily begin at midnight. +For the ‘statbox’ quota, restrictions are set for every hour and for every 24 hours (86,400 seconds). The time interval is counted starting from an implementation-defined fixed moment in time. In other words, the 24-hour interval doesn’t necessarily begin at midnight. When the interval ends, all collected values are cleared. For the next hour, the quota calculation starts over. @@ -79,9 +79,9 @@ Here are the amounts that can be restricted: If the limit is exceeded for at least one time interval, an exception is thrown with a text about which restriction was exceeded, for which interval, and when the new interval begins (when queries can be sent again). -Quotas can use the "quota key" feature in order to report on resources for multiple keys independently. Here is an example of this: +Quotas can use the «quota key» feature in order to report on resources for multiple keys independently. Here is an example of this: -```xml +``` xml diff --git a/docs/zh/operations/server_settings/index.md b/docs/zh/operations/server_settings/index.md index 39f3a5bb6d4..4a1276a2ce1 100644 --- a/docs/zh/operations/server_settings/index.md +++ b/docs/zh/operations/server_settings/index.md @@ -1,12 +1,11 @@ -# Server configuration parameters {#server_settings} +# Server configuration parameters {#server-settings} This section contains descriptions of server settings that cannot be changed at the session or query level. These settings are stored in the `config.xml` file on the ClickHouse server. -Other settings are described in the "[Settings](../settings/index.md#settings)" section. +Other settings are described in the «[Settings](../settings/index.md#settings)» section. Before studying the settings, read the [Configuration files](../configuration_files.md#configuration_files) section and note the use of substitutions (the `incl` and `optional` attributes). - [Original article](https://clickhouse.tech/docs/en/operations/server_settings/) diff --git a/docs/zh/operations/settings/index.md b/docs/zh/operations/settings/index.md index 1d914cb2a24..6d3d96dfbf3 100644 --- a/docs/zh/operations/settings/index.md +++ b/docs/zh/operations/settings/index.md @@ -5,20 +5,21 @@ Settings are configured in layers, so each subsequent layer redefines the previo Ways to configure settings, in order of priority: -- Settings in the `users.xml` server configuration file. +- Settings in the `users.xml` server configuration file. - Set in the element ``. + Set in the element ``. -- Session settings. +- Session settings. - Send ` SET setting=value` from the ClickHouse console client in interactive mode. -Similarly, you can use ClickHouse sessions in the HTTP protocol. To do this, you need to specify the `session_id` HTTP parameter. + Send ` SET setting=value` from the ClickHouse console client in interactive mode. -- Query settings. - - When starting the ClickHouse console client in non-interactive mode, set the startup parameter `--setting=value`. - - When using the HTTP API, pass CGI parameters (`URL?setting_1=value&setting_2=value...`). + Similarly, you can use ClickHouse sessions in the HTTP protocol. To do this, you need to specify the `session_id` HTTP parameter. + +- Query settings. + + - When starting the ClickHouse console client in non-interactive mode, set the startup parameter `--setting=value`. + - When using the HTTP API, pass CGI parameters (`URL?setting_1=value&setting_2=value...`). Settings that can only be made in the server config file are not covered in this section. - [Original article](https://clickhouse.tech/docs/en/operations/settings/) diff --git a/docs/zh/operations/settings/query_complexity.md b/docs/zh/operations/settings/query_complexity.md index b1d5484ae4e..bdd4a150bb6 100644 --- a/docs/zh/operations/settings/query_complexity.md +++ b/docs/zh/operations/settings/query_complexity.md @@ -1,36 +1,36 @@ -# Restrictions on query complexity +# Restrictions on query complexity {#restrictions-on-query-complexity} Restrictions on query complexity are part of the settings. They are used in order to provide safer execution from the user interface. Almost all the restrictions only apply to SELECTs.For distributed query processing, restrictions are applied on each server separately. -Restrictions on the "maximum amount of something" can take the value 0, which means "unrestricted". -Most restrictions also have an 'overflow_mode' setting, meaning what to do when the limit is exceeded. -It can take one of two values: `throw` or `break`. Restrictions on aggregation (group_by_overflow_mode) also have the value `any`. +Restrictions on the «maximum amount of something» can take the value 0, which means «unrestricted». +Most restrictions also have an ‘overflow\_mode’ setting, meaning what to do when the limit is exceeded. +It can take one of two values: `throw` or `break`. Restrictions on aggregation (group\_by\_overflow\_mode) also have the value `any`. `throw` – Throw an exception (default). `break` – Stop executing the query and return the partial result, as if the source data ran out. -`any (only for group_by_overflow_mode)` – Continuing aggregation for the keys that got into the set, but don't add new keys to the set. +`any (only for group_by_overflow_mode)` – Continuing aggregation for the keys that got into the set, but don’t add new keys to the set. -## readonly {#query_complexity_readonly} +## readonly {#query-complexity-readonly} With a value of 0, you can execute any queries. With a value of 1, you can only execute read requests (such as SELECT and SHOW). Requests for writing and changing settings (INSERT, SET) are prohibited. With a value of 2, you can process read queries (SELECT, SHOW) and change settings (SET). -After enabling readonly mode, you can't disable it in the current session. +After enabling readonly mode, you can’t disable it in the current session. -When using the GET method in the HTTP interface, 'readonly = 1' is set automatically. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body, or in the URL parameter. +When using the GET method in the HTTP interface, ‘readonly = 1’ is set automatically. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body, or in the URL parameter. -## max_memory_usage {#settings_max_memory_usage} +## max\_memory\_usage {#settings-max-memory-usage} The maximum amount of RAM to use for running a query on a single server. In the default configuration file, the maximum is 10 GB. -The setting doesn't consider the volume of available memory or the total volume of memory on the machine. +The setting doesn’t consider the volume of available memory or the total volume of memory on the machine. The restriction applies to a single query within a single server. You can use `SHOW PROCESSLIST` to see the current memory consumption for each query. In addition, the peak memory consumption is tracked for each query and written to the log. @@ -41,153 +41,153 @@ Memory usage is not fully tracked for states of the aggregate functions `min`, ` Memory consumption is also restricted by the parameters `max_memory_usage_for_user` and `max_memory_usage_for_all_queries`. -## max_memory_usage_for_user +## max\_memory\_usage\_for\_user {#max-memory-usage-for-user} -The maximum amount of RAM to use for running a user's queries on a single server. +The maximum amount of RAM to use for running a user’s queries on a single server. Default values are defined in [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Interpreters/Settings.h#L244). By default, the amount is not restricted (`max_memory_usage_for_user = 0`). -See also the description of [max_memory_usage](#settings_max_memory_usage). +See also the description of [max\_memory\_usage](#settings_max_memory_usage). -## max_memory_usage_for_all_queries +## max\_memory\_usage\_for\_all\_queries {#max-memory-usage-for-all-queries} The maximum amount of RAM to use for running all queries on a single server. Default values are defined in [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Interpreters/Settings.h#L245). By default, the amount is not restricted (`max_memory_usage_for_all_queries = 0`). -See also the description of [max_memory_usage](#settings_max_memory_usage). +See also the description of [max\_memory\_usage](#settings_max_memory_usage). -## max_rows_to_read +## max\_rows\_to\_read {#max-rows-to-read} The following restrictions can be checked on each block (instead of on each row). That is, the restrictions can be broken a little. When running a query in multiple threads, the following restrictions apply to each thread separately. Maximum number of rows that can be read from a table when running a query. -## max_bytes_to_read +## max\_bytes\_to\_read {#max-bytes-to-read} Maximum number of bytes (uncompressed data) that can be read from a table when running a query. -## read_overflow_mode +## read\_overflow\_mode {#read-overflow-mode} -What to do when the volume of data read exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do when the volume of data read exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -## max_rows_to_group_by +## max\_rows\_to\_group\_by {#max-rows-to-group-by} Maximum number of unique keys received from aggregation. This setting lets you limit memory consumption when aggregating. -## group_by_overflow_mode +## group\_by\_overflow\_mode {#group-by-overflow-mode} -What to do when the number of unique keys for aggregation exceeds the limit: 'throw', 'break', or 'any'. By default, throw. -Using the 'any' value lets you run an approximation of GROUP BY. The quality of this approximation depends on the statistical nature of the data. +What to do when the number of unique keys for aggregation exceeds the limit: ‘throw’, ‘break’, or ‘any’. By default, throw. +Using the ‘any’ value lets you run an approximation of GROUP BY. The quality of this approximation depends on the statistical nature of the data. -## max_rows_to_sort +## max\_rows\_to\_sort {#max-rows-to-sort} Maximum number of rows before sorting. This allows you to limit memory consumption when sorting. -## max_bytes_to_sort +## max\_bytes\_to\_sort {#max-bytes-to-sort} Maximum number of bytes before sorting. -## sort_overflow_mode +## sort\_overflow\_mode {#sort-overflow-mode} -What to do if the number of rows received before sorting exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do if the number of rows received before sorting exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -## max_result_rows +## max\_result\_rows {#max-result-rows} Limit on the number of rows in the result. Also checked for subqueries, and on remote servers when running parts of a distributed query. -## max_result_bytes +## max\_result\_bytes {#max-result-bytes} Limit on the number of bytes in the result. The same as the previous setting. -## result_overflow_mode +## result\_overflow\_mode {#result-overflow-mode} -What to do if the volume of the result exceeds one of the limits: 'throw' or 'break'. By default, throw. -Using 'break' is similar to using LIMIT. +What to do if the volume of the result exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. +Using ‘break’ is similar to using LIMIT. -## max_execution_time +## max\_execution\_time {#max-execution-time} Maximum query execution time in seconds. At this time, it is not checked for one of the sorting stages, or when merging and finalizing aggregate functions. -## timeout_overflow_mode +## timeout\_overflow\_mode {#timeout-overflow-mode} -What to do if the query is run longer than 'max_execution_time': 'throw' or 'break'. By default, throw. +What to do if the query is run longer than ‘max\_execution\_time’: ‘throw’ or ‘break’. By default, throw. -## min_execution_speed +## min\_execution\_speed {#min-execution-speed} -Minimal execution speed in rows per second. Checked on every data block when 'timeout_before_checking_execution_speed' expires. If the execution speed is lower, an exception is thrown. +Minimal execution speed in rows per second. Checked on every data block when ‘timeout\_before\_checking\_execution\_speed’ expires. If the execution speed is lower, an exception is thrown. -## timeout_before_checking_execution_speed +## timeout\_before\_checking\_execution\_speed {#timeout-before-checking-execution-speed} -Checks that execution speed is not too slow (no less than 'min_execution_speed'), after the specified time in seconds has expired. +Checks that execution speed is not too slow (no less than ‘min\_execution\_speed’), after the specified time in seconds has expired. -## max_columns_to_read +## max\_columns\_to\_read {#max-columns-to-read} Maximum number of columns that can be read from a table in a single query. If a query requires reading a greater number of columns, it throws an exception. -## max_temporary_columns +## max\_temporary\_columns {#max-temporary-columns} Maximum number of temporary columns that must be kept in RAM at the same time when running a query, including constant columns. If there are more temporary columns than this, it throws an exception. -## max_temporary_non_const_columns +## max\_temporary\_non\_const\_columns {#max-temporary-non-const-columns} -The same thing as 'max_temporary_columns', but without counting constant columns. +The same thing as ‘max\_temporary\_columns’, but without counting constant columns. Note that constant columns are formed fairly often when running a query, but they require approximately zero computing resources. -## max_subquery_depth +## max\_subquery\_depth {#max-subquery-depth} Maximum nesting depth of subqueries. If subqueries are deeper, an exception is thrown. By default, 100. -## max_pipeline_depth +## max\_pipeline\_depth {#max-pipeline-depth} Maximum pipeline depth. Corresponds to the number of transformations that each data block goes through during query processing. Counted within the limits of a single server. If the pipeline depth is greater, an exception is thrown. By default, 1000. -## max_ast_depth +## max\_ast\_depth {#max-ast-depth} Maximum nesting depth of a query syntactic tree. If exceeded, an exception is thrown. -At this time, it isn't checked during parsing, but only after parsing the query. That is, a syntactic tree that is too deep can be created during parsing, but the query will fail. By default, 1000. +At this time, it isn’t checked during parsing, but only after parsing the query. That is, a syntactic tree that is too deep can be created during parsing, but the query will fail. By default, 1000. -## max_ast_elements +## max\_ast\_elements {#max-ast-elements} Maximum number of elements in a query syntactic tree. If exceeded, an exception is thrown. In the same way as the previous setting, it is checked only after parsing the query. By default, 50,000. -## max_rows_in_set +## max\_rows\_in\_set {#max-rows-in-set} Maximum number of rows for a data set in the IN clause created from a subquery. -## max_bytes_in_set +## max\_bytes\_in\_set {#max-bytes-in-set} Maximum number of bytes (uncompressed data) used by a set in the IN clause created from a subquery. -## set_overflow_mode +## set\_overflow\_mode {#set-overflow-mode} -What to do when the amount of data exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -## max_rows_in_distinct +## max\_rows\_in\_distinct {#max-rows-in-distinct} Maximum number of different rows when using DISTINCT. -## max_bytes_in_distinct +## max\_bytes\_in\_distinct {#max-bytes-in-distinct} Maximum number of bytes used by a hash table when using DISTINCT. -## distinct_overflow_mode +## distinct\_overflow\_mode {#distinct-overflow-mode} -What to do when the amount of data exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. -## max_rows_to_transfer +## max\_rows\_to\_transfer {#max-rows-to-transfer} Maximum number of rows that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. -## max_bytes_to_transfer +## max\_bytes\_to\_transfer {#max-bytes-to-transfer} Maximum number of bytes (uncompressed data) that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. -## transfer_overflow_mode +## transfer\_overflow\_mode {#transfer-overflow-mode} -What to do when the amount of data exceeds one of the limits: 'throw' or 'break'. By default, throw. +What to do when the amount of data exceeds one of the limits: ‘throw’ or ‘break’. By default, throw. [Original article](https://clickhouse.tech/docs/en/operations/settings/query_complexity/) diff --git a/docs/zh/operations/table_engines/aggregatingmergetree.md b/docs/zh/operations/table_engines/aggregatingmergetree.md index f4e1319620a..2b18b2fbe48 100644 --- a/docs/zh/operations/table_engines/aggregatingmergetree.md +++ b/docs/zh/operations/table_engines/aggregatingmergetree.md @@ -1,5 +1,4 @@ - -# AggregatingMergeTree +# AggregatingMergeTree {#aggregatingmergetree} 该引擎继承自 [MergeTree](mergetree.md),并改变了数据片段的合并逻辑。 ClickHouse 会将相同主键的所有行(在一个数据片段内)替换为单个存储一系列聚合函数状态的行。 @@ -9,7 +8,7 @@ 如果要按一组规则来合并减少行数,则使用 `AggregatingMergeTree` 是合适的。 -## 建表 +## 建表 {#jian-biao} ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] @@ -30,12 +29,14 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] 创建 `AggregatingMergeTree` 表时,需用跟创建 `MergeTree` 表一样的[子句](mergetree.md)。 -
已弃用的建表方法 +
-!!! 注意 +已弃用的建表方法 + +!!! 注意 "注意" 不要在新项目中使用该方法,可能的话,请将旧项目切换到上述方法。 -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -47,14 +48,14 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] 上面的所有参数跟 `MergeTree` 中的一样。
-## SELECT 和 INSERT +## SELECT 和 INSERT {#select-he-insert} 插入数据,需使用带有聚合 -State- 函数的 [INSERT SELECT](../../query_language/insert_into.md) 语句。 从 `AggregatingMergeTree` 表中查询数据时,需使用 `GROUP BY` 子句并且要使用与插入时相同的聚合函数,但后缀要改为 `-Merge` 。 在 `SELECT` 查询的结果中,对于 ClickHouse 的所有输出格式 `AggregateFunction` 类型的值都实现了特定的二进制表示法。如果直接用 `SELECT` 导出这些数据,例如如用 `TabSeparated` 格式,那么这些导出数据也能直接用 `INSERT` 语句加载导入。 -## 聚合物化视图的示例 +## 聚合物化视图的示例 {#ju-he-wu-hua-shi-tu-de-shi-li} 创建一个跟踪 `test.visits` 表的 `AggregatingMergeTree` 物化视图: diff --git a/docs/zh/operations/table_engines/buffer.md b/docs/zh/operations/table_engines/buffer.md index aec20029933..70fc8c6cbb2 100644 --- a/docs/zh/operations/table_engines/buffer.md +++ b/docs/zh/operations/table_engines/buffer.md @@ -1,18 +1,16 @@ -# Buffer +# Buffer {#buffer} 缓冲数据写入 RAM 中,周期性地将数据刷新到另一个表。在读取操作时,同时从缓冲区和另一个表读取数据。 -``` -Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes) -``` + Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes) -引擎的参数:database,table - 要刷新数据的表。可以使用返回字符串的常量表达式而不是数据库名称。 num_layers - 并行层数。在物理上,该表将表示为 num_layers 个独立缓冲区。建议值为16。min_time,max_time,min_rows,max_rows,min_bytes,max_bytes - 从缓冲区刷新数据的条件。 +引擎的参数:database,table - 要刷新数据的表。可以使用返回字符串的常量表达式而不是数据库名称。 num\_layers - 并行层数。在物理上,该表将表示为 num\_layers 个独立缓冲区。建议值为16。min\_time,max\_time,min\_rows,max\_rows,min\_bytes,max\_bytes - 从缓冲区刷新数据的条件。 -如果满足所有 "min" 条件或至少一个 "max" 条件,则从缓冲区刷新数据并将其写入目标表。min_time,max_time — 从第一次写入缓冲区时起以秒为单位的时间条件。min_rows,max_rows - 缓冲区中行数的条件。min_bytes,max_bytes - 缓冲区中字节数的条件。 +如果满足所有 «min» 条件或至少一个 «max» 条件,则从缓冲区刷新数据并将其写入目标表。min\_time,max\_time — 从第一次写入缓冲区时起以秒为单位的时间条件。min\_rows,max\_rows - 缓冲区中行数的条件。min\_bytes,max\_bytes - 缓冲区中字节数的条件。 -写入时,数据从 num_layers 个缓冲区中随机插入。或者,如果插入数据的大小足够大(大于 max_rows 或 max_bytes ),则会绕过缓冲区将其写入目标表。 +写入时,数据从 num\_layers 个缓冲区中随机插入。或者,如果插入数据的大小足够大(大于 max\_rows 或 max\_bytes ),则会绕过缓冲区将其写入目标表。 -每个 "num_layers" 缓冲区刷新数据的条件是分别计算。例如,如果 num_layers = 16 且 max_bytes = 100000000,则最大RAM消耗将为1.6 GB。 +每个 «num\_layers» 缓冲区刷新数据的条件是分别计算。例如,如果 num\_layers = 16 且 max\_bytes = 100000000,则最大RAM消耗将为1.6 GB。 示例: @@ -20,7 +18,7 @@ Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000) ``` -创建一个 "merge.hits_buffer" 表,其结构与 "merge.hits" 相同,并使用 Buffer 引擎。写入此表时,数据缓冲在 RAM 中,然后写入 "merge.hits" 表。创建了16个缓冲区。如果已经过了100秒,或者已写入100万行,或者已写入100 MB数据,则刷新每个缓冲区的数据;或者如果同时已经过了10秒并且已经写入了10,000行和10 MB的数据。例如,如果只写了一行,那么在100秒之后,都会被刷新。但是如果写了很多行,数据将会更快地刷新。 +创建一个 «merge.hits\_buffer» 表,其结构与 «merge.hits» 相同,并使用 Buffer 引擎。写入此表时,数据缓冲在 RAM 中,然后写入 «merge.hits» 表。创建了16个缓冲区。如果已经过了100秒,或者已写入100万行,或者已写入100 MB数据,则刷新每个缓冲区的数据;或者如果同时已经过了10秒并且已经写入了10,000行和10 MB的数据。例如,如果只写了一行,那么在100秒之后,都会被刷新。但是如果写了很多行,数据将会更快地刷新。 当服务器停止时,使用 DROP TABLE 或 DETACH TABLE,缓冲区数据也会刷新到目标表。 @@ -42,15 +40,14 @@ PREWHERE,FINAL 和 SAMPLE 对缓冲表不起作用。这些条件将传递到 将数据添加到缓冲区时,其中一个缓冲区被锁定。如果同时从表执行读操作,则会导致延迟。 -插入到 Buffer 表中的数据可能以不同的顺序和不同的块写入目标表中。因此,Buffer 表很难用于正确写入 CollapsingMergeTree。为避免出现问题,您可以将 "num_layers" 设置为1。 +插入到 Buffer 表中的数据可能以不同的顺序和不同的块写入目标表中。因此,Buffer 表很难用于正确写入 CollapsingMergeTree。为避免出现问题,您可以将 «num\_layers» 设置为1。 -如果目标表是复制表,则在写入 Buffer 表时会丢失复制表的某些预期特征。数据部分的行次序和大小的随机变化导致数据不能去重,这意味着无法对复制表进行可靠的 "exactly once" 写入。 +如果目标表是复制表,则在写入 Buffer 表时会丢失复制表的某些预期特征。数据部分的行次序和大小的随机变化导致数据不能去重,这意味着无法对复制表进行可靠的 «exactly once» 写入。 由于这些缺点,我们只建议在极少数情况下使用 Buffer 表。 当在单位时间内从大量服务器接收到太多 INSERTs 并且在插入之前无法缓冲数据时使用 Buffer 表,这意味着这些 INSERTs 不能足够快地执行。 -请注意,一次插入一行数据是没有意义的,即使对于 Buffer 表也是如此。这将只产生每秒几千行的速度,而插入更大的数据块每秒可以产生超过一百万行(参见 "性能" 部分)。 - +请注意,一次插入一行数据是没有意义的,即使对于 Buffer 表也是如此。这将只产生每秒几千行的速度,而插入更大的数据块每秒可以产生超过一百万行(参见 «性能» 部分)。 [Original article](https://clickhouse.tech/docs/zh/operations/table_engines/buffer/) diff --git a/docs/zh/operations/table_engines/collapsingmergetree.md b/docs/zh/operations/table_engines/collapsingmergetree.md index 2aa695b155a..c72d823e8b6 100644 --- a/docs/zh/operations/table_engines/collapsingmergetree.md +++ b/docs/zh/operations/table_engines/collapsingmergetree.md @@ -1,4 +1,4 @@ -# CollapsingMergeTree {#table_engine-collapsingmergetree} +# CollapsingMergeTree {#table-engine-collapsingmergetree} 该引擎继承于 [MergeTree](mergetree.md),并在数据块合并算法中添加了折叠行的逻辑。 @@ -6,9 +6,9 @@ 因此,该引擎可以显著的降低存储量并提高 `SELECT` 查询效率。 -## 建表 +## 建表 {#jian-biao} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -25,20 +25,22 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **CollapsingMergeTree 参数** -- `sign` — 类型列的名称: `1` 是“状态”行,`-1` 是“取消”行。 +- `sign` — 类型列的名称: `1` 是«状态»行,`-1` 是«取消»行。 - 列数据类型 — `Int8`。 + 列数据类型 — `Int8`。 **子句** 创建 `CollapsingMergeTree` 表时,需要与创建 `MergeTree` 表时相同的[子句](mergetree.md#table_engine-mergetree-creating-a-table)。 -
已弃用的建表方法 +
+ +已弃用的建表方法 !!! attention "注意" 不要在新项目中使用该方法,可能的话,请将旧项目切换到上述方法。 -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -49,36 +51,32 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] 除了 `sign` 的所有参数都与 `MergeTree` 中的含义相同。 -- `sign` — 类型列的名称: `1` 是“状态”行,`-1` 是“取消”行。 +- `sign` — 类型列的名称: `1` 是«状态»行,`-1` 是«取消»行。 + + 列数据类型 — `Int8`。 - 列数据类型 — `Int8`。
+## 折叠 {#table-engine-collapsingmergetree-collapsing} -## 折叠 {#table_engine-collapsingmergetree-collapsing} - -### 数据 +### 数据 {#shu-ju} 考虑你需要为某个对象保存不断变化的数据的情景。似乎为一个对象保存一行记录并在其发生任何变化时更新记录是合乎逻辑的,但是更新操作对 DBMS 来说是昂贵且缓慢的,因为它需要重写存储中的数据。如果你需要快速的写入数据,则更新操作是不可接受的,但是你可以按下面的描述顺序地更新一个对象的变化。 -在写入行的时候使用特定的列 `Sign`。如果 `Sign = 1` 则表示这一行是对象的状态,我们称之为“状态”行。如果 `Sign = -1` 则表示是对具有相同属性的状态行的取消,我们称之为“取消”行。 +在写入行的时候使用特定的列 `Sign`。如果 `Sign = 1` 则表示这一行是对象的状态,我们称之为«状态»行。如果 `Sign = -1` 则表示是对具有相同属性的状态行的取消,我们称之为«取消»行。 例如,我们想要计算用户在某个站点访问的页面页面数以及他们在那里停留的时间。在某个时候,我们将用户的活动状态写入下面这样的行。 -``` -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` + ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ + │ 4324182021466249494 │ 5 │ 146 │ 1 │ + └─────────────────────┴───────────┴──────────┴──────┘ 一段时间后,我们写入下面的两行来记录用户活动的变化。 -``` -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` + ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ + │ 4324182021466249494 │ 5 │ 146 │ -1 │ + │ 4324182021466249494 │ 6 │ 185 │ 1 │ + └─────────────────────┴───────────┴──────────┴──────┘ 第一行取消了这个对象(用户)的状态。它需要复制被取消的状态行的所有除了 `Sign` 的属性。 @@ -86,12 +84,10 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] 因为我们只需要用户活动的最后状态,这些行 -``` -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` + ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ + │ 4324182021466249494 │ 5 │ 146 │ 1 │ + │ 4324182021466249494 │ 5 │ 146 │ -1 │ + └─────────────────────┴───────────┴──────────┴──────┘ 可以在折叠对象的失效(老的)状态的时候被删除。`CollapsingMergeTree` 会在合并数据片段的时候做这件事。 @@ -99,27 +95,27 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **这种方法的特殊属性** -1. 写入的程序应该记住对象的状态从而可以取消它。“取消”字符串应该是“状态”字符串的复制,除了相反的 `Sign`。它增加了存储的初始数据的大小,但使得写入数据更快速。 -2. 由于写入的负载,列中长的增长阵列会降低引擎的效率。数据越简单,效率越高。 -3. `SELECT` 的结果很大程度取决于对象变更历史的一致性。在准备插入数据时要准确。在不一致的数据中会得到不可预料的结果,例如,像会话深度这种非负指标的负值。 +1. 写入的程序应该记住对象的状态从而可以取消它。«取消»字符串应该是«状态»字符串的复制,除了相反的 `Sign`。它增加了存储的初始数据的大小,但使得写入数据更快速。 +2. 由于写入的负载,列中长的增长阵列会降低引擎的效率。数据越简单,效率越高。 +3. `SELECT` 的结果很大程度取决于对象变更历史的一致性。在准备插入数据时要准确。在不一致的数据中会得到不可预料的结果,例如,像会话深度这种非负指标的负值。 -### 算法 {#table_engine-collapsingmergetree-collapsing-algorithm} +### 算法 {#table-engine-collapsingmergetree-collapsing-algorithm} -当 ClickHouse 合并数据片段时,每组具有相同主键的连续行被减少到不超过两行,一行 `Sign = 1`(“状态”行),另一行 `Sign = -1` (“取消”行),换句话说,数据项被折叠了。 +当 ClickHouse 合并数据片段时,每组具有相同主键的连续行被减少到不超过两行,一行 `Sign = 1`(«状态»行),另一行 `Sign = -1` («取消»行),换句话说,数据项被折叠了。 对每个结果的数据部分 ClickHouse 保存: - 1. 第一个“取消”和最后一个“状态”行,如果“状态”和“取消”行的数量匹配和最后一个行是“状态”行 - 2. 最后一个“状态”行,如果“状态”行比“取消”行多一个或一个以上。 - 3. 第一个“取消”行,如果“取消”行比“状态”行多一个或一个以上。 - 4. 没有行,在其他所有情况下。 + 1. 第一个«取消»和最后一个«状态»行,如果«状态»和«取消»行的数量匹配和最后一个行是«状态»行 + 2. 最后一个«状态»行,如果«状态»行比«取消»行多一个或一个以上。 + 3. 第一个«取消»行,如果«取消»行比«状态»行多一个或一个以上。 + 4. 没有行,在其他所有情况下。 - 合并会继续,但是 ClickHouse 会把此情况视为逻辑错误并将其记录在服务日志中。这个错误会在相同的数据被插入超过一次时出现。 + 合并会继续,但是 ClickHouse 会把此情况视为逻辑错误并将其记录在服务日志中。这个错误会在相同的数据被插入超过一次时出现。 因此,折叠不应该改变统计数据的结果。 变化逐渐地被折叠,因此最终几乎每个对象都只剩下了最后的状态。 -`Sign` 是必须的因为合并算法不保证所有有相同主键的行都会在同一个结果数据片段中,甚至是在同一台物理服务器上。ClickHouse 用多线程来处理 `SELECT` 请求,所以它不能预测结果中行的顺序。如果要从 `CollapsingMergeTree` 表中获取完全“折叠”后的数据,则需要聚合。 +`Sign` 是必须的因为合并算法不保证所有有相同主键的行都会在同一个结果数据片段中,甚至是在同一台物理服务器上。ClickHouse 用多线程来处理 `SELECT` 请求,所以它不能预测结果中行的顺序。如果要从 `CollapsingMergeTree` 表中获取完全«折叠»后的数据,则需要聚合。 要完成折叠,请使用 `GROUP BY` 子句和用于处理符号的聚合函数编写请求。例如,要计算数量,使用 `sum(Sign)` 而不是 `count()`。要计算某物的总和,使用 `sum(Sign * x)` 而不是 `sum(x)`,并添加 `HAVING sum(Sign) > 0` 子句。 @@ -127,21 +123,19 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] 如果你需要在不进行聚合的情况下获取数据(例如,要检查是否存在最新值与特定条件匹配的行),你可以在 `FROM` 从句中使用 `FINAL` 修饰符。这种方法显然是更低效的。 -## 示例 +## 示例 {#shi-li} 示例数据: -``` -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` + ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ + │ 4324182021466249494 │ 5 │ 146 │ 1 │ + │ 4324182021466249494 │ 5 │ 146 │ -1 │ + │ 4324182021466249494 │ 6 │ 185 │ 1 │ + └─────────────────────┴───────────┴──────────┴──────┘ 建表: -```sql +``` sql CREATE TABLE UAct ( UserID UInt64, @@ -155,10 +149,11 @@ ORDER BY UserID 插入数据: -```sql +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1) ``` -```sql + +``` sql INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 6, 185, 1) ``` @@ -166,19 +161,15 @@ INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 获取数据: -``` -SELECT * FROM UAct -``` + SELECT * FROM UAct -``` -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ -1 │ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 5 │ 146 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` + ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ + │ 4324182021466249494 │ 5 │ 146 │ -1 │ + │ 4324182021466249494 │ 6 │ 185 │ 1 │ + └─────────────────────┴───────────┴──────────┴──────┘ + ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ + │ 4324182021466249494 │ 5 │ 146 │ 1 │ + └─────────────────────┴───────────┴──────────┴──────┘ 我们看到了什么,哪里有折叠? @@ -186,7 +177,7 @@ SELECT * FROM UAct 因此我们需要聚合: -```sql +``` sql SELECT UserID, sum(PageViews * Sign) AS PageViews, @@ -195,22 +186,20 @@ FROM UAct GROUP BY UserID HAVING sum(Sign) > 0 ``` -``` -┌──────────────UserID─┬─PageViews─┬─Duration─┐ -│ 4324182021466249494 │ 6 │ 185 │ -└─────────────────────┴───────────┴──────────┘ -``` + + ┌──────────────UserID─┬─PageViews─┬─Duration─┐ + │ 4324182021466249494 │ 6 │ 185 │ + └─────────────────────┴───────────┴──────────┘ 如果我们不需要聚合并想要强制进行折叠,我们可以在 `FROM` 从句中使用 `FINAL` 修饰语。 -```sql +``` sql SELECT * FROM UAct FINAL ``` -``` -┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ -│ 4324182021466249494 │ 6 │ 185 │ 1 │ -└─────────────────────┴───────────┴──────────┴──────┘ -``` + + ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ + │ 4324182021466249494 │ 6 │ 185 │ 1 │ + └─────────────────────┴───────────┴──────────┴──────┘ 这种查询数据的方法是非常低效的。不要在大表中使用它。 diff --git a/docs/zh/operations/table_engines/custom_partitioning_key.md b/docs/zh/operations/table_engines/custom_partitioning_key.md index 0517e1ec7d5..1a8cea2d0e2 100644 --- a/docs/zh/operations/table_engines/custom_partitioning_key.md +++ b/docs/zh/operations/table_engines/custom_partitioning_key.md @@ -1,4 +1,4 @@ -# 自定义分区键 +# 自定义分区键 {#zi-ding-yi-fen-qu-jian} [MergeTree](mergetree.md) 系列的表(包括 [可复制表](replication.md) )可以使用分区。基于 MergeTree 表的 [物化视图](materializedview.md) 也支持分区。 @@ -9,8 +9,8 @@ ``` sql CREATE TABLE visits ( - VisitDate Date, - Hour UInt8, + VisitDate Date, + Hour UInt8, ClientID UUID ) ENGINE = MergeTree() @@ -36,36 +36,34 @@ ORDER BY (CounterID, StartDate, intHash32(UserID)); 可以通过 [system.parts](../system_tables.md#system_tables-parts) 表查看表片段和分区信息。例如,假设我们有一个 `visits` 表,按月分区。对 `system.parts` 表执行 `SELECT`: ``` sql -SELECT +SELECT partition, - name, + name, active -FROM system.parts +FROM system.parts WHERE table = 'visits' ``` -``` -┌─partition─┬─name───────────┬─active─┐ -│ 201901 │ 201901_1_3_1 │ 0 │ -│ 201901 │ 201901_1_9_2 │ 1 │ -│ 201901 │ 201901_8_8_0 │ 0 │ -│ 201901 │ 201901_9_9_0 │ 0 │ -│ 201902 │ 201902_4_6_1 │ 1 │ -│ 201902 │ 201902_10_10_0 │ 1 │ -│ 201902 │ 201902_11_11_0 │ 1 │ -└───────────┴────────────────┴────────┘ -``` + ┌─partition─┬─name───────────┬─active─┐ + │ 201901 │ 201901_1_3_1 │ 0 │ + │ 201901 │ 201901_1_9_2 │ 1 │ + │ 201901 │ 201901_8_8_0 │ 0 │ + │ 201901 │ 201901_9_9_0 │ 0 │ + │ 201902 │ 201902_4_6_1 │ 1 │ + │ 201902 │ 201902_10_10_0 │ 1 │ + │ 201902 │ 201902_11_11_0 │ 1 │ + └───────────┴────────────────┴────────┘ -`partition` 列存储分区的名称。此示例中有两个分区:`201901` 和 `201902`。在 [ALTER ... PARTITION](#alter_manipulations-with-partitions) 语句中你可以使用该列值来指定分区名称。 +`partition` 列存储分区的名称。此示例中有两个分区:`201901` 和 `201902`。在 [ALTER … PARTITION](#alter_manipulations-with-partitions) 语句中你可以使用该列值来指定分区名称。 `name` 列为分区中数据片段的名称。在 [ALTER ATTACH PART](#alter_attach-partition) 语句中你可以使用此列值中来指定片段名称。 这里我们拆解下第一部分的名称:`201901_1_3_1`: -- `201901` 是分区名称。 -- `1` 是数据块的最小编号。 -- `3` 是数据块的最大编号。 -- `1` 是块级别(即在由块组成的合并树中,该块在树中的深度)。 +- `201901` 是分区名称。 +- `1` 是数据块的最小编号。 +- `3` 是数据块的最大编号。 +- `1` 是块级别(即在由块组成的合并树中,该块在树中的深度)。 !!! attention "注意" 旧类型表的片段名称为:`20190117_20190123_2_2_0`(最小日期 - 最大日期 - 最小块编号 - 最大块编号 - 块级别)。 @@ -78,24 +76,22 @@ WHERE table = 'visits' OPTIMIZE TABLE visits PARTITION 201902; ``` -``` -┌─partition─┬─name───────────┬─active─┐ -│ 201901 │ 201901_1_3_1 │ 0 │ -│ 201901 │ 201901_1_9_2 │ 1 │ -│ 201901 │ 201901_8_8_0 │ 0 │ -│ 201901 │ 201901_9_9_0 │ 0 │ -│ 201902 │ 201902_4_6_1 │ 0 │ -│ 201902 │ 201902_4_11_2 │ 1 │ -│ 201902 │ 201902_10_10_0 │ 0 │ -│ 201902 │ 201902_11_11_0 │ 0 │ -└───────────┴────────────────┴────────┘ -``` + ┌─partition─┬─name───────────┬─active─┐ + │ 201901 │ 201901_1_3_1 │ 0 │ + │ 201901 │ 201901_1_9_2 │ 1 │ + │ 201901 │ 201901_8_8_0 │ 0 │ + │ 201901 │ 201901_9_9_0 │ 0 │ + │ 201902 │ 201902_4_6_1 │ 0 │ + │ 201902 │ 201902_4_11_2 │ 1 │ + │ 201902 │ 201902_10_10_0 │ 0 │ + │ 201902 │ 201902_11_11_0 │ 0 │ + └───────────┴────────────────┴────────┘ 非激活片段会在合并后的10分钟左右删除。 查看片段和分区信息的另一种方法是进入表的目录:`/var/lib/clickhouse/data//
${SearchPhrase:XML} ${с:XML}
/`。例如: -```bash +``` bash dev:/var/lib/clickhouse/data/default/visits$ ls -l total 40 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1 @@ -109,7 +105,7 @@ drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 12:09 201902_4_6_1 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 detached ``` -文件夹 '201901_1_1_0','201901_1_7_1' 等是片段的目录。每个片段都与一个对应的分区相关,并且只包含这个月的数据(本例中的表按月分区)。 +文件夹 ‘201901\_1\_1\_0’,‘201901\_1\_7\_1’ 等是片段的目录。每个片段都与一个对应的分区相关,并且只包含这个月的数据(本例中的表按月分区)。 `detached` 目录存放着使用 [DETACH](#alter_detach-partition) 语句从表中分离的片段。损坏的片段也会移到该目录,而不是删除。服务器不使用`detached`目录中的片段。可以随时添加,删除或修改此目录中的数据 – 在运行 [ATTACH](../../query_language/alter.md#alter_attach-partition) 语句前,服务器不会感知到。 diff --git a/docs/zh/operations/table_engines/dictionary.md b/docs/zh/operations/table_engines/dictionary.md index d08155f2ca0..3bd6b9d78b6 100644 --- a/docs/zh/operations/table_engines/dictionary.md +++ b/docs/zh/operations/table_engines/dictionary.md @@ -1,11 +1,10 @@ - -# Dictionary +# Dictionary {#dictionary} `Dictionary` 引擎将字典数据展示为一个ClickHouse的表。 例如,考虑使用一个具有以下配置的 `products` 字典: -```xml +``` xml products @@ -39,7 +38,7 @@ 查询字典中的数据: ``` sql -select name, type, key, attribute.names, attribute.types, bytes_allocated, element_count,source from system.dictionaries where name = 'products'; +select name, type, key, attribute.names, attribute.types, bytes_allocated, element_count,source from system.dictionaries where name = 'products'; SELECT name, @@ -54,21 +53,17 @@ FROM system.dictionaries WHERE name = 'products' ``` -``` -┌─name─────┬─type─┬─key────┬─attribute.names─┬─attribute.types─┬─bytes_allocated─┬─element_count─┬─source──────────┐ -│ products │ Flat │ UInt64 │ ['title'] │ ['String'] │ 23065376 │ 175032 │ ODBC: .products │ -└──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘ -``` + ┌─name─────┬─type─┬─key────┬─attribute.names─┬─attribute.types─┬─bytes_allocated─┬─element_count─┬─source──────────┐ + │ products │ Flat │ UInt64 │ ['title'] │ ['String'] │ 23065376 │ 175032 │ ODBC: .products │ + └──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘ -你可以使用 [dictGet*](../../query_language/functions/ext_dict_functions.md) 函数来获取这种格式的字典数据。 +你可以使用 [dictGet\*](../../query_language/functions/ext_dict_functions.md) 函数来获取这种格式的字典数据。 当你需要获取原始数据,或者是想要使用 `JOIN` 操作的时候,这种视图并没有什么帮助。对于这些情况,你可以使用 `Dictionary` 引擎,它可以将字典数据展示在表中。 语法: -``` -CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)` -``` + CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)` 示例: @@ -83,11 +78,9 @@ CREATE TABLE products ENGINE = Dictionary(products) ``` -``` -Ok. + Ok. -0 rows in set. Elapsed: 0.004 sec. -``` + 0 rows in set. Elapsed: 0.004 sec. 看一看表中的内容。 @@ -99,13 +92,10 @@ FROM products LIMIT 1 ``` -``` -┌────product_id─┬─title───────────┐ -│ 152689 │ Some item │ -└───────────────┴─────────────────┘ - -1 rows in set. Elapsed: 0.006 sec. -``` + ┌────product_id─┬─title───────────┐ + │ 152689 │ Some item │ + └───────────────┴─────────────────┘ + 1 rows in set. Elapsed: 0.006 sec. [来源文章](https://clickhouse.tech/docs/en/operations/table_engines/dictionary/) diff --git a/docs/zh/operations/table_engines/distributed.md b/docs/zh/operations/table_engines/distributed.md index 06b600b373c..b81e52348e6 100644 --- a/docs/zh/operations/table_engines/distributed.md +++ b/docs/zh/operations/table_engines/distributed.md @@ -1,16 +1,13 @@ - -# Distributed +# Distributed {#distributed} **分布式引擎本身不存储数据**, 但可以在多个服务器上进行分布式查询。 读是自动并行的。读取时,远程服务器表的索引(如果有的话)会被使用。 分布式引擎参数:服务器配置文件中的集群名,远程数据库名,远程表名,数据分片键(可选)。 示例: -``` -Distributed(logs, default, hits[, sharding_key]) -``` + Distributed(logs, default, hits[, sharding_key]) -将会从位于“logs”集群中 default.hits 表所有服务器上读取数据。 +将会从位于«logs»集群中 default.hits 表所有服务器上读取数据。 远程服务器不仅用于读取数据,还会对尽可能数据做部分处理。 例如,对于使用 GROUP BY 的查询,数据首先在远程服务器聚合,之后返回聚合函数的中间状态给查询请求的服务器。再在请求的服务器上进一步汇总数据。 @@ -20,7 +17,7 @@ logs – 服务器配置文件中的集群名称。 集群示例配置如下: -```xml +``` xml @@ -61,15 +58,14 @@ logs – 服务器配置文件中的集群名称。 集群名称不能包含点号。 每个服务器需要指定 `host`,`port`,和可选的 `user`,`password`,`secure`,`compression` 的参数: +- `host` – 远程服务器地址。可以域名、IPv4或IPv6。如果指定域名,则服务在启动时发起一个 DNS 请求,并且请求结果会在服务器运行期间一直被记录。如果 DNS 请求失败,则服务不会启动。如果你修改了 DNS 记录,则需要重启服务。 +- `port` – 消息传递的 TCP 端口(「tcp\_port」配置通常设为 9000)。不要跟 http\_port 混淆。 +- `user` – 用于连接远程服务器的用户名。默认值:default。该用户必须有权限访问该远程服务器。访问权限配置在 users.xml 文件中。更多信息,请查看«访问权限»部分。 +- `password` – 用于连接远程服务器的密码。默认值:空字符串。 +- `secure` – 是否使用ssl进行连接,设为true时,通常也应该设置 `port` = 9440。服务器也要监听 9440 并有正确的证书。 +- `compression` - 是否使用数据压缩。默认值:true。 -: - `host` – 远程服务器地址。可以域名、IPv4或IPv6。如果指定域名,则服务在启动时发起一个 DNS 请求,并且请求结果会在服务器运行期间一直被记录。如果 DNS 请求失败,则服务不会启动。如果你修改了 DNS 记录,则需要重启服务。 - - `port` – 消息传递的 TCP 端口(「tcp_port」配置通常设为 9000)。不要跟 http_port 混淆。 - - `user` – 用于连接远程服务器的用户名。默认值:default。该用户必须有权限访问该远程服务器。访问权限配置在 users.xml 文件中。更多信息,请查看“访问权限”部分。 - - `password` – 用于连接远程服务器的密码。默认值:空字符串。 - - `secure` – 是否使用ssl进行连接,设为true时,通常也应该设置 `port` = 9440。服务器也要监听 9440 并有正确的证书。 - - `compression` - 是否使用数据压缩。默认值:true。 - -配置了副本,读取操作会从每个分片里选择一个可用的副本。可配置负载平衡算法(挑选副本的方式) - 请参阅“load_balancing”设置。 +配置了副本,读取操作会从每个分片里选择一个可用的副本。可配置负载平衡算法(挑选副本的方式) - 请参阅«load\_balancing»设置。 如果跟服务器的连接不可用,则在尝试短超时的重连。如果重连失败,则选择下一个副本,依此类推。如果跟所有副本的连接尝试都失败,则尝试用相同的方式再重复几次。 该机制有利于系统可用性,但不保证完全容错:如有远程服务器能够接受连接,但无法正常工作或状况不佳。 @@ -77,17 +73,17 @@ logs – 服务器配置文件中的集群名称。 可以在配置中配置任意数量的集群。 -要查看集群,可使用“system.clusters”表。 +要查看集群,可使用«system.clusters»表。 通过分布式引擎可以像使用本地服务器一样使用集群。但是,集群不是自动扩展的:你必须编写集群配置到服务器配置文件中(最好,给所有集群的服务器写上完整配置)。 -不支持用分布式表查询别的分布式表(除非该表只有一个分片)。或者说,要用分布表查查询“最终”的数据表。 +不支持用分布式表查询别的分布式表(除非该表只有一个分片)。或者说,要用分布表查查询«最终»的数据表。 -分布式引擎需要将集群信息写入配置文件。配置文件中的集群信息会即时更新,无需重启服务器。如果你每次是要向不确定的一组分片和副本发送查询,则不适合创建分布式表 - 而应该使用“远程”表函数。 请参阅“表函数”部分。 +分布式引擎需要将集群信息写入配置文件。配置文件中的集群信息会即时更新,无需重启服务器。如果你每次是要向不确定的一组分片和副本发送查询,则不适合创建分布式表 - 而应该使用«远程»表函数。 请参阅«表函数»部分。 向集群写数据的方法有两种: -一,自已指定要将哪些数据写入哪些服务器,并直接在每个分片上执行写入。换句话说,在分布式表上“查询”,在数据表上 INSERT。 +一,自已指定要将哪些数据写入哪些服务器,并直接在每个分片上执行写入。换句话说,在分布式表上«查询»,在数据表上 INSERT。 这是最灵活的解决方案 – 你可以使用任何分片方案,对于复杂业务特性的需求,这可能是非常重要的。 这也是最佳解决方案,因为数据可以完全独立地写入不同的分片。 @@ -96,15 +92,15 @@ logs – 服务器配置文件中的集群名称。 每个分片都可以在配置文件中定义权重。默认情况下,权重等于1。数据依据分片权重按比例分发到分片上。例如,如果有两个分片,第一个分片的权重是9,而第二个分片的权重是10,则发送 9 / 19 的行到第一个分片, 10 / 19 的行到第二个分片。 -分片可在配置文件中定义 'internal_replication' 参数。 +分片可在配置文件中定义 ‘internal\_replication’ 参数。 -此参数设置为“true”时,写操作只选一个正常的副本写入数据。如果分布式表的子表是复制表(*ReplicaMergeTree),请使用此方案。换句话说,这其实是把数据的复制工作交给实际需要写入数据的表本身而不是分布式表。 +此参数设置为«true»时,写操作只选一个正常的副本写入数据。如果分布式表的子表是复制表(\*ReplicaMergeTree),请使用此方案。换句话说,这其实是把数据的复制工作交给实际需要写入数据的表本身而不是分布式表。 -若此参数设置为“false”(默认值),写操作会将数据写入所有副本。实质上,这意味着要分布式表本身来复制数据。这种方式不如使用复制表的好,因为不会检查副本的一致性,并且随着时间的推移,副本数据可能会有些不一样。 +若此参数设置为«false»(默认值),写操作会将数据写入所有副本。实质上,这意味着要分布式表本身来复制数据。这种方式不如使用复制表的好,因为不会检查副本的一致性,并且随着时间的推移,副本数据可能会有些不一样。 -选择将一行数据发送到哪个分片的方法是,首先计算分片表达式,然后将这个计算结果除以所有分片的权重总和得到余数。该行会发送到那个包含该余数的从'prev_weight'到'prev_weights + weight'的半闭半开区间对应的分片上,其中 'prev_weights' 是该分片前面的所有分片的权重和,'weight' 是该分片的权重。例如,如果有两个分片,第一个分片权重为9,而第二个分片权重为10,则余数在 \[0,9) 中的行发给第一个分片,余数在 \[9,19) 中的行发给第二个分片。 +选择将一行数据发送到哪个分片的方法是,首先计算分片表达式,然后将这个计算结果除以所有分片的权重总和得到余数。该行会发送到那个包含该余数的从’prev\_weight’到’prev\_weights + weight’的半闭半开区间对应的分片上,其中 ‘prev\_weights’ 是该分片前面的所有分片的权重和,‘weight’ 是该分片的权重。例如,如果有两个分片,第一个分片权重为9,而第二个分片权重为10,则余数在 \[0,9) 中的行发给第一个分片,余数在 \[9,19) 中的行发给第二个分片。 -分片表达式可以是由常量和表列组成的任何返回整数表达式。例如,您可以使用表达式 'rand()' 来随机分配数据,或者使用 'UserID' 来按用户 ID 的余数分布(相同用户的数据将分配到单个分片上,这可降低带有用户信息的 IN 和 JOIN 的语句运行的复杂度)。如果该列数据分布不够均匀,可以将其包装在散列函数中:intHash64(UserID)。 +分片表达式可以是由常量和表列组成的任何返回整数表达式。例如,您可以使用表达式 ‘rand()’ 来随机分配数据,或者使用 ‘UserID’ 来按用户 ID 的余数分布(相同用户的数据将分配到单个分片上,这可降低带有用户信息的 IN 和 JOIN 的语句运行的复杂度)。如果该列数据分布不够均匀,可以将其包装在散列函数中:intHash64(UserID)。 这种简单的用余数来选择分片的方案是有局限的,并不总适用。它适用于中型和大型数据(数十台服务器)的场景,但不适用于巨量数据(数百台或更多服务器)的场景。后一种情况下,应根据业务特性需求考虑的分片方案,而不是直接用分布式表的多分片。 @@ -112,14 +108,13 @@ SELECT 查询会被发送到所有分片,并且无论数据在分片中如何 下面的情况,你需要关注分片方案: -- 使用需要特定键连接数据( IN 或 JOIN )的查询。如果数据是用该键进行分片,则应使用本地 IN 或 JOIN 而不是 GLOBAL IN 或 GLOBAL JOIN,这样效率更高。 -- 使用大量服务器(上百或更多),但有大量小查询(个别客户的查询 - 网站,广告商或合作伙伴)。为了使小查询不影响整个集群,让单个客户的数据处于单个分片上是有意义的。或者,正如我们在 Yandex.Metrica 中所做的那样,你可以配置两级分片:将整个集群划分为“层”,一个层可以包含多个分片。单个客户的数据位于单个层上,根据需要将分片添加到层中,层中的数据随机分布。然后给每层创建分布式表,再创建一个全局的分布式表用于全局的查询。 +- 使用需要特定键连接数据( IN 或 JOIN )的查询。如果数据是用该键进行分片,则应使用本地 IN 或 JOIN 而不是 GLOBAL IN 或 GLOBAL JOIN,这样效率更高。 +- 使用大量服务器(上百或更多),但有大量小查询(个别客户的查询 - 网站,广告商或合作伙伴)。为了使小查询不影响整个集群,让单个客户的数据处于单个分片上是有意义的。或者,正如我们在 Yandex.Metrica 中所做的那样,你可以配置两级分片:将整个集群划分为«层»,一个层可以包含多个分片。单个客户的数据位于单个层上,根据需要将分片添加到层中,层中的数据随机分布。然后给每层创建分布式表,再创建一个全局的分布式表用于全局的查询。 数据是异步写入的。对于分布式表的 INSERT,数据块只写本地文件系统。之后会尽快地在后台发送到远程服务器。你可以通过查看表目录中的文件列表(等待发送的数据)来检查数据是否成功发送:/var/lib/clickhouse/data/database/table/ 。 -如果在 INSERT 到分布式表时服务器节点丢失或重启(如,设备故障),则插入的数据可能会丢失。如果在表目录中检测到损坏的数据分片,则会将其转移到“broken”子目录,并不再使用。 - -启用 max_parallel_replicas 选项后,会在分表的所有副本上并行查询处理。更多信息,请参阅“设置,max_parallel_replicas”部分。 +如果在 INSERT 到分布式表时服务器节点丢失或重启(如,设备故障),则插入的数据可能会丢失。如果在表目录中检测到损坏的数据分片,则会将其转移到«broken»子目录,并不再使用。 +启用 max\_parallel\_replicas 选项后,会在分表的所有副本上并行查询处理。更多信息,请参阅«设置,max\_parallel\_replicas»部分。 [Original article](https://clickhouse.tech/docs/en/operations/table_engines/distributed/) diff --git a/docs/zh/operations/table_engines/external_data.md b/docs/zh/operations/table_engines/external_data.md index 4c1d80183c4..d993a796e52 100644 --- a/docs/zh/operations/table_engines/external_data.md +++ b/docs/zh/operations/table_engines/external_data.md @@ -1,6 +1,6 @@ -# External Data for Query Processing +# External Data for Query Processing {#external-data-for-query-processing} -ClickHouse 允许向服务器发送处理查询所需的数据以及 SELECT 查询。这些数据放在一个临时表中(请参阅 "临时表" 一节),可以在查询中使用(例如,在 IN 操作符中)。 +ClickHouse 允许向服务器发送处理查询所需的数据以及 SELECT 查询。这些数据放在一个临时表中(请参阅 «临时表» 一节),可以在查询中使用(例如,在 IN 操作符中)。 例如,如果您有一个包含重要用户标识符的文本文件,则可以将其与使用此列表过滤的查询一起上传到服务器。 @@ -10,27 +10,27 @@ ClickHouse 允许向服务器发送处理查询所需的数据以及 SELECT 查 在命令行客户端中,您可以指定格式的参数部分 -```bash +``` bash --external --file=... [--name=...] [--format=...] [--types=...|--structure=...] ``` 对于传输的表的数量,可能有多个这样的部分。 -**--external** – 标记子句的开始。 -**--file** – 带有表存储的文件的路径,或者,它指的是STDIN。 +**–external** – 标记子句的开始。 +**–file** – 带有表存储的文件的路径,或者,它指的是STDIN。 只能从 stdin 中检索单个表。 -以下的参数是可选的:**--name** – 表的名称,如果省略,则采用 _data。 -**--format** – 文件中的数据格式。 如果省略,则使用 TabSeparated。 +以下的参数是可选的:**–name** – 表的名称,如果省略,则采用 \_data。 +**–format** – 文件中的数据格式。 如果省略,则使用 TabSeparated。 -以下的参数必选一个:**--types** – 逗号分隔列类型的列表。例如:`UInt64,String`。列将被命名为 _1,_2,... -**--structure**– 表结构的格式 `UserID UInt64`,`URL String`。定义列的名字以及类型。 +以下的参数必选一个:**–types** – 逗号分隔列类型的列表。例如:`UInt64,String`。列将被命名为 \_1,\_2,… +**–structure**– 表结构的格式 `UserID UInt64`,`URL String`。定义列的名字以及类型。 -在 "file" 中指定的文件将由 "format" 中指定的格式解析,使用在 "types" 或 "structure" 中指定的数据类型。该表将被上传到服务器,并在作为名称为 "name"临时表。 +在 «file» 中指定的文件将由 «format» 中指定的格式解析,使用在 «types» 或 «structure» 中指定的数据类型。该表将被上传到服务器,并在作为名称为 «name»临时表。 示例: -```bash +``` bash echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 849897 cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' @@ -41,11 +41,11 @@ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, coun /bin/sync 1 ``` -当使用HTTP接口时,外部数据以 multipart/form-data 格式传递。每个表作为一个单独的文件传输。表名取自文件名。"query_string" 传递参数 "name_format"、"name_types"和"name_structure",其中 "name" 是这些参数对应的表的名称。参数的含义与使用命令行客户端时的含义相同。 +当使用HTTP接口时,外部数据以 multipart/form-data 格式传递。每个表作为一个单独的文件传输。表名取自文件名。«query\_string» 传递参数 «name\_format»、«name\_types»和«name\_structure»,其中 «name» 是这些参数对应的表的名称。参数的含义与使用命令行客户端时的含义相同。 示例: -```bash +``` bash cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' @@ -58,5 +58,4 @@ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count( 对于分布式查询,将临时表发送到所有远程服务器。 - [Original article](https://clickhouse.tech/docs/zh/operations/table_engines/external_data/) diff --git a/docs/zh/operations/table_engines/file.md b/docs/zh/operations/table_engines/file.md index dcdc091a84b..1109a6629a3 100644 --- a/docs/zh/operations/table_engines/file.md +++ b/docs/zh/operations/table_engines/file.md @@ -1,18 +1,16 @@ -# File(InputFormat) {#table_engines-file} +# File(InputFormat) {#table-engines-file} 数据源是以 Clickhouse 支持的一种输入格式(TabSeparated,Native等)存储数据的文件。 用法示例: -- 从 ClickHouse 导出数据到文件。 -- 将数据从一种格式转换为另一种格式。 -- 通过编辑磁盘上的文件来更新 ClickHouse 中的数据。 +- 从 ClickHouse 导出数据到文件。 +- 将数据从一种格式转换为另一种格式。 +- 通过编辑磁盘上的文件来更新 ClickHouse 中的数据。 -## 在 ClickHouse 服务器中的使用 +## 在 ClickHouse 服务器中的使用 {#zai-clickhouse-fu-wu-qi-zhong-de-shi-yong} -``` -File(Format) -``` + File(Format) 选用的 `Format` 需要支持 `INSERT` 或 `SELECT` 。有关支持格式的完整列表,请参阅 [格式](../../interfaces/formats.md#formats)。 @@ -22,7 +20,7 @@ ClickHouse 不支持给 `File` 指定文件系统路径。它使用服务器配 你也可以在服务器文件系统中手动创建这些子文件夹和文件,然后通过 [ATTACH](../../query_language/misc.md) 将其创建为具有对应名称的表,这样你就可以从该文件中查询数据了。 -!!! 注意 +!!! 注意 "注意" 注意这个功能,因为 ClickHouse 不会跟踪这些文件在外部的更改。在 ClickHouse 中和 ClickHouse 外部同时写入会造成结果是不确定的。 **示例:** @@ -37,10 +35,10 @@ CREATE TABLE file_engine_table (name String, value UInt32) ENGINE=File(TabSepara **2.** 手动创建 `/var/lib/clickhouse/data/default/file_engine_table/data.TabSeparated` 文件,并且包含内容: -```bash +``` bash $ cat data.TabSeparated -one 1 -two 2 +one 1 +two 2 ``` **3.** 查询这些数据: @@ -49,29 +47,27 @@ two 2 SELECT * FROM file_engine_table ``` -``` -┌─name─┬─value─┐ -│ one │ 1 │ -│ two │ 2 │ -└──────┴───────┘ -``` + ┌─name─┬─value─┐ + │ one │ 1 │ + │ two │ 2 │ + └──────┴───────┘ -## 在 Clickhouse-local 中的使用 +## 在 Clickhouse-local 中的使用 {#zai-clickhouse-local-zhong-de-shi-yong} 使用 [clickhouse-local](../utils/clickhouse-local.md) 时,File 引擎除了 `Format` 之外,还可以接受文件路径参数。可以使用数字或人类可读的名称来指定标准输入/输出流,例如 `0` 或 `stdin`,`1` 或 `stdout`。 **例如:** -```bash +``` bash $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" ``` -## 功能实现 +## 功能实现 {#gong-neng-shi-xian} -- 读操作可支持并发,但写操作不支持 -- 不支持: - - `ALTER` - - `SELECT ... SAMPLE` - - 索引 - - 副本 +- 读操作可支持并发,但写操作不支持 +- 不支持: + - `ALTER` + - `SELECT ... SAMPLE` + - 索引 + - 副本 [来源文章](https://clickhouse.tech/docs/en/operations/table_engines/file/) diff --git a/docs/zh/operations/table_engines/index.md b/docs/zh/operations/table_engines/index.md index 6ffc3149d39..9924a0b8b67 100644 --- a/docs/zh/operations/table_engines/index.md +++ b/docs/zh/operations/table_engines/index.md @@ -1,68 +1,73 @@ -# 表引擎 +# 表引擎 {#biao-yin-qing} 表引擎(即表的类型)决定了: -* 数据的存储方式和位置,写到哪里以及从哪里读取数据 -* 支持哪些查询以及如何支持。 -* 并发数据访问。 -* 索引的使用(如果存在)。 -* 是否可以执行多线程请求。 -* 数据复制参数。 +- 数据的存储方式和位置,写到哪里以及从哪里读取数据 +- 支持哪些查询以及如何支持。 +- 并发数据访问。 +- 索引的使用(如果存在)。 +- 是否可以执行多线程请求。 +- 数据复制参数。 -# 引擎类型 -## MergeTree +# 引擎类型 {#yin-qing-lei-xing} -适用于高负载任务的最通用和功能最强大的表引擎。这些引擎的共同特点是可以快速插入数据并进行后续的后台数据处理。 MergeTree系列引擎支持数据复制(使用[Replicated*](replication.md) 的引擎版本),分区和一些其他引擎不支持的其他功能。 +## MergeTree {#mergetree} + +适用于高负载任务的最通用和功能最强大的表引擎。这些引擎的共同特点是可以快速插入数据并进行后续的后台数据处理。 MergeTree系列引擎支持数据复制(使用[Replicated\*](replication.md) 的引擎版本),分区和一些其他引擎不支持的其他功能。 该类型的引擎: -* [MergeTree](mergetree.md) -* [ReplacingMergeTree](replacingmergetree.md) -* [SummingMergeTree](summingmergetree.md) -* [AggregatingMergeTree](aggregatingmergetree.md) -* [CollapsingMergeTree](collapsingmergetree.md) -* [VersionedCollapsingMergeTree](versionedcollapsingmergetree.md) -* [GraphiteMergeTree](graphitemergetree.md) +\* [MergeTree](mergetree.md) +\* [ReplacingMergeTree](replacingmergetree.md) +\* [SummingMergeTree](summingmergetree.md) +\* [AggregatingMergeTree](aggregatingmergetree.md) +\* [CollapsingMergeTree](collapsingmergetree.md) +\* [VersionedCollapsingMergeTree](versionedcollapsingmergetree.md) +\* [GraphiteMergeTree](graphitemergetree.md) + +## Log {#log} -## Log 具有最小功能的[轻量级引擎](log_family.md)。当您需要快速写入许多小表(最多约100万行)并在以后整体读取它们时,该类型的引擎是最有效的。 该类型的引擎: -* [TinyLog](tinylog/) -* [StripeLog](stripelog/) -* [Log](log/) +- \[TinyLog\](tinylog/) +- \[StripeLog\](stripelog/) +- [Log](#log)(log/) + +## Intergation engines {#intergation-engines} -## Intergation engines 用于与其他的数据存储与处理系统集成的引擎。 该类型的引擎: -* [Kafka](kafka.md) -* [MySQL](mysql.md) -* [ODBC](odbc.md) -* [JDBC](jdbc.md) -* [HDFS](hdfs.md) +- [Kafka](kafka.md) +- [MySQL](mysql.md) +- [ODBC](odbc.md) +- [JDBC](jdbc.md) +- [HDFS](hdfs.md) + +## 用于其他特定功能的引擎 {#yong-yu-qi-ta-te-ding-gong-neng-de-yin-qing} -## 用于其他特定功能的引擎 该类型的引擎: -* [Distributed](distributed.md) -* [MaterializedView](materializedview.md) -* [Dictionary](dictionary.md) -* [Merge](merge.md) -* [File](file.md) -* [Null](null.md) -* [Set](set.md) -* [Join](join.md) -* [URL](url.md) -* [View](view.md) -* [Memory](memory.md) -* [Buffer](buffer.md) +- [Distributed](distributed.md) +- [MaterializedView](materializedview.md) +- [Dictionary](dictionary.md) +- [Merge](merge.md) +- [File](file.md) +- [Null](null.md) +- [Set](set.md) +- [Join](join.md) +- [URL](url.md) +- [View](view.md) +- [Memory](memory.md) +- [Buffer](buffer.md) + +# 虚拟列 {#xu-ni-lie} -#虚拟列 虚拟列是表引擎组成的一部分,它在对应的表引擎的源代码中定义。 您不能在 `CREATE TABLE` 中指定虚拟列,并且虚拟列不会包含在 `SHOW CREATE TABLE` 和 `DESCRIBE TABLE` 的查询结果中。虚拟列是只读的,所以您不能向虚拟列中写入数据。 -如果想要查询虚拟列中的数据,您必须在SELECT查询中包含虚拟列的名字。SELECT * 不会返回虚拟列的内容。 +如果想要查询虚拟列中的数据,您必须在SELECT查询中包含虚拟列的名字。SELECT \* 不会返回虚拟列的内容。 若您创建的表中有一列与虚拟列的名字相同,那么虚拟列将不能再被访问。我们不建议您这样做。为了避免这种列名的冲突,虚拟列的名字一般都以下划线开头。 diff --git a/docs/zh/operations/table_engines/join.md b/docs/zh/operations/table_engines/join.md index 397506daaec..024530cc0b7 100644 --- a/docs/zh/operations/table_engines/join.md +++ b/docs/zh/operations/table_engines/join.md @@ -1,10 +1,8 @@ -# Join +# Join {#join} 加载好的 JOIN 表数据会常驻内存中。 -``` -Join(ANY|ALL, LEFT|INNER, k1[, k2, ...]) -``` + Join(ANY|ALL, LEFT|INNER, k1[, k2, ...]) 引擎参数:`ANY|ALL` – 连接修饰;`LEFT|INNER` – 连接类型。更多信息可参考 [JOIN子句](../../query_language/select.md#select-join)。 这些参数设置不用带引号,但必须与要 JOIN 表匹配。 k1,k2,……是 USING 子句中要用于连接的关键列。 @@ -19,12 +17,12 @@ Join(ANY|ALL, LEFT|INNER, k1[, k2, ...]) When creating a table, the following settings are applied: -- join_use_nulls -- max_rows_in_join -- max_bytes_in_join -- join_overflow_mode -- join_any_take_last_row +- join\_use\_nulls +- max\_rows\_in\_join +- max\_bytes\_in\_join +- join\_overflow\_mode +- join\_any\_take\_last\_row -The `Join`-engine tables can't be used in `GLOBAL JOIN` operations. +The `Join`-engine tables can’t be used in `GLOBAL JOIN` operations. [来源文章](https://clickhouse.tech/docs/en/operations/table_engines/join/) diff --git a/docs/zh/operations/table_engines/kafka.md b/docs/zh/operations/table_engines/kafka.md index de70c4a6d35..e992a76519e 100644 --- a/docs/zh/operations/table_engines/kafka.md +++ b/docs/zh/operations/table_engines/kafka.md @@ -1,47 +1,43 @@ -# Kafka +# Kafka {#kafka} 此引擎与 [Apache Kafka](http://kafka.apache.org/) 结合使用。 Kafka 特性: -- 发布或者订阅数据流。 -- 容错存储机制。 -- 处理流数据。 +- 发布或者订阅数据流。 +- 容错存储机制。 +- 处理流数据。 老版格式: -``` -Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format - [, kafka_row_delimiter, kafka_schema, kafka_num_consumers]) -``` + Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format + [, kafka_row_delimiter, kafka_schema, kafka_num_consumers]) 新版格式: -``` -Kafka SETTINGS - kafka_broker_list = 'localhost:9092', - kafka_topic_list = 'topic1,topic2', - kafka_group_name = 'group1', - kafka_format = 'JSONEachRow', - kafka_row_delimiter = '\n', - kafka_schema = '', - kafka_num_consumers = 2 -``` + Kafka SETTINGS + kafka_broker_list = 'localhost:9092', + kafka_topic_list = 'topic1,topic2', + kafka_group_name = 'group1', + kafka_format = 'JSONEachRow', + kafka_row_delimiter = '\n', + kafka_schema = '', + kafka_num_consumers = 2 必要参数: -- `kafka_broker_list` – 以逗号分隔的 brokers 列表 (`localhost:9092`)。 -- `kafka_topic_list` – topic 列表 (`my_topic`)。 -- `kafka_group_name` – Kafka 消费组名称 (`group1`)。如果不希望消息在集群中重复,请在每个分片中使用相同的组名。 -- `kafka_format` – 消息体格式。使用与 SQL 部分的 `FORMAT` 函数相同表示方法,例如 `JSONEachRow`。了解详细信息,请参考 `Formats` 部分。 +- `kafka_broker_list` – 以逗号分隔的 brokers 列表 (`localhost:9092`)。 +- `kafka_topic_list` – topic 列表 (`my_topic`)。 +- `kafka_group_name` – Kafka 消费组名称 (`group1`)。如果不希望消息在集群中重复,请在每个分片中使用相同的组名。 +- `kafka_format` – 消息体格式。使用与 SQL 部分的 `FORMAT` 函数相同表示方法,例如 `JSONEachRow`。了解详细信息,请参考 `Formats` 部分。 可选参数: -- `kafka_row_delimiter` - 每个消息体(记录)之间的分隔符。 -- `kafka_schema` – 如果解析格式需要一个 schema 时,此参数必填。例如,[Cap'n Proto](https://capnproto.org/) 需要 schema 文件路径以及根对象 `schema.capnp:Message` 的名字。 -- `kafka_num_consumers` – 单个表的消费者数量。默认值是:`1`,如果一个消费者的吞吐量不足,则指定更多的消费者。消费者的总数不应该超过 topic 中分区的数量,因为每个分区只能分配一个消费者。 +- `kafka_row_delimiter` - 每个消息体(记录)之间的分隔符。 +- `kafka_schema` – 如果解析格式需要一个 schema 时,此参数必填。例如,[Cap’n Proto](https://capnproto.org/) 需要 schema 文件路径以及根对象 `schema.capnp:Message` 的名字。 +- `kafka_num_consumers` – 单个表的消费者数量。默认值是:`1`,如果一个消费者的吞吐量不足,则指定更多的消费者。消费者的总数不应该超过 topic 中分区的数量,因为每个分区只能分配一个消费者。 示例: @@ -75,13 +71,13 @@ Kafka SETTINGS 消费的消息会被自动追踪,因此每个消息在不同的消费组里只会记录一次。如果希望获得两次数据,则使用另一个组名创建副本。 -消费组可以灵活配置并且在集群之间同步。例如,如果群集中有10个主题和5个表副本,则每个副本将获得2个主题。 如果副本数量发生变化,主题将自动在副本中重新分配。了解更多信息请访问 [http://kafka.apache.org/intro](http://kafka.apache.org/intro)。 +消费组可以灵活配置并且在集群之间同步。例如,如果群集中有10个主题和5个表副本,则每个副本将获得2个主题。 如果副本数量发生变化,主题将自动在副本中重新分配。了解更多信息请访问 http://kafka.apache.org/intro。 `SELECT` 查询对于读取消息并不是很有用(调试除外),因为每条消息只能被读取一次。使用物化视图创建实时线程更实用。您可以这样做: -1. 使用引擎创建一个 Kafka 消费者并作为一条数据流。 -2. 创建一个结构表。 -3. 创建物化视图,改视图会在后台转换引擎中的数据并将其放入之前创建的表中。 +1. 使用引擎创建一个 Kafka 消费者并作为一条数据流。 +2. 创建一个结构表。 +3. 创建物化视图,改视图会在后台转换引擎中的数据并将其放入之前创建的表中。 当 `MATERIALIZED VIEW` 添加至引擎,它将会在后台收集数据。可以持续不断地从 Kafka 收集数据并通过 `SELECT` 将数据转换为所需要的格式。 @@ -107,22 +103,20 @@ Kafka SETTINGS SELECT level, sum(total) FROM daily GROUP BY level; ``` -为了提高性能,接受的消息被分组为 [max_insert_block_size](../settings/settings.md#settings-max_insert_block_size) 大小的块。如果未在 [stream_flush_interval_ms](../settings/settings.md) 毫秒内形成块,则不关心块的完整性,都会将数据刷新到表中。 +为了提高性能,接受的消息被分组为 [max\_insert\_block\_size](../settings/settings.md#settings-max_insert_block_size) 大小的块。如果未在 [stream\_flush\_interval\_ms](../settings/settings.md) 毫秒内形成块,则不关心块的完整性,都会将数据刷新到表中。 停止接收主题数据或更改转换逻辑,请 detach 物化视图: -``` - DETACH TABLE consumer; - ATTACH MATERIALIZED VIEW consumer; -``` + DETACH TABLE consumer; + ATTACH MATERIALIZED VIEW consumer; 如果使用 `ALTER` 更改目标表,为了避免目标表与视图中的数据之间存在差异,推荐停止物化视图。 -## 配置 +## 配置 {#pei-zhi} 与 `GraphiteMergeTree` 类似,Kafka 引擎支持使用ClickHouse配置文件进行扩展配置。可以使用两个配置键:全局 (`kafka`) 和 主题级别 (`kafka_*`)。首先应用全局配置,然后应用主题级配置(如果存在)。 -```xml +``` xml cgrp diff --git a/docs/zh/operations/table_engines/log.md b/docs/zh/operations/table_engines/log.md index 47ad15f7ea8..852575181cd 100644 --- a/docs/zh/operations/table_engines/log.md +++ b/docs/zh/operations/table_engines/log.md @@ -1,7 +1,5 @@ -# Log - -日志与 TinyLog 的不同之处在于,"标记" 的小文件与列文件存在一起。这些标记写在每个数据块上,并且包含偏移量,这些偏移量指示从哪里开始读取文件以便跳过指定的行数。这使得可以在多个线程中读取表数据。对于并发数据访问,可以同时执行读取操作,而写入操作则阻塞读取和其它写入。Log 引擎不支持索引。同样,如果写入表失败,则该表将被破坏,并且从该表读取将返回错误。Log 引擎适用于临时数据,write-once 表以及测试或演示目的。 +# Log {#log} +日志与 TinyLog 的不同之处在于,«标记» 的小文件与列文件存在一起。这些标记写在每个数据块上,并且包含偏移量,这些偏移量指示从哪里开始读取文件以便跳过指定的行数。这使得可以在多个线程中读取表数据。对于并发数据访问,可以同时执行读取操作,而写入操作则阻塞读取和其它写入。Log 引擎不支持索引。同样,如果写入表失败,则该表将被破坏,并且从该表读取将返回错误。Log 引擎适用于临时数据,write-once 表以及测试或演示目的。 [Original article](https://clickhouse.tech/docs/zh/operations/table_engines/log/) - diff --git a/docs/zh/operations/table_engines/log_family.md b/docs/zh/operations/table_engines/log_family.md index 0726040ff07..407c3c3a0fb 100644 --- a/docs/zh/operations/table_engines/log_family.md +++ b/docs/zh/operations/table_engines/log_family.md @@ -1,39 +1,42 @@ -# 日志引擎系列 {#table_engines-log_engine_family} +# 日志引擎系列 {#table-engines-log-engine-family} 这些引擎是为了需要写入许多小数据量(少于一百万行)的表的场景而开发的。 这系列的引擎有: -- [StripeLog](stripelog.md) -- [Log](log.md) -- [TinyLog](tinylog.md) +- [StripeLog](stripelog.md) +- [Log](log.md) +- [TinyLog](tinylog.md) -## 共同属性 {#table_engines-log_engine_family-common_properties} +## 共同属性 {#table-engines-log-engine-family-common-properties} 引擎: -- 数据存储在磁盘上。 -- 写入时将数据追加在文件末尾。 -- 不支持[突变](../../query_language/alter.md#alter-mutations)操作。 -- 不支持索引。 +- 数据存储在磁盘上。 - 这意味着 `SELECT` 在范围查询时效率不高。 - -- 非原子地写入数据。 +- 写入时将数据追加在文件末尾。 - 如果某些事情破坏了写操作,例如服务器的异常关闭,你将会得到一张包含了损坏数据的表。 +- 不支持[突变](../../query_language/alter.md#alter-mutations)操作。 -## 差异 {#table_engines-log_engine_family-differences} +- 不支持索引。 + + 这意味着 `SELECT` 在范围查询时效率不高。 + +- 非原子地写入数据。 + + 如果某些事情破坏了写操作,例如服务器的异常关闭,你将会得到一张包含了损坏数据的表。 + +## 差异 {#table-engines-log-engine-family-differences} `Log` 和 `StripeLog` 引擎支持: -- 并发访问数据的锁。 - - `INSERT` 请求执行过程中表会被锁定,并且其他的读写数据的请求都会等待直到锁定被解除。如果没有写数据的请求,任意数量的读请求都可以并发执行。 +- 并发访问数据的锁。 -- 并行读取数据。 + `INSERT` 请求执行过程中表会被锁定,并且其他的读写数据的请求都会等待直到锁定被解除。如果没有写数据的请求,任意数量的读请求都可以并发执行。 - 在读取数据时,ClickHouse 使用多线程。 每个线程处理不同的数据块。 +- 并行读取数据。 + + 在读取数据时,ClickHouse 使用多线程。 每个线程处理不同的数据块。 `Log` 引擎为表中的每一列使用不同的文件。`StripeLog` 将所有的数据存储在一个文件中。因此 `StripeLog` 引擎在操作系统中使用更少的描述符,但是 `Log` 引擎提供更高的读性能。 diff --git a/docs/zh/operations/table_engines/materializedview.md b/docs/zh/operations/table_engines/materializedview.md index c0c6aee88e5..b22700fe3c6 100644 --- a/docs/zh/operations/table_engines/materializedview.md +++ b/docs/zh/operations/table_engines/materializedview.md @@ -1,6 +1,5 @@ -# 物化视图 +# 物化视图 {#wu-hua-shi-tu} 物化视图的使用(更多信息请参阅 [CREATE TABLE](../../query_language/create.md) )。它需要使用一个不同的引擎来存储数据,这个引擎要在创建物化视图时指定。当从表中读取时,它就会使用该引擎。 - [来源文章](https://clickhouse.tech/docs/en/operations/table_engines/materializedview/) diff --git a/docs/zh/operations/table_engines/memory.md b/docs/zh/operations/table_engines/memory.md index 7d2d6dc0f60..a48308f7b17 100644 --- a/docs/zh/operations/table_engines/memory.md +++ b/docs/zh/operations/table_engines/memory.md @@ -1,9 +1,7 @@ -# Memory +# Memory {#memory} Memory 引擎以未压缩的形式将数据存储在 RAM 中。数据完全以读取时获得的形式存储。换句话说,从这张表中读取是很轻松的。并发数据访问是同步的。锁范围小:读写操作不会相互阻塞。不支持索引。阅读是并行化的。在简单查询上达到最大生产率(超过10 GB /秒),因为没有磁盘读取,不需要解压缩或反序列化数据。(值得注意的是,在许多情况下,与 MergeTree 引擎的性能几乎一样高)。重新启动服务器时,表中的数据消失,表将变为空。通常,使用此表引擎是不合理的。但是,它可用于测试,以及在相对较少的行(最多约100,000,000)上需要最高性能的查询。 -Memory 引擎是由系统用于临时表进行外部数据的查询(请参阅 "外部数据用于请求处理" 部分),以及用于实现 `GLOBAL IN`(请参见 "IN 运算符" 部分)。 - +Memory 引擎是由系统用于临时表进行外部数据的查询(请参阅 «外部数据用于请求处理» 部分),以及用于实现 `GLOBAL IN`(请参见 «IN 运算符» 部分)。 [Original article](https://clickhouse.tech/docs/zh/operations/table_engines/memory/) - diff --git a/docs/zh/operations/table_engines/merge.md b/docs/zh/operations/table_engines/merge.md index 722f510d135..bbcbf8772b4 100644 --- a/docs/zh/operations/table_engines/merge.md +++ b/docs/zh/operations/table_engines/merge.md @@ -1,4 +1,4 @@ -# Merge +# Merge {#merge} `Merge` 引擎 (不要跟 `MergeTree` 引擎混淆) 本身不存储数据,但可用于同时从任意多个其他的表中读取数据。 读是自动并行的,不支持写入。读取时,那些被真正读取到数据的表的索引(如果有的话)会被使用。 @@ -6,16 +6,14 @@ 示例: -``` -Merge(hits, '^WatchLog') -``` + Merge(hits, '^WatchLog') -数据会从 `hits` 数据库中表名匹配正则 '`^WatchLog`' 的表中读取。 +数据会从 `hits` 数据库中表名匹配正则 ‘`^WatchLog`’ 的表中读取。 除了数据库名,你也可以用一个返回字符串的常量表达式。例如, `currentDatabase()` 。 正则表达式 — [re2](https://github.com/google/re2) (支持 PCRE 一个子集的功能),大小写敏感。 -了解关于正则表达式中转义字符的说明可参看 "match" 一节。 +了解关于正则表达式中转义字符的说明可参看 «match» 一节。 当选择需要读的表时,`Merge` 表本身会被排除,即使它匹配上了该正则。这样设计为了避免循环。 当然,是能够创建两个相互无限递归读取对方数据的 `Merge` 表的,但这并没有什么意义。 @@ -24,46 +22,42 @@ Merge(hits, '^WatchLog') 示例 2 : -我们假定你有一个旧表(WatchLog_old),你想改变数据分区了,但又不想把旧数据转移到新表(WatchLog_new)里,并且你需要同时能看到这两个表的数据。 +我们假定你有一个旧表(WatchLog\_old),你想改变数据分区了,但又不想把旧数据转移到新表(WatchLog\_new)里,并且你需要同时能看到这两个表的数据。 -``` -CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64) -ENGINE=MergeTree(date, (UserId, EventType), 8192); -INSERT INTO WatchLog_old VALUES ('2018-01-01', 1, 'hit', 3); + CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64) + ENGINE=MergeTree(date, (UserId, EventType), 8192); + INSERT INTO WatchLog_old VALUES ('2018-01-01', 1, 'hit', 3); -CREATE TABLE WatchLog_new(date Date, UserId Int64, EventType String, Cnt UInt64) -ENGINE=MergeTree PARTITION BY date ORDER BY (UserId, EventType) SETTINGS index_granularity=8192; -INSERT INTO WatchLog_new VALUES ('2018-01-02', 2, 'hit', 3); + CREATE TABLE WatchLog_new(date Date, UserId Int64, EventType String, Cnt UInt64) + ENGINE=MergeTree PARTITION BY date ORDER BY (UserId, EventType) SETTINGS index_granularity=8192; + INSERT INTO WatchLog_new VALUES ('2018-01-02', 2, 'hit', 3); -CREATE TABLE WatchLog as WatchLog_old ENGINE=Merge(currentDatabase(), '^WatchLog'); + CREATE TABLE WatchLog as WatchLog_old ENGINE=Merge(currentDatabase(), '^WatchLog'); -SELECT * -FROM WatchLog + SELECT * + FROM WatchLog -┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ -│ 2018-01-01 │ 1 │ hit │ 3 │ -└────────────┴────────┴───────────┴─────┘ -┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ -│ 2018-01-02 │ 2 │ hit │ 3 │ -└────────────┴────────┴───────────┴─────┘ + ┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ + │ 2018-01-01 │ 1 │ hit │ 3 │ + └────────────┴────────┴───────────┴─────┘ + ┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ + │ 2018-01-02 │ 2 │ hit │ 3 │ + └────────────┴────────┴───────────┴─────┘ -``` - -## 虚拟列 +## 虚拟列 {#xu-ni-lie} 虚拟列是一种由表引擎提供而不是在表定义中的列。换种说法就是,这些列并没有在 `CREATE TABLE` 中指定,但可以在 `SELECT` 中使用。 下面列出虚拟列跟普通列的不同点: -- 虚拟列不在表结构定义里指定。 -- 不能用 `INSERT` 向虚拟列写数据。 -- 使用不指定列名的 `INSERT` 语句时,虚拟列要会被忽略掉。 -- 使用星号通配符( `SELECT *` )时虚拟列不会包含在里面。 -- 虚拟列不会出现在 `SHOW CREATE TABLE` 和 `DESC TABLE` 的查询结果里。 +- 虚拟列不在表结构定义里指定。 +- 不能用 `INSERT` 向虚拟列写数据。 +- 使用不指定列名的 `INSERT` 语句时,虚拟列要会被忽略掉。 +- 使用星号通配符( `SELECT *` )时虚拟列不会包含在里面。 +- 虚拟列不会出现在 `SHOW CREATE TABLE` 和 `DESC TABLE` 的查询结果里。 `Merge` 类型的表包括一个 `String` 类型的 `_table` 虚拟列。(如果该表本来已有了一个 `_table` 的列,那这个虚拟列会命名为 `_table1` ;如果 `_table1` 也本就存在了,那这个虚拟列会被命名为 `_table2` ,依此类推)该列包含被读数据的表名。 如果 `WHERE/PREWHERE` 子句包含了带 `_table` 的条件,并且没有依赖其他的列(如作为表达式谓词链接的一个子项或作为整个的表达式),这些条件的作用会像索引一样。这些条件会在那些可能被读数据的表的表名上执行,并且读操作只会在那些满足了该条件的表上去执行。 - [来源文章](https://clickhouse.tech/docs/en/operations/table_engines/merge/) diff --git a/docs/zh/operations/table_engines/mergetree.md b/docs/zh/operations/table_engines/mergetree.md index c1705029e2e..dcb6254b0f1 100644 --- a/docs/zh/operations/table_engines/mergetree.md +++ b/docs/zh/operations/table_engines/mergetree.md @@ -1,4 +1,4 @@ -# MergeTree {#table_engines-mergetree} +# MergeTree {#table-engines-mergetree} Clickhouse 中最强大的表引擎当属 `MergeTree` (合并树)引擎及该系列(`*MergeTree`)中的其他引擎。 @@ -6,43 +6,40 @@ Clickhouse 中最强大的表引擎当属 `MergeTree` (合并树)引擎及 主要特点: -- 存储的数据按主键排序。 +- 存储的数据按主键排序。 - 这让你可以创建一个用于快速检索数据的小稀疏索引。 + 这让你可以创建一个用于快速检索数据的小稀疏索引。 -- 允许使用分区,如果指定了 [分区键](custom_partitioning_key.md) 的话。 +- 允许使用分区,如果指定了 [分区键](custom_partitioning_key.md) 的话。 - 在相同数据集和相同结果集的情况下 ClickHouse 中某些带分区的操作会比普通操作更快。查询中指定了分区键时 ClickHouse 会自动截取分区数据。这也有效增加了查询性能。 + 在相同数据集和相同结果集的情况下 ClickHouse 中某些带分区的操作会比普通操作更快。查询中指定了分区键时 ClickHouse 会自动截取分区数据。这也有效增加了查询性能。 -- 支持数据副本。 +- 支持数据副本。 - `ReplicatedMergeTree` 系列的表便是用于此。更多信息,请参阅 [数据副本](replication.md) 一节。 + `ReplicatedMergeTree` 系列的表便是用于此。更多信息,请参阅 [数据副本](replication.md) 一节。 -- 支持数据采样。 +- 支持数据采样。 - 需要的话,你可以给表设置一个采样方法。 + 需要的话,你可以给表设置一个采样方法。 -!!! 注意 +!!! 注意 "注意" [Merge](merge.md) 引擎并不属于 `*MergeTree` 系列。 +## 建表 {#table-engine-mergetree-creating-a-table} -## 建表 {#table_engine-mergetree-creating-a-table} - -``` -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... - INDEX index_name1 expr1 TYPE type1(...) GRANULARITY value1, - INDEX index_name2 expr2 TYPE type2(...) GRANULARITY value2 -) ENGINE = MergeTree() -[PARTITION BY expr] -[ORDER BY expr] -[PRIMARY KEY expr] -[SAMPLE BY expr] -[SETTINGS name=value, ...] -``` + CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] + ( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... + INDEX index_name1 expr1 TYPE type1(...) GRANULARITY value1, + INDEX index_name2 expr2 TYPE type2(...) GRANULARITY value2 + ) ENGINE = MergeTree() + [PARTITION BY expr] + [ORDER BY expr] + [PRIMARY KEY expr] + [SAMPLE BY expr] + [SETTINGS name=value, ...] 请求参数的描述,参考 [请求描述](../../query_language/create.md) 。 @@ -50,42 +47,41 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **子句** -- `ENGINE` - 引擎名和参数。 `ENGINE = MergeTree()`. `MergeTree` 引擎没有参数。 +- `ENGINE` - 引擎名和参数。 `ENGINE = MergeTree()`. `MergeTree` 引擎没有参数。 -- `PARTITION BY` — [分区键](custom_partitioning_key.md) 。 +- `PARTITION BY` — [分区键](custom_partitioning_key.md) 。 - 要按月分区,可以使用表达式 `toYYYYMM(date_column)` ,这里的 `date_column` 是一个 [Date](../../data_types/date.md) 类型的列。这里该分区名格式会是 `"YYYYMM"` 这样。 + 要按月分区,可以使用表达式 `toYYYYMM(date_column)` ,这里的 `date_column` 是一个 [Date](../../data_types/date.md) 类型的列。这里该分区名格式会是 `"YYYYMM"` 这样。 -- `ORDER BY` — 表的排序键。 +- `ORDER BY` — 表的排序键。 - 可以是一组列的元组或任意的表达式。 例如: `ORDER BY (CounterID, EventDate)` 。 + 可以是一组列的元组或任意的表达式。 例如: `ORDER BY (CounterID, EventDate)` 。 -- `PRIMARY KEY` - 主键,如果要设成 [跟排序键不相同](mergetree.md)。 +- `PRIMARY KEY` - 主键,如果要设成 [跟排序键不相同](mergetree.md)。 - 默认情况下主键跟排序键(由 `ORDER BY` 子句指定)相同。 - 因此,大部分情况下不需要再专门指定一个 `PRIMARY KEY` 子句。 + 默认情况下主键跟排序键(由 `ORDER BY` 子句指定)相同。 + 因此,大部分情况下不需要再专门指定一个 `PRIMARY KEY` 子句。 -- `SAMPLE BY` — 用于抽样的表达式。 +- `SAMPLE BY` — 用于抽样的表达式。 - 如果要用抽样表达式,主键中必须包含这个表达式。例如: - `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))` 。 + 如果要用抽样表达式,主键中必须包含这个表达式。例如: + `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))` 。 -- `SETTINGS` — 影响 `MergeTree` 性能的额外参数: - - `index_granularity` — 索引粒度。即索引中相邻『标记』间的数据行数。默认值,8192 。该列表中所有可用的参数可以从这里查看 [MergeTreeSettings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h) 。 - - `index_granularity_bytes` — 索引粒度,以字节为单位,默认值: 10Mb。如果仅按数据行数限制索引粒度, 请设置为0(不建议)。 - - `enable_mixed_granularity_parts` — 启用或禁用通过 `index_granularity_bytes` 控制索引粒度的大小。在19.11版本之前, 只有 `index_granularity` 配置能够用于限制索引粒度的大小。当从大表(数十或数百兆)中查询数据时候,`index_granularity_bytes` 配置能够提升ClickHouse的性能。如果你的表内数据量很大,可以开启这项配置用以提升`SELECT` 查询的性能。 - - `use_minimalistic_part_header_in_zookeeper` — 数据片段头在 ZooKeeper 中的存储方式。如果设置了 `use_minimalistic_part_header_in_zookeeper=1` ,ZooKeeper 会存储更少的数据。更多信息参考『服务配置参数』这章中的 [设置描述](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) 。 - - `min_merge_bytes_to_use_direct_io` — 使用直接 I/O 来操作磁盘的合并操作时要求的最小数据量。合并数据片段时,ClickHouse 会计算要被合并的所有数据的总存储空间。如果大小超过了 `min_merge_bytes_to_use_direct_io` 设置的字节数,则 ClickHouse 将使用直接 I/O 接口(`O_DIRECT` 选项)对磁盘读写。如果设置 `min_merge_bytes_to_use_direct_io = 0` ,则会禁用直接 I/O。默认值:`10 * 1024 * 1024 * 1024` 字节。 - - - `merge_with_ttl_timeout` — TTL合并频率的最小间隔时间。默认值: 86400 (1 天)。 - - `write_final_mark` — 启用或禁用在数据片段尾部写入最终索引标记。默认值: 1(不建议更改)。 - - `storage_policy` — 存储策略。 参见 [使用多个区块装置进行数据存储](#table_engine-mergetree-multiple-volumes). +- `SETTINGS` — 影响 `MergeTree` 性能的额外参数: + + - `index_granularity` — 索引粒度。即索引中相邻『标记』间的数据行数。默认值,8192 。该列表中所有可用的参数可以从这里查看 [MergeTreeSettings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h) 。 + - `index_granularity_bytes` — 索引粒度,以字节为单位,默认值: 10Mb。如果仅按数据行数限制索引粒度, 请设置为0(不建议)。 + - `enable_mixed_granularity_parts` — 启用或禁用通过 `index_granularity_bytes` 控制索引粒度的大小。在19.11版本之前, 只有 `index_granularity` 配置能够用于限制索引粒度的大小。当从大表(数十或数百兆)中查询数据时候,`index_granularity_bytes` 配置能够提升ClickHouse的性能。如果你的表内数据量很大,可以开启这项配置用以提升`SELECT` 查询的性能。 + - `use_minimalistic_part_header_in_zookeeper` — 数据片段头在 ZooKeeper 中的存储方式。如果设置了 `use_minimalistic_part_header_in_zookeeper=1` ,ZooKeeper 会存储更少的数据。更多信息参考『服务配置参数』这章中的 [设置描述](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) 。 + - `min_merge_bytes_to_use_direct_io` — 使用直接 I/O 来操作磁盘的合并操作时要求的最小数据量。合并数据片段时,ClickHouse 会计算要被合并的所有数据的总存储空间。如果大小超过了 `min_merge_bytes_to_use_direct_io` 设置的字节数,则 ClickHouse 将使用直接 I/O 接口(`O_DIRECT` 选项)对磁盘读写。如果设置 `min_merge_bytes_to_use_direct_io = 0` ,则会禁用直接 I/O。默认值:`10 * 1024 * 1024 * 1024` 字节。 + + - `merge_with_ttl_timeout` — TTL合并频率的最小间隔时间。默认值: 86400 (1 天)。 + - `write_final_mark` — 启用或禁用在数据片段尾部写入最终索引标记。默认值: 1(不建议更改)。 + - `storage_policy` — 存储策略。 参见 [使用多个区块装置进行数据存储](#table_engine-mergetree-multiple-volumes). **示例配置** -``` -ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity=8192 -``` + ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity=8192 示例中,我们设为按月分区。 @@ -93,32 +89,30 @@ ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDa `index_granularity` 可省略,默认值为 8192 。 -
已弃用的建表方法 +
+ +已弃用的建表方法 !!! attention "注意" 不要在新版项目中使用该方法,可能的话,请将旧项目切换到上述方法。 -``` -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE [=] MergeTree(date-column [, sampling_expression], (primary, key), index_granularity) -``` + CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] + ( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... + ) ENGINE [=] MergeTree(date-column [, sampling_expression], (primary, key), index_granularity) **MergeTree() 参数** -- `date-column` — 类型为 [Date](../../data_types/date.md) 的列名。ClickHouse 会自动依据这个列按月创建分区。分区名格式为 `"YYYYMM"` 。 -- `sampling_expression` — 采样表达式。 -- `(primary, key)` — 主键。类型 — [Tuple()](../../data_types/tuple.md) -- `index_granularity` — 索引粒度。即索引中相邻『标记』间的数据行数。设为 8192 可以适用大部分场景。 +- `date-column` — 类型为 [Date](../../data_types/date.md) 的列名。ClickHouse 会自动依据这个列按月创建分区。分区名格式为 `"YYYYMM"` 。 +- `sampling_expression` — 采样表达式。 +- `(primary, key)` — 主键。类型 — [Tuple()](../../data_types/tuple.md) +- `index_granularity` — 索引粒度。即索引中相邻『标记』间的数据行数。设为 8192 可以适用大部分场景。 **示例** -``` -MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) -``` + MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) 对于主要的配置方法,这里 `MergeTree` 引擎跟前面的例子一样,可以以同样的方式配置。
@@ -135,24 +129,22 @@ ClickHouse 会为每个数据片段创建一个索引文件,索引文件包含 你可以只用一单一大表并不断地一块块往里面加入数据 – `MergeTree` 引擎的就是为了这样的场景。 -## 主键和索引在查询中的表现 {#primary-keys-and-indexes-in-queries} +## 主键和索引在查询中的表现 {#primary-keys-and-indexes-in-queries} 我们以 `(CounterID, Date)` 以主键。排序好的索引的图示会是下面这样: -``` -全部数据 : [-------------------------------------------------------------------------] -CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll] -Date: [1111111222222233331233211111222222333211111112122222223111112223311122333] -标记: | | | | | | | | | | | - a,1 a,2 a,3 b,3 e,2 e,3 g,1 h,2 i,1 i,3 l,3 -标记号: 0 1 2 3 4 5 6 7 8 9 10 -``` + 全部数据 : [-------------------------------------------------------------------------] + CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll] + Date: [1111111222222233331233211111222222333211111112122222223111112223311122333] + 标记: | | | | | | | | | | | + a,1 a,2 a,3 b,3 e,2 e,3 g,1 h,2 i,1 i,3 l,3 + 标记号: 0 1 2 3 4 5 6 7 8 9 10 如果指定查询如下: -- `CounterID in ('a', 'h')`,服务器会读取标记号在 `[0, 3)` 和 `[6, 8)` 区间中的数据。 -- `CounterID IN ('a', 'h') AND Date = 3`,服务器会读取标记号在 `[1, 3)` 和 `[7, 8)` 区间中的数据。 -- `Date = 3`,服务器会读取标记号在 `[1, 10]` 区间中的数据。 +- `CounterID in ('a', 'h')`,服务器会读取标记号在 `[0, 3)` 和 `[6, 8)` 区间中的数据。 +- `CounterID IN ('a', 'h') AND Date = 3`,服务器会读取标记号在 `[1, 3)` 和 `[7, 8)` 区间中的数据。 +- `Date = 3`,服务器会读取标记号在 `[1, 10]` 区间中的数据。 上面例子可以看出使用索引通常会比全表描述要高效。 @@ -162,28 +154,27 @@ Date: [111111122222223333123321111122222233321111111212222222311111222 ClickHouse 不要求主键惟一。所以,你可以插入多条具有相同主键的行。 -### 主键的选择 +### 主键的选择 {#zhu-jian-de-xuan-ze} 主键中列的数量并没有明确的限制。依据数据结构,你应该让主键包含多些或少些列。这样可以: -- 改善索引的性能。 +- 改善索引的性能。 - 如果当前主键是 `(a, b)` ,然后加入另一个 `c` 列,满足下面条件时,则可以改善性能: - - 有带有 `c` 列条件的查询。 - - 很长的数据范围( `index_granularity` 的数倍)里 `(a, b)` 都是相同的值,并且这种的情况很普遍。换言之,就是加入另一列后,可以让你的查询略过很长的数据范围。 + 如果当前主键是 `(a, b)` ,然后加入另一个 `c` 列,满足下面条件时,则可以改善性能: + - 有带有 `c` 列条件的查询。 + - 很长的数据范围( `index_granularity` 的数倍)里 `(a, b)` 都是相同的值,并且这种的情况很普遍。换言之,就是加入另一列后,可以让你的查询略过很长的数据范围。 -- 改善数据压缩。 +- 改善数据压缩。 - ClickHouse 以主键排序片段数据,所以,数据的一致性越高,压缩越好。 + ClickHouse 以主键排序片段数据,所以,数据的一致性越高,压缩越好。 -- [CollapsingMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) 和 [SummingMergeTree](summingmergetree.md) 引擎里,数据合并时,会有额外的处理逻辑。 +- [CollapsingMergeTree](collapsingmergetree.md#table_engine-collapsingmergetree) 和 [SummingMergeTree](summingmergetree.md) 引擎里,数据合并时,会有额外的处理逻辑。 - 在这种情况下,指定一个跟主键不同的 *排序键* 也是有意义的。 + 在这种情况下,指定一个跟主键不同的 *排序键* 也是有意义的。 长的主键会对插入性能和内存消耗有负面影响,但主键中额外的列并不影响 `SELECT` 查询的性能。 - -### 选择跟排序键不一样主键 +### 选择跟排序键不一样主键 {#xuan-ze-gen-pai-xu-jian-bu-yi-yang-zhu-jian} 指定一个跟排序键(用于排序数据片段中行的表达式) 不一样的主键(用于计算写到索引文件的每个标记值的表达式)是可以的。 @@ -202,7 +193,7 @@ ClickHouse 不要求主键惟一。所以,你可以插入多条具有相同主 [排序键的修改](../../query_language/alter.md) 是轻量级的操作,因为一个新列同时被加入到表里和排序键后时,已存在的数据片段并不需要修改。由于旧的排序键是新排序键的前缀,并且刚刚添加的列中没有数据,因此在表修改时的数据对于新旧的排序键来说都是有序的。 -### 索引和分区在查询中的应用 +### 索引和分区在查询中的应用 {#suo-yin-he-fen-qu-zai-cha-xun-zhong-de-ying-yong} 对于 `SELECT` 查询,ClickHouse 分析是否可以使用索引。如果 `WHERE/PREWHERE` 子句具有下面这些表达式(作为谓词链接一子项或整个)则可以使用索引:基于主键或分区键的列或表达式的部分的等式或比较运算表达式;基于主键或分区键的列或表达式的固定前缀的 `IN` 或 `LIKE` 表达式;基于主键或分区键的列的某些函数;基于主键或分区键的表达式的逻辑表达式。 @@ -210,9 +201,7 @@ ClickHouse 不要求主键惟一。所以,你可以插入多条具有相同主 当引擎配置如下时: -``` -ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192 -``` + ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192 这种情况下,这些查询: @@ -232,17 +221,17 @@ ClickHouse 会依据主键索引剪掉不符合的数据,依据按月分区的 SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%' ``` -要检查 ClickHouse 执行一个查询时能否使用索引,可设置 [force_index_by_date](../settings/settings.md#settings-force_index_by_date) 和 [force_primary_key](../settings/settings.md) 。 +要检查 ClickHouse 执行一个查询时能否使用索引,可设置 [force\_index\_by\_date](../settings/settings.md#settings-force_index_by_date) 和 [force\_primary\_key](../settings/settings.md) 。 按月分区的分区键是只能读取包含适当范围日期的数据块。这种情况下,数据块会包含很多天(最多整月)的数据。在块中,数据按主键排序,主键第一列可能不包含日期。因此,仅使用日期而没有带主键前缀条件的查询将会导致读取超过这个日期范围。 - -### 跳数索引(分段汇总索引,实验性的) +### 跳数索引(分段汇总索引,实验性的) {#tiao-shu-suo-yin-fen-duan-hui-zong-suo-yin-shi-yan-xing-de} 需要设置 `allow_experimental_data_skipping_indices` 为 1 才能使用此索引。(执行 `SET allow_experimental_data_skipping_indices = 1`)。 此索引在 `CREATE` 语句的列部分里定义。 -```sql + +``` sql INDEX index_name expr TYPE type(...) GRANULARITY granularity_value ``` @@ -251,9 +240,9 @@ INDEX index_name expr TYPE type(...) GRANULARITY granularity_value 这些索引是由数据块按粒度分割后的每部分在指定表达式上汇总信息 `granularity_value` 组成(粒度大小用表引擎里 `index_granularity` 的指定)。 这些汇总信息有助于用 `where` 语句跳过大片不满足的数据,从而减少 `SELECT` 查询从磁盘读取的数据量, - 示例 -```sql + +``` sql CREATE TABLE table_name ( u64 UInt64, @@ -267,58 +256,59 @@ CREATE TABLE table_name ``` 上例中的索引能让 ClickHouse 执行下面这些查询时减少读取数据量。 -```sql + +``` sql SELECT count() FROM table WHERE s < 'z' SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 ``` -#### 索引的可用类型 {#table_engine-mergetree-data_skipping-indexes} +#### 索引的可用类型 {#table-engine-mergetree-data-skipping-indexes} -* `minmax` -存储指定表达式的极值(如果表达式是 `tuple` ,则存储 `tuple` 中每个元素的极值),这些信息用于跳过数据块,类似主键。 +- `minmax` + 存储指定表达式的极值(如果表达式是 `tuple` ,则存储 `tuple` 中每个元素的极值),这些信息用于跳过数据块,类似主键。 -* `set(max_rows)` -存储指定表达式的惟一值(不超过 `max_rows` 个,`max_rows=0` 则表示『无限制』)。这些信息可用于检查 `WHERE` 表达式是否满足某个数据块。 +- `set(max_rows)` + 存储指定表达式的惟一值(不超过 `max_rows` 个,`max_rows=0` 则表示『无限制』)。这些信息可用于检查 `WHERE` 表达式是否满足某个数据块。 -* `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` -存储包含数据块中所有 n 元短语的 [布隆过滤器](https://en.wikipedia.org/wiki/Bloom_filter) 。只可用在字符串上。 -可用于优化 `equals` , `like` 和 `in` 表达式的性能。 -`n` -- 短语长度。 -`size_of_bloom_filter_in_bytes` -- 布隆过滤器大小,单位字节。(因为压缩得好,可以指定比较大的值,如256或512)。 -`number_of_hash_functions` -- 布隆过滤器中使用的 hash 函数的个数。 -`random_seed` -- hash 函数的随机种子。 +- `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` + 存储包含数据块中所有 n 元短语的 [布隆过滤器](https://en.wikipedia.org/wiki/Bloom_filter) 。只可用在字符串上。 + 可用于优化 `equals` , `like` 和 `in` 表达式的性能。 + `n` – 短语长度。 + `size_of_bloom_filter_in_bytes` – 布隆过滤器大小,单位字节。(因为压缩得好,可以指定比较大的值,如256或512)。 + `number_of_hash_functions` – 布隆过滤器中使用的 hash 函数的个数。 + `random_seed` – hash 函数的随机种子。 -* `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` -跟 `ngrambf_v1` 类似,不同于 ngrams 存储字符串指定长度的所有片段。它只存储被非字母数据字符分割的片段。 +- `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` + 跟 `ngrambf_v1` 类似,不同于 ngrams 存储字符串指定长度的所有片段。它只存储被非字母数据字符分割的片段。 -```sql + + +``` sql INDEX sample_index (u64 * length(s)) TYPE minmax GRANULARITY 4 INDEX sample_index2 (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4 INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 4 ``` - -## 并发数据访问 +## 并发数据访问 {#bing-fa-shu-ju-fang-wen} 应对表的并发访问,我们使用多版本机制。换言之,当同时读和更新表时,数据从当前查询到的一组片段中读取。没有冗长的的锁。插入不会阻碍读取。 对表的读操作是自动并行的。 - -## 列和表的TTL {#table_engine-mergetree-ttl} +## 列和表的TTL {#table-engine-mergetree-ttl} TTL可以设置值的生命周期,它既可以为整张表设置,也可以为每个列字段单独设置。如果`TTL`同时作用于表和字段,ClickHouse会使用先到期的那个。 被设置TTL的表,必须拥有[Date](../../data_types/date.md) 或 [DateTime](../../data_types/datetime.md) 类型的字段。要定义数据的生命周期,需要在这个日期字段上使用操作符,例如: -```sql +``` sql TTL time_column TTL time_column + interval ``` 要定义`interval`, 需要使用 [time interval](../../query_language/operators.md#operators-datetime) 操作符。 -```sql +``` sql TTL date_time + INTERVAL 1 MONTH TTL date_time + INTERVAL 15 HOUR ``` @@ -333,8 +323,8 @@ TTL date_time + INTERVAL 15 HOUR 创建一张包含 `TTL` 的表 -```sql -CREATE TABLE example_table +``` sql +CREATE TABLE example_table ( d DateTime, a Int TTL d + INTERVAL 1 MONTH, @@ -348,7 +338,7 @@ ORDER BY d; 为表中已存在的列字段添加 `TTL` -```sql +``` sql ALTER TABLE example_table MODIFY COLUMN c String TTL d + INTERVAL 1 DAY; @@ -356,7 +346,7 @@ ALTER TABLE example_table 修改列字段的 `TTL` -```sql +``` sql ALTER TABLE example_table MODIFY COLUMN c String TTL d + INTERVAL 1 MONTH; @@ -370,8 +360,8 @@ ALTER TABLE example_table 创建一张包含 `TTL` 的表 -```sql -CREATE TABLE example_table +``` sql +CREATE TABLE example_table ( d DateTime, a Int @@ -384,7 +374,7 @@ TTL d + INTERVAL 1 MONTH; 修改表的 `TTL` -```sql +``` sql ALTER TABLE example_table MODIFY TTL d + INTERVAL 1 DAY; ``` @@ -393,13 +383,12 @@ ALTER TABLE example_table 当ClickHouse合并数据分区时, 会删除TTL过期的数据。 -当ClickHouse发现数据过期时, 它将会执行一个计划外的合并。要控制这类合并的频率, 你可以设置 [merge_with_ttl_timeout](#mergetree_setting-merge_with_ttl_timeout)。如果该值被设置的太低, 它将导致执行许多的计划外合并,这可能会消耗大量资源。 +当ClickHouse发现数据过期时, 它将会执行一个计划外的合并。要控制这类合并的频率, 你可以设置 [merge\_with\_ttl\_timeout](#mergetree_setting-merge_with_ttl_timeout)。如果该值被设置的太低, 它将导致执行许多的计划外合并,这可能会消耗大量资源。 如果在合并的时候执行`SELECT` 查询, 则可能会得到过期的数据。为了避免这种情况,可以在`SELECT`之前使用 [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) 查询。 +## Using Multiple Block Devices for Data Storage {#table-engine-mergetree-multiple-volumes} -## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes} - -### Configuration {#table_engine-mergetree-multiple-volumes_configure} +### Configuration {#table-engine-mergetree-multiple-volumes-configure} [来源文章](https://clickhouse.tech/docs/en/operations/table_engines/mergetree/) diff --git a/docs/zh/operations/table_engines/mysql.md b/docs/zh/operations/table_engines/mysql.md index f4974b9a7d8..e97f5f12106 100644 --- a/docs/zh/operations/table_engines/mysql.md +++ b/docs/zh/operations/table_engines/mysql.md @@ -1,24 +1,22 @@ -# MySQL +# MySQL {#mysql} MySQL 引擎可以对存储在远程 MySQL 服务器上的数据执行 `SELECT` 查询。 调用格式: -``` -MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); -``` + MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); **调用参数** -- `host:port` — MySQL 服务器地址。 -- `database` — 数据库的名称。 -- `table` — 表名称。 -- `user` — 数据库用户。 -- `password` — 用户密码。 -- `replace_query` — 将 `INSERT INTO` 查询是否替换为 `REPLACE INTO` 的标志。如果 `replace_query=1`,则替换查询 -- `'on_duplicate_clause'` — 将 `ON DUPLICATE KEY UPDATE 'on_duplicate_clause'` 表达式添加到 `INSERT` 查询语句中。例如:`impression = VALUES(impression) + impression`。如果需要指定 `'on_duplicate_clause'`,则需要设置 `replace_query=0`。如果同时设置 `replace_query = 1` 和 `'on_duplicate_clause'`,则会抛出异常。 +- `host:port` — MySQL 服务器地址。 +- `database` — 数据库的名称。 +- `table` — 表名称。 +- `user` — 数据库用户。 +- `password` — 用户密码。 +- `replace_query` — 将 `INSERT INTO` 查询是否替换为 `REPLACE INTO` 的标志。如果 `replace_query=1`,则替换查询 +- `'on_duplicate_clause'` — 将 `ON DUPLICATE KEY UPDATE 'on_duplicate_clause'` 表达式添加到 `INSERT` 查询语句中。例如:`impression = VALUES(impression) + impression`。如果需要指定 `'on_duplicate_clause'`,则需要设置 `replace_query=0`。如果同时设置 `replace_query = 1` 和 `'on_duplicate_clause'`,则会抛出异常。 -此时,简单的 `WHERE` 子句(例如 ` =, !=, >, >=, <, <=`)是在 MySQL 服务器上执行。 +此时,简单的 `WHERE` 子句(例如 `=, !=, >, >=, <, <=`)是在 MySQL 服务器上执行。 其余条件以及 `LIMIT` 采样约束语句仅在对MySQL的查询完成后才在ClickHouse中执行。 diff --git a/docs/zh/operations/table_engines/null.md b/docs/zh/operations/table_engines/null.md index dc2e777739a..94f731f756d 100644 --- a/docs/zh/operations/table_engines/null.md +++ b/docs/zh/operations/table_engines/null.md @@ -1,8 +1,7 @@ -# Null +# Null {#null} 当写入 Null 类型的表时,将忽略数据。从 Null 类型的表中读取时,返回空。 但是,可以在 Null 类型的表上创建物化视图。写入表的数据将转发到视图中。 - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/null/) diff --git a/docs/zh/operations/table_engines/replacingmergetree.md b/docs/zh/operations/table_engines/replacingmergetree.md index b72838d9e21..66c3246f272 100644 --- a/docs/zh/operations/table_engines/replacingmergetree.md +++ b/docs/zh/operations/table_engines/replacingmergetree.md @@ -1,4 +1,4 @@ -# ReplacingMergeTree +# ReplacingMergeTree {#replacingmergetree} 该引擎和[MergeTree](mergetree.md)的不同之处在于它会删除具有相同主键的重复项。 @@ -6,9 +6,9 @@ 因此,`ReplacingMergeTree` 适用于在后台清除重复的数据以节省空间,但是它不保证没有重复的数据出现。 -## 建表 +## 建表 {#jian-biao} -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -25,22 +25,24 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **ReplacingMergeTree Parameters** -- `ver` — 版本列。类型为 `UInt*`, `Date` 或 `DateTime`。可选参数。 +- `ver` — 版本列。类型为 `UInt*`, `Date` 或 `DateTime`。可选参数。 - 合并的时候,`ReplacingMergeTree` 从所有具有相同主键的行中选择一行留下: - - 如果 `ver` 列未指定,选择最后一条。 - - 如果 `ver` 列已指定,选择 `ver` 值最大的版本。 + 合并的时候,`ReplacingMergeTree` 从所有具有相同主键的行中选择一行留下: + - 如果 `ver` 列未指定,选择最后一条。 + - 如果 `ver` 列已指定,选择 `ver` 值最大的版本。 **子句** 创建 `ReplacingMergeTree` 表时,需要与创建 `MergeTree` 表时相同的[子句](mergetree.md)。 -
已弃用的建表方法 +
+ +已弃用的建表方法 !!! attention "注意" 不要在新项目中使用该方法,可能的话,请将旧项目切换到上述方法。 -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -51,8 +53,8 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] 除了 `ver` 的所有参数都与 `MergeTree` 中的含义相同。 +- `ver` - 版本列。可选参数,有关说明,请参阅上文。 -- `ver` - 版本列。可选参数,有关说明,请参阅上文。
[来源文章](https://clickhouse.tech/docs/en/operations/table_engines/replacingmergetree/) diff --git a/docs/zh/operations/table_engines/replication.md b/docs/zh/operations/table_engines/replication.md index c79606b9d03..25f0dee3e8b 100644 --- a/docs/zh/operations/table_engines/replication.md +++ b/docs/zh/operations/table_engines/replication.md @@ -1,14 +1,14 @@ -# 数据副本 {#table_engines-replication} +# 数据副本 {#table-engines-replication} 只有 MergeTree 系列里的表可支持副本: -- ReplicatedMergeTree -- ReplicatedSummingMergeTree -- ReplicatedReplacingMergeTree -- ReplicatedAggregatingMergeTree -- ReplicatedCollapsingMergeTree -- ReplicatedVersionedCollapsingMergeTree -- ReplicatedGraphiteMergeTree +- ReplicatedMergeTree +- ReplicatedSummingMergeTree +- ReplicatedReplacingMergeTree +- ReplicatedAggregatingMergeTree +- ReplicatedCollapsingMergeTree +- ReplicatedVersionedCollapsingMergeTree +- ReplicatedGraphiteMergeTree 副本是表级别的,不是整个服务器级的。所以,服务器里可以同时有复制表和非复制表。 @@ -18,13 +18,13 @@ 而 `CREATE`,`DROP`,`ATTACH`,`DETACH` 和 `RENAME` 语句只会在单个服务器上执行,不会被复制。 -- `The CREATE TABLE` 在运行此语句的服务器上创建一个新的可复制表。如果此表已存在其他服务器上,则给该表添加新副本。 -- `The DROP TABLE` 删除运行此查询的服务器上的副本。 -- `The RENAME` 重命名一个副本。换句话说,可复制表不同的副本可以有不同的名称。 +- `The CREATE TABLE` 在运行此语句的服务器上创建一个新的可复制表。如果此表已存在其他服务器上,则给该表添加新副本。 +- `The DROP TABLE` 删除运行此查询的服务器上的副本。 +- `The RENAME` 重命名一个副本。换句话说,可复制表不同的副本可以有不同的名称。 要使用副本,需在配置文件中设置 ZooKeeper 集群的地址。例如: -```xml +``` xml example1 @@ -47,7 +47,7 @@ 如果配置文件中没有设置 ZooKeeper ,则无法创建复制表,并且任何现有的复制表都将变为只读。 - `SELECT` 查询并不需要借助 ZooKeeper ,复本并不影响 `SELECT` 的性能,查询复制表与非复制表速度是一样的。查询分布式表时,ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../settings/settings.md) 修改。 +`SELECT` 查询并不需要借助 ZooKeeper ,复本并不影响 `SELECT` 的性能,查询复制表与非复制表速度是一样的。查询分布式表时,ClickHouse的处理方式可通过设置 [max\_replica\_delay\_for\_distributed\_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback\_to\_stale\_replicas\_for\_distributed\_queries](../settings/settings.md) 修改。 对于每个 `INSERT` 语句,会通过几个事务将十来个记录添加到 ZooKeeper。(确切地说,这是针对每个插入的数据块; 每个 INSERT 语句的每 `max_insert_block_size = 1048576` 行和最后剩余的都各算作一个块。)相比非复制表,写 zk 会导致 `INSERT` 的延迟略长一些。但只要你按照建议每秒不超过一个 `INSERT` 地批量插入数据,不会有任何问题。一个 ZooKeeper 集群能给整个 ClickHouse 集群支撑协调每秒几百个 `INSERT`。数据插入的吞吐量(每秒的行数)可以跟不用复制的数据一样高。 @@ -59,7 +59,7 @@ 单个数据块写入是原子的。 INSERT 的数据按每块最多 `max_insert_block_size = 1048576` 行进行分块,换句话说,如果 `INSERT` 插入的行少于 1048576,则该 INSERT 是原子的。 -数据块会去重。对于被多次写的相同数据块(大小相同且具有相同顺序的相同行的数据块),该块仅会写入一次。这样设计的原因是万一在网络故障时客户端应用程序不知道数据是否成功写入DB,此时可以简单地重复 `INSERT` 。把相同的数据发送给多个副本 INSERT 并不会有问题。因为这些 `INSERT` 是完全相同的(会被去重)。去重参数参看服务器设置 [merge_tree](../server_settings/settings.md) 。(注意:Replicated\*MergeTree 才会去重,不需要 zookeeper 的不带 MergeTree 不会去重) +数据块会去重。对于被多次写的相同数据块(大小相同且具有相同顺序的相同行的数据块),该块仅会写入一次。这样设计的原因是万一在网络故障时客户端应用程序不知道数据是否成功写入DB,此时可以简单地重复 `INSERT` 。把相同的数据发送给多个副本 INSERT 并不会有问题。因为这些 `INSERT` 是完全相同的(会被去重)。去重参数参看服务器设置 [merge\_tree](../server_settings/settings.md) 。(注意:Replicated\*MergeTree 才会去重,不需要 zookeeper 的不带 MergeTree 不会去重) 在复制期间,只有要插入的源数据通过网络传输。进一步的数据转换(合并)会在所有副本上以相同的方式进行处理执行。这样可以最大限度地减少网络使用,这意味着即使副本在不同的数据中心,数据同步也能工作良好。(能在不同数据中心中的同步数据是副本机制的主要目标。) @@ -67,20 +67,18 @@ 系统会监视副本数据同步情况,并能在发生故障后恢复。故障转移是自动的(对于小的数据差异)或半自动的(当数据差异很大时,这可能意味是有配置错误)。 - ## 创建复制表 {#creating-replicated-tables} - 在表引擎名称上加上 `Replicated` 前缀。例如:`ReplicatedMergeTree`。 **Replicated\*MergeTree 参数** -- `zoo_path` — ZooKeeper 中该表的路径。 -- `replica_name` — ZooKeeper 中的该表的副本名称。 +- `zoo_path` — ZooKeeper 中该表的路径。 +- `replica_name` — ZooKeeper 中的该表的副本名称。 示例: -```sql +``` sql CREATE TABLE table_name ( EventDate DateTime, @@ -94,7 +92,7 @@ SAMPLE BY intHash32(UserID) 已弃用的建表语法示例: -```sql +``` sql CREATE TABLE table_name ( EventDate DateTime, @@ -103,9 +101,9 @@ CREATE TABLE table_name ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192) ``` -如上例所示,这些参数可以包含宏替换的占位符,即大括号的部分。它们会被替换为配置文件里 'macros' 那部分配置的值。示例: +如上例所示,这些参数可以包含宏替换的占位符,即大括号的部分。它们会被替换为配置文件里 ‘macros’ 那部分配置的值。示例: -```xml +``` xml 05 02 @@ -113,14 +111,14 @@ CREATE TABLE table_name ``` -“ZooKeeper 中该表的路径”对每个可复制表都要是唯一的。不同分片上的表要有不同的路径。 +«ZooKeeper 中该表的路径»对每个可复制表都要是唯一的。不同分片上的表要有不同的路径。 这种情况下,路径包含下面这些部分: `/clickhouse/tables/` 是公共前缀,我们推荐使用这个。 `{layer}-{shard}` 是分片标识部分。在此示例中,由于 Yandex.Metrica 集群使用了两级分片,所以它是由两部分组成的。但对于大多数情况来说,你只需保留 {shard} 占位符即可,它会替换展开为分片标识。 -`table_name` 是该表在 ZooKeeper 中的名称。使其与 ClickHouse 中的表名相同比较好。 这里它被明确定义,跟 ClickHouse 表名不一样,它并不会被 RENAME 语句修改。 +`table_name` 是该表在 ZooKeeper 中的名称。使其与 ClickHouse 中的表名相同比较好。 这里它被明确定义,跟 ClickHouse 表名不一样,它并不会被 RENAME 语句修改。 *HINT*: you could add a database name in front of `table_name` as well. E.g. `db_name.table_name` 副本名称用于标识同一个表分片的不同副本。你可以使用服务器名称,如上例所示。同个分片中不同副本的副本名称要唯一。 @@ -135,7 +133,7 @@ CREATE TABLE table_name 要删除副本,使用 `DROP TABLE`。但它只删除那个 – 位于运行该语句的服务器上的副本。 -## 故障恢复 +## 故障恢复 {#gu-zhang-hui-fu} 如果服务器启动时 ZooKeeper 不可用,则复制表会切换为只读模式。系统会定期尝试去连接 ZooKeeper。 @@ -143,7 +141,7 @@ CREATE TABLE table_name 连接到 ZooKeeper 后,系统会检查本地文件系统中的数据集是否与预期的数据集( ZooKeeper 存储此信息)一致。如果存在轻微的不一致,系统会通过与副本同步数据来解决。 -如果系统检测到损坏的数据片段(文件大小错误)或无法识别的片段(写入文件系统但未记录在 ZooKeeper 中的部分),则会把它们移动到 'detached' 子目录(不会删除)。而副本中其他任何缺少的但正常数据片段都会被复制同步。 +如果系统检测到损坏的数据片段(文件大小错误)或无法识别的片段(写入文件系统但未记录在 ZooKeeper 中的部分),则会把它们移动到 ‘detached’ 子目录(不会删除)。而副本中其他任何缺少的但正常数据片段都会被复制同步。 注意,ClickHouse 不会执行任何破坏性操作,例如自动删除大量数据。 @@ -153,29 +151,28 @@ CREATE TABLE table_name 要触发启动恢复,可在 ZooKeeper 中创建节点 `/path_to_table/replica_name/flags/force_restore_data`,节点值可以是任何内容,或运行命令来恢复所有的可复制表: -```bash +``` bash sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data ``` 然后重启服务器。启动时,服务器会删除这些标志并开始恢复。 -## 在数据完全丢失后的恢复 +## 在数据完全丢失后的恢复 {#zai-shu-ju-wan-quan-diu-shi-hou-de-hui-fu} 如果其中一个服务器的所有数据和元数据都消失了,请按照以下步骤进行恢复: -1. 在服务器上安装 ClickHouse。在包含分片标识符和副本的配置文件中正确定义宏配置,如果有用到的话, -2. 如果服务器上有非复制表则必须手动复制,可以从副本服务器上(在 `/var/lib/clickhouse/data/db_name/table_name/` 目录中)复制它们的数据。 -3. 从副本服务器上中复制位于 `/var/lib/clickhouse/metadata/` 中的表定义信息。如果在表定义信息中显式指定了分片或副本标识符,请更正它以使其对应于该副本。(另外,启动服务器,然后会在 `/var/lib/clickhouse/metadata/` 中的.sql文件中生成所有的 `ATTACH TABLE` 语句。) -4.要开始恢复,ZooKeeper 中创建节点 `/path_to_table/replica_name/flags/force_restore_data`,节点内容不限,或运行命令来恢复所有复制的表:`sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` +1. 在服务器上安装 ClickHouse。在包含分片标识符和副本的配置文件中正确定义宏配置,如果有用到的话, +2. 如果服务器上有非复制表则必须手动复制,可以从副本服务器上(在 `/var/lib/clickhouse/data/db_name/table_name/` 目录中)复制它们的数据。 +3. 从副本服务器上中复制位于 `/var/lib/clickhouse/metadata/` 中的表定义信息。如果在表定义信息中显式指定了分片或副本标识符,请更正它以使其对应于该副本。(另外,启动服务器,然后会在 `/var/lib/clickhouse/metadata/` 中的.sql文件中生成所有的 `ATTACH TABLE` 语句。) + 4.要开始恢复,ZooKeeper 中创建节点 `/path_to_table/replica_name/flags/force_restore_data`,节点内容不限,或运行命令来恢复所有复制的表:`sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` 然后启动服务器(如果它已运行则重启)。数据会从副本中下载。 -另一种恢复方式是从 ZooKeeper(`/path_to_table/replica_name`)中删除有数据丢的副本的所有元信息,然后再按照“[创建可复制表](#creating-replicated-tables)”中的描述重新创建副本。 +另一种恢复方式是从 ZooKeeper(`/path_to_table/replica_name`)中删除有数据丢的副本的所有元信息,然后再按照«[创建可复制表](#creating-replicated-tables)»中的描述重新创建副本。 恢复期间的网络带宽没有限制。特别注意这一点,尤其是要一次恢复很多副本。 - -## MergeTree 转换为 ReplicatedMergeTree +## MergeTree 转换为 ReplicatedMergeTree {#mergetree-zhuan-huan-wei-replicatedmergetree} 我们使用 `MergeTree` 来表示 `MergeTree系列` 中的所有表引擎,`ReplicatedMergeTree` 同理。 @@ -184,21 +181,21 @@ sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data 如果各个副本上的数据不一致,则首先对其进行同步,或者除保留的一个副本外,删除其他所有副本上的数据。 重命名现有的 MergeTree 表,然后使用旧名称创建 `ReplicatedMergeTree` 表。 -将数据从旧表移动到新表(`/var/lib/clickhouse/data/db_name/table_name/`)目录内的 'detached' 目录中。 +将数据从旧表移动到新表(`/var/lib/clickhouse/data/db_name/table_name/`)目录内的 ‘detached’ 目录中。 然后在其中一个副本上运行`ALTER TABLE ATTACH PARTITION`,将这些数据片段添加到工作集中。 -## ReplicatedMergeTree 转换为 MergeTree +## ReplicatedMergeTree 转换为 MergeTree {#replicatedmergetree-zhuan-huan-wei-mergetree} 使用其他名称创建 MergeTree 表。将具有`ReplicatedMergeTree`表数据的目录中的所有数据移动到新表的数据目录中。然后删除`ReplicatedMergeTree`表并重新启动服务器。 如果你想在不启动服务器的情况下清除 `ReplicatedMergeTree` 表: -- 删除元数据目录中的相应 `.sql` 文件(`/var/lib/clickhouse/metadata/`)。 -- 删除 ZooKeeper 中的相应路径(`/path_to_table/replica_name`)。 +- 删除元数据目录中的相应 `.sql` 文件(`/var/lib/clickhouse/metadata/`)。 +- 删除 ZooKeeper 中的相应路径(`/path_to_table/replica_name`)。 之后,你可以启动服务器,创建一个 `MergeTree` 表,将数据移动到其目录,然后重新启动服务器。 -## 当 ZooKeeper 集群中的元数据丢失或损坏时恢复方法 +## 当 ZooKeeper 集群中的元数据丢失或损坏时恢复方法 {#dang-zookeeper-ji-qun-zhong-de-yuan-shu-ju-diu-shi-huo-sun-pi-shi-hui-fu-fang-fa} 如果 ZooKeeper 中的数据丢失或损坏,如上所述,你可以通过将数据转移到非复制表来保存数据。 diff --git a/docs/zh/operations/table_engines/set.md b/docs/zh/operations/table_engines/set.md index d47cc4721a7..e9be9ab7e56 100644 --- a/docs/zh/operations/table_engines/set.md +++ b/docs/zh/operations/table_engines/set.md @@ -1,6 +1,6 @@ -# Set +# Set {#set} -始终存在于 RAM 中的数据集。它适用于IN运算符的右侧(请参见 "IN运算符" 部分)。 +始终存在于 RAM 中的数据集。它适用于IN运算符的右侧(请参见 «IN运算符» 部分)。 可以使用 INSERT 向表中插入数据。新元素将添加到数据集中,而重复项将被忽略。但是不能对此类型表执行 SELECT 语句。检索数据的唯一方法是在 IN 运算符的右半部分使用它。 @@ -8,5 +8,4 @@ 对于强制服务器重启,磁盘上的数据块可能会丢失或损坏。在数据块损坏的情况下,可能需要手动删除包含损坏数据的文件。 - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/set/) diff --git a/docs/zh/operations/table_engines/stripelog.md b/docs/zh/operations/table_engines/stripelog.md index 4def6d86a3b..14f9a9d65ac 100644 --- a/docs/zh/operations/table_engines/stripelog.md +++ b/docs/zh/operations/table_engines/stripelog.md @@ -1,42 +1,40 @@ -# StripeLog {#table_engines-stripelog} +# StripeLog {#table-engines-stripelog} 该引擎属于日志引擎系列。请在[日志引擎系列](log_family.md)文章中查看引擎的共同属性和差异。 在你需要写入许多小数据量(小于一百万行)的表的场景下使用这个引擎。 -## 建表 {#table_engines-stripelog-creating-a-table} +## 建表 {#table-engines-stripelog-creating-a-table} -``` -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - column1_name [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - column2_name [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = StripeLog -``` + CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] + ( + column1_name [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + column2_name [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... + ) ENGINE = StripeLog 查看[建表](../../query_language/create.md#create-table-query)请求的详细说明。 -## 写数据 {#table_engines-stripelog-writing-the-data} +## 写数据 {#table-engines-stripelog-writing-the-data} `StripeLog` 引擎将所有列存储在一个文件中。对每一次 `Insert` 请求,ClickHouse 将数据块追加在表文件的末尾,逐列写入。 ClickHouse 为每张表写入以下文件: -- `data.bin` — 数据文件。 -- `index.mrk` — 带标记的文件。标记包含了已插入的每个数据块中每列的偏移量。 +- `data.bin` — 数据文件。 +- `index.mrk` — 带标记的文件。标记包含了已插入的每个数据块中每列的偏移量。 `StripeLog` 引擎不支持 `ALTER UPDATE` 和 `ALTER DELETE` 操作。 -## 读数据 {#table_engines-stripelog-reading-the-data} +## 读数据 {#table-engines-stripelog-reading-the-data} 带标记的文件使得 ClickHouse 可以并行的读取数据。这意味着 `SELECT` 请求返回行的顺序是不可预测的。使用 `ORDER BY` 子句对行进行排序。 -## 使用示例 {#table_engines-stripelog-example-of-use} +## 使用示例 {#table-engines-stripelog-example-of-use} 建表: -```sql +``` sql CREATE TABLE stripe_log_table ( timestamp DateTime, @@ -48,39 +46,37 @@ ENGINE = StripeLog 插入数据: -```sql +``` sql INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The first regular message') INSERT INTO stripe_log_table VALUES (now(),'REGULAR','The second regular message'),(now(),'WARNING','The first warning message') ``` 我们使用两次 `INSERT` 请求从而在 `data.bin` 文件中创建两个数据块。 - + ClickHouse 在查询数据时使用多线程。每个线程读取单独的数据块并在完成后独立的返回结果行。这样的结果是,大多数情况下,输出中块的顺序和输入时相应块的顺序是不同的。例如: -```sql +``` sql SELECT * FROM stripe_log_table ``` -``` -┌───────────timestamp─┬─message_type─┬─message────────────────────┐ -│ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ -│ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ -└─────────────────────┴──────────────┴────────────────────────────┘ -┌───────────timestamp─┬─message_type─┬─message───────────────────┐ -│ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ -└─────────────────────┴──────────────┴───────────────────────────┘ -``` + + ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ + │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ + │ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ + └─────────────────────┴──────────────┴────────────────────────────┘ + ┌───────────timestamp─┬─message_type─┬─message───────────────────┐ + │ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ + └─────────────────────┴──────────────┴───────────────────────────┘ 对结果排序(默认增序): -```sql +``` sql SELECT * FROM stripe_log_table ORDER BY timestamp ``` -``` -┌───────────timestamp─┬─message_type─┬─message────────────────────┐ -│ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ -│ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ -│ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ -└─────────────────────┴──────────────┴────────────────────────────┘ -``` + + ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ + │ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ + │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ + │ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ + └─────────────────────┴──────────────┴────────────────────────────┘ [来源文章](https://clickhouse.tech/docs/en/operations/table_engines/stripelog/) diff --git a/docs/zh/operations/table_engines/summingmergetree.md b/docs/zh/operations/table_engines/summingmergetree.md index 03cd0561b0e..326ccb7118e 100644 --- a/docs/zh/operations/table_engines/summingmergetree.md +++ b/docs/zh/operations/table_engines/summingmergetree.md @@ -1,62 +1,60 @@ - -# SummingMergeTree +# SummingMergeTree {#summingmergetree} 该引擎继承自 [MergeTree](mergetree.md)。区别在于,当合并 `SummingMergeTree` 表的数据片段时,ClickHouse 会把所有具有相同主键的行合并为一行,该行包含了被合并的行中具有数值数据类型的列的汇总值。如果主键的组合方式使得单个键值对应于大量的行,则可以显著的减少存储空间并加快数据查询的速度。 我们推荐将该引擎和 `MergeTree` 一起使用。例如,在准备做报告的时候,将完整的数据存储在 `MergeTree` 表中,并且使用 `SummingMergeTree` 来存储聚合数据。这种方法可以使你避免因为使用不正确的主键组合方式而丢失有价值的数据。 -## 建表 +## 建表 {#jian-biao} -``` -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = SummingMergeTree([columns]) -[PARTITION BY expr] -[ORDER BY expr] -[SAMPLE BY expr] -[SETTINGS name=value, ...] -``` + CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] + ( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... + ) ENGINE = SummingMergeTree([columns]) + [PARTITION BY expr] + [ORDER BY expr] + [SAMPLE BY expr] + [SETTINGS name=value, ...] 请求参数的描述,参考 [请求描述](../../query_language/create.md)。 **SummingMergeTree 的参数** -- `columns` - 包含了将要被汇总的列的列名的元组。可选参数。 -所选的列必须是数值类型,并且不可位于主键中。 +- `columns` - 包含了将要被汇总的列的列名的元组。可选参数。 + 所选的列必须是数值类型,并且不可位于主键中。 - 如果没有指定 `columns`,ClickHouse 会把所有不在主键中的数值类型的列都进行汇总。 + 如果没有指定 `columns`,ClickHouse 会把所有不在主键中的数值类型的列都进行汇总。 **子句** 创建 `SummingMergeTree` 表时,需要与创建 `MergeTree` 表时相同的[子句](mergetree.md)。 -
已弃用的建表方法 +
+ +已弃用的建表方法 !!! attention "注意" 不要在新项目中使用该方法,可能的话,请将旧项目切换到上述方法。 -``` -CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE [=] SummingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [columns]) -``` + CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] + ( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... + ) ENGINE [=] SummingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, [columns]) 除 `columns` 外的所有参数都与 `MergeTree` 中的含义相同。 -- `columns` — 包含将要被汇总的列的列名的元组。可选参数。有关说明,请参阅上文。 +- `columns` — 包含将要被汇总的列的列名的元组。可选参数。有关说明,请参阅上文。 +
-## 用法示例 +## 用法示例 {#yong-fa-shi-li} 考虑如下的表: -```sql +``` sql CREATE TABLE summtt ( key UInt32, @@ -68,23 +66,18 @@ ORDER BY key 向其中插入数据: -``` -:) INSERT INTO summtt Values(1,1),(1,2),(2,1) -``` + :) INSERT INTO summtt Values(1,1),(1,2),(2,1) ClickHouse可能不会完整的汇总所有行([见下文](#data-processing)),因此我们在查询中使用了聚合函数 `sum` 和 `GROUP BY` 子句。 -```sql +``` sql SELECT key, sum(value) FROM summtt GROUP BY key ``` -``` -┌─key─┬─sum(value)─┐ -│ 2 │ 1 │ -│ 1 │ 3 │ -└─────┴────────────┘ -``` - + ┌─key─┬─sum(value)─┐ + │ 2 │ 1 │ + │ 1 │ 3 │ + └─────┴────────────┘ ## 数据处理 {#data-processing} @@ -92,7 +85,7 @@ SELECT key, sum(value) FROM summtt GROUP BY key ClickHouse 会按片段合并数据,以至于不同的数据片段中会包含具有相同主键的行,即单个汇总片段将会是不完整的。因此,聚合函数 [sum()](../../query_language/agg_functions/reference.md#agg_function-sum) 和 `GROUP BY` 子句应该在(`SELECT`)查询语句中被使用,如上文中的例子所述。 -### 汇总的通用规则 +### 汇总的通用规则 {#hui-zong-de-tong-yong-gui-ze} 列中数值类型的值会被汇总。这些列的集合在参数 `columns` 中被定义。 @@ -102,29 +95,27 @@ ClickHouse 会按片段合并数据,以至于不同的数据片段中会包含 主键所在的列中的值不会被汇总。 -### AggregateFunction 列中的汇总 +### AggregateFunction 列中的汇总 {#aggregatefunction-lie-zhong-de-hui-zong} 对于 [AggregateFunction 类型](../../data_types/nested_data_structures/aggregatefunction.md)的列,ClickHouse 根据对应函数表现为 [AggregatingMergeTree](aggregatingmergetree.md) 引擎的聚合。 -### 嵌套结构 +### 嵌套结构 {#qian-tao-jie-gou} 表中可以具有以特殊方式处理的嵌套数据结构。 如果嵌套表的名称以 `Map` 结尾,并且包含至少两个符合以下条件的列: -- 第一列是数值类型 `(*Int*, Date, DateTime)`,我们称之为 `key`, -- 其他的列是可计算的 `(*Int*, Float32/64)`,我们称之为 `(values...)`, +- 第一列是数值类型 `(*Int*, Date, DateTime)`,我们称之为 `key`, +- 其他的列是可计算的 `(*Int*, Float32/64)`,我们称之为 `(values...)`, 然后这个嵌套表会被解释为一个 `key => (values...)` 的映射,当合并它们的行时,两个数据集中的元素会被根据 `key` 合并为相应的 `(values...)` 的汇总值。 示例: -``` -[(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] -[(1, 100)] + [(1, 150)] -> [(1, 250)] -[(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] -[(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)] -``` + [(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] + [(1, 100)] + [(1, 150)] -> [(1, 250)] + [(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] + [(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)] 请求数据时,使用 [sumMap(key, value)](../../query_language/agg_functions/reference.md) 函数来对 `Map` 进行聚合。 diff --git a/docs/zh/operations/table_engines/tinylog.md b/docs/zh/operations/table_engines/tinylog.md index 620fb36c905..7c9d524d5e6 100644 --- a/docs/zh/operations/table_engines/tinylog.md +++ b/docs/zh/operations/table_engines/tinylog.md @@ -1,15 +1,13 @@ -# TinyLog +# TinyLog {#tinylog} 最简单的表引擎,用于将数据存储在磁盘上。每列都存储在单独的压缩文件中。写入时,数据将附加到文件末尾。 并发数据访问不受任何限制: - - 如果同时从表中读取并在不同的查询中写入,则读取操作将抛出异常 - - 如果同时写入多个查询中的表,则数据将被破坏。 +- 如果同时从表中读取并在不同的查询中写入,则读取操作将抛出异常 +- 如果同时写入多个查询中的表,则数据将被破坏。 这种表引擎的典型用法是 write-once:首先只写入一次数据,然后根据需要多次读取。查询在单个流中执行。换句话说,此引擎适用于相对较小的表(建议最多1,000,000行)。如果您有许多小表,则使用此表引擎是适合的,因为它比Log引擎更简单(需要打开的文件更少)。当您拥有大量小表时,可能会导致性能低下,但在可能已经在其它 DBMS 时使用过,则您可能会发现切换使用 TinyLog 类型的表更容易。**不支持索引**。 在 Yandex.Metrica 中,TinyLog 表用于小批量处理的中间数据。 - [Original article](https://clickhouse.tech/docs/zh/operations/table_engines/tinylog/) - diff --git a/docs/zh/operations/table_engines/url.md b/docs/zh/operations/table_engines/url.md index 8928b9232fd..46927669a77 100644 --- a/docs/zh/operations/table_engines/url.md +++ b/docs/zh/operations/table_engines/url.md @@ -1,9 +1,9 @@ -# URL(URL, Format) {#table_engines-url} +# URL(URL, Format) {#table-engines-url} 用于管理远程 HTTP/HTTPS 服务器上的数据。该引擎类似 [File](file.md) 引擎。 -## 在 ClickHouse 服务器中使用引擎 +## 在 ClickHouse 服务器中使用引擎 {#zai-clickhouse-fu-wu-qi-zhong-shi-yong-yin-qing} `Format` 必须是 ClickHouse 可以用于 `SELECT` 查询的一种格式,若有必要,还要可用于 `INSERT` 。有关支持格式的完整列表,请查看 @@ -29,7 +29,7 @@ ENGINE=URL('http://127.0.0.1:12345/', CSV) **2.** 用标准的 Python 3 工具库创建一个基本的 HTTP 服务并 启动它: -```python3 +``` python3 from http.server import BaseHTTPRequestHandler, HTTPServer class CSVHTTPServer(BaseHTTPRequestHandler): @@ -45,7 +45,7 @@ if __name__ == "__main__": HTTPServer(server_address, CSVHTTPServer).serve_forever() ``` -```bash +``` bash python3 server.py ``` @@ -55,19 +55,17 @@ python3 server.py SELECT * FROM url_engine_table ``` -``` -┌─word──┬─value─┐ -│ Hello │ 1 │ -│ World │ 2 │ -└───────┴───────┘ -``` + ┌─word──┬─value─┐ + │ Hello │ 1 │ + │ World │ 2 │ + └───────┴───────┘ -## 功能实现 +## 功能实现 {#gong-neng-shi-xian} -- 读写操作都支持并发 -- 不支持: - - `ALTER` 和 `SELECT...SAMPLE` 操作。 - - 索引。 - - 副本。 +- 读写操作都支持并发 +- 不支持: + - `ALTER` 和 `SELECT...SAMPLE` 操作。 + - 索引。 + - 副本。 [来源文章](https://clickhouse.tech/docs/en/operations/table_engines/url/) diff --git a/docs/zh/operations/table_engines/view.md b/docs/zh/operations/table_engines/view.md index 4191eae3974..5d15fc74218 100644 --- a/docs/zh/operations/table_engines/view.md +++ b/docs/zh/operations/table_engines/view.md @@ -1,6 +1,5 @@ -# View +# View {#view} 用于构建视图(有关更多信息,请参阅 `CREATE VIEW 查询`)。 它不存储数据,仅存储指定的 `SELECT` 查询。 从表中读取时,它会运行此查询(并从查询中删除所有不必要的列)。 - [Original article](https://clickhouse.tech/docs/en/operations/table_engines/view/) diff --git a/docs/zh/operations/tips.md b/docs/zh/operations/tips.md index 4780d3a0b8c..8980d74e8b2 100644 --- a/docs/zh/operations/tips.md +++ b/docs/zh/operations/tips.md @@ -1,56 +1,56 @@ -# Usage Recommendations +# Usage Recommendations {#usage-recommendations} -## CPU +## CPU {#cpu} The SSE 4.2 instruction set must be supported. Modern processors (since 2008) support it. When choosing a processor, prefer a large number of cores and slightly slower clock rate over fewer cores and a higher clock rate. For example, 16 cores with 2600 MHz is better than 8 cores with 3600 MHz. -## Hyper-threading +## Hyper-threading {#hyper-threading} -Don't disable hyper-threading. It helps for some queries, but not for others. +Don’t disable hyper-threading. It helps for some queries, but not for others. -## Turbo Boost +## Turbo Boost {#turbo-boost} Turbo Boost is highly recommended. It significantly improves performance with a typical load. -You can use `turbostat` to view the CPU's actual clock rate under a load. +You can use `turbostat` to view the CPU’s actual clock rate under a load. -## CPU Scaling Governor +## CPU Scaling Governor {#cpu-scaling-governor} Always use the `performance` scaling governor. The `on-demand` scaling governor works much worse with constantly high demand. -```bash +``` bash echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ``` -## CPU Limitations +## CPU Limitations {#cpu-limitations} -Processors can overheat. Use `dmesg` to see if the CPU's clock rate was limited due to overheating. +Processors can overheat. Use `dmesg` to see if the CPU’s clock rate was limited due to overheating. The restriction can also be set externally at the datacenter level. You can use `turbostat` to monitor it under a load. -## RAM +## RAM {#ram} -For small amounts of data (up to \~200 GB compressed), it is best to use as much memory as the volume of data. +For small amounts of data (up to ~200 GB compressed), it is best to use as much memory as the volume of data. For large amounts of data and when processing interactive (online) queries, you should use a reasonable amount of RAM (128 GB or more) so the hot data subset will fit in the cache of pages. -Even for data volumes of \~50 TB per server, using 128 GB of RAM significantly improves query performance compared to 64 GB. +Even for data volumes of ~50 TB per server, using 128 GB of RAM significantly improves query performance compared to 64 GB. -## Swap File +## Swap File {#swap-file} Always disable the swap file. The only reason for not doing this is if you are using ClickHouse on your personal laptop. -## Huge Pages +## Huge Pages {#huge-pages} Always disable transparent huge pages. It interferes with memory allocators, which leads to significant performance degradation. -```bash +``` bash echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled ``` Use `perf top` to watch the time spent in the kernel for memory management. Permanent huge pages also do not need to be allocated. -## Storage Subsystem +## Storage Subsystem {#storage-subsystem} If your budget allows you to use SSD, use SSD. If not, use HDD. SATA HDDs 7200 RPM will do. @@ -58,17 +58,17 @@ If not, use HDD. SATA HDDs 7200 RPM will do. Give preference to a lot of servers with local hard drives over a smaller number of servers with attached disk shelves. But for storing archives with rare queries, shelves will work. -## RAID +## RAID {#raid} When using HDD, you can combine their RAID-10, RAID-5, RAID-6 or RAID-50. -For Linux, software RAID is better (with `mdadm`). We don't recommend using LVM. +For Linux, software RAID is better (with `mdadm`). We don’t recommend using LVM. When creating RAID-10, select the `far` layout. If your budget allows, choose RAID-10. If you have more than 4 disks, use RAID-6 (preferred) or RAID-50, instead of RAID-5. -When using RAID-5, RAID-6 or RAID-50, always increase stripe_cache_size, since the default value is usually not the best choice. +When using RAID-5, RAID-6 or RAID-50, always increase stripe\_cache\_size, since the default value is usually not the best choice. -```bash +``` bash echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size ``` @@ -80,33 +80,33 @@ Never set the block size too small or too large. You can use RAID-0 on SSD. Regardless of RAID use, always use replication for data security. -Enable NCQ with a long queue. For HDD, choose the CFQ scheduler, and for SSD, choose noop. Don't reduce the 'readahead' setting. +Enable NCQ with a long queue. For HDD, choose the CFQ scheduler, and for SSD, choose noop. Don’t reduce the ‘readahead’ setting. For HDD, enable the write cache. -## File System +## File System {#file-system} Ext4 is the most reliable option. Set the mount options `noatime, nobarrier`. -XFS is also suitable, but it hasn't been as thoroughly tested with ClickHouse. +XFS is also suitable, but it hasn’t been as thoroughly tested with ClickHouse. Most other file systems should also work fine. File systems with delayed allocation work better. -## Linux Kernel +## Linux Kernel {#linux-kernel} -Don't use an outdated Linux kernel. +Don’t use an outdated Linux kernel. -## Network +## Network {#network} If you are using IPv6, increase the size of the route cache. The Linux kernel prior to 3.2 had a multitude of problems with IPv6 implementation. Use at least a 10 GB network, if possible. 1 Gb will also work, but it will be much worse for patching replicas with tens of terabytes of data, or for processing distributed queries with a large amount of intermediate data. -## ZooKeeper +## ZooKeeper {#zookeeper} -You are probably already using ZooKeeper for other purposes. You can use the same installation of ZooKeeper, if it isn't already overloaded. +You are probably already using ZooKeeper for other purposes. You can use the same installation of ZooKeeper, if it isn’t already overloaded. -It's best to use a fresh version of ZooKeeper – 3.4.9 or later. The version in stable Linux distributions may be outdated. +It’s best to use a fresh version of ZooKeeper – 3.4.9 or later. The version in stable Linux distributions may be outdated. -You should never use manually written scripts to transfer data between different ZooKeeper clusters, because the result will be incorrect for sequential nodes. Never use the "zkcopy" utility for the same reason: https://github.com/ksprojects/zkcopy/issues/15 +You should never use manually written scripts to transfer data between different ZooKeeper clusters, because the result will be incorrect for sequential nodes. Never use the «zkcopy» utility for the same reason: https://github.com/ksprojects/zkcopy/issues/15 If you want to divide an existing ZooKeeper cluster into two, the correct way is to increase the number of its replicas and then reconfigure it as two independent clusters. @@ -114,7 +114,7 @@ Do not run ZooKeeper on the same servers as ClickHouse. Because ZooKeeper is ver With the default settings, ZooKeeper is a time bomb: -> The ZooKeeper server won't delete files from old snapshots and logs when using the default configuration (see autopurge), and this is the responsibility of the operator. +> The ZooKeeper server won’t delete files from old snapshots and logs when using the default configuration (see autopurge), and this is the responsibility of the operator. This bomb must be defused. @@ -122,7 +122,7 @@ The ZooKeeper (3.5.1) configuration below is used in the Yandex.Metrica producti zoo.cfg: -```bash +``` bash # http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html # The number of milliseconds of each tick @@ -138,9 +138,9 @@ maxClientCnxns=2000 maxSessionTimeout=60000000 # the directory where the snapshot is stored. -dataDir=/opt/zookeeper/{{ cluster['name'] }}/data +dataDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/data # Place the dataLogDir to a separate physical disc for better performance -dataLogDir=/opt/zookeeper/{{ cluster['name'] }}/logs +dataLogDir=/opt/zookeeper/{{ '{{' }} cluster['name'] {{ '}}' }}/logs autopurge.snapRetainCount=10 autopurge.purgeInterval=1 @@ -173,20 +173,18 @@ snapCount=3000000 leaderServes=yes standaloneEnabled=false -dynamicConfigFile=/etc/zookeeper-{{ cluster['name'] }}/conf/zoo.cfg.dynamic +dynamicConfigFile=/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/zoo.cfg.dynamic ``` Java version: -``` -Java(TM) SE Runtime Environment (build 1.8.0_25-b17) -Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) -``` + Java(TM) SE Runtime Environment (build 1.8.0_25-b17) + Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) JVM parameters: -```bash -NAME=zookeeper-{{ cluster['name'] }} +``` bash +NAME=zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} ZOOCFGDIR=/etc/$NAME/conf # TODO this is really ugly @@ -205,8 +203,8 @@ JAVA=/usr/bin/java ZOOMAIN="org.apache.zookeeper.server.quorum.QuorumPeerMain" ZOO_LOG4J_PROP="INFO,ROLLINGFILE" JMXLOCALONLY=false -JAVA_OPTS="-Xms{{ cluster.get('xms','128M') }} \ - -Xmx{{ cluster.get('xmx','1G') }} \ +JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '}}' }} \ + -Xmx{{ '{{' }} cluster.get('xmx','1G') {{ '}}' }} \ -Xloggc:/var/log/$NAME/zookeeper-gc.log \ -XX:+UseGCLogFileRotation \ -XX:NumberOfGCLogFiles=16 \ @@ -226,34 +224,31 @@ JAVA_OPTS="-Xms{{ cluster.get('xms','128M') }} \ Salt init: -``` -description "zookeeper-{{ cluster['name'] }} centralized coordination service" + description "zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} centralized coordination service" -start on runlevel [2345] -stop on runlevel [!2345] + start on runlevel [2345] + stop on runlevel [!2345] -respawn + respawn -limit nofile 8192 8192 + limit nofile 8192 8192 -pre-start script - [ -r "/etc/zookeeper-{{ cluster['name'] }}/conf/environment" ] || exit 0 - . /etc/zookeeper-{{ cluster['name'] }}/conf/environment - [ -d $ZOO_LOG_DIR ] || mkdir -p $ZOO_LOG_DIR - chown $USER:$GROUP $ZOO_LOG_DIR -end script - -script - . /etc/zookeeper-{{ cluster['name'] }}/conf/environment - [ -r /etc/default/zookeeper ] && . /etc/default/zookeeper - if [ -z "$JMXDISABLE" ]; then - JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.local.only=$JMXLOCALONLY" - fi - exec start-stop-daemon --start -c $USER --exec $JAVA --name zookeeper-{{ cluster['name'] }} \ - -- -cp $CLASSPATH $JAVA_OPTS -Dzookeeper.log.dir=${ZOO_LOG_DIR} \ - -Dzookeeper.root.logger=${ZOO_LOG4J_PROP} $ZOOMAIN $ZOOCFG -end script -``` + pre-start script + [ -r "/etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment" ] || exit 0 + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment + [ -d $ZOO_LOG_DIR ] || mkdir -p $ZOO_LOG_DIR + chown $USER:$GROUP $ZOO_LOG_DIR + end script + script + . /etc/zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }}/conf/environment + [ -r /etc/default/zookeeper ] && . /etc/default/zookeeper + if [ -z "$JMXDISABLE" ]; then + JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.local.only=$JMXLOCALONLY" + fi + exec start-stop-daemon --start -c $USER --exec $JAVA --name zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} \ + -- -cp $CLASSPATH $JAVA_OPTS -Dzookeeper.log.dir=${ZOO_LOG_DIR} \ + -Dzookeeper.root.logger=${ZOO_LOG4J_PROP} $ZOOMAIN $ZOOCFG + end script [Original article](https://clickhouse.tech/docs/en/operations/tips/) diff --git a/docs/zh/operations/utils/clickhouse-copier.md b/docs/zh/operations/utils/clickhouse-copier.md index 6366c71d60e..1a1b8599dba 100644 --- a/docs/zh/operations/utils/clickhouse-copier.md +++ b/docs/zh/operations/utils/clickhouse-copier.md @@ -1,5 +1,4 @@ - -# clickhouse-copier +# clickhouse-copier {#clickhouse-copier} Copies data from the tables in one cluster to tables in another (or the same) cluster. @@ -7,36 +6,37 @@ You can run multiple `clickhouse-copier` instances on different servers to perfo After starting, `clickhouse-copier`: -- Connects to ZooKeeper and receives: - - Copying jobs. - - The state of the copying jobs. +- Connects to ZooKeeper and receives: -- It performs the jobs. + - Copying jobs. + - The state of the copying jobs. - Each running process chooses the "closest" shard of the source cluster and copies the data into the destination cluster, resharding the data if necessary. +- It performs the jobs. + + Each running process chooses the "closest" shard of the source cluster and copies the data into the destination cluster, resharding the data if necessary. `clickhouse-copier` tracks the changes in ZooKeeper and applies them on the fly. To reduce network traffic, we recommend running `clickhouse-copier` on the same server where the source data is located. -## Running clickhouse-copier +## Running clickhouse-copier {#running-clickhouse-copier} The utility should be run manually: -```bash +``` bash clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir ``` Parameters: -- `daemon` — Starts `clickhouse-copier` in daemon mode. -- `config` — The path to the `zookeeper.xml` file with the parameters for the connection to ZooKeeper. -- `task-path` — The path to the ZooKeeper node. This node is used for syncing `clickhouse-copier` processes and storing tasks. Tasks are stored in `$task-path/description`. -- `base-dir` — The path to logs and auxiliary files. When it starts, `clickhouse-copier` creates `clickhouse-copier_YYYYMMHHSS_` subdirectories in `$base-dir`. If this parameter is omitted, the directories are created in the directory where `clickhouse-copier` was launched. +- `daemon` — Starts `clickhouse-copier` in daemon mode. +- `config` — The path to the `zookeeper.xml` file with the parameters for the connection to ZooKeeper. +- `task-path` — The path to the ZooKeeper node. This node is used for syncing `clickhouse-copier` processes and storing tasks. Tasks are stored in `$task-path/description`. +- `base-dir` — The path to logs and auxiliary files. When it starts, `clickhouse-copier` creates `clickhouse-copier_YYYYMMHHSS_` subdirectories in `$base-dir`. If this parameter is omitted, the directories are created in the directory where `clickhouse-copier` was launched. -## Format of zookeeper.xml +## Format of zookeeper.xml {#format-of-zookeeper-xml} -```xml +``` xml trace @@ -53,9 +53,9 @@ Parameters: ``` -## Configuration of copying tasks +## Configuration of copying tasks {#configuration-of-copying-tasks} -```xml +``` xml @@ -164,5 +164,4 @@ Parameters: `clickhouse-copier` tracks the changes in `/task/path/description` and applies them on the fly. For instance, if you change the value of `max_workers`, the number of processes running tasks will also change. - [Original article](https://clickhouse.tech/docs/en/operations/utils/clickhouse-copier/) diff --git a/docs/zh/operations/utils/clickhouse-local.md b/docs/zh/operations/utils/clickhouse-local.md index 9a0880ac9f1..159e914f446 100644 --- a/docs/zh/operations/utils/clickhouse-local.md +++ b/docs/zh/operations/utils/clickhouse-local.md @@ -1,5 +1,4 @@ - -# clickhouse-local +# clickhouse-local {#clickhouse-local} The `clickhouse-local` program enables you to perform fast processing on local files, without having to deploy and configure the ClickHouse server. @@ -9,11 +8,10 @@ Accepts data that represent tables and queries them using [ClickHouse SQL dialec By default `clickhouse-local` does not have access to data on the same host, but it supports loading server configuration using `--config-file` argument. -!!! warning +!!! warning "Warning" It is not recommended to load production server configuration into `clickhouse-local` because data can be damaged in case of human error. - -## Usage +## Usage {#usage} Basic usage: @@ -23,28 +21,27 @@ clickhouse-local --structure "table_structure" --input-format "format_of_incomin Arguments: -- `-S`, `--structure` — table structure for input data. -- `-if`, `--input-format` — input format, `TSV` by default. -- `-f`, `--file` — path to data, `stdin` by default. -- `-q` `--query` — queries to execute with `;` as delimeter. -- `-N`, `--table` — table name where to put output data, `table` by default. -- `-of`, `--format`, `--output-format` — output format, `TSV` by default. -- `--stacktrace` — whether to dump debug output in case of exception. -- `--verbose` — more details on query execution. -- `-s` — disables `stderr` logging. -- `--config-file` — path to configuration file in same format as for ClickHouse server, by default the configuration empty. -- `--help` — arguments references for `clickhouse-local`. +- `-S`, `--structure` — table structure for input data. +- `-if`, `--input-format` — input format, `TSV` by default. +- `-f`, `--file` — path to data, `stdin` by default. +- `-q` `--query` — queries to execute with `;` as delimeter. +- `-N`, `--table` — table name where to put output data, `table` by default. +- `-of`, `--format`, `--output-format` — output format, `TSV` by default. +- `--stacktrace` — whether to dump debug output in case of exception. +- `--verbose` — more details on query execution. +- `-s` — disables `stderr` logging. +- `--config-file` — path to configuration file in same format as for ClickHouse server, by default the configuration empty. +- `--help` — arguments references for `clickhouse-local`. Also there are arguments for each ClickHouse configuration variable which are more commonly used instead of `--config-file`. - -## Examples +## Examples {#examples} ``` bash echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. -1 2 -3 4 +1 2 +3 4 ``` Previous example is the same as: @@ -52,11 +49,11 @@ Previous example is the same as: ``` bash $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. -1 2 -3 4 +1 2 +3 4 ``` -Now let's output memory user for each Unix user: +Now let’s output memory user for each Unix user: ``` bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" diff --git a/docs/zh/operations/utils/index.md b/docs/zh/operations/utils/index.md index 71d46f3f7c8..ebc1396d031 100644 --- a/docs/zh/operations/utils/index.md +++ b/docs/zh/operations/utils/index.md @@ -1,7 +1,7 @@ -# ClickHouse Utility +# ClickHouse Utility {#clickhouse-utility} -- [clickhouse-local](clickhouse-local.md) — Allows running SQL queries on data without stopping the ClickHouse server, similar to how `awk` does this. -- [clickhouse-copier](clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. -- [clickhouse-benchmark](clickhouse-benchmark.md) — Loads server with the custom queries and settings. +- [clickhouse-local](clickhouse-local.md) — Allows running SQL queries on data without stopping the ClickHouse server, similar to how `awk` does this. +- [clickhouse-copier](clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. +- [clickhouse-benchmark](clickhouse-benchmark.md) — Loads server with the custom queries and settings. [Original article](https://clickhouse.tech/docs/en/operations/utils/) diff --git a/docs/zh/query_language/create.md b/docs/zh/query_language/create.md index 36004aed6a0..8ffebe93750 100644 --- a/docs/zh/query_language/create.md +++ b/docs/zh/query_language/create.md @@ -1,4 +1,4 @@ -## CREATE DATABASE +## CREATE DATABASE {#create-database} 该查询用于根据指定名称创建数据库。 @@ -9,12 +9,11 @@ CREATE DATABASE [IF NOT EXISTS] db_name 数据库其实只是用于存放表的一个目录。 如果查询中存在`IF NOT EXISTS`,则当数据库已经存在时,该查询不会返回任何错误。 - ## CREATE TABLE {#create-table-query} 对于`CREATE TABLE`,存在以下几种方式。 -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -66,7 +65,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... 物化表达式,被该表达式指定的列不能包含在INSERT的列表中,因为它总是被计算出来的。 对于INSERT而言,不需要考虑这些列。 -另外,在SELECT查询中如果包含星号,此列不会被用来替换星号,这是因为考虑到数据转储,在使用`SELECT *`查询出的结果总能够被'INSERT'回表。 +另外,在SELECT查询中如果包含星号,此列不会被用来替换星号,这是因为考虑到数据转储,在使用`SELECT *`查询出的结果总能够被’INSERT’回表。 `ALIAS expr` @@ -84,7 +83,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... Along with columns descriptions constraints could be defined: -```sql +``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec] [TTL expr1], @@ -98,7 +97,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Adding large amount of constraints can negatively affect performance of big `INSERT` queries. -### TTL Expression +### TTL Expression {#ttl-expression} Defines storage time for values. Can be specified only for MergeTree-family tables. For the detailed description, see [TTL for columns and tables](../operations/table_engines/mergetree.md#table_engine-mergetree-ttl). @@ -106,7 +105,7 @@ Defines storage time for values. Can be specified only for MergeTree-family tabl By default, ClickHouse applies the compression method, defined in [server settings](../operations/server_settings/settings.md#server-settings-compression), to columns. You can also define the compression method for each individual column in the `CREATE TABLE` query. -```sql +``` sql CREATE TABLE codec_example ( dt Date CODEC(ZSTD), @@ -119,34 +118,34 @@ ENGINE = ... ``` -If a codec is specified, the default codec doesn't apply. Codecs can be combined in a pipeline, for example, `CODEC(Delta, ZSTD)`. To select the best codec combination for you project, pass benchmarks similar to described in the Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article. +If a codec is specified, the default codec doesn’t apply. Codecs can be combined in a pipeline, for example, `CODEC(Delta, ZSTD)`. To select the best codec combination for you project, pass benchmarks similar to described in the Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article. -!!!warning "Warning" - You can't decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor) utility. +!!! warning "Warning" + You can’t decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor) utility. Compression is supported for the following table engines: -- [MergeTree](../operations/table_engines/mergetree.md) family -- [Log](../operations/table_engines/log_family.md) family -- [Set](../operations/table_engines/set.md) -- [Join](../operations/table_engines/join.md) +- [MergeTree](../operations/table_engines/mergetree.md) family +- [Log](../operations/table_engines/log_family.md) family +- [Set](../operations/table_engines/set.md) +- [Join](../operations/table_engines/join.md) ClickHouse supports common purpose codecs and specialized codecs. #### Specialized Codecs {#create-query-specialized-codecs} -These codecs are designed to make compression more effective by using specific features of data. Some of these codecs don't compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. +These codecs are designed to make compression more effective by using specific features of data. Some of these codecs don’t compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. Specialized codecs: -- `Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it's 1. -- `DoubleDelta` — Calculates delta of deltas and writes it in compact binary form. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-byte deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` and `DateTime`). At each step of its algorithm, codec takes a block of 64 values, puts them into 64x64 bit matrix, transposes it, crops the unused bits of values and returns the rest as a sequence. Unused bits are the bits, that don't differ between maximum and minimum values in the whole data part for which the compression is used. +- `Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. +- `DoubleDelta` — Calculates delta of deltas and writes it in compact binary form. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-byte deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `T64` — Compression approach that crops unused high bits of values in integer data types (including `Enum`, `Date` and `DateTime`). At each step of its algorithm, codec takes a block of 64 values, puts them into 64x64 bit matrix, transposes it, crops the unused bits of values and returns the rest as a sequence. Unused bits are the bits, that don’t differ between maximum and minimum values in the whole data part for which the compression is used. `DoubleDelta` and `Gorilla` codecs are used in Gorilla TSDB as the components of its compressing algorithm. Gorilla approach is effective in scenarios when there is a sequence of slowly changing values with their timestamps. Timestamps are effectively compressed by the `DoubleDelta` codec, and values are effectively compressed by the `Gorilla` codec. For example, to get an effectively stored table, you can create it in the following configuration: -```sql +``` sql CREATE TABLE codec_example ( timestamp DateTime CODEC(DoubleDelta), @@ -159,27 +158,26 @@ ENGINE = MergeTree() Codecs: -- `NONE` — No compression. -- `LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression. -- `LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` applies the default level. Possible levels: [1, 12]. Recommended level range: [4, 9]. -- `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: [1, 22]. Default value: 1. +- `NONE` — No compression. +- `LZ4` — Lossless [data compression algorithm](https://github.com/lz4/lz4) used by default. Applies LZ4 fast compression. +- `LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` applies the default level. Possible levels: \[1, 12\]. Recommended level range: \[4, 9\]. +- `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: \[1, 22\]. Default value: 1. High compression levels are useful for asymmetric scenarios, like compress once, decompress repeatedly. Higher levels mean better compression and higher CPU usage. - -## 临时表 +## 临时表 {#lin-shi-biao} ClickHouse支持临时表,其具有以下特征: -- 当回话结束时,临时表将随会话一起消失,这包含链接中断。 -- 临时表仅能够使用Memory表引擎。 -- 无法为临时表指定数据库。它是在数据库之外创建的。 -- 如果临时表与另一个表名称相同,那么当在查询时没有显示的指定db的情况下,将优先使用临时表。 -- 对于分布式处理,查询中使用的临时表将被传递到远程服务器。 +- 当回话结束时,临时表将随会话一起消失,这包含链接中断。 +- 临时表仅能够使用Memory表引擎。 +- 无法为临时表指定数据库。它是在数据库之外创建的。 +- 如果临时表与另一个表名称相同,那么当在查询时没有显示的指定db的情况下,将优先使用临时表。 +- 对于分布式处理,查询中使用的临时表将被传递到远程服务器。 可以使用下面的语法创建一个临时表: -```sql +``` sql CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -190,7 +188,7 @@ CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name [ON CLUSTER cluster] 大多数情况下,临时表不是手动创建的,只有在分布式查询处理中使用`(GLOBAL) IN`时为外部数据创建。更多信息,可以参考相关章节。 -## 分布式DDL查询 (ON CLUSTER 子句) +## 分布式DDL查询 (ON CLUSTER 子句) {#fen-bu-shi-ddlcha-xun-on-cluster-zi-ju} 对于 `CREATE`, `DROP`, `ALTER`,以及`RENAME`查询,系统支持其运行在整个集群上。 例如,以下查询将在`cluster`集群的所有节点上创建名为`all_hits`的`Distributed`表: @@ -202,7 +200,7 @@ CREATE TABLE IF NOT EXISTS all_hits ON CLUSTER cluster (p Date, i Int32) ENGINE 为了能够正确的运行这种查询,每台主机必须具有相同的cluster声明(为了简化配置的同步,你可以使用zookeeper的方式进行配置)。同时这些主机还必须链接到zookeeper服务器。 这个查询将最终在集群的每台主机上运行,即使一些主机当前处于不可用状态。同时它还保证了所有的查询在单台主机中的执行顺序。 -## CREATE VIEW +## CREATE VIEW {#create-view} ``` sql CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]table_name [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ... @@ -250,7 +248,7 @@ SELECT a, b, c FROM (SELECT ...) ## CREATE DICTIONARY {#create-dictionary-query} -```sql +``` sql CREATE DICTIONARY [IF NOT EXISTS] [db.]dictionary_name ( key1 type1 [DEFAULT|EXPRESSION expr1] [HIERARCHICAL|INJECTIVE|IS_OBJECT_ID], diff --git a/docs/zh/query_language/functions/arithmetic_functions.md b/docs/zh/query_language/functions/arithmetic_functions.md index 014ec96ce03..36c4d46c755 100644 --- a/docs/zh/query_language/functions/arithmetic_functions.md +++ b/docs/zh/query_language/functions/arithmetic_functions.md @@ -1,4 +1,4 @@ -# 算术函数 +# 算术函数 {#suan-zhu-han-shu} 对于所有算术函数,结果类型为结果适合的最小数字类型(如果存在这样的类型)。最小数字类型是根据数字的位数,是否有符号以及是否是浮点类型而同时进行的。如果没有足够的位,则采用最高位类型。 @@ -8,72 +8,69 @@ SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0) ``` -``` -┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐ -│ UInt8 │ UInt16 │ UInt32 │ UInt64 │ -└───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘ -``` + ┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐ + │ UInt8 │ UInt16 │ UInt32 │ UInt64 │ + └───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘ 算术函数适用于UInt8,UInt16,UInt32,UInt64,Int8,Int16,Int32,Int64,Float32或Float64中的任何类型。 溢出的产生方式与C++相同。 -## plus(a, b), a + b +## plus(a, b), a + b {#plusa-b-a-b} 计算数字的总和。 您还可以将Date或DateTime与整数进行相加。在Date的情况下,添加的整数意味着添加相应的天数。对于DateTime,这意味这添加相应的描述。 -## minus(a, b), a - b +## minus(a, b), a - b {#minusa-b-a-b} 计算数字之间的差,结果总是有符号的。 -您还可以将Date或DateTime与整数进行相减。见上面的'plus'。 +您还可以将Date或DateTime与整数进行相减。见上面的’plus’。 -## multiply(a, b), a \* b +## multiply(a, b), a \* b {#multiplya-b-a-b} 计算数字的乘积。 -## divide(a, b), a / b +## divide(a, b), a / b {#dividea-b-a-b} 计算数字的商。结果类型始终是浮点类型。 -它不是整数除法。对于整数除法,请使用'intDiv'函数。 -当除以零时,你得到'inf','- inf'或'nan'。 +它不是整数除法。对于整数除法,请使用’intDiv’函数。 +当除以零时,你得到’inf’,‘- inf’或’nan’。 -## intDiv(a, b) +## intDiv(a, b) {#intdiva-b} 计算整数数字的商,向下舍入(按绝对值)。 除以零或将最小负数除以-1时抛出异常。 -## intDivOrZero(a, b) +## intDivOrZero(a, b) {#intdivorzeroa-b} -与'intDiv'的不同之处在于它在除以零或将最小负数除以-1时返回零。 +与’intDiv’的不同之处在于它在除以零或将最小负数除以-1时返回零。 -## modulo(a, b), a % b +## modulo(a, b), a % b {#moduloa-b-a-b} 计算除法后的余数。 如果参数是浮点数,则通过删除小数部分将它们预转换为整数。 其余部分与C++中的含义相同。截断除法用于负数。 除以零或将最小负数除以-1时抛出异常。 -## negate(a), -a +## negate(a), -a {#negatea-a} 计算一个数字的 用反转符号计算一个数字。结果始终是签名的。 Calculates a number with the reverse sign. The result is always signed. -## abs(a) {#arithm_func-abs} +## abs(a) {#arithm-func-abs} 计算数字(a)的绝对值。也就是说,如果a &lt; 0,它返回-a。对于无符号类型,它不执行任何操作。对于有符号整数类型,它返回无符号数。 -## gcd(a, b) +## gcd(a, b) {#gcda-b} 返回数字的最大公约数。 除以零或将最小负数除以-1时抛出异常。 -## lcm(a, b) +## lcm(a, b) {#lcma-b} 返回数字的最小公倍数。 除以零或将最小负数除以-1时抛出异常。 - [来源文章](https://clickhouse.tech/docs/en/query_language/functions/arithmetic_functions/) diff --git a/docs/zh/query_language/functions/array_functions.md b/docs/zh/query_language/functions/array_functions.md index 6f87a62102f..00a12dba8bc 100644 --- a/docs/zh/query_language/functions/array_functions.md +++ b/docs/zh/query_language/functions/array_functions.md @@ -1,119 +1,113 @@ -# 数组函数 +# 数组函数 {#shu-zu-han-shu} -## empty +## empty {#empty} 对于空数组返回1,对于非空数组返回0。 结果类型是UInt8。 该函数也适用于字符串。 -## notEmpty +## notEmpty {#notempty} 对于空数组返回0,对于非空数组返回1。 结果类型是UInt8。 该函数也适用于字符串。 -## length {#array_functions-length} +## length {#array-functions-length} 返回数组中的元素个数。 结果类型是UInt64。 该函数也适用于字符串。 -## emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64 +## emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64 {#emptyarrayuint8-emptyarrayuint16-emptyarrayuint32-emptyarrayuint64} -## emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64 +## emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64 {#emptyarrayint8-emptyarrayint16-emptyarrayint32-emptyarrayint64} -## emptyArrayFloat32, emptyArrayFloat64 +## emptyArrayFloat32, emptyArrayFloat64 {#emptyarrayfloat32-emptyarrayfloat64} -## emptyArrayDate, emptyArrayDateTime +## emptyArrayDate, emptyArrayDateTime {#emptyarraydate-emptyarraydatetime} -## emptyArrayString +## emptyArrayString {#emptyarraystring} 不接受任何参数并返回适当类型的空数组。 -## emptyArrayToSingle +## emptyArrayToSingle {#emptyarraytosingle} 接受一个空数组并返回一个仅包含一个默认值元素的数组。 -## range(N) +## range(N) {#rangen} 返回从0到N-1的数字数组。 以防万一,如果在数据块中创建总长度超过100,000,000个元素的数组,则抛出异常。 -## array(x1, ...), operator \[x1, ...\] +## array(x1, …), operator \[x1, …\] {#arrayx1-operator-x1} 使用函数的参数作为数组元素创建一个数组。 -参数必须是常量,并且具有最小公共类型的类型。必须至少传递一个参数,否则将不清楚要创建哪种类型的数组。也就是说,你不能使用这个函数来创建一个空数组(为此,使用上面描述的'emptyArray \ *'函数)。 -返回'Array(T)'类型的结果,其中'T'是传递的参数中最小的公共类型。 +参数必须是常量,并且具有最小公共类型的类型。必须至少传递一个参数,否则将不清楚要创建哪种类型的数组。也就是说,你不能使用这个函数来创建一个空数组(为此,使用上面描述的’emptyArray  \*’函数)。 +返回’Array(T)’类型的结果,其中’T’是传递的参数中最小的公共类型。 -## arrayConcat +## arrayConcat {#arrayconcat} 合并参数中传递的所有数组。 -``` -arrayConcat(arrays) -``` + arrayConcat(arrays) **参数** -- `arrays` – 任意数量的[Array](../../data_types/array.md)类型的参数. -**示例** +- `arrays` – 任意数量的[Array](../../data_types/array.md)类型的参数. + **示例** + + ``` sql SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res ``` -``` -┌─res───────────┐ -│ [1,2,3,4,5,6] │ -└───────────────┘ -``` + ┌─res───────────┐ + │ [1,2,3,4,5,6] │ + └───────────────┘ -## arrayElement(arr, n), operator arr[n] +## arrayElement(arr, n), operator arr\[n\] {#arrayelementarr-n-operator-arrn} -从数组`arr`中获取索引为“n”的元素。 `n`必须是任何整数类型。 +从数组`arr`中获取索引为«n»的元素。 `n`必须是任何整数类型。 数组中的索引从一开始。 支持负索引。在这种情况下,它选择从末尾开始编号的相应元素。例如,`arr [-1]`是数组中的最后一项。 如果索引超出数组的边界,则返回默认值(数字为0,字符串为空字符串等)。 -## has(arr, elem) +## has(arr, elem) {#hasarr-elem} -检查'arr'数组是否具有'elem'元素。 +检查’arr’数组是否具有’elem’元素。 如果元素不在数组中,则返回0;如果在,则返回1。 `NULL` 值的处理。 -``` -SELECT has([1, 2, NULL], NULL) + SELECT has([1, 2, NULL], NULL) -┌─has([1, 2, NULL], NULL)─┐ -│ 1 │ -└─────────────────────────┘ -``` + ┌─has([1, 2, NULL], NULL)─┐ + │ 1 │ + └─────────────────────────┘ -## hasAll +## hasAll {#hasall} 检查一个数组是否是另一个数组的子集。 -``` -hasAll(set, subset) -``` + hasAll(set, subset) **参数** -- `set` – 具有一组元素的任何类型的数组。 -- `subset` – 任何类型的数组,其元素应该被测试为`set`的子集。 +- `set` – 具有一组元素的任何类型的数组。 +- `subset` – 任何类型的数组,其元素应该被测试为`set`的子集。 **返回值** -- `1`, 如果`set`包含`subset`中的所有元素。 -- `0`, 否则。 +- `1`, 如果`set`包含`subset`中的所有元素。 +- `0`, 否则。 **特殊的定义** -- 空数组是任何数组的子集。 -- “Null”作为数组中的元素值进行处理。 -- 忽略两个数组中的元素值的顺序。 +- 空数组是任何数组的子集。 +- «Null»作为数组中的元素值进行处理。 +- 忽略两个数组中的元素值的顺序。 **示例** @@ -129,28 +123,26 @@ hasAll(set, subset) `SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [3, 5]])` returns 0. -## hasAny +## hasAny {#hasany} 检查两个数组是否存在交集。 -``` -hasAny(array1, array2) -``` + hasAny(array1, array2) **参数** -- `array1` – 具有一组元素的任何类型的数组。 -- `array2` – 具有一组元素的任何类型的数组。 +- `array1` – 具有一组元素的任何类型的数组。 +- `array2` – 具有一组元素的任何类型的数组。 **返回值** -- `1`, 如果`array1`和`array2`存在交集。 -- `0`, 否则。 +- `1`, 如果`array1`和`array2`存在交集。 +- `0`, 否则。 **特殊的定义** -- “Null”作为数组中的元素值进行处理。 -- 忽略两个数组中的元素值的顺序。 +- «Null»作为数组中的元素值进行处理。 +- 忽略两个数组中的元素值的顺序。 **示例** @@ -164,43 +156,39 @@ hasAny(array1, array2) `SELECT hasAll([[1, 2], [3, 4]], [[1, 2], [1, 2]])` returns `1`. -## indexOf(arr, x) +## indexOf(arr, x) {#indexofarr-x} 返回数组中第一个‘x’元素的索引(从1开始),如果‘x’元素不存在在数组中,则返回0。 示例: -``` -:) SELECT indexOf([1,3,NULL,NULL],NULL) + :) SELECT indexOf([1,3,NULL,NULL],NULL) -SELECT indexOf([1, 3, NULL, NULL], NULL) + SELECT indexOf([1, 3, NULL, NULL], NULL) -┌─indexOf([1, 3, NULL, NULL], NULL)─┐ -│ 3 │ -└───────────────────────────────────┘ -``` + ┌─indexOf([1, 3, NULL, NULL], NULL)─┐ + │ 3 │ + └───────────────────────────────────┘ -设置为“NULL”的元素将作为普通的元素值处理。 +设置为«NULL»的元素将作为普通的元素值处理。 -## countEqual(arr, x) +## countEqual(arr, x) {#countequalarr-x} -返回数组中等于x的元素的个数。相当于arrayCount(elem - > elem = x,arr)。 +返回数组中等于x的元素的个数。相当于arrayCount(elem - \> elem = x,arr)。 `NULL`值将作为单独的元素值处理。 示例: -``` -SELECT countEqual([1, 2, NULL, NULL], NULL) + SELECT countEqual([1, 2, NULL, NULL], NULL) -┌─countEqual([1, 2, NULL, NULL], NULL)─┐ -│ 2 │ -└──────────────────────────────────────┘ -``` + ┌─countEqual([1, 2, NULL, NULL], NULL)─┐ + │ 2 │ + └──────────────────────────────────────┘ -## arrayEnumerate(arr) {#array_functions-arrayenumerate} +## arrayEnumerate(arr) {#array-functions-arrayenumerate} -返回 Array \[1, 2, 3, ..., length (arr) \] +返回 Array \[1, 2, 3, …, length (arr) \] 此功能通常与ARRAY JOIN一起使用。它允许在应用ARRAY JOIN后为每个数组计算一次。例如: @@ -216,11 +204,9 @@ WHERE CounterID = 160656 LIMIT 10 ``` -``` -┌─Reaches─┬──Hits─┐ -│ 95606 │ 31406 │ -└─────────┴───────┘ -``` + ┌─Reaches─┬──Hits─┐ + │ 95606 │ 31406 │ + └─────────┴───────┘ 在此示例中,Reaches是转换次数(应用ARRAY JOIN后接收的字符串),Hits是浏览量(ARRAY JOIN之前的字符串)。在这种特殊情况下,您可以更轻松地获得相同的结果: @@ -232,18 +218,16 @@ FROM test.hits WHERE (CounterID = 160656) AND notEmpty(GoalsReached) ``` -``` -┌─Reaches─┬──Hits─┐ -│ 95606 │ 31406 │ -└─────────┴───────┘ -``` + ┌─Reaches─┬──Hits─┐ + │ 95606 │ 31406 │ + └─────────┴───────┘ 此功能也可用于高阶函数。例如,您可以使用它来获取与条件匹配的元素的数组索引。 -## arrayEnumerateUniq(arr, ...) +## arrayEnumerateUniq(arr, …) {#arrayenumerateuniqarr} 返回与源数组大小相同的数组,其中每个元素表示与其下标对应的源数组元素在源数组中出现的次数。 -例如:arrayEnumerateUniq(\ [10,20,10,30 \])= \ [1,1,2,1 \]。 +例如:arrayEnumerateUniq( \[10,20,10,30 \])=  \[1,1,2,1 \]。 使用ARRAY JOIN和数组元素的聚合时,此函数很有用。 @@ -264,20 +248,18 @@ ORDER BY Reaches DESC LIMIT 10 ``` -``` -┌──GoalID─┬─Reaches─┬─Visits─┐ -│ 53225 │ 3214 │ 1097 │ -│ 2825062 │ 3188 │ 1097 │ -│ 56600 │ 2803 │ 488 │ -│ 1989037 │ 2401 │ 365 │ -│ 2830064 │ 2396 │ 910 │ -│ 1113562 │ 2372 │ 373 │ -│ 3270895 │ 2262 │ 812 │ -│ 1084657 │ 2262 │ 345 │ -│ 56599 │ 2260 │ 799 │ -│ 3271094 │ 2256 │ 812 │ -└─────────┴─────────┴────────┘ -``` + ┌──GoalID─┬─Reaches─┬─Visits─┐ + │ 53225 │ 3214 │ 1097 │ + │ 2825062 │ 3188 │ 1097 │ + │ 56600 │ 2803 │ 488 │ + │ 1989037 │ 2401 │ 365 │ + │ 2830064 │ 2396 │ 910 │ + │ 1113562 │ 2372 │ 373 │ + │ 3270895 │ 2262 │ 812 │ + │ 1084657 │ 2262 │ 345 │ + │ 56599 │ 2260 │ 799 │ + │ 3271094 │ 2256 │ 812 │ + └─────────┴─────────┴────────┘ 在此示例中,每个GoalID都计算转换次数(目标嵌套数据结构中的每个元素都是达到的目标,我们称之为转换)和会话数。如果没有ARRAY JOIN,我们会将会话数计为总和(Sign)。但在这种特殊情况下,行乘以嵌套的Goals结构,因此为了在此之后计算每个会话一次,我们将一个条件应用于arrayEnumerateUniq(Goals.ID)函数的值。 @@ -287,25 +269,21 @@ arrayEnumerateUniq函数可以使用与参数大小相同的多个数组。在 SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res ``` -``` -┌─res───────────┐ -│ [1,2,1,1,2,1] │ -└───────────────┘ -``` + ┌─res───────────┐ + │ [1,2,1,1,2,1] │ + └───────────────┘ 当使用带有嵌套数据结构的ARRAY JOIN并在此结构中跨多个元素进一步聚合时,这是必需的。 -## arrayPopBack +## arrayPopBack {#arraypopback} 从数组中删除最后一项。 -``` -arrayPopBack(array) -``` + arrayPopBack(array) **参数** -- `array` – 数组。 +- `array` – 数组。 **示例** @@ -313,23 +291,19 @@ arrayPopBack(array) SELECT arrayPopBack([1, 2, 3]) AS res ``` -``` -┌─res───┐ -│ [1,2] │ -└───────┘ -``` + ┌─res───┐ + │ [1,2] │ + └───────┘ -## arrayPopFront +## arrayPopFront {#arraypopfront} 从数组中删除第一项。 -``` -arrayPopFront(array) -``` + arrayPopFront(array) **参数** -- `array` – 数组。 +- `array` – 数组。 **示例** @@ -337,24 +311,20 @@ arrayPopFront(array) SELECT arrayPopFront([1, 2, 3]) AS res ``` -``` -┌─res───┐ -│ [2,3] │ -└───────┘ -``` + ┌─res───┐ + │ [2,3] │ + └───────┘ -## arrayPushBack +## arrayPushBack {#arraypushback} 添加一个元素到数组的末尾。 -``` -arrayPushBack(array, single_value) -``` + arrayPushBack(array, single_value) **参数** -- `array` – 数组。 -- `single_value` – 单个值。只能将数字添加到带数字的数组中,并且只能将字符串添加到字符串数组中。添加数字时,ClickHouse会自动为数组的数据类型设置`single_value`类型。有关ClickHouse中数据类型的更多信息,请参阅“[数据类型](../../data_types/index.md#data_types)”。可以是'NULL`。该函数向数组添加一个“NULL”元素,数组元素的类型转换为`Nullable`。 +- `array` – 数组。 +- `single_value` – 单个值。只能将数字添加到带数字的数组中,并且只能将字符串添加到字符串数组中。添加数字时,ClickHouse会自动为数组的数据类型设置`single_value`类型。有关ClickHouse中数据类型的更多信息,请参阅«[数据类型](../../data_types/index.md#data_types)»。可以是’NULL`。该函数向数组添加一个«NULL»元素,数组元素的类型转换为`Nullable\`。 **示例** @@ -362,24 +332,20 @@ arrayPushBack(array, single_value) SELECT arrayPushBack(['a'], 'b') AS res ``` -``` -┌─res───────┐ -│ ['a','b'] │ -└───────────┘ -``` + ┌─res───────┐ + │ ['a','b'] │ + └───────────┘ -## arrayPushFront +## arrayPushFront {#arraypushfront} 将一个元素添加到数组的开头。 -``` -arrayPushFront(array, single_value) -``` + arrayPushFront(array, single_value) **参数** -- `array` – 数组。 -- `single_value` – 单个值。只能将数字添加到带数字的数组中,并且只能将字符串添加到字符串数组中。添加数字时,ClickHouse会自动为数组的数据类型设置`single_value`类型。有关ClickHouse中数据类型的更多信息,请参阅“[数据类型](../../data_types/index.md#data_types)”。可以是'NULL`。该函数向数组添加一个“NULL”元素,数组元素的类型转换为`Nullable`。 +- `array` – 数组。 +- `single_value` – 单个值。只能将数字添加到带数字的数组中,并且只能将字符串添加到字符串数组中。添加数字时,ClickHouse会自动为数组的数据类型设置`single_value`类型。有关ClickHouse中数据类型的更多信息,请参阅«[数据类型](../../data_types/index.md#data_types)»。可以是’NULL`。该函数向数组添加一个«NULL»元素,数组元素的类型转换为`Nullable\`。 **示例** @@ -387,27 +353,23 @@ arrayPushFront(array, single_value) SELECT arrayPushFront(['b'], 'a') AS res ``` -``` -┌─res───────┐ -│ ['a','b'] │ -└───────────┘ -``` + ┌─res───────┐ + │ ['a','b'] │ + └───────────┘ -## arrayResize +## arrayResize {#arrayresize} 更改数组的长度。 -``` -arrayResize(array, size[, extender]) -``` + arrayResize(array, size[, extender]) **参数:** -- `array` — 数组. -- `size` — 数组所需的长度。 - - 如果`size`小于数组的原始大小,则数组将从右侧截断。 -- 如果`size`大于数组的初始大小,则使用`extender`值或数组项的数据类型的默认值将数组扩展到右侧。 -- `extender` — 扩展数组的值。可以是'NULL`。 +- `array` — 数组. +- `size` — 数组所需的长度。 + - 如果`size`小于数组的原始大小,则数组将从右侧截断。 +- 如果`size`大于数组的初始大小,则使用`extender`值或数组项的数据类型的默认值将数组扩展到右侧。 +- `extender` — 扩展数组的值。可以是’NULL\`。 **返回值:** @@ -415,35 +377,29 @@ arrayResize(array, size[, extender]) **调用示例** -``` -SELECT arrayResize([1], 3) + SELECT arrayResize([1], 3) -┌─arrayResize([1], 3)─┐ -│ [1,0,0] │ -└─────────────────────┘ -``` + ┌─arrayResize([1], 3)─┐ + │ [1,0,0] │ + └─────────────────────┘ -``` -SELECT arrayResize([1], 3, NULL) + SELECT arrayResize([1], 3, NULL) -┌─arrayResize([1], 3, NULL)─┐ -│ [1,NULL,NULL] │ -└───────────────────────────┘ -``` + ┌─arrayResize([1], 3, NULL)─┐ + │ [1,NULL,NULL] │ + └───────────────────────────┘ -## arraySlice +## arraySlice {#arrayslice} 返回一个子数组,包含从指定位置的指定长度的元素。 -``` -arraySlice(array, offset[, length]) -``` + arraySlice(array, offset[, length]) **参数** -- `array` – 数组。 -- `offset` – 数组的偏移。正值表示左侧的偏移量,负值表示右侧的缩进值。数组下标从1开始。 -- `length` - 子数组的长度。如果指定负值,则该函数返回`[offset,array_length - length`。如果省略该值,则该函数返回`[offset,the_end_of_array]`。 +- `array` – 数组。 +- `offset` – 数组的偏移。正值表示左侧的偏移量,负值表示右侧的缩进值。数组下标从1开始。 +- `length` - 子数组的长度。如果指定负值,则该函数返回`[offset,array_length - length`。如果省略该值,则该函数返回`[offset,the_end_of_array]`。 **示例** @@ -451,15 +407,13 @@ arraySlice(array, offset[, length]) SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res ``` -``` -┌─res────────┐ -│ [2,NULL,4] │ -└────────────┘ -``` + ┌─res────────┐ + │ [2,NULL,4] │ + └────────────┘ -设置为“NULL”的数组元素作为普通的数组元素值处理。 +设置为«NULL»的数组元素作为普通的数组元素值处理。 -## arraySort(\[func,\] arr, ...) {#array_functions-reverse-sort} +## arraySort(\[func,\] arr, …) {#array-functions-reverse-sort} 以升序对`arr`数组的元素进行排序。如果指定了`func`函数,则排序顺序由`func`函数的调用结果决定。如果`func`接受多个参数,那么`arraySort`函数也将解析与`func`函数参数相同数量的数组参数。更详细的示例在`arraySort`的末尾。 @@ -468,38 +422,35 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res ``` sql SELECT arraySort([1, 3, 3, 0]); ``` -``` -┌─arraySort([1, 3, 3, 0])─┐ -│ [0,1,3,3] │ -└─────────────────────────┘ -``` + + ┌─arraySort([1, 3, 3, 0])─┐ + │ [0,1,3,3] │ + └─────────────────────────┘ 字符串排序示例: ``` sql SELECT arraySort(['hello', 'world', '!']); ``` -``` -┌─arraySort(['hello', 'world', '!'])─┐ -│ ['!','hello','world'] │ -└────────────────────────────────────┘ -``` + + ┌─arraySort(['hello', 'world', '!'])─┐ + │ ['!','hello','world'] │ + └────────────────────────────────────┘ `NULL`,`NaN`和`Inf`的排序顺序: ``` sql SELECT arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]); ``` -``` -┌─arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf])─┐ -│ [-inf,-4,1,2,3,inf,nan,nan,NULL,NULL] │ -└───────────────────────────────────────────────────────────┘ -``` -- `-Inf` 是数组中的第一个。 -- `NULL` 是数组中的最后一个。 -- `NaN` 在`NULL`的前面。 -- `Inf` 在`NaN`的前面。 + ┌─arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf])─┐ + │ [-inf,-4,1,2,3,inf,nan,nan,NULL,NULL] │ + └───────────────────────────────────────────────────────────┘ + +- `-Inf` 是数组中的第一个。 +- `NULL` 是数组中的最后一个。 +- `NaN` 在`NULL`的前面。 +- `Inf` 在`NaN`的前面。 注意:`arraySort`是[高阶函数](higher_order_functions.md)。您可以将lambda函数作为第一个参数传递给它。在这种情况下,排序顺序由lambda函数的调用结果决定。 @@ -508,13 +459,12 @@ SELECT arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]); ``` sql SELECT arraySort((x) -> -x, [1, 2, 3]) as res; ``` -``` -┌─res─────┐ -│ [3,2,1] │ -└─────────┘ -``` -对于源数组的每个元素,lambda函数返回排序键,即[1 -> -1, 2 -> -2, 3 -> -3]。由于`arraySort`函数按升序对键进行排序,因此结果为[3,2,1]。因此,`(x) -> -x` lambda函数将排序设置为[降序](#array_functions-reverse-sort)。 + ┌─res─────┐ + │ [3,2,1] │ + └─────────┘ + +对于源数组的每个元素,lambda函数返回排序键,即\[1 -\> -1, 2 -\> -2, 3 -\> -3\]。由于`arraySort`函数按升序对键进行排序,因此结果为\[3,2,1\]。因此,`(x) -> -x` lambda函数将排序设置为[降序](#array_functions-reverse-sort)。 lambda函数可以接受多个参数。在这种情况下,您需要为`arraySort`传递与lambda参数个数相同的数组。函数使用第一个输入的数组中的元素组成返回结果;使用接下来传入的数组作为排序键。例如: @@ -522,19 +472,18 @@ lambda函数可以接受多个参数。在这种情况下,您需要为`arraySo SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -``` -┌─res────────────────┐ -│ ['world', 'hello'] │ -└────────────────────┘ -``` + ┌─res────────────────┐ + │ ['world', 'hello'] │ + └────────────────────┘ -这里,在第二个数组([2, 1])中定义了第一个数组(['hello','world'])的相应元素的排序键,即['hello' -> 2,'world' -> 1]。 由于lambda函数中没有使用`x`,因此源数组中的实际值不会影响结果的顺序。所以,‘world’将是结果中的第一个元素,‘hello’将是结果中的第二个元素。 +这里,在第二个数组(\[2, 1\])中定义了第一个数组(\[‘hello’,‘world’\])的相应元素的排序键,即\[‘hello’ -\> 2,‘world’ -\> 1\]。 由于lambda函数中没有使用`x`,因此源数组中的实际值不会影响结果的顺序。所以,‘world’将是结果中的第一个元素,‘hello’将是结果中的第二个元素。 其他示例如下所示。 ``` sql SELECT arraySort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res; ``` + ``` sql ┌─res─────┐ │ [2,1,0] │ @@ -544,16 +493,17 @@ SELECT arraySort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res; ``` sql SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; ``` + ``` sql ┌─res─────┐ │ [2,1,0] │ └─────────┘ ``` -!!! 注意 +!!! 注意 "注意" 为了提高排序效率, 使用了[Schwartzian transform](https://en.wikipedia.org/wiki/Schwartzian_transform)。 -## arrayReverseSort([func,] arr, ...) {#array_functions-reverse-sort} +## arrayReverseSort(\[func,\] arr, …) {#array-functions-reverse-sort} 以降序对`arr`数组的元素进行排序。如果指定了`func`函数,则排序顺序由`func`函数的调用结果决定。如果`func`接受多个参数,那么`arrayReverseSort`函数也将解析与`func`函数参数相同数量的数组作为参数。更详细的示例在`arrayReverseSort`的末尾。 @@ -562,61 +512,60 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; ``` sql SELECT arrayReverseSort([1, 3, 3, 0]); ``` -``` -┌─arrayReverseSort([1, 3, 3, 0])─┐ -│ [3,3,1,0] │ -└────────────────────────────────┘ -``` + + ┌─arrayReverseSort([1, 3, 3, 0])─┐ + │ [3,3,1,0] │ + └────────────────────────────────┘ 字符串排序示例: ``` sql SELECT arrayReverseSort(['hello', 'world', '!']); ``` -``` -┌─arrayReverseSort(['hello', 'world', '!'])─┐ -│ ['world','hello','!'] │ -└───────────────────────────────────────────┘ -``` + + ┌─arrayReverseSort(['hello', 'world', '!'])─┐ + │ ['world','hello','!'] │ + └───────────────────────────────────────────┘ `NULL`,`NaN`和`Inf`的排序顺序: ``` sql SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res; ``` + ``` sql ┌─res───────────────────────────────────┐ │ [inf,3,2,1,-4,-inf,nan,nan,NULL,NULL] │ └───────────────────────────────────────┘ ``` -- `Inf` 是数组中的第一个。 -- `NULL` 是数组中的最后一个。 -- `NaN` 在`NULL`的前面。 -- `-Inf` 在`NaN`的前面。 +- `Inf` 是数组中的第一个。 +- `NULL` 是数组中的最后一个。 +- `NaN` 在`NULL`的前面。 +- `-Inf` 在`NaN`的前面。 注意:`arraySort`是[高阶函数](higher_order_functions.md)。您可以将lambda函数作为第一个参数传递给它。如下示例所示。 ``` sql SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res; ``` -``` -┌─res─────┐ -│ [1,2,3] │ -└─────────┘ -``` + + ┌─res─────┐ + │ [1,2,3] │ + └─────────┘ 数组按以下方式排序: The array is sorted in the following way: -1. 首先,根据lambda函数的调用结果对源数组([1, 2, 3])进行排序。 结果是[3, 2, 1]。 -2. 反转上一步获得的数组。 所以,最终的结果是[1, 2, 3]。 +1. 首先,根据lambda函数的调用结果对源数组(\[1, 2, 3\])进行排序。 结果是\[3, 2, 1\]。 +2. 反转上一步获得的数组。 所以,最终的结果是\[1, 2, 3\]。 lambda函数可以接受多个参数。在这种情况下,您需要为`arrayReverseSort`传递与lambda参数个数相同的数组。函数使用第一个输入的数组中的元素组成返回结果;使用接下来传入的数组作为排序键。例如: ``` sql SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` + ``` sql ┌─res───────────────┐ │ ['hello','world'] │ @@ -625,97 +574,92 @@ SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; 在这个例子中,数组按以下方式排序: -1. 首先,根据lambda函数的调用结果对源数组(['hello','world'])进行排序。 其中,在第二个数组([2,1])中定义了源数组中相应元素的排序键。 所以,排序结果['world','hello']。 -2. 反转上一步骤中获得的排序数组。 所以,最终的结果是['hello','world']。 +1. 首先,根据lambda函数的调用结果对源数组(\[‘hello’,‘world’\])进行排序。 其中,在第二个数组(\[2,1\])中定义了源数组中相应元素的排序键。 所以,排序结果\[‘world’,‘hello’\]。 +2. 反转上一步骤中获得的排序数组。 所以,最终的结果是\[‘hello’,‘world’\]。 其他示例如下所示。 ``` sql SELECT arrayReverseSort((x, y) -> y, [4, 3, 5], ['a', 'b', 'c']) AS res; ``` + ``` sql ┌─res─────┐ │ [5,3,4] │ └─────────┘ ``` + ``` sql SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; ``` + ``` sql ┌─res─────┐ │ [4,3,5] │ └─────────┘ ``` -## arrayUniq(arr, ...) +## arrayUniq(arr, …) {#arrayuniqarr} 如果传递一个参数,则计算数组中不同元素的数量。 如果传递了多个参数,则它计算多个数组中相应位置的不同元素元组的数量。 -如果要获取数组中唯一项的列表,可以使用arrayReduce('groupUniqArray',arr)。 +如果要获取数组中唯一项的列表,可以使用arrayReduce(‘groupUniqArray’,arr)。 -## arrayJoin(arr) {#array_functions-join} +## arrayJoin(arr) {#array-functions-join} -一个特殊的功能。请参见[“ArrayJoin函数”](array_join.md#functions_arrayjoin)部分。 +一个特殊的功能。请参见[«ArrayJoin函数»](array_join.md#functions_arrayjoin)部分。 -## arrayDifference(arr) +## arrayDifference(arr) {#arraydifferencearr} 返回一个数组,其中包含所有相邻元素对之间的差值。例如: -```sql +``` sql SELECT arrayDifference([1, 2, 3, 4]) ``` -``` -┌─arrayDifference([1, 2, 3, 4])─┐ -│ [0,1,1,1] │ -└───────────────────────────────┘ -``` + ┌─arrayDifference([1, 2, 3, 4])─┐ + │ [0,1,1,1] │ + └───────────────────────────────┘ -## arrayDistinct(arr) +## arrayDistinct(arr) {#arraydistinctarr} 返回一个包含所有数组中不同元素的数组。例如: -```sql +``` sql SELECT arrayDistinct([1, 2, 2, 3, 1]) ``` -``` -┌─arrayDistinct([1, 2, 2, 3, 1])─┐ -│ [1,2,3] │ -└────────────────────────────────┘ -``` + ┌─arrayDistinct([1, 2, 2, 3, 1])─┐ + │ [1,2,3] │ + └────────────────────────────────┘ -## arrayEnumerateDense(arr) +## arrayEnumerateDense(arr) {#arrayenumeratedensearr} -返回与源数组大小相同的数组,指示每个元素首次出现在源数组中的位置。例如:arrayEnumerateDense([10,20,10,30])= [1,2,1,3]。 +返回与源数组大小相同的数组,指示每个元素首次出现在源数组中的位置。例如:arrayEnumerateDense(\[10,20,10,30\])= \[1,2,1,3\]。 -## arrayIntersect(arr) +## arrayIntersect(arr) {#arrayintersectarr} 返回所有数组元素的交集。例如: -```sql +``` sql SELECT arrayIntersect([1, 2], [1, 3], [2, 3]) AS no_intersect, arrayIntersect([1, 2], [1, 3], [1, 4]) AS intersect ``` -``` -┌─no_intersect─┬─intersect─┐ -│ [] │ [1] │ -└──────────────┴───────────┘ -``` + ┌─no_intersect─┬─intersect─┐ + │ [] │ [1] │ + └──────────────┴───────────┘ -## arrayReduce(agg_func, arr1, ...) +## arrayReduce(agg\_func, arr1, …) {#arrayreduceagg-func-arr1} 将聚合函数应用于数组并返回其结果。如果聚合函数具有多个参数,则此函数可应用于相同大小的多个数组。 -arrayReduce('agg_func',arr1,...) - 将聚合函数`agg_func`应用于数组`arr1 ...`。如果传递了多个数组,则相应位置上的元素将作为多个参数传递给聚合函数。例如:SELECT arrayReduce('max',[1,2,3])= 3 +arrayReduce(‘agg\_func’,arr1,…) - 将聚合函数`agg_func`应用于数组`arr1 ...`。如果传递了多个数组,则相应位置上的元素将作为多个参数传递给聚合函数。例如:SELECT arrayReduce(‘max’,\[1,2,3\])= 3 -## arrayReverse(arr) +## arrayReverse(arr) {#arrayreversearr} 返回与源数组大小相同的数组,包含反转源数组的所有元素的结果。 - - [来源文章](https://clickhouse.tech/docs/en/query_language/functions/array_functions/) diff --git a/docs/zh/query_language/functions/array_join.md b/docs/zh/query_language/functions/array_join.md index 400230d60a2..3017828a702 100644 --- a/docs/zh/query_language/functions/array_join.md +++ b/docs/zh/query_language/functions/array_join.md @@ -1,10 +1,10 @@ -# arrayJoin函数 {#functions_arrayjoin} +# arrayJoin函数 {#functions-arrayjoin} 这是一个非常有用的函数。 普通函数不会更改结果集的行数,而只是计算每行中的值(map)。 聚合函数将多行压缩到一行中(fold或reduce)。 -'arrayJoin'函数获取每一行并将他们展开到多行(unfold)。 +’arrayJoin’函数获取每一行并将他们展开到多行(unfold)。 此函数将数组作为参数,并将该行在结果集中复制数组元素个数。 除了应用此函数的列中的值之外,简单地复制列中的所有值;它被替换为相应的数组值。 @@ -19,13 +19,10 @@ SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src ``` -``` -┌─dst─┬─\'Hello\'─┬─src─────┐ -│ 1 │ Hello │ [1,2,3] │ -│ 2 │ Hello │ [1,2,3] │ -│ 3 │ Hello │ [1,2,3] │ -└─────┴───────────┴─────────┘ -``` - + ┌─dst─┬─\'Hello\'─┬─src─────┐ + │ 1 │ Hello │ [1,2,3] │ + │ 2 │ Hello │ [1,2,3] │ + │ 3 │ Hello │ [1,2,3] │ + └─────┴───────────┴─────────┘ [来源文章](https://clickhouse.tech/docs/en/query_language/functions/array_join/) diff --git a/docs/zh/query_language/functions/bit_functions.md b/docs/zh/query_language/functions/bit_functions.md index 0f8e8ad9e6c..a9ded6b0930 100644 --- a/docs/zh/query_language/functions/bit_functions.md +++ b/docs/zh/query_language/functions/bit_functions.md @@ -1,31 +1,29 @@ -# 位操作函数 +# 位操作函数 {#wei-cao-zuo-han-shu} 位操作函数适用于UInt8,UInt16,UInt32,UInt64,Int8,Int16,Int32,Int64,Float32或Float64中的任何类型。 结果类型是一个整数,其位数等于其参数的最大位。如果至少有一个参数为有符数字,则结果为有符数字。如果参数是浮点数,则将其强制转换为Int64。 -## bitAnd(a, b) +## bitAnd(a, b) {#bitanda-b} -## bitOr(a, b) +## bitOr(a, b) {#bitora-b} -## bitXor(a, b) +## bitXor(a, b) {#bitxora-b} -## bitNot(a) +## bitNot(a) {#bitnota} -## bitShiftLeft(a, b) +## bitShiftLeft(a, b) {#bitshiftlefta-b} -## bitShiftRight(a, b) +## bitShiftRight(a, b) {#bitshiftrighta-b} -## bitRotateLeft(a, b) +## bitRotateLeft(a, b) {#bitrotatelefta-b} -## bitRotateRight(a, b) +## bitRotateRight(a, b) {#bitrotaterighta-b} -## bitTest(a, b) - -## bitTestAll(a, b) - -## bitTestAny(a, b) +## bitTest(a, b) {#bittesta-b} +## bitTestAll(a, b) {#bittestalla-b} +## bitTestAny(a, b) {#bittestanya-b} [来源文章](https://clickhouse.tech/docs/en/query_language/functions/bit_functions/) diff --git a/docs/zh/query_language/functions/bitmap_functions.md b/docs/zh/query_language/functions/bitmap_functions.md index e8ad2645a1d..498212bc1fe 100644 --- a/docs/zh/query_language/functions/bitmap_functions.md +++ b/docs/zh/query_language/functions/bitmap_functions.md @@ -1,456 +1,384 @@ -# 位图函数 - -位图函数用于对两个位图对象进行计算,对于任何一个位图函数,它都将返回一个位图对象,例如and,or,xor,not等等。 - -位图对象有两种构造方法。一个是由聚合函数groupBitmapState构造的,另一个是由Array Object构造的。同时还可以将位图对象转化为数组对象。 - -我们使用RoaringBitmap实际存储位图对象,当基数小于或等于32时,它使用Set保存。当基数大于32时,它使用RoaringBitmap保存。这也是为什么低基数集的存储更快的原因。 - -有关RoaringBitmap的更多信息,请参阅:[CRoaring](https://github.com/RoaringBitmap/CRoaring)。 - - -## bitmapBuild - -从无符号整数数组构建位图对象。 - -``` -bitmapBuild(array) -``` - -**参数** - -- `array` – 无符号整数数组. - -**示例** - -``` sql -SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res -``` - -## bitmapToArray - -将位图转换为整数数组。 - -``` -bitmapToArray(bitmap) -``` - -**参数** - -- `bitmap` – 位图对象. - -**示例** - -``` sql -SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res -``` - -``` -┌─res─────────┐ -│ [1,2,3,4,5] │ -└─────────────┘ -``` - -## bitmapSubsetInRange - -将位图指定范围(不包含range_end)转换为另一个位图。 - -``` -bitmapSubsetInRange(bitmap, range_start, range_end) -``` - -**参数** - -- `bitmap` – 位图对象. -- `range_start` – 范围起始点(含). -- `range_end` – 范围结束点(不含). - -**示例** - -``` sql -SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res -``` - -``` -┌─res───────────────┐ -│ [30,31,32,33,100] │ -└───────────────────┘ -``` - -## bitmapSubsetLimit - -将位图指定范围(起始点和数目上限)转换为另一个位图。 - -``` -bitmapSubsetLimit(bitmap, range_start, limit) -``` - -**参数** - -- `bitmap` – 位图对象. -- `range_start` – 范围起始点(含). -- `limit` – 子位图基数上限. - -**示例** - -``` sql -SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res -``` - -``` -┌─res───────────────────────┐ -│ [30,31,32,33,100,200,500] │ -└───────────────────────────┘ -``` - -## bitmapContains - -检查位图是否包含指定元素。 - -``` -bitmapContains(haystack, needle) -``` - -**参数** - -- `haystack` – 位图对象. -- `needle` – 元素,类型UInt32. - -**示例** - -``` sql -SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res -``` -```text -┌─res─┐ -│ 1 │ -└─────┘ -``` - -## bitmapHasAny - -与`hasAny(array,array)`类似,如果位图有任何公共元素则返回1,否则返回0。 -对于空位图,返回0。 - -``` -bitmapHasAny(bitmap,bitmap) -``` - -**参数** - -- `bitmap` – bitmap对象。 - -**示例** - -``` sql -SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res -``` - -``` -┌─res─┐ -│ 1 │ -└─────┘ -``` - -## bitmapHasAll - -与`hasAll(array,array)`类似,如果第一个位图包含第二个位图的所有元素,则返回1,否则返回0。 -如果第二个参数是空位图,则返回1。 - -``` -bitmapHasAll(bitmap,bitmap) -``` - -**参数** - -- `bitmap` – bitmap 对象。 - -**示例** - -``` sql -SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res -``` - -``` -┌─res─┐ -│ 0 │ -└─────┘ -``` - -## bitmapAnd - -为两个位图对象进行与操作,返回一个新的位图对象。 - -``` -bitmapAnd(bitmap1,bitmap2) -``` - -**参数** - -- `bitmap1` – 位图对象。 -- `bitmap2` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res -``` - -``` -┌─res─┐ -│ [3] │ -└─────┘ -``` - - -## bitmapOr - -为两个位图对象进行或操作,返回一个新的位图对象。 - -``` -bitmapOr(bitmap1,bitmap2) -``` - -**Parameters** - -- `bitmap1` – 位图对象。 -- `bitmap2` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res -``` - -``` -┌─res─────────┐ -│ [1,2,3,4,5] │ -└─────────────┘ -``` - -## bitmapXor - -为两个位图对象进行异或操作,返回一个新的位图对象。 - -``` -bitmapXor(bitmap1,bitmap2) -``` - -**参数** - -- `bitmap1` – 位图对象。 -- `bitmap2` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res -``` - -``` -┌─res───────┐ -│ [1,2,4,5] │ -└───────────┘ -``` - -## bitmapAndnot - -计算两个位图的差异,返回一个新的位图对象。 - -``` -bitmapAndnot(bitmap1,bitmap2) -``` - -**参数** - -- `bitmap1` – 位图对象。 -- `bitmap2` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res -``` - -``` -┌─res───┐ -│ [1,2] │ -└───────┘ -``` - -## bitmapCardinality - -返回一个UInt64类型的数值,表示位图对象的基数。 - -``` -bitmapCardinality(bitmap) -``` - -**Parameters** - -- `bitmap` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res -``` - -``` -┌─res─┐ -│ 5 │ -└─────┘ -``` - -## bitmapMin - -返回一个UInt64类型的数值,表示位图中的最小值。如果位图为空则返回UINT32_MAX。 - -``` -bitmapMin(bitmap) -``` - -**Parameters** - -- `bitmap` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res -``` - -``` -┌─res─┐ -│ 1 │ -└─────┘ -``` - -## bitmapMax - -返回一个UInt64类型的数值,表示位图中的最大值。如果位图为空则返回0。 - -``` -bitmapMax(bitmap) -``` - -**Parameters** - -- `bitmap` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res -``` - -``` -┌─res─┐ -│ 5 │ -└─────┘ -``` - -## bitmapAndCardinality - -为两个位图对象进行与操作,返回结果位图的基数。 - -``` -bitmapAndCardinality(bitmap1,bitmap2) -``` - -**参数** - -- `bitmap1` – 位图对象。 -- `bitmap2` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; -``` - -``` -┌─res─┐ -│ 1 │ -└─────┘ -``` - - -## bitmapOrCardinality - -为两个位图进行或运算,返回结果位图的基数。 - -``` -bitmapOrCardinality(bitmap1,bitmap2) -``` - -**参数** - -- `bitmap1` – 位图对象。 -- `bitmap2` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; -``` - -``` -┌─res─┐ -│ 5 │ -└─────┘ -``` - -## bitmapXorCardinality - -为两个位图进行异或运算,返回结果位图的基数。 - -``` -bitmapXorCardinality(bitmap1,bitmap2) -``` - -**参数** - -- `bitmap1` – 位图对象。 -- `bitmap2` – 位图对象。 - -**示例** - -``` sql -SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; -``` - -``` -┌─res─┐ -│ 4 │ -└─────┘ -``` - - -## bitmapAndnotCardinality - -计算两个位图的差异,返回结果位图的基数。 - -``` -bitmapAndnotCardinality(bitmap1,bitmap2) -``` - -**参数** - -- `bitmap1` – 位图对象。 -- `bitmap2` - 位图对象。 - -**示例** - -``` sql -SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; -``` - -``` -┌─res─┐ -│ 2 │ -└─────┘ -``` - - -[来源文章](https://clickhouse.tech/docs/en/query_language/functions/bitmap_functions/) +# 位图函数 {#wei-tu-han-shu} + +位图函数用于对两个位图对象进行计算,对于任何一个位图函数,它都将返回一个位图对象,例如and,or,xor,not等等。 + +位图对象有两种构造方法。一个是由聚合函数groupBitmapState构造的,另一个是由Array Object构造的。同时还可以将位图对象转化为数组对象。 + +我们使用RoaringBitmap实际存储位图对象,当基数小于或等于32时,它使用Set保存。当基数大于32时,它使用RoaringBitmap保存。这也是为什么低基数集的存储更快的原因。 + +有关RoaringBitmap的更多信息,请参阅:[CRoaring](https://github.com/RoaringBitmap/CRoaring)。 + +## bitmapBuild {#bitmapbuild} + +从无符号整数数组构建位图对象。 + + bitmapBuild(array) + +**参数** + +- `array` – 无符号整数数组. + +**示例** + +``` sql +SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res +``` + +## bitmapToArray {#bitmaptoarray} + +将位图转换为整数数组。 + + bitmapToArray(bitmap) + +**参数** + +- `bitmap` – 位图对象. + +**示例** + +``` sql +SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + + ┌─res─────────┐ + │ [1,2,3,4,5] │ + └─────────────┘ + +## bitmapSubsetInRange {#bitmapsubsetinrange} + +将位图指定范围(不包含range\_end)转换为另一个位图。 + + bitmapSubsetInRange(bitmap, range_start, range_end) + +**参数** + +- `bitmap` – 位图对象. +- `range_start` – 范围起始点(含). +- `range_end` – 范围结束点(不含). + +**示例** + +``` sql +SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res +``` + + ┌─res───────────────┐ + │ [30,31,32,33,100] │ + └───────────────────┘ + +## bitmapSubsetLimit {#bitmapsubsetlimit} + +将位图指定范围(起始点和数目上限)转换为另一个位图。 + + bitmapSubsetLimit(bitmap, range_start, limit) + +**参数** + +- `bitmap` – 位图对象. +- `range_start` – 范围起始点(含). +- `limit` – 子位图基数上限. + +**示例** + +``` sql +SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res +``` + + ┌─res───────────────────────┐ + │ [30,31,32,33,100,200,500] │ + └───────────────────────────┘ + +## bitmapContains {#bitmapcontains} + +检查位图是否包含指定元素。 + + bitmapContains(haystack, needle) + +**参数** + +- `haystack` – 位图对象. +- `needle` – 元素,类型UInt32. + +**示例** + +``` sql +SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res +``` + +``` text +┌─res─┐ +│ 1 │ +└─────┘ +``` + +## bitmapHasAny {#bitmaphasany} + +与`hasAny(array,array)`类似,如果位图有任何公共元素则返回1,否则返回0。 +对于空位图,返回0。 + + bitmapHasAny(bitmap,bitmap) + +**参数** + +- `bitmap` – bitmap对象。 + +**示例** + +``` sql +SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res +``` + + ┌─res─┐ + │ 1 │ + └─────┘ + +## bitmapHasAll {#bitmaphasall} + +与`hasAll(array,array)`类似,如果第一个位图包含第二个位图的所有元素,则返回1,否则返回0。 +如果第二个参数是空位图,则返回1。 + + bitmapHasAll(bitmap,bitmap) + +**参数** + +- `bitmap` – bitmap 对象。 + +**示例** + +``` sql +SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res +``` + + ┌─res─┐ + │ 0 │ + └─────┘ + +## bitmapAnd {#bitmapand} + +为两个位图对象进行与操作,返回一个新的位图对象。 + + bitmapAnd(bitmap1,bitmap2) + +**参数** + +- `bitmap1` – 位图对象。 +- `bitmap2` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +``` + + ┌─res─┐ + │ [3] │ + └─────┘ + +## bitmapOr {#bitmapor} + +为两个位图对象进行或操作,返回一个新的位图对象。 + + bitmapOr(bitmap1,bitmap2) + +**Parameters** + +- `bitmap1` – 位图对象。 +- `bitmap2` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +``` + + ┌─res─────────┐ + │ [1,2,3,4,5] │ + └─────────────┘ + +## bitmapXor {#bitmapxor} + +为两个位图对象进行异或操作,返回一个新的位图对象。 + + bitmapXor(bitmap1,bitmap2) + +**参数** + +- `bitmap1` – 位图对象。 +- `bitmap2` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +``` + + ┌─res───────┐ + │ [1,2,4,5] │ + └───────────┘ + +## bitmapAndnot {#bitmapandnot} + +计算两个位图的差异,返回一个新的位图对象。 + + bitmapAndnot(bitmap1,bitmap2) + +**参数** + +- `bitmap1` – 位图对象。 +- `bitmap2` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res +``` + + ┌─res───┐ + │ [1,2] │ + └───────┘ + +## bitmapCardinality {#bitmapcardinality} + +返回一个UInt64类型的数值,表示位图对象的基数。 + + bitmapCardinality(bitmap) + +**Parameters** + +- `bitmap` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + + ┌─res─┐ + │ 5 │ + └─────┘ + +## bitmapMin {#bitmapmin} + +返回一个UInt64类型的数值,表示位图中的最小值。如果位图为空则返回UINT32\_MAX。 + + bitmapMin(bitmap) + +**Parameters** + +- `bitmap` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + + ┌─res─┐ + │ 1 │ + └─────┘ + +## bitmapMax {#bitmapmax} + +返回一个UInt64类型的数值,表示位图中的最大值。如果位图为空则返回0。 + + bitmapMax(bitmap) + +**Parameters** + +- `bitmap` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res +``` + + ┌─res─┐ + │ 5 │ + └─────┘ + +## bitmapAndCardinality {#bitmapandcardinality} + +为两个位图对象进行与操作,返回结果位图的基数。 + + bitmapAndCardinality(bitmap1,bitmap2) + +**参数** + +- `bitmap1` – 位图对象。 +- `bitmap2` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; +``` + + ┌─res─┐ + │ 1 │ + └─────┘ + +## bitmapOrCardinality {#bitmaporcardinality} + +为两个位图进行或运算,返回结果位图的基数。 + + bitmapOrCardinality(bitmap1,bitmap2) + +**参数** + +- `bitmap1` – 位图对象。 +- `bitmap2` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; +``` + + ┌─res─┐ + │ 5 │ + └─────┘ + +## bitmapXorCardinality {#bitmapxorcardinality} + +为两个位图进行异或运算,返回结果位图的基数。 + + bitmapXorCardinality(bitmap1,bitmap2) + +**参数** + +- `bitmap1` – 位图对象。 +- `bitmap2` – 位图对象。 + +**示例** + +``` sql +SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; +``` + + ┌─res─┐ + │ 4 │ + └─────┘ + +## bitmapAndnotCardinality {#bitmapandnotcardinality} + +计算两个位图的差异,返回结果位图的基数。 + + bitmapAndnotCardinality(bitmap1,bitmap2) + +**参数** + +- `bitmap1` – 位图对象。 +- `bitmap2` - 位图对象。 + +**示例** + +``` sql +SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; +``` + + ┌─res─┐ + │ 2 │ + └─────┘ + +[来源文章](https://clickhouse.tech/docs/en/query_language/functions/bitmap_functions/) diff --git a/docs/zh/query_language/functions/comparison_functions.md b/docs/zh/query_language/functions/comparison_functions.md index 68067a5586a..ce8c3728b5b 100644 --- a/docs/zh/query_language/functions/comparison_functions.md +++ b/docs/zh/query_language/functions/comparison_functions.md @@ -1,13 +1,13 @@ -# 比较函数 +# 比较函数 {#bi-jiao-han-shu} 比较函数始终返回0或1(UInt8)。 可以比较以下类型: - - Numbers - - String 和 FixedString - - Date - - DateTime +- Numbers +- String 和 FixedString +- Date +- DateTime 以上每个组内的类型均可互相比较,但是对于不同组的类型间不能够进行比较。 @@ -17,17 +17,16 @@ 注意。直到1.1.54134版本,有符号和无符号数字的比较方式与C++相同。换句话说,在SELECT 9223372036854775807 &gt; -1 等情况下,您可能会得到错误的结果。 此行为在版本1.1.54134中已更改,现在在数学上是正确的。 -## equals, a = b and a == b operator +## equals, a = b and a == b operator {#equals-a-b-and-a-b-operator} -## notEquals, a ! operator= b and a `<>` b +## notEquals, a ! operator= b and a `<>` b {#notequals-a-operator-b-and-a-b} -## less, `< operator` +## less, `< operator` {#less-operator} -## greater, `> operator` +## greater, `> operator` {#greater-operator} -## lessOrEquals, `<= operator` - -## greaterOrEquals, `>= operator` +## lessOrEquals, `<= operator` {#lessorequals-operator} +## greaterOrEquals, `>= operator` {#greaterorequals-operator} [来源文章](https://clickhouse.tech/docs/en/query_language/functions/comparison_functions/) diff --git a/docs/zh/query_language/functions/conditional_functions.md b/docs/zh/query_language/functions/conditional_functions.md index 89babefd338..175656b8374 100644 --- a/docs/zh/query_language/functions/conditional_functions.md +++ b/docs/zh/query_language/functions/conditional_functions.md @@ -1,50 +1,44 @@ -# 条件函数 +# 条件函数 {#tiao-jian-han-shu} -## if(cond, then, else), cond ? operator then : else +## if(cond, then, else), cond ? operator then : else {#ifcond-then-else-cond-operator-then-else} 如果`cond != 0`则返回`then`,如果`cond = 0`则返回`else`。 `cond`必须是`UInt8`类型,`then`和`else`必须存在最低的共同类型。 `then`和`else`可以是`NULL` -## multiIf +## multiIf {#multiif} 允许您在查询中更紧凑地编写[CASE](../operators.md#operator_case)运算符。 -``` -multiIf(cond_1, then_1, cond_2, then_2...else) -``` + multiIf(cond_1, then_1, cond_2, then_2...else) **参数:** -- `cond_N` — 函数返回`then_N`的条件。 -- `then_N` — 执行时函数的结果。 -- `else` — 如果没有满足任何条件,则为函数的结果。 +- `cond_N` — 函数返回`then_N`的条件。 +- `then_N` — 执行时函数的结果。 +- `else` — 如果没有满足任何条件,则为函数的结果。 该函数接受`2N + 1`参数。 **返回值** -该函数返回值“then_N”或“else”之一,具体取决于条件`cond_N`。 +该函数返回值«then\_N»或«else»之一,具体取决于条件`cond_N`。 **示例** 存在如下一张表 -``` -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 3 │ -└───┴──────┘ -``` + ┌─x─┬────y─┐ + │ 1 │ ᴺᵁᴸᴸ │ + │ 2 │ 3 │ + └───┴──────┘ 执行查询 `SELECT multiIf(isNull(y) x, y < 3, y, NULL) FROM t_null`。结果: -``` -┌─multiIf(isNull(y), x, less(y, 3), y, NULL)─┐ -│ 1 │ -│ ᴺᵁᴸᴸ │ -└────────────────────────────────────────────┘ -``` + ┌─multiIf(isNull(y), x, less(y, 3), y, NULL)─┐ + │ 1 │ + │ ᴺᵁᴸᴸ │ + └────────────────────────────────────────────┘ [来源文章](https://clickhouse.tech/docs/en/query_language/functions/conditional_functions/) diff --git a/docs/zh/query_language/functions/date_time_functions.md b/docs/zh/query_language/functions/date_time_functions.md index 91f81757830..fe9961d7658 100644 --- a/docs/zh/query_language/functions/date_time_functions.md +++ b/docs/zh/query_language/functions/date_time_functions.md @@ -1,4 +1,4 @@ -# 时间日期函数 +# 时间日期函数 {#shi-jian-ri-qi-han-shu} 支持时区。 @@ -12,199 +12,199 @@ SELECT toString(time, 'US/Samoa') AS time_samoa ``` -``` -┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ -│ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ -└─────────────────────┴────────────┴────────────┴─────────────────────┘ -``` + ┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ + │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ + └─────────────────────┴────────────┴────────────┴─────────────────────┘ 仅支持与UTC相差一整小时的时区。 -## toTimeZone +## toTimeZone {#totimezone} 将Date或DateTime转换为指定的时区。 -## toYear +## toYear {#toyear} 将Date或DateTime转换为包含年份编号(AD)的UInt16类型的数字。 -## toQuarter +## toQuarter {#toquarter} 将Date或DateTime转换为包含季度编号的UInt8类型的数字。 -## toMonth +## toMonth {#tomonth} 将Date或DateTime转换为包含月份编号(1-12)的UInt8类型的数字。 -## toDayOfYear +## toDayOfYear {#todayofyear} 将Date或DateTime转换为包含一年中的某一天的编号的UInt16(1-366)类型的数字。 -## toDayOfMonth +## toDayOfMonth {#todayofmonth} 将Date或DateTime转换为包含一月中的某一天的编号的UInt8(1-31)类型的数字。 -## toDayOfWeek +## toDayOfWeek {#todayofweek} 将Date或DateTime转换为包含一周中的某一天的编号的UInt8(周一是1, 周日是7)类型的数字。 -## toHour +## toHour {#tohour} 将DateTime转换为包含24小时制(0-23)小时数的UInt8数字。 这个函数假设如果时钟向前移动,它是一个小时,发生在凌晨2点,如果时钟被移回,它是一个小时,发生在凌晨3点(这并非总是如此 - 即使在莫斯科时钟在不同的时间两次改变)。 -## toMinute +## toMinute {#tominute} 将DateTime转换为包含一小时中分钟数(0-59)的UInt8数字。 -## toSecond +## toSecond {#tosecond} 将DateTime转换为包含一分钟中秒数(0-59)的UInt8数字。 闰秒不计算在内。 -## toUnixTimestamp +## toUnixTimestamp {#tounixtimestamp} 将DateTime转换为unix时间戳。 -## toStartOfYear +## toStartOfYear {#tostartofyear} 将Date或DateTime向前取整到本年的第一天。 返回Date类型。 -## toStartOfISOYear +## toStartOfISOYear {#tostartofisoyear} 将Date或DateTime向前取整到ISO本年的第一天。 返回Date类型。 -## toStartOfQuarter +## toStartOfQuarter {#tostartofquarter} 将Date或DateTime向前取整到本季度的第一天。 返回Date类型。 -## toStartOfMonth +## toStartOfMonth {#tostartofmonth} 将Date或DateTime向前取整到本月的第一天。 返回Date类型。 -!!! 注意 -    解析不正确日期的行为是特定于实现的。 ClickHouse可能会返回零日期,抛出异常或执行“natural”溢出。 +!!! 注意 "注意" +     解析不正确日期的行为是特定于实现的。 ClickHouse可能会返回零日期,抛出异常或执行«natural»溢出。 -## toMonday +## toMonday {#tomonday} 将Date或DateTime向前取整到本周的星期一。 返回Date类型。 -## toStartOfDay +## toStartOfDay {#tostartofday} 将DateTime向前取整到当日的开始。 -## toStartOfHour +## toStartOfHour {#tostartofhour} 将DateTime向前取整到当前小时的开始。 -## toStartOfMinute +## toStartOfMinute {#tostartofminute} 将DateTime向前取整到当前分钟的开始。 -## toStartOfFiveMinute +## toStartOfFiveMinute {#tostartoffiveminute} 将DateTime以五分钟为单位向前取整到最接近的时间点。 -## toStartOfTenMinutes +## toStartOfTenMinutes {#tostartoftenminutes} + 将DateTime以十分钟为单位向前取整到最接近的时间点。 -## toStartOfFifteenMinutes +## toStartOfFifteenMinutes {#tostartoffifteenminutes} 将DateTime以十五分钟为单位向前取整到最接近的时间点。 -## toStartOfInterval(time_or_data, INTERVAL x unit [, time_zone]) +## toStartOfInterval(time\_or\_data, INTERVAL x unit \[, time\_zone\]) {#tostartofintervaltime-or-data-interval-x-unit-time-zone} + 这是名为`toStartOf*`的所有函数的通用函数。例如, `toStartOfInterval(t,INTERVAL 1 year)`返回与`toStartOfYear(t)`相同的结果, `toStartOfInterval(t,INTERVAL 1 month)`返回与`toStartOfMonth(t)`相同的结果, `toStartOfInterval(t,INTERVAL 1 day)`返回与`toStartOfDay(t)`相同的结果, `toStartOfInterval(t,INTERVAL 15 minute)`返回与`toStartOfFifteenMinutes(t)`相同的结果。 -## toTime +## toTime {#totime} 将DateTime中的日期转换为一个固定的日期,同时保留时间部分。 -## toRelativeYearNum +## toRelativeYearNum {#torelativeyearnum} 将Date或DateTime转换为年份的编号,从过去的某个固定时间点开始。 -## toRelativeQuarterNum +## toRelativeQuarterNum {#torelativequarternum} 将Date或DateTime转换为季度的数字,从过去的某个固定时间点开始。 -## toRelativeMonthNum +## toRelativeMonthNum {#torelativemonthnum} 将Date或DateTime转换为月份的编号,从过去的某个固定时间点开始。 -## toRelativeWeekNum +## toRelativeWeekNum {#torelativeweeknum} 将Date或DateTime转换为星期数,从过去的某个固定时间点开始。 -## toRelativeDayNum +## toRelativeDayNum {#torelativedaynum} 将Date或DateTime转换为当天的编号,从过去的某个固定时间点开始。 -## toRelativeHourNum +## toRelativeHourNum {#torelativehournum} 将DateTime转换为小时数,从过去的某个固定时间点开始。 -## toRelativeMinuteNum +## toRelativeMinuteNum {#torelativeminutenum} 将DateTime转换为分钟数,从过去的某个固定时间点开始。 -## toRelativeSecondNum +## toRelativeSecondNum {#torelativesecondnum} 将DateTime转换为秒数,从过去的某个固定时间点开始。 -## toISOYear +## toISOYear {#toisoyear} 将Date或DateTime转换为包含ISO年份的UInt16类型的编号。 -## toISOWeek +## toISOWeek {#toisoweek} 将Date或DateTime转换为包含ISO周数的UInt8类型的编号。 -## now +## now {#now} 不接受任何参数并在请求执行时的某一刻返回当前时间(DateTime)。 此函数返回一个常量,即时请求需要很长时间能够完成。 -## today +## today {#today} 不接受任何参数并在请求执行时的某一刻返回当前日期(Date)。 -其功能与'toDate(now())'相同。 +其功能与’toDate(now())’相同。 -## yesterday +## yesterday {#yesterday} 不接受任何参数并在请求执行时的某一刻返回昨天的日期(Date)。 -其功能与'today() - 1'相同。 +其功能与’today() - 1’相同。 -## timeSlot +## timeSlot {#timeslot} 将时间向前取整半小时。 此功能用于Yandex.Metrica,因为如果跟踪标记显示单个用户的连续综合浏览量在时间上严格超过此数量,则半小时是将会话分成两个会话的最短时间。这意味着(tag id,user id,time slot)可用于搜索相应会话中包含的综合浏览量。 -## toYYYYMM +## toYYYYMM {#toyyyymm} -将Date或DateTime转换为包含年份和月份编号的UInt32类型的数字(YYYY * 100 + MM)。 +将Date或DateTime转换为包含年份和月份编号的UInt32类型的数字(YYYY \* 100 + MM)。 -## toYYYYMMDD +## toYYYYMMDD {#toyyyymmdd} -将Date或DateTime转换为包含年份和月份编号的UInt32类型的数字(YYYY * 10000 + MM * 100 + DD)。 +将Date或DateTime转换为包含年份和月份编号的UInt32类型的数字(YYYY \* 10000 + MM \* 100 + DD)。 -## toYYYYMMDDhhmmss +## toYYYYMMDDhhmmss {#toyyyymmddhhmmss} -将Date或DateTime转换为包含年份和月份编号的UInt64类型的数字(YYYY * 10000000000 + MM * 100000000 + DD * 1000000 + hh * 10000 + mm * 100 + ss)。 +将Date或DateTime转换为包含年份和月份编号的UInt64类型的数字(YYYY \* 10000000000 + MM \* 100000000 + DD \* 1000000 + hh \* 10000 + mm \* 100 + ss)。 -## addYears, addMonths, addWeeks, addDays, addHours, addMinutes, addSeconds, addQuarters +## addYears, addMonths, addWeeks, addDays, addHours, addMinutes, addSeconds, addQuarters {#addyears-addmonths-addweeks-adddays-addhours-addminutes-addseconds-addquarters} 函数将一段时间间隔添加到Date/DateTime,然后返回Date/DateTime。例如: -```sql +``` sql WITH toDate('2018-01-01') AS date, toDateTime('2018-01-01 00:00:00') AS date_time @@ -213,17 +213,15 @@ SELECT addYears(date_time, 1) AS add_years_with_date_time ``` -``` -┌─add_years_with_date─┬─add_years_with_date_time─┐ -│ 2019-01-01 │ 2019-01-01 00:00:00 │ -└─────────────────────┴──────────────────────────┘ -``` + ┌─add_years_with_date─┬─add_years_with_date_time─┐ + │ 2019-01-01 │ 2019-01-01 00:00:00 │ + └─────────────────────┴──────────────────────────┘ -## subtractYears, subtractMonths, subtractWeeks, subtractDays, subtractHours, subtractMinutes, subtractSeconds, subtractQuarters +## subtractYears, subtractMonths, subtractWeeks, subtractDays, subtractHours, subtractMinutes, subtractSeconds, subtractQuarters {#subtractyears-subtractmonths-subtractweeks-subtractdays-subtracthours-subtractminutes-subtractseconds-subtractquarters} 函数将Date/DateTime减去一段时间间隔,然后返回Date/DateTime。例如: -```sql +``` sql WITH toDate('2019-01-01') AS date, toDateTime('2019-01-01 00:00:00') AS date_time @@ -232,65 +230,63 @@ SELECT subtractYears(date_time, 1) AS subtract_years_with_date_time ``` -``` -┌─subtract_years_with_date─┬─subtract_years_with_date_time─┐ -│ 2018-01-01 │ 2018-01-01 00:00:00 │ -└──────────────────────────┴───────────────────────────────┘ -``` + ┌─subtract_years_with_date─┬─subtract_years_with_date_time─┐ + │ 2018-01-01 │ 2018-01-01 00:00:00 │ + └──────────────────────────┴───────────────────────────────┘ -## dateDiff('unit', t1, t2, \[timezone\]) +## dateDiff(‘unit’, t1, t2, \[timezone\]) {#datediffunit-t1-t2-timezone} -返回以'unit'为单位表示的两个时间之间的差异,例如`'hours'`。 't1'和't2'可以是Date或DateTime,如果指定'timezone',它将应用于两个参数。如果不是,则使用来自数据类型't1'和't2'的时区。如果时区不相同,则结果将是未定义的。 +返回以’unit’为单位表示的两个时间之间的差异,例如`'hours'`。 ‘t1’和’t2’可以是Date或DateTime,如果指定’timezone’,它将应用于两个参数。如果不是,则使用来自数据类型’t1’和’t2’的时区。如果时区不相同,则结果将是未定义的。 Supported unit values: -| unit | -| ------ | -|second | -|minute | -|hour | -|day | -|week | -|month | -|quarter | -|year | +| unit | +|---------| +| second | +| minute | +| hour | +| day | +| week | +| month | +| quarter | +| year | -## timeSlots(StartTime, Duration,\[, Size\]) +## timeSlots(StartTime, Duration,\[, Size\]) {#timeslotsstarttime-duration-size} -它返回一个时间数组,其中包括从从“StartTime”开始到“StartTime + Duration 秒”内的所有符合“size”(以秒为单位)步长的时间点。其中“size”是一个可选参数,默认为1800。 +它返回一个时间数组,其中包括从从«StartTime»开始到«StartTime + Duration 秒»内的所有符合«size»(以秒为单位)步长的时间点。其中«size»是一个可选参数,默认为1800。 例如,`timeSlots(toDateTime('2012-01-01 12:20:00'),600) = [toDateTime('2012-01-01 12:00:00'),toDateTime('2012-01-01 12:30:00' )]`。 这对于搜索在相应会话中综合浏览量是非常有用的。 -## formatDateTime(Time, Format\[, Timezone\]) +## formatDateTime(Time, Format\[, Timezone\]) {#formatdatetimetime-format-timezone} 函数根据给定的格式字符串来格式化时间。请注意:格式字符串必须是常量表达式,例如:单个结果列不能有多种格式字符串。 支持的格式修饰符: -("Example" 列是对`2018-01-02 22:33:44`的格式化结果) +(«Example» 列是对`2018-01-02 22:33:44`的格式化结果) -| Modifier | Description | Example | -| ----------- | -------- | --------------- | -|%C|year divided by 100 and truncated to integer (00-99)|20 -|%d|day of the month, zero-padded (01-31)|02 -|%D|Short MM/DD/YY date, equivalent to %m/%d/%y|01/02/2018| -|%e|day of the month, space-padded ( 1-31)| 2| -|%F|short YYYY-MM-DD date, equivalent to %Y-%m-%d|2018-01-02 -|%H|hour in 24h format (00-23)|22| -|%I|hour in 12h format (01-12)|10| -|%j|day of the year (001-366)|002| -|%m|month as a decimal number (01-12)|01| -|%M|minute (00-59)|33| -|%n|new-line character ('\n')|| -|%p|AM or PM designation|PM| -|%R|24-hour HH:MM time, equivalent to %H:%M|22:33| -|%S|second (00-59)|44| -|%t|horizontal-tab character ('\t')|| -|%T|ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S|22:33:44| -|%u|ISO 8601 weekday as number with Monday as 1 (1-7)|2| -|%V|ISO 8601 week number (01-53)|01| -|%w|weekday as a decimal number with Sunday as 0 (0-6)|2| -|%y|Year, last two digits (00-99)|18| -|%Y|Year|2018| -|%%|a % sign|%| +| Modifier | Description | Example | +|----------|---------------------------------------------------------|------------| +| %C | year divided by 100 and truncated to integer (00-99) | 20 | +| %d | day of the month, zero-padded (01-31) | 02 | +| %D | Short MM/DD/YY date, equivalent to %m/%d/%y | 01/02/2018 | +| %e | day of the month, space-padded ( 1-31) | 2 | +| %F | short YYYY-MM-DD date, equivalent to %Y-%m-%d | 2018-01-02 | +| %H | hour in 24h format (00-23) | 22 | +| %I | hour in 12h format (01-12) | 10 | +| %j | day of the year (001-366) | 002 | +| %m | month as a decimal number (01-12) | 01 | +| %M | minute (00-59) | 33 | +| %n | new-line character (‘’) | | +| %p | AM or PM designation | PM | +| %R | 24-hour HH:MM time, equivalent to %H:%M | 22:33 | +| %S | second (00-59) | 44 | +| %t | horizontal-tab character (’) | | +| %T | ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S | 22:33:44 | +| %u | ISO 8601 weekday as number with Monday as 1 (1-7) | 2 | +| %V | ISO 8601 week number (01-53) | 01 | +| %w | weekday as a decimal number with Sunday as 0 (0-6) | 2 | +| %y | Year, last two digits (00-99) | 18 | +| %Y | Year | 2018 | +| %% | a % sign | % | [来源文章](https://clickhouse.tech/docs/en/query_language/functions/date_time_functions/) diff --git a/docs/zh/query_language/functions/encoding_functions.md b/docs/zh/query_language/functions/encoding_functions.md index 4cc6afd39c7..589edd75450 100644 --- a/docs/zh/query_language/functions/encoding_functions.md +++ b/docs/zh/query_language/functions/encoding_functions.md @@ -1,29 +1,28 @@ -# 编码函数 +# 编码函数 {#bian-ma-han-shu} -## hex +## hex {#hex} -接受`String`,`unsigned integer`,`Date`或`DateTime`类型的参数。返回包含参数的十六进制表示的字符串。使用大写字母`A-F`。不使用`0x`前缀或`h`后缀。对于字符串,所有字节都简单地编码为两个十六进制数字。数字转换为大端(“易阅读”)格式。对于数字,去除其中较旧的零,但仅限整个字节。例如,`hex(1)='01'`。 `Date`被编码为自Unix时间开始以来的天数。 `DateTime`编码为自Unix时间开始以来的秒数。 +接受`String`,`unsigned integer`,`Date`或`DateTime`类型的参数。返回包含参数的十六进制表示的字符串。使用大写字母`A-F`。不使用`0x`前缀或`h`后缀。对于字符串,所有字节都简单地编码为两个十六进制数字。数字转换为大端(«易阅读»)格式。对于数字,去除其中较旧的零,但仅限整个字节。例如,`hex(1)='01'`。 `Date`被编码为自Unix时间开始以来的天数。 `DateTime`编码为自Unix时间开始以来的秒数。 -## unhex(str) +## unhex(str) {#unhexstr} 接受包含任意数量的十六进制数字的字符串,并返回包含相应字节的字符串。支持大写和小写字母A-F。十六进制数字的数量不必是偶数。如果是奇数,则最后一位数被解释为00-0F字节的低位。如果参数字符串包含除十六进制数字以外的任何内容,则返回一些实现定义的结果(不抛出异常)。 -如果要将结果转换为数字,可以使用“reverse”和“reinterpretAsType”函数。 +如果要将结果转换为数字,可以使用«reverse»和«reinterpretAsType»函数。 -## UUIDStringToNum(str) +## UUIDStringToNum(str) {#uuidstringtonumstr} -接受包含36个字符的字符串,格式为“123e4567-e89b-12d3-a456-426655440000”,并将其转化为FixedString(16)返回。 +接受包含36个字符的字符串,格式为«123e4567-e89b-12d3-a456-426655440000»,并将其转化为FixedString(16)返回。 -## UUIDNumToString(str) +## UUIDNumToString(str) {#uuidnumtostringstr} 接受FixedString(16)值。返回包含36个字符的文本格式的字符串。 -## bitmaskToList(num) +## bitmaskToList(num) {#bitmasktolistnum} 接受一个整数。返回一个字符串,其中包含一组2的幂列表,其列表中的所有值相加等于这个整数。列表使用逗号分割,按升序排列。 -## bitmaskToArray(num) +## bitmaskToArray(num) {#bitmasktoarraynum} 接受一个整数。返回一个UInt64类型数组,其中包含一组2的幂列表,其列表中的所有值相加等于这个整数。数组中的数字按升序排列。 - [来源文章](https://clickhouse.tech/docs/en/query_language/functions/encoding_functions/) diff --git a/docs/zh/query_language/functions/ext_dict_functions.md b/docs/zh/query_language/functions/ext_dict_functions.md index fca1eeb6cc9..9541fa355d7 100644 --- a/docs/zh/query_language/functions/ext_dict_functions.md +++ b/docs/zh/query_language/functions/ext_dict_functions.md @@ -1,47 +1,46 @@ -# 字典函数 +# 字典函数 {#zi-dian-han-shu} 有关连接和配置外部词典的信息,请参阅[外部词典](../dicts/external_dicts.md)。 -## dictGetUInt8, dictGetUInt16, dictGetUInt32, dictGetUInt64 +## dictGetUInt8, dictGetUInt16, dictGetUInt32, dictGetUInt64 {#dictgetuint8-dictgetuint16-dictgetuint32-dictgetuint64} -## dictGetInt8, dictGetInt16, dictGetInt32, dictGetInt64 +## dictGetInt8, dictGetInt16, dictGetInt32, dictGetInt64 {#dictgetint8-dictgetint16-dictgetint32-dictgetint64} -## dictGetFloat32, dictGetFloat64 +## dictGetFloat32, dictGetFloat64 {#dictgetfloat32-dictgetfloat64} -## dictGetDate, dictGetDateTime +## dictGetDate, dictGetDateTime {#dictgetdate-dictgetdatetime} -## dictGetUUID +## dictGetUUID {#dictgetuuid} -## dictGetString +## dictGetString {#dictgetstring} `dictGetT('dict_name', 'attr_name', id)` -- 使用'id'键获取dict_name字典中attr_name属性的值。`dict_name`和`attr_name`是常量字符串。`id`必须是UInt64。 -如果字典中没有`id`键,则返回字典描述中指定的默认值。 +- 使用’id’键获取dict\_name字典中attr\_name属性的值。`dict_name`和`attr_name`是常量字符串。`id`必须是UInt64。 + 如果字典中没有`id`键,则返回字典描述中指定的默认值。 -## dictGetTOrDefault {#ext_dict_functions_dictGetTOrDefault} +## dictGetTOrDefault {#ext-dict-functions-dictgettordefault} `dictGetTOrDefault('dict_name', 'attr_name', id, default)` 与`dictGetT`函数相同,但默认值取自函数的最后一个参数。 -## dictIsIn +## dictIsIn {#dictisin} `dictIsIn ('dict_name', child_id, ancestor_id)` -- 对于'dict_name'分层字典,查找'child_id'键是否位于'ancestor_id'内(或匹配'ancestor_id')。返回UInt8。 +- 对于’dict\_name’分层字典,查找’child\_id’键是否位于’ancestor\_id’内(或匹配’ancestor\_id’)。返回UInt8。 -## dictGetHierarchy +## dictGetHierarchy {#dictgethierarchy} `dictGetHierarchy('dict_name', id)` -- 对于'dict_name'分层字典,返回从'id'开始并沿父元素链继续的字典键数组。返回Array(UInt64) +- 对于’dict\_name’分层字典,返回从’id’开始并沿父元素链继续的字典键数组。返回Array(UInt64) -## dictHas +## dictHas {#dicthas} `dictHas('dict_name', id)` -- 检查字典是否存在指定的`id`。如果不存在,则返回0;如果存在,则返回1。 - +- 检查字典是否存在指定的`id`。如果不存在,则返回0;如果存在,则返回1。 [来源文章](https://clickhouse.tech/docs/en/query_language/functions/ext_dict_functions/) diff --git a/docs/zh/query_language/functions/functions_for_nulls.md b/docs/zh/query_language/functions/functions_for_nulls.md index 71bd7a2fe88..d6db2906e92 100644 --- a/docs/zh/query_language/functions/functions_for_nulls.md +++ b/docs/zh/query_language/functions/functions_for_nulls.md @@ -1,182 +1,156 @@ -# Nullable处理函数 +# Nullable处理函数 {#nullablechu-li-han-shu} -## isNull +## isNull {#isnull} 检查参数是否为[NULL](../syntax.md#null)。 -``` -isNull(x) -``` + isNull(x) **参数** -- `x` — 一个非复合数据类型的值。 +- `x` — 一个非复合数据类型的值。 **返回值** -- `1` 如果`x`为`NULL`。 -- `0` 如果`x`不为`NULL`。 +- `1` 如果`x`为`NULL`。 +- `0` 如果`x`不为`NULL`。 **示例** 存在以下内容的表 -``` -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 3 │ -└───┴──────┘ -``` + ┌─x─┬────y─┐ + │ 1 │ ᴺᵁᴸᴸ │ + │ 2 │ 3 │ + └───┴──────┘ 对其进行查询 -``` -:) SELECT x FROM t_null WHERE isNull(y) + :) SELECT x FROM t_null WHERE isNull(y) -SELECT x -FROM t_null -WHERE isNull(y) + SELECT x + FROM t_null + WHERE isNull(y) -┌─x─┐ -│ 1 │ -└───┘ + ┌─x─┐ + │ 1 │ + └───┘ -1 rows in set. Elapsed: 0.010 sec. -``` + 1 rows in set. Elapsed: 0.010 sec. -## isNotNull +## isNotNull {#isnotnull} 检查参数是否不为 [NULL](../syntax.md#null). -``` -isNotNull(x) -``` + isNotNull(x) **参数:** -- `x` — 一个非复合数据类型的值。 +- `x` — 一个非复合数据类型的值。 **返回值** -- `0` 如果`x`为`NULL`。 -- `1` 如果`x`不为`NULL`。 +- `0` 如果`x`为`NULL`。 +- `1` 如果`x`不为`NULL`。 **示例** 存在以下内容的表 -``` -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 3 │ -└───┴──────┘ -``` + ┌─x─┬────y─┐ + │ 1 │ ᴺᵁᴸᴸ │ + │ 2 │ 3 │ + └───┴──────┘ 对其进行查询 -``` -:) SELECT x FROM t_null WHERE isNotNull(y) + :) SELECT x FROM t_null WHERE isNotNull(y) -SELECT x -FROM t_null -WHERE isNotNull(y) + SELECT x + FROM t_null + WHERE isNotNull(y) -┌─x─┐ -│ 2 │ -└───┘ + ┌─x─┐ + │ 2 │ + └───┘ -1 rows in set. Elapsed: 0.010 sec. -``` + 1 rows in set. Elapsed: 0.010 sec. -## coalesce +## coalesce {#coalesce} -检查从左到右是否传递了“NULL”参数并返回第一个非`'NULL`参数。 +检查从左到右是否传递了«NULL»参数并返回第一个非`'NULL`参数。 -``` -coalesce(x,...) -``` + coalesce(x,...) **参数:** -- 任何数量的非复合类型的参数。所有参数必须与数据类型兼容。 +- 任何数量的非复合类型的参数。所有参数必须与数据类型兼容。 **返回值** -- 第一个非'NULL`参数。 -- `NULL`,如果所有参数都是'NULL`。 +- 第一个非’NULL\`参数。 +- `NULL`,如果所有参数都是’NULL\`。 **示例** 考虑可以指定多种联系客户的方式的联系人列表。 -``` -┌─name─────┬─mail─┬─phone─────┬──icq─┐ -│ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ -│ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -└──────────┴──────┴───────────┴──────┘ -``` + ┌─name─────┬─mail─┬─phone─────┬──icq─┐ + │ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ + │ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ + └──────────┴──────┴───────────┴──────┘ `mail`和`phone`字段是String类型,但`icq`字段是`UInt32`,所以它需要转换为`String`。 从联系人列表中获取客户的第一个可用联系方式: -``` -:) SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook + :) SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook -SELECT coalesce(mail, phone, CAST(icq, 'Nullable(String)')) -FROM aBook + SELECT coalesce(mail, phone, CAST(icq, 'Nullable(String)')) + FROM aBook -┌─name─────┬─coalesce(mail, phone, CAST(icq, 'Nullable(String)'))─┐ -│ client 1 │ 123-45-67 │ -│ client 2 │ ᴺᵁᴸᴸ │ -└──────────┴──────────────────────────────────────────────────────┘ + ┌─name─────┬─coalesce(mail, phone, CAST(icq, 'Nullable(String)'))─┐ + │ client 1 │ 123-45-67 │ + │ client 2 │ ᴺᵁᴸᴸ │ + └──────────┴──────────────────────────────────────────────────────┘ -2 rows in set. Elapsed: 0.006 sec. -``` + 2 rows in set. Elapsed: 0.006 sec. -## ifNull +## ifNull {#ifnull} -如果第一个参数为“NULL”,则返回第二个参数的值。 +如果第一个参数为«NULL»,则返回第二个参数的值。 -``` -ifNull(x,alt) -``` + ifNull(x,alt) **参数:** -- `x` — 要检查“NULL”的值。 -- `alt` — 如果`x`为'NULL`,函数返回的值。 +- `x` — 要检查«NULL»的值。 +- `alt` — 如果`x`为’NULL\`,函数返回的值。 **返回值** -- The value `x`, if `x` is not `NULL`. -- The value `alt`, if `x` is `NULL`. +- The value `x`, if `x` is not `NULL`. +- The value `alt`, if `x` is `NULL`. **示例** -``` -SELECT ifNull('a', 'b') + SELECT ifNull('a', 'b') -┌─ifNull('a', 'b')─┐ -│ a │ -└──────────────────┘ -``` + ┌─ifNull('a', 'b')─┐ + │ a │ + └──────────────────┘ -``` -SELECT ifNull(NULL, 'b') + SELECT ifNull(NULL, 'b') -┌─ifNull(NULL, 'b')─┐ -│ b │ -└───────────────────┘ -``` + ┌─ifNull(NULL, 'b')─┐ + │ b │ + └───────────────────┘ -## nullIf +## nullIf {#nullif} 如果参数相等,则返回`NULL`。 -``` -nullIf(x, y) -``` + nullIf(x, y) **参数:** @@ -184,114 +158,95 @@ nullIf(x, y) **返回值** -- 如果参数相等,则为`NULL`。 -- 如果参数不相等,则为`x`值。 +- 如果参数相等,则为`NULL`。 +- 如果参数不相等,则为`x`值。 **示例** -``` -SELECT nullIf(1, 1) + SELECT nullIf(1, 1) -┌─nullIf(1, 1)─┐ -│ ᴺᵁᴸᴸ │ -└──────────────┘ -``` + ┌─nullIf(1, 1)─┐ + │ ᴺᵁᴸᴸ │ + └──────────────┘ -``` -SELECT nullIf(1, 2) + SELECT nullIf(1, 2) -┌─nullIf(1, 2)─┐ -│ 1 │ -└──────────────┘ -``` + ┌─nullIf(1, 2)─┐ + │ 1 │ + └──────────────┘ -## assumeNotNull +## assumeNotNull {#assumenotnull} 将[Nullable](../../data_types/nullable.md)类型的值转换为非`Nullable`类型的值。 -``` -assumeNotNull(x) -``` + assumeNotNull(x) **参数:** -- `x` — 原始值。 +- `x` — 原始值。 **返回值** -- 如果`x`不为`NULL`,返回非`Nullable`类型的原始值。 -- 如果`x`为`NULL`,返回对应非`Nullable`类型的默认值。 +- 如果`x`不为`NULL`,返回非`Nullable`类型的原始值。 +- 如果`x`为`NULL`,返回对应非`Nullable`类型的默认值。 **示例** 存在如下`t_null`表。 -``` -SHOW CREATE TABLE t_null + SHOW CREATE TABLE t_null -┌─statement─────────────────────────────────────────────────────────────────┐ -│ CREATE TABLE default.t_null ( x Int8, y Nullable(Int8)) ENGINE = TinyLog │ -└───────────────────────────────────────────────────────────────────────────┘ -``` + ┌─statement─────────────────────────────────────────────────────────────────┐ + │ CREATE TABLE default.t_null ( x Int8, y Nullable(Int8)) ENGINE = TinyLog │ + └───────────────────────────────────────────────────────────────────────────┘ -``` -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 3 │ -└───┴──────┘ -``` + ┌─x─┬────y─┐ + │ 1 │ ᴺᵁᴸᴸ │ + │ 2 │ 3 │ + └───┴──────┘ 将列`y`作为`assumeNotNull`函数的参数。 -``` -SELECT assumeNotNull(y) FROM t_null + SELECT assumeNotNull(y) FROM t_null -┌─assumeNotNull(y)─┐ -│ 0 │ -│ 3 │ -└──────────────────┘ -``` + ┌─assumeNotNull(y)─┐ + │ 0 │ + │ 3 │ + └──────────────────┘ -``` -SELECT toTypeName(assumeNotNull(y)) FROM t_null + SELECT toTypeName(assumeNotNull(y)) FROM t_null -┌─toTypeName(assumeNotNull(y))─┐ -│ Int8 │ -│ Int8 │ -└──────────────────────────────┘ -``` + ┌─toTypeName(assumeNotNull(y))─┐ + │ Int8 │ + │ Int8 │ + └──────────────────────────────┘ -## toNullable +## toNullable {#tonullable} 将参数的类型转换为`Nullable`。 -``` -toNullable(x) -``` + toNullable(x) **参数:** -- `x` — 任何非复合类型的值。 +- `x` — 任何非复合类型的值。 **返回值** -- 输入的值,但其类型为`Nullable`。 +- 输入的值,但其类型为`Nullable`。 **示例** -``` -SELECT toTypeName(10) + SELECT toTypeName(10) -┌─toTypeName(10)─┐ -│ UInt8 │ -└────────────────┘ + ┌─toTypeName(10)─┐ + │ UInt8 │ + └────────────────┘ -SELECT toTypeName(toNullable(10)) - -┌─toTypeName(toNullable(10))─┐ -│ Nullable(UInt8) │ -└────────────────────────────┘ -``` + SELECT toTypeName(toNullable(10)) + ┌─toTypeName(toNullable(10))─┐ + │ Nullable(UInt8) │ + └────────────────────────────┘ [来源文章](https://clickhouse.tech/docs/en/query_language/functions/functions_for_nulls/) diff --git a/docs/zh/query_language/functions/geo.md b/docs/zh/query_language/functions/geo.md index a9d30b91596..3e6e6aa6b64 100644 --- a/docs/zh/query_language/functions/geo.md +++ b/docs/zh/query_language/functions/geo.md @@ -1,19 +1,19 @@ -# GEO函数 +# GEO函数 {#geohan-shu} -## greatCircleDistance +## greatCircleDistance {#greatcircledistance} 使用[great-circle distance公式](https://en.wikipedia.org/wiki/Great-circle_distance)计算地球表面两点之间的距离。 -```sql +``` sql greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) ``` **输入参数** -- `lon1Deg` — 第一个点的经度,单位:度,范围: `[-180°, 180°]`。 -- `lat1Deg` — 第一个点的纬度,单位:度,范围: `[-90°, 90°]`。 -- `lon2Deg` — 第二个点的经度,单位:度,范围: `[-180°, 180°]`。 -- `lat2Deg` — 第二个点的纬度,单位:度,范围: `[-90°, 90°]`。 +- `lon1Deg` — 第一个点的经度,单位:度,范围: `[-180°, 180°]`。 +- `lat1Deg` — 第一个点的纬度,单位:度,范围: `[-90°, 90°]`。 +- `lon2Deg` — 第二个点的经度,单位:度,范围: `[-180°, 180°]`。 +- `lat2Deg` — 第二个点的纬度,单位:度,范围: `[-90°, 90°]`。 正值对应北纬和东经,负值对应南纬和西经。 @@ -25,30 +25,30 @@ greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) **示例** -```sql +``` sql SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) ``` -```text +``` text ┌─greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673)─┐ │ 14132374.194975413 │ └───────────────────────────────────────────────────────────────────┘ ``` -## pointInEllipses +## pointInEllipses {#pointinellipses} 检查指定的点是否至少包含在指定的一个椭圆中。 下述中的坐标是几何图形在笛卡尔坐标系中的位置。 -```sql +``` sql pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) ``` **输入参数** -- `x, y` — 平面上某个点的坐标。 -- `xᵢ, yᵢ` — 第i个椭圆的中心坐标。 -- `aᵢ, bᵢ` — 以x, y坐标为单位的第i个椭圆的轴。 +- `x, y` — 平面上某个点的坐标。 +- `xᵢ, yᵢ` — 第i个椭圆的中心坐标。 +- `aᵢ, bᵢ` — 以x, y坐标为单位的第i个椭圆的轴。 输入参数的个数必须是`2+4⋅n`,其中`n`是椭圆的数量。 @@ -58,29 +58,29 @@ pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) **示例** -```sql +``` sql SELECT pointInEllipses(55.755831, 37.617673, 55.755831, 37.617673, 1.0, 2.0) ``` -```text +``` text ┌─pointInEllipses(55.755831, 37.617673, 55.755831, 37.617673, 1., 2.)─┐ │ 1 │ └─────────────────────────────────────────────────────────────────────┘ ``` -## pointInPolygon +## pointInPolygon {#pointinpolygon} 检查指定的点是否包含在指定的多边形中。 -```sql +``` sql pointInPolygon((x, y), [(a, b), (c, d) ...], ...) ``` **输入参数** -- `(x, y)` — 平面上某个点的坐标。[Tuple](../../data_types/tuple.md)类型,包含坐标的两个数字。 -- `[(a, b), (c, d) ...]` — 多边形的顶点。[Array](../../data_types/array.md)类型。每个顶点由一对坐标`(a, b)`表示。顶点可以按顺时针或逆时针指定。顶点的个数应该大于等于3。同时只能是常量的。 -- 该函数还支持镂空的多边形(切除部分)。如果需要,可以使用函数的其他参数定义需要切除部分的多边形。(The function does not support non-simply-connected polygons.) +- `(x, y)` — 平面上某个点的坐标。[Tuple](../../data_types/tuple.md)类型,包含坐标的两个数字。 +- `[(a, b), (c, d) ...]` — 多边形的顶点。[Array](../../data_types/array.md)类型。每个顶点由一对坐标`(a, b)`表示。顶点可以按顺时针或逆时针指定。顶点的个数应该大于等于3。同时只能是常量的。 +- 该函数还支持镂空的多边形(切除部分)。如果需要,可以使用函数的其他参数定义需要切除部分的多边形。(The function does not support non-simply-connected polygons.) **返回值** @@ -89,129 +89,131 @@ pointInPolygon((x, y), [(a, b), (c, d) ...], ...) **示例** -```sql +``` sql SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res ``` -```text +``` text ┌─res─┐ │ 1 │ └─────┘ ``` -## geohashEncode +## geohashEncode {#geohashencode} 将经度和纬度编码为geohash-string,请参阅(http://geohash.org/,https://en.wikipedia.org/wiki/Geohash)。 -```sql + +``` sql geohashEncode(longitude, latitude, [precision]) ``` **输入值** -- longitude - 要编码的坐标的经度部分。其值应在`[-180°,180°]`范围内 -- latitude - 要编码的坐标的纬度部分。其值应在`[-90°,90°]`范围内 -- precision - 可选,生成的geohash-string的长度,默认为`12`。取值范围为`[1,12]`。任何小于`1`或大于`12`的值都会默认转换为`12`。 +- longitude - 要编码的坐标的经度部分。其值应在`[-180°,180°]`范围内 +- latitude - 要编码的坐标的纬度部分。其值应在`[-90°,90°]`范围内 +- precision - 可选,生成的geohash-string的长度,默认为`12`。取值范围为`[1,12]`。任何小于`1`或大于`12`的值都会默认转换为`12`。 **返回值** -- 坐标编码的字符串(使用base32编码的修改版本)。 +- 坐标编码的字符串(使用base32编码的修改版本)。 **示例** -```sql +``` sql SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res ``` -```text +``` text ┌─res──────────┐ │ ezs42d000000 │ └──────────────┘ ``` -## geohashDecode +## geohashDecode {#geohashdecode} 将任何geohash编码的字符串解码为经度和纬度。 **输入值** -- encoded string - geohash编码的字符串。 +- encoded string - geohash编码的字符串。 **返回值** -- (longitude, latitude) - 经度和纬度的`Float64`值的2元组。 +- (longitude, latitude) - 经度和纬度的`Float64`值的2元组。 **示例** -```sql +``` sql SELECT geohashDecode('ezs42') AS res ``` -```text +``` text ┌─res─────────────────────────────┐ │ (-5.60302734375,42.60498046875) │ └─────────────────────────────────┘ ``` - -## geoToH3 +## geoToH3 {#geotoh3} 计算指定的分辨率的[H3](https://uber.github.io/h3/#/documentation/overview/introduction)索引`(lon, lat)`。 -```sql +``` sql geoToH3(lon, lat, resolution) ``` **输入值** -- `lon` — 经度。 [Float64](../../data_types/float.md)类型。 -- `lat` — 纬度。 [Float64](../../data_types/float.md)类型。 -- `resolution` — 索引的分辨率。 取值范围为: `[0, 15]`。 [UInt8](../../data_types/int_uint.md)类型。 +- `lon` — 经度。 [Float64](../../data_types/float.md)类型。 +- `lat` — 纬度。 [Float64](../../data_types/float.md)类型。 +- `resolution` — 索引的分辨率。 取值范围为: `[0, 15]`。 [UInt8](../../data_types/int_uint.md)类型。 **返回值** -- H3中六边形的索引值。 -- 发生异常时返回0。 +- H3中六边形的索引值。 +- 发生异常时返回0。 [UInt64](../../data_types/int_uint.md)类型。 **示例** -```sql +``` sql SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index ``` -```text + +``` text ┌────────────h3Index─┐ │ 644325524701193974 │ └────────────────────┘ ``` -## geohashesInBox +## geohashesInBox {#geohashesinbox} 计算在指定精度下计算最小包含指定的经纬范围的最小图形的geohash数组。 **输入值** -- longitude_min - 最小经度。其值应在`[-180°,180°]`范围内 -- latitude_min - 最小纬度。其值应在`[-90°,90°]`范围内 -- longitude_max - 最大经度。其值应在`[-180°,180°]`范围内 -- latitude_max - 最大纬度。其值应在`[-90°,90°]`范围内 -- precision - geohash的精度。其值应在`[1, 12]`内的`UInt8`类型的数字 +- longitude\_min - 最小经度。其值应在`[-180°,180°]`范围内 +- latitude\_min - 最小纬度。其值应在`[-90°,90°]`范围内 +- longitude\_max - 最大经度。其值应在`[-180°,180°]`范围内 +- latitude\_max - 最大纬度。其值应在`[-90°,90°]`范围内 +- precision - geohash的精度。其值应在`[1, 12]`内的`UInt8`类型的数字 请注意,上述所有的坐标参数必须同为`Float32`或`Float64`中的一种类型。 **返回值** -- 包含指定范围内的指定精度的geohash字符串数组。注意,您不应该依赖返回数组中geohash的顺序。 -- [] - 当传入的最小经纬度大于最大经纬度时将返回一个空数组。 +- 包含指定范围内的指定精度的geohash字符串数组。注意,您不应该依赖返回数组中geohash的顺序。 +- \[\] - 当传入的最小经纬度大于最大经纬度时将返回一个空数组。 请注意,如果生成的数组长度超过10000时,则函数将抛出异常。 **示例** -```sql +``` sql SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos ``` -```text + +``` text ┌─thasos──────────────────────────────────────┐ │ ['sx1q','sx1r','sx32','sx1w','sx1x','sx38'] │ └─────────────────────────────────────────────┘ diff --git a/docs/zh/query_language/functions/hash_functions.md b/docs/zh/query_language/functions/hash_functions.md index 98ed40d51dd..90ad491a9ff 100644 --- a/docs/zh/query_language/functions/hash_functions.md +++ b/docs/zh/query_language/functions/hash_functions.md @@ -1,33 +1,33 @@ -# Hash函数 +# Hash函数 {#hashhan-shu} Hash函数可以用于将元素不可逆的伪随机打乱。 -## halfMD5 +## halfMD5 {#halfmd5} 计算字符串的MD5。然后获取结果的前8个字节并将它们作为UInt64(大端)返回。 此函数相当低效(500万个短字符串/秒/核心)。 如果您不需要一定使用MD5,请使用‘sipHash64’函数。 -## MD5 +## MD5 {#md5} 计算字符串的MD5并将结果放入FixedString(16)中返回。 如果您只是需要一个128位的hash,同时不需要一定使用MD5,请使用‘sipHash128’函数。 如果您要获得与md5sum程序相同的输出结果,请使用lower(hex(MD5(s)))。 -## sipHash64 +## sipHash64 {#siphash64} 计算字符串的SipHash。 接受String类型的参数,返回UInt64。 SipHash是一种加密哈希函数。它的处理性能至少比MD5快三倍。 -有关详细信息,请参阅链接: +有关详细信息,请参阅链接:https://131002.net/siphash/ -## sipHash128 {#hash_functions-siphash128} +## sipHash128 {#hash-functions-siphash128} 计算字符串的SipHash。 接受String类型的参数,返回FixedString(16)。 与sipHash64函数的不同在于它的最终计算结果为128位。 -## cityHash64 +## cityHash64 {#cityhash64} 计算任意数量字符串的CityHash64或使用特定实现的Hash函数计算任意数量其他类型的Hash。 对于字符串,使用CityHash算法。 这是一个快速的非加密哈希函数,用于字符串。 @@ -35,77 +35,77 @@ SipHash是一种加密哈希函数。它的处理性能至少比MD5快三倍。 如果传递了多个参数,则使用CityHash组合这些参数的Hash结果。 例如,您可以计算整个表的checksum,其结果取决于行的顺序:`SELECT sum(cityHash64(*)) FROM table`。 -## intHash32 +## intHash32 {#inthash32} 为任何类型的整数计算32位的哈希。 这是相对高效的非加密Hash函数。 -## intHash64 +## intHash64 {#inthash64} 从任何类型的整数计算64位哈希码。 它的工作速度比intHash32函数快。 -## SHA1 +## SHA1 {#sha1} -## SHA224 +## SHA224 {#sha224} -## SHA256 +## SHA256 {#sha256} 计算字符串的SHA-1,SHA-224或SHA-256,并将结果字节集返回为FixedString(20),FixedString(28)或FixedString(32)。 该函数相当低效(SHA-1大约500万个短字符串/秒/核心,而SHA-224和SHA-256大约220万个短字符串/秒/核心)。 我们建议仅在必须使用这些Hash函数且无法更改的情况下使用这些函数。 即使在这些情况下,我们仍建议将函数采用在写入数据时使用预计算的方式将其计算完毕。而不是在SELECT中计算它们。 -## URLHash(url\[, N\]) +## URLHash(url\[, N\]) {#urlhashurl-n} 一种快速的非加密哈希函数,用于规范化的从URL获得的字符串。 `URLHash(s)` - 从一个字符串计算一个哈希,如果结尾存在尾随符号`/`,`?`或`#`则忽略。 `URLHash(s,N)` - 计算URL层次结构中字符串到N级别的哈希值,如果末尾存在尾随符号`/`,`?`或`#`则忽略。 URL的层级与URLHierarchy中的层级相同。 此函数被用于Yandex.Metrica。 -## farmHash64 +## farmHash64 {#farmhash64} 计算字符串的FarmHash64。 接受一个String类型的参数。返回UInt64。 有关详细信息,请参阅链接:[FarmHash64](https://github.com/google/farmhash) -## javaHash {#hash_functions-javahash} +## javaHash {#hash-functions-javahash} 计算字符串的JavaHash。 接受一个String类型的参数。返回Int32。 有关更多信息,请参阅链接:[JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) -## hiveHash +## hiveHash {#hivehash} 计算字符串的HiveHash。 接受一个String类型的参数。返回Int32。 与[JavaHash](#hash_functions-javahash)相同,但不会返回负数。 -## metroHash64 +## metroHash64 {#metrohash64} 计算字符串的MetroHash。 接受一个String类型的参数。返回UInt64。 有关详细信息,请参阅链接:[MetroHash64](http://www.jandrewrogers.com/2015/05/27/metrohash/) -## jumpConsistentHash +## jumpConsistentHash {#jumpconsistenthash} 计算UInt64的JumpConsistentHash。 接受UInt64类型的参数。返回Int32。 有关更多信息,请参见链接:[JumpConsistentHash](https://arxiv.org/pdf/1406.2294.pdf) -## murmurHash2_32, murmurHash2_64 +## murmurHash2\_32, murmurHash2\_64 {#murmurhash2-32-murmurhash2-64} 计算字符串的MurmurHash2。 接受一个String类型的参数。返回UInt64或UInt32。 有关更多信息,请参阅链接:[MurmurHash2](https://github.com/aappleby/smhasher) -## murmurHash3_32, murmurHash3_64, murmurHash3_128 +## murmurHash3\_32, murmurHash3\_64, murmurHash3\_128 {#murmurhash3-32-murmurhash3-64-murmurhash3-128} 计算字符串的MurmurHash3。 接受一个String类型的参数。返回UInt64或UInt32或FixedString(16)。 有关更多信息,请参阅链接:[MurmurHash3](https://github.com/aappleby/smhasher) -## xxHash32, xxHash64 +## xxHash32, xxHash64 {#xxhash32-xxhash64} 计算字符串的xxHash。 接受一个String类型的参数。返回UInt64或UInt32。 diff --git a/docs/zh/query_language/functions/higher_order_functions.md b/docs/zh/query_language/functions/higher_order_functions.md index fff702ab0ec..6199320e547 100644 --- a/docs/zh/query_language/functions/higher_order_functions.md +++ b/docs/zh/query_language/functions/higher_order_functions.md @@ -1,6 +1,6 @@ -# 高阶函数 +# 高阶函数 {#gao-jie-han-shu} -## `->` 运算符, lambda(params, expr) 函数 +## `->` 运算符, lambda(params, expr) 函数 {#yun-suan-fu-lambdaparams-expr-han-shu} 用于描述一个lambda函数用来传递给其他高阶函数。箭头的左侧有一个形式参数,它可以是一个标识符或多个标识符所组成的元祖。箭头的右侧是一个表达式,在这个表达式中可以使用形式参数列表中的任何一个标识符或表的任何一个列名。 @@ -10,17 +10,17 @@ 高阶函数可以接受多个参数的lambda函数作为其参数,在这种情况下,高阶函数需要同时传递几个长度相等的数组,这些数组将被传递给lambda参数。 -除了'arrayMap'和'arrayFilter'以外的所有其他函数,都可以省略第一个参数(lambda函数)。在这种情况下,默认返回数组元素本身。 +除了’arrayMap’和’arrayFilter’以外的所有其他函数,都可以省略第一个参数(lambda函数)。在这种情况下,默认返回数组元素本身。 -### arrayMap(func, arr1, ...) {#higher_order_functions-array-map} +### arrayMap(func, arr1, …) {#higher-order-functions-array-map} 将arr -将从'func'函数的原始应用程序获得的数组返回到'arr'数组中的每个元素。 -Returns an array obtained from the original application of the 'func' function to each element in the 'arr' array. +将从’func’函数的原始应用程序获得的数组返回到’arr’数组中的每个元素。 +Returns an array obtained from the original application of the ‘func’ function to each element in the ‘arr’ array. -### arrayFilter(func, arr1, ...) +### arrayFilter(func, arr1, …) {#arrayfilterfunc-arr1} -Returns an array containing only the elements in 'arr1' for which 'func' returns something other than 0. +Returns an array containing only the elements in ‘arr1’ for which ‘func’ returns something other than 0. 示例: @@ -28,11 +28,9 @@ Returns an array containing only the elements in 'arr1' for which 'func' returns SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res ``` -``` -┌─res───────────┐ -│ ['abc World'] │ -└───────────────┘ -``` + ┌─res───────────┐ + │ ['abc World'] │ + └───────────────┘ ``` sql SELECT @@ -43,37 +41,35 @@ SELECT AS res ``` -``` -┌─res─┐ -│ [2] │ -└─────┘ -``` + ┌─res─┐ + │ [2] │ + └─────┘ -### arrayCount(\[func,\] arr1, ...) +### arrayCount(\[func,\] arr1, …) {#arraycountfunc-arr1} 返回数组arr中非零元素的数量,如果指定了‘func’,则通过‘func’的返回值确定元素是否为非零元素。 -### arrayExists(\[func,\] arr1, ...) +### arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1} 返回数组‘arr’中是否存在非零元素,如果指定了‘func’,则使用‘func’的返回值确定元素是否为非零元素。 -### arrayAll(\[func,\] arr1, ...) +### arrayAll(\[func,\] arr1, …) {#arrayallfunc-arr1} 返回数组‘arr’中是否存在为零的元素,如果指定了‘func’,则使用‘func’的返回值确定元素是否为零元素。 -### arraySum(\[func,\] arr1, ...) +### arraySum(\[func,\] arr1, …) {#arraysumfunc-arr1} 计算arr数组的总和,如果指定了‘func’,则通过‘func’的返回值计算数组的总和。 -### arrayFirst(func, arr1, ...) +### arrayFirst(func, arr1, …) {#arrayfirstfunc-arr1} 返回数组中第一个匹配的元素,函数使用‘func’匹配所有元素,直到找到第一个匹配的元素。 -### arrayFirstIndex(func, arr1, ...) +### arrayFirstIndex(func, arr1, …) {#arrayfirstindexfunc-arr1} 返回数组中第一个匹配的元素的下标索引,函数使用‘func’匹配所有元素,直到找到第一个匹配的元素。 -### arrayCumSum(\[func,\] arr1, ...) +### arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} 返回源数组部分数据的总和,如果指定了`func`函数,则使用`func`的返回值计算总和。 @@ -83,13 +79,11 @@ SELECT SELECT arrayCumSum([1, 1, 1, 1]) AS res ``` -``` -┌─res──────────┐ -│ [1, 2, 3, 4] │ -└──────────────┘ -``` + ┌─res──────────┐ + │ [1, 2, 3, 4] │ + └──────────────┘ -### arrayCumSumNonNegative(arr) +### arrayCumSumNonNegative(arr) {#arraycumsumnonnegativearr} 与arrayCumSum相同,返回源数组部分数据的总和。不同于arrayCumSum,当返回值包含小于零的值时,该值替换为零,后续计算使用零继续计算。例如: @@ -97,13 +91,11 @@ SELECT arrayCumSum([1, 1, 1, 1]) AS res SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res ``` -``` -┌─res───────┐ -│ [1,2,0,1] │ -└───────────┘ -``` + ┌─res───────┐ + │ [1,2,0,1] │ + └───────────┘ -### arraySort(\[func,\] arr1, ...) +### arraySort(\[func,\] arr1, …) {#arraysortfunc-arr1} 返回升序排序`arr1`的结果。如果指定了`func`函数,则排序顺序由`func`的结果决定。 @@ -115,24 +107,21 @@ SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); ``` -``` -┌─res────────────────┐ -│ ['world', 'hello'] │ -└────────────────────┘ -``` + ┌─res────────────────┐ + │ ['world', 'hello'] │ + └────────────────────┘ 请注意,NULL和NaN在最后(NaN在NULL之前)。例如: ``` sql SELECT arraySort([1, nan, 2, NULL, 3, nan, 4, NULL]) ``` -``` -┌─arraySort([1, nan, 2, NULL, 3, nan, 4, NULL])─┐ -│ [1,2,3,4,nan,nan,NULL,NULL] │ -└───────────────────────────────────────────────┘ -``` -### arrayReverseSort(\[func,\] arr1, ...) + ┌─arraySort([1, nan, 2, NULL, 3, nan, 4, NULL])─┐ + │ [1,2,3,4,nan,nan,NULL,NULL] │ + └───────────────────────────────────────────────┘ + +### arrayReverseSort(\[func,\] arr1, …) {#arrayreversesortfunc-arr1} 返回降序排序`arr1`的结果。如果指定了`func`函数,则排序顺序由`func`的结果决定。 @@ -141,13 +130,9 @@ SELECT arraySort([1, nan, 2, NULL, 3, nan, 4, NULL]) ``` sql SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, 4, NULL]) ``` -``` -┌─arrayReverseSort([1, nan, 2, NULL, 3, nan, 4, NULL])─┐ -│ [4,3,2,1,nan,nan,NULL,NULL] │ -└──────────────────────────────────────────────────────┘ -``` - - + ┌─arrayReverseSort([1, nan, 2, NULL, 3, nan, 4, NULL])─┐ + │ [4,3,2,1,nan,nan,NULL,NULL] │ + └──────────────────────────────────────────────────────┘ [来源文章](https://clickhouse.tech/docs/en/query_language/functions/higher_order_functions/) diff --git a/docs/zh/query_language/functions/in_functions.md b/docs/zh/query_language/functions/in_functions.md index 3f26d0ce500..60df3e25db1 100644 --- a/docs/zh/query_language/functions/in_functions.md +++ b/docs/zh/query_language/functions/in_functions.md @@ -1,20 +1,19 @@ -# IN运算符相关函数 +# IN运算符相关函数 {#inyun-suan-fu-xiang-guan-han-shu} -## in, notIn, globalIn, globalNotIn +## in, notIn, globalIn, globalNotIn {#in-notin-globalin-globalnotin} 请参阅[IN 运算符](../select.md#select-in-operators)部分。 -## tuple(x, y, ...), operator (x, y, ...) +## tuple(x, y, …), operator (x, y, …) {#tuplex-y-operator-x-y} 函数用于对多个列进行分组。 -对于具有类型T1,T2,...的列,它返回包含这些列的元组(T1,T2,...)。 执行该函数没有任何成本。 +对于具有类型T1,T2,…的列,它返回包含这些列的元组(T1,T2,…)。 执行该函数没有任何成本。 元组通常用作IN运算符的中间参数值,或用于创建lambda函数的形参列表。 元组不能写入表。 -## tupleElement(tuple, n), operator x.N +## tupleElement(tuple, n), operator x.N {#tupleelementtuple-n-operator-x-n} 函数用于从元组中获取列。 -'N'是列索引,从1开始。N必须是常量正整数常数,并且不大于元组的大小。 +’N’是列索引,从1开始。N必须是常量正整数常数,并且不大于元组的大小。 执行该函数没有任何成本。 - [来源文章](https://clickhouse.tech/docs/en/query_language/functions/in_functions/) diff --git a/docs/zh/query_language/functions/index.md b/docs/zh/query_language/functions/index.md index ad287b49062..8d178592e92 100644 --- a/docs/zh/query_language/functions/index.md +++ b/docs/zh/query_language/functions/index.md @@ -1,67 +1,66 @@ -# 函数 +# 函数 {#han-shu} -ClickHouse中至少存在两种类型的函数 - 常规函数(它们称之为“函数”)和聚合函数。 常规函数的工作就像分别为每一行执行一次函数计算一样(对于每一行,函数的结果不依赖于其他行)。 聚合函数则从各行累积一组值(即函数的结果以来整个结果集)。 +ClickHouse中至少存在两种类型的函数 - 常规函数(它们称之为«函数»)和聚合函数。 常规函数的工作就像分别为每一行执行一次函数计算一样(对于每一行,函数的结果不依赖于其他行)。 聚合函数则从各行累积一组值(即函数的结果以来整个结果集)。 -在本节中,我们将讨论常规函数。 有关聚合函数,请参阅“聚合函数”一节。 +在本节中,我们将讨论常规函数。 有关聚合函数,请参阅«聚合函数»一节。 -\ * - 'arrayJoin'函数与表函数均属于第三种类型的函数。\ * + \* - ’arrayJoin’函数与表函数均属于第三种类型的函数。 \* -## 强类型 +## 强类型 {#qiang-lei-xing} 与标准SQL相比,ClickHouse具有强类型。 换句话说,它不会在类型之间进行隐式转换。 每个函数适用于特定的一组类型。 这意味着有时您需要使用类型转换函数。 -## 常见的子表达式消除 +## 常见的子表达式消除 {#chang-jian-de-zi-biao-da-shi-xiao-chu} 查询中具有相同AST(相同语句或语法分析结果相同)的所有表达式都被视为具有相同的值。 这样的表达式被连接并执行一次。 通过这种方式也可以消除相同的子查询。 -## 结果类型 +## 结果类型 {#jie-guo-lei-xing} 所有函数都只能够返回一个返回值。 结果类型通常由参数的类型决定。 但tupleElement函数(a.N运算符)和toFixedString函数是例外的。 -## 常量 +## 常量 {#chang-liang} 为了简单起见,某些函数的某些参数只能是常量。 例如,LIKE运算符的右参数必须是常量。 几乎所有函数都为常量参数返回常量。 除了用于生成随机数的函数。 -'now'函数为在不同时间运行的查询返回不同的值,但结果被视为常量,因为常量在单个查询中很重要。 +’now’函数为在不同时间运行的查询返回不同的值,但结果被视为常量,因为常量在单个查询中很重要。 常量表达式也被视为常量(例如,LIKE运算符的右半部分可以由多个常量构造)。 对于常量和非常量参数,可以以不同方式实现函数(执行不同的代码)。 但是,对于包含相同数据的常量和非常量参数它们的结果应该是一致的。 -## NULL值处理 +## NULL值处理 {#nullzhi-chu-li} 函数具有以下行为: - - 如果函数的参数至少一个是“NULL”,则函数结果也是“NULL”。 - - 在每个函数的描述中单独指定的特殊行为。在ClickHouse源代码中,这些函数具有“UseDefaultImplementationForNulls = false”。 +- 如果函数的参数至少一个是«NULL»,则函数结果也是«NULL»。 +- 在每个函数的描述中单独指定的特殊行为。在ClickHouse源代码中,这些函数具有«UseDefaultImplementationForNulls = false»。 -## 不可变性 +## 不可变性 {#bu-ke-bian-xing} 函数不能更改其参数的值 - 任何更改都将作为结果返回。因此,计算单独函数的结果不依赖于在查询中写入函数的顺序。 -## 错误处理 +## 错误处理 {#cuo-wu-chu-li} 如果数据无效,某些函数可能会抛出异常。在这种情况下,将取消查询并将错误信息返回给客户端。对于分布式处理,当其中一个服务器发生异常时,其他服务器也会尝试中止查询。 -## 表达式参数的计算 +## 表达式参数的计算 {#biao-da-shi-can-shu-de-ji-suan} 在几乎所有编程语言中,某些函数可能无法预先计算其中一个参数。这通常是运算符`&&`,`||`和`? :`。 但是在ClickHouse中,函数(运算符)的参数总是被预先计算。这是因为一次评估列的整个部分,而不是分别计算每一行。 -## 执行分布式查询处理的功能 +## 执行分布式查询处理的功能 {#zhi-xing-fen-bu-shi-cha-xun-chu-li-de-gong-neng} 对于分布式查询处理,在远程服务器上执行尽可能多的查询处理阶段,并且在请求者服务器上执行其余阶段(合并中间结果和之后的所有内容)。 这意味着可以在不同的服务器上执行功能。 例如,在查询`SELECT f(sum(g(x)))FROM distributed_table GROUP BY h(y)中,` - - 如果`distributed_table`至少有两个分片,则在远程服务器上执行函数'g'和'h',并在请求服务器上执行函数'f'。 - - 如果`distributed_table`只有一个分片,则在该分片的服务器上执行所有'f','g'和'h'功能。 +- 如果`distributed_table`至少有两个分片,则在远程服务器上执行函数’g’和’h’,并在请求服务器上执行函数’f’。 +- 如果`distributed_table`只有一个分片,则在该分片的服务器上执行所有’f’,’g’和’h’功能。 函数的结果通常不依赖于它在哪个服务器上执行。但是,有时这很重要。 例如,使用字典的函数时将使用运行它们的服务器上存在的字典。 另一个例子是`hostName`函数,它返回运行它的服务器的名称,以便在`SELECT`查询中对服务器进行`GROUP BY`。 -如果查询中的函数在请求服务器上执行,但您需要在远程服务器上执行它,则可以将其包装在“any”聚合函数中,或将其添加到“GROUP BY”中。 - +如果查询中的函数在请求服务器上执行,但您需要在远程服务器上执行它,则可以将其包装在«any»聚合函数中,或将其添加到«GROUP BY»中。 [来源文章](https://clickhouse.tech/docs/en/query_language/functions/) diff --git a/docs/zh/query_language/functions/ip_address_functions.md b/docs/zh/query_language/functions/ip_address_functions.md index ad4212b0e36..0e012f90f84 100644 --- a/docs/zh/query_language/functions/ip_address_functions.md +++ b/docs/zh/query_language/functions/ip_address_functions.md @@ -1,14 +1,14 @@ -# IP函数 +# IP函数 {#iphan-shu} -## IPv4NumToString(num) +## IPv4NumToString(num) {#ipv4numtostringnum} 接受一个UInt32(大端)表示的IPv4的地址,返回相应IPv4的字符串表现形式,格式为A.B.C.D(以点分割的十进制数字)。 -## IPv4StringToNum(s) +## IPv4StringToNum(s) {#ipv4stringtonums} 与IPv4NumToString函数相反。如果IPv4地址格式无效,则返回0。 -## IPv4NumToStringClassC(num) +## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} 与IPv4NumToString类似,但使用xxx替换最后一个字节。 @@ -24,24 +24,22 @@ ORDER BY c DESC LIMIT 10 ``` -``` -┌─k──────────────┬─────c─┐ -│ 83.149.9.xxx │ 26238 │ -│ 217.118.81.xxx │ 26074 │ -│ 213.87.129.xxx │ 25481 │ -│ 83.149.8.xxx │ 24984 │ -│ 217.118.83.xxx │ 22797 │ -│ 78.25.120.xxx │ 22354 │ -│ 213.87.131.xxx │ 21285 │ -│ 78.25.121.xxx │ 20887 │ -│ 188.162.65.xxx │ 19694 │ -│ 83.149.48.xxx │ 17406 │ -└────────────────┴───────┘ -``` + ┌─k──────────────┬─────c─┐ + │ 83.149.9.xxx │ 26238 │ + │ 217.118.81.xxx │ 26074 │ + │ 213.87.129.xxx │ 25481 │ + │ 83.149.8.xxx │ 24984 │ + │ 217.118.83.xxx │ 22797 │ + │ 78.25.120.xxx │ 22354 │ + │ 213.87.131.xxx │ 21285 │ + │ 78.25.121.xxx │ 20887 │ + │ 188.162.65.xxx │ 19694 │ + │ 83.149.48.xxx │ 17406 │ + └────────────────┴───────┘ -由于使用'xxx'是不规范的,因此将来可能会更改。我们建议您不要依赖此格式。 +由于使用’xxx’是不规范的,因此将来可能会更改。我们建议您不要依赖此格式。 -### IPv6NumToString(x) +### IPv6NumToString(x) {#ipv6numtostringx} 接受FixedString(16)类型的二进制格式的IPv6地址。以文本格式返回此地址的字符串。 IPv6映射的IPv4地址以::ffff:111.222.33。例如: @@ -50,11 +48,9 @@ IPv6映射的IPv4地址以::ffff:111.222.33。例如: SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr ``` -``` -┌─addr─────────┐ -│ 2a02:6b8::11 │ -└──────────────┘ -``` + ┌─addr─────────┐ + │ 2a02:6b8::11 │ + └──────────────┘ ``` sql SELECT @@ -67,20 +63,18 @@ ORDER BY c DESC LIMIT 10 ``` -``` -┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ -│ 2a02:2168:aaa:bbbb::2 │ 24695 │ -│ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ -│ 2a02:6b8:0:fff::ff │ 16389 │ -│ 2a01:4f8:111:6666::2 │ 16016 │ -│ 2a02:2168:888:222::1 │ 15896 │ -│ 2a01:7e00::ffff:ffff:ffff:222 │ 14774 │ -│ 2a02:8109:eee:ee:eeee:eeee:eeee:eeee │ 14443 │ -│ 2a02:810b:8888:888:8888:8888:8888:8888 │ 14345 │ -│ 2a02:6b8:0:444:4444:4444:4444:4444 │ 14279 │ -│ 2a01:7e00::ffff:ffff:ffff:ffff │ 13880 │ -└─────────────────────────────────────────┴───────┘ -``` + ┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ + │ 2a02:2168:aaa:bbbb::2 │ 24695 │ + │ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ + │ 2a02:6b8:0:fff::ff │ 16389 │ + │ 2a01:4f8:111:6666::2 │ 16016 │ + │ 2a02:2168:888:222::1 │ 15896 │ + │ 2a01:7e00::ffff:ffff:ffff:222 │ 14774 │ + │ 2a02:8109:eee:ee:eeee:eeee:eeee:eeee │ 14443 │ + │ 2a02:810b:8888:888:8888:8888:8888:8888 │ 14345 │ + │ 2a02:6b8:0:444:4444:4444:4444:4444 │ 14279 │ + │ 2a01:7e00::ffff:ffff:ffff:ffff │ 13880 │ + └─────────────────────────────────────────┴───────┘ ``` sql SELECT @@ -93,27 +87,25 @@ ORDER BY c DESC LIMIT 10 ``` -``` -┌─IPv6NumToString(ClientIP6)─┬──────c─┐ -│ ::ffff:94.26.111.111 │ 747440 │ -│ ::ffff:37.143.222.4 │ 529483 │ -│ ::ffff:5.166.111.99 │ 317707 │ -│ ::ffff:46.38.11.77 │ 263086 │ -│ ::ffff:79.105.111.111 │ 186611 │ -│ ::ffff:93.92.111.88 │ 176773 │ -│ ::ffff:84.53.111.33 │ 158709 │ -│ ::ffff:217.118.11.22 │ 154004 │ -│ ::ffff:217.118.11.33 │ 148449 │ -│ ::ffff:217.118.11.44 │ 148243 │ -└────────────────────────────┴────────┘ -``` + ┌─IPv6NumToString(ClientIP6)─┬──────c─┐ + │ ::ffff:94.26.111.111 │ 747440 │ + │ ::ffff:37.143.222.4 │ 529483 │ + │ ::ffff:5.166.111.99 │ 317707 │ + │ ::ffff:46.38.11.77 │ 263086 │ + │ ::ffff:79.105.111.111 │ 186611 │ + │ ::ffff:93.92.111.88 │ 176773 │ + │ ::ffff:84.53.111.33 │ 158709 │ + │ ::ffff:217.118.11.22 │ 154004 │ + │ ::ffff:217.118.11.33 │ 148449 │ + │ ::ffff:217.118.11.44 │ 148243 │ + └────────────────────────────┴────────┘ -## IPv6StringToNum(s) +## IPv6StringToNum(s) {#ipv6stringtonums} 与IPv6NumToString的相反。如果IPv6地址格式无效,则返回空字节字符串。 十六进制可以是大写的或小写的。 -## IPv4ToIPv6(x) +## IPv4ToIPv6(x) {#ipv4toipv6x} 接受一个UInt32类型的IPv4地址,返回FixedString(16)类型的IPv6地址。例如: @@ -121,64 +113,52 @@ LIMIT 10 SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr ``` -``` -┌─addr───────────────┐ -│ ::ffff:192.168.0.1 │ -└────────────────────┘ -``` + ┌─addr───────────────┐ + │ ::ffff:192.168.0.1 │ + └────────────────────┘ -## cutIPv6(x, bitsToCutForIPv6, bitsToCutForIPv4) +## cutIPv6(x, bitsToCutForIPv6, bitsToCutForIPv4) {#cutipv6x-bitstocutforipv6-bitstocutforipv4} 接受一个FixedString(16)类型的IPv6地址,返回一个String,这个String中包含了删除指定位之后的地址的文本格式。例如: -```sql +``` sql WITH IPv6StringToNum('2001:0DB8:AC10:FE01:FEED:BABE:CAFE:F00D') AS ipv6, IPv4ToIPv6(IPv4StringToNum('192.168.0.1')) AS ipv4 SELECT cutIPv6(ipv6, 2, 0), cutIPv6(ipv4, 0, 2) - ``` -``` -┌─cutIPv6(ipv6, 2, 0)─────────────────┬─cutIPv6(ipv4, 0, 2)─┐ -│ 2001:db8:ac10:fe01:feed:babe:cafe:0 │ ::ffff:192.168.0.0 │ -└─────────────────────────────────────┴─────────────────────┘ -``` + ┌─cutIPv6(ipv6, 2, 0)─────────────────┬─cutIPv6(ipv4, 0, 2)─┐ + │ 2001:db8:ac10:fe01:feed:babe:cafe:0 │ ::ffff:192.168.0.0 │ + └─────────────────────────────────────┴─────────────────────┘ -## IPv4CIDRToRange(ipv4, cidr), +## IPv4CIDRToRange(ipv4, cidr), {#ipv4cidrtorangeipv4-cidr} 接受一个IPv4地址以及一个UInt8类型的CIDR。返回包含子网最低范围以及最高范围的元组。 - -```sql +``` sql SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16) ``` -``` -┌─IPv4CIDRToRange(toIPv4('192.168.5.2'), 16)─┐ -│ ('192.168.0.0','192.168.255.255') │ -└────────────────────────────────────────────┘ -``` + ┌─IPv4CIDRToRange(toIPv4('192.168.5.2'), 16)─┐ + │ ('192.168.0.0','192.168.255.255') │ + └────────────────────────────────────────────┘ - -## IPv6CIDRToRange(ipv6, cidr), +## IPv6CIDRToRange(ipv6, cidr), {#ipv6cidrtorangeipv6-cidr} 接受一个IPv6地址以及一个UInt8类型的CIDR。返回包含子网最低范围以及最高范围的元组。 - -```sql +``` sql SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32); ``` -``` -┌─IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32)─┐ -│ ('2001:db8::','2001:db8:ffff:ffff:ffff:ffff:ffff:ffff') │ -└────────────────────────────────────────────────────────────────────────┘ -``` + ┌─IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32)─┐ + │ ('2001:db8::','2001:db8:ffff:ffff:ffff:ffff:ffff:ffff') │ + └────────────────────────────────────────────────────────────────────────┘ -## toIPv4(string) +## toIPv4(string) {#toipv4string} `IPv4StringToNum()`的别名,它采用字符串形式的IPv4地址并返回[IPv4](../../data_types/domains/ipv4.md)类型的值,该二进制值等于`IPv4StringToNum()`返回的值。 @@ -190,11 +170,9 @@ SELECT toTypeName(toIPv4(IPv4_string)) ``` -``` -┌─toTypeName(IPv4StringToNum(IPv4_string))─┬─toTypeName(toIPv4(IPv4_string))─┐ -│ UInt32 │ IPv4 │ -└──────────────────────────────────────────┴─────────────────────────────────┘ -``` + ┌─toTypeName(IPv4StringToNum(IPv4_string))─┬─toTypeName(toIPv4(IPv4_string))─┐ + │ UInt32 │ IPv4 │ + └──────────────────────────────────────────┴─────────────────────────────────┘ ``` sql WITH @@ -204,13 +182,11 @@ SELECT hex(toIPv4(IPv4_string)) ``` -``` -┌─hex(IPv4StringToNum(IPv4_string))─┬─hex(toIPv4(IPv4_string))─┐ -│ ABE1822D │ ABE1822D │ -└───────────────────────────────────┴──────────────────────────┘ -``` + ┌─hex(IPv4StringToNum(IPv4_string))─┬─hex(toIPv4(IPv4_string))─┐ + │ ABE1822D │ ABE1822D │ + └───────────────────────────────────┴──────────────────────────┘ -## toIPv6(string) +## toIPv6(string) {#toipv6string} `IPv6StringToNum()`的别名,它采用字符串形式的IPv6地址并返回[IPv6](../../data_types/domains/ipv6.md)类型的值,该二进制值等于`IPv6StringToNum()`返回的值。 @@ -222,11 +198,9 @@ SELECT toTypeName(toIPv6(IPv6_string)) ``` -``` -┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ -│ FixedString(16) │ IPv6 │ -└──────────────────────────────────────────┴─────────────────────────────────┘ -``` + ┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ + │ FixedString(16) │ IPv6 │ + └──────────────────────────────────────────┴─────────────────────────────────┘ ``` sql WITH @@ -236,10 +210,8 @@ SELECT hex(toIPv6(IPv6_string)) ``` -``` -┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ -│ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ -└───────────────────────────────────┴──────────────────────────────────┘ -``` + ┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ + │ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ + └───────────────────────────────────┴──────────────────────────────────┘ [来源文章](https://clickhouse.tech/docs/en/query_language/functions/ip_address_functions/) diff --git a/docs/zh/query_language/functions/json_functions.md b/docs/zh/query_language/functions/json_functions.md index 7d205b1ea5f..5203ae91291 100644 --- a/docs/zh/query_language/functions/json_functions.md +++ b/docs/zh/query_language/functions/json_functions.md @@ -1,63 +1,59 @@ -# JSON函数 +# JSON函数 {#jsonhan-shu} 在Yandex.Metrica中,用户使用JSON作为访问参数。为了处理这些JSON,实现了一些函数。(尽管在大多数情况下,JSON是预先进行额外处理的,并将结果值放在单独的列中。)所有的这些函数都进行了尽可能的假设。以使函数能够尽快的完成工作。 我们对JSON格式做了如下假设: -1. 字段名称(函数的参数)必须使常量。 -2. 字段名称必须使用规范的编码。例如:`visitParamHas('{"abc":"def"}', 'abc') = 1`,但是 `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` -3. 函数可以随意的在多层嵌套结构下查找字段。如果存在多个匹配字段,则返回第一个匹配字段。 -4. JSON除字符串文本外不存在空格字符。 +1. 字段名称(函数的参数)必须使常量。 +2. 字段名称必须使用规范的编码。例如:`visitParamHas('{"abc":"def"}', 'abc') = 1`,但是 `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` +3. 函数可以随意的在多层嵌套结构下查找字段。如果存在多个匹配字段,则返回第一个匹配字段。 +4. JSON除字符串文本外不存在空格字符。 -## visitParamHas(params, name) +## visitParamHas(params, name) {#visitparamhasparams-name} -检查是否存在“name”名称的字段 +检查是否存在«name»名称的字段 -## visitParamExtractUInt(params, name) +## visitParamExtractUInt(params, name) {#visitparamextractuintparams-name} -将名为“name”的字段的值解析成UInt64。如果这是一个字符串字段,函数将尝试从字符串的开头解析一个数字。如果该字段不存在,或无法从它中解析到数字,则返回0。 +将名为«name»的字段的值解析成UInt64。如果这是一个字符串字段,函数将尝试从字符串的开头解析一个数字。如果该字段不存在,或无法从它中解析到数字,则返回0。 -## visitParamExtractInt(params, name) +## visitParamExtractInt(params, name) {#visitparamextractintparams-name} 与visitParamExtractUInt相同,但返回Int64。 -## visitParamExtractFloat(params, name) +## visitParamExtractFloat(params, name) {#visitparamextractfloatparams-name} 与visitParamExtractUInt相同,但返回Float64。 -## visitParamExtractBool(params, name) +## visitParamExtractBool(params, name) {#visitparamextractboolparams-name} 解析true/false值。其结果是UInt8类型的。 -## visitParamExtractRaw(params, name) +## visitParamExtractRaw(params, name) {#visitparamextractrawparams-name} 返回字段的值,包含空格符。 示例: -``` -visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' -visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' -``` + visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' + visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' -## visitParamExtractString(params, name) +## visitParamExtractString(params, name) {#visitparamextractstringparams-name} 使用双引号解析字符串。这个值没有进行转义。如果转义失败,它将返回一个空白字符串。 示例: -``` -visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' -visitParamExtractString('{"abc":"hello}', 'abc') = '' -``` + visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' + visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' + visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' + visitParamExtractString('{"abc":"hello}', 'abc') = '' 目前不支持`\uXXXX\uYYYY`这些字符编码,这些编码不在基本多文种平面中(它们被转化为CESU-8而不是UTF-8)。 以下函数基于[simdjson](https://github.com/lemire/simdjson),专为更复杂的JSON解析要求而设计。但上述假设2仍然适用。 -## JSONHas(json[, indices_or_keys]...) +## JSONHas(json\[, indices\_or\_keys\]…) {#jsonhasjson-indices-or-keys} 如果JSON中存在该值,则返回`1`。 @@ -65,30 +61,26 @@ visitParamExtractString('{"abc":"hello}', 'abc') = '' 示例: -``` -select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 -select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 -``` + select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 + select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 `indices_or_keys`可以是零个或多个参数的列表,每个参数可以是字符串或整数。 -* String = 按成员名称访问JSON对象成员。 -* 正整数 = 从头开始访问第n个成员/成员名称。 -* 负整数 = 从末尾访问第n个成员/成员名称。 +- String = 按成员名称访问JSON对象成员。 +- 正整数 = 从头开始访问第n个成员/成员名称。 +- 负整数 = 从末尾访问第n个成员/成员名称。 您可以使用整数来访问JSON数组和JSON对象。 例如: -``` -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' -select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' -``` + select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' + select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' + select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' + select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' + select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' -## JSONLength(json[, indices_or_keys]...) +## JSONLength(json\[, indices\_or\_keys\]…) {#jsonlengthjson-indices-or-keys} 返回JSON数组或JSON对象的长度。 @@ -96,12 +88,10 @@ select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' 示例: -``` -select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 -select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 -``` + select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 + select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 -## JSONType(json[, indices_or_keys]...) +## JSONType(json\[, indices\_or\_keys\]…) {#jsontypejson-indices-or-keys} 返回JSON值的类型。 @@ -109,16 +99,17 @@ select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 示例: -``` -select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' -select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' -select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' -``` + select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' + select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' + select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' -## JSONExtractUInt(json[, indices_or_keys]...) -## JSONExtractInt(json[, indices_or_keys]...) -## JSONExtractFloat(json[, indices_or_keys]...) -## JSONExtractBool(json[, indices_or_keys]...) +## JSONExtractUInt(json\[, indices\_or\_keys\]…) {#jsonextractuintjson-indices-or-keys} + +## JSONExtractInt(json\[, indices\_or\_keys\]…) {#jsonextractintjson-indices-or-keys} + +## JSONExtractFloat(json\[, indices\_or\_keys\]…) {#jsonextractfloatjson-indices-or-keys} + +## JSONExtractBool(json\[, indices\_or\_keys\]…) {#jsonextractbooljson-indices-or-keys} 解析JSON并提取值。这些函数类似于`visitParam*`函数。 @@ -126,13 +117,11 @@ select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' 示例: -``` -select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 -select JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 -select JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 -``` + select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 + select JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 + select JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 -## JSONExtractString(json[, indices_or_keys]...) +## JSONExtractString(json\[, indices\_or\_keys\]…) {#jsonextractstringjson-indices-or-keys} 解析JSON并提取字符串。此函数类似于`visitParamExtractString`函数。 @@ -142,46 +131,37 @@ select JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 示例: -``` -select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' -select JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -select JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -select JSONExtractString('{"abc":"\\u263"}', 'abc') = '' -select JSONExtractString('{"abc":"hello}', 'abc') = '' -``` + select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' + select JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' + select JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' + select JSONExtractString('{"abc":"\\u263"}', 'abc') = '' + select JSONExtractString('{"abc":"hello}', 'abc') = '' -## JSONExtract(json[, indices_or_keys...], return_type) +## JSONExtract(json\[, indices\_or\_keys…\], return\_type) {#jsonextractjson-indices-or-keys-return-type} 解析JSON并提取给定ClickHouse数据类型的值。 -这是以前的`JSONExtract函数的变体。 -这意味着 -`JSONExtract(..., 'String')`返回与`JSONExtractString()`返回完全相同。 -`JSONExtract(..., 'Float64')`返回于`JSONExtractFloat()`返回完全相同。 +这是以前的`JSONExtract函数的变体。 这意味着`JSONExtract(…, ‘String’)`返回与`JSONExtractString()`返回完全相同。`JSONExtract(…, ‘Float64’)`返回于`JSONExtractFloat()\`返回完全相同。 示例: -``` -SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))') = ('hello',[-100,200,300]) -SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)') = ([-100,200,300],'hello') -SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))') = [-100, NULL, NULL] -SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4, 'Nullable(Int64)') = NULL -SELECT JSONExtract('{"passed": true}', 'passed', 'UInt8') = 1 -SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Thursday' -SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' -``` + SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))') = ('hello',[-100,200,300]) + SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)') = ([-100,200,300],'hello') + SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))') = [-100, NULL, NULL] + SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4, 'Nullable(Int64)') = NULL + SELECT JSONExtract('{"passed": true}', 'passed', 'UInt8') = 1 + SELECT JSONExtract('{"day": "Thursday"}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Thursday' + SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \'Tuesday\' = 2, \'Wednesday\' = 3, \'Thursday\' = 4, \'Friday\' = 5, \'Saturday\' = 6)') = 'Friday' -## JSONExtractKeysAndValues(json[, indices_or_keys...], value_type) +## JSONExtractKeysAndValues(json\[, indices\_or\_keys…\], value\_type) {#jsonextractkeysandvaluesjson-indices-or-keys-value-type} 从JSON中解析键值对,其中值是给定的ClickHouse数据类型。 示例: -``` -SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; -``` + SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; -## JSONExtractRaw(json[, indices_or_keys]...) +## JSONExtractRaw(json\[, indices\_or\_keys\]…) {#jsonextractrawjson-indices-or-keys} 返回JSON的部分。 @@ -189,8 +169,6 @@ SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') 示例: -``` -select JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' -``` + select JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' [来源文章](https://clickhouse.tech/docs/en/query_language/functions/json_functions/) diff --git a/docs/zh/query_language/functions/logical_functions.md b/docs/zh/query_language/functions/logical_functions.md index c71c8c5f2b8..2f2a61f57a6 100644 --- a/docs/zh/query_language/functions/logical_functions.md +++ b/docs/zh/query_language/functions/logical_functions.md @@ -1,16 +1,15 @@ -# 逻辑函数 +# 逻辑函数 {#luo-ji-han-shu} 逻辑函数可以接受任何数字类型的参数,并返回UInt8类型的0或1。 -当向函数传递零时,函数将判定为“false”,否则,任何其他非零的值都将被判定为“true”。 +当向函数传递零时,函数将判定为«false»,否则,任何其他非零的值都将被判定为«true»。 -## and, AND operator +## and, AND operator {#and-and-operator} -## or, OR operator +## or, OR operator {#or-or-operator} -## not, NOT operator - -## xor +## not, NOT operator {#not-not-operator} +## xor {#xor} [来源文章](https://clickhouse.tech/docs/en/query_language/functions/logical_functions/) diff --git a/docs/zh/query_language/functions/machine_learning_functions.md b/docs/zh/query_language/functions/machine_learning_functions.md index c80c4a3fbfe..74bf1c5fcfc 100644 --- a/docs/zh/query_language/functions/machine_learning_functions.md +++ b/docs/zh/query_language/functions/machine_learning_functions.md @@ -1,15 +1,15 @@ -# 机器学习函数 +# 机器学习函数 {#ji-qi-xue-xi-han-shu} -## evalMLMethod (prediction) {#machine_learning_methods-evalmlmethod} +## evalMLMethod (prediction) {#machine-learning-methods-evalmlmethod} 使用拟合回归模型的预测请使用`evalMLMethod`函数。 请参阅`linearRegression`中的链接。 -## Stochastic Linear Regression +## Stochastic Linear Regression {#stochastic-linear-regression} `stochasticLinearRegression`聚合函数使用线性模型和MSE损失函数实现随机梯度下降法。 使用`evalMLMethod`来预测新数据。 请参阅示例和注释[此处](../agg_functions/reference.md#agg_functions-stochasticlinearregression)。 -## Stochastic Logistic Regression +## Stochastic Logistic Regression {#stochastic-logistic-regression} `stochasticLogisticRegression`聚合函数实现了二元分类问题的随机梯度下降法。 使用`evalMLMethod`来预测新数据。 请参阅示例和注释[此处](../agg_functions/reference.md#agg_functions-stochasticlogisticregression)。 diff --git a/docs/zh/query_language/functions/math_functions.md b/docs/zh/query_language/functions/math_functions.md index 93b55728aee..38b3115e396 100644 --- a/docs/zh/query_language/functions/math_functions.md +++ b/docs/zh/query_language/functions/math_functions.md @@ -1,50 +1,50 @@ -# 数学函数 +# 数学函数 {#shu-xue-han-shu} 以下所有的函数都返回一个Float64类型的数值。返回结果总是以尽可能最大精度返回,但还是可能与机器中可表示最接近该值的数字不同。 -## e() +## e() {#e} 返回一个接近数学常量e的Float64数字。 -## pi() +## pi() {#pi} 返回一个接近数学常量π的Float64数字。 -## exp(x) +## exp(x) {#expx} 接受一个数值类型的参数并返回它的指数。 -## log(x), ln(x) +## log(x), ln(x) {#logx-lnx} 接受一个数值类型的参数并返回它的自然对数。 -## exp2(x) +## exp2(x) {#exp2x} 接受一个数值类型的参数并返回它的2的x次幂。 -## log2(x) +## log2(x) {#log2x} 接受一个数值类型的参数并返回它的底2对数。 -## exp10(x) +## exp10(x) {#exp10x} 接受一个数值类型的参数并返回它的10的x次幂。 -## log10(x) +## log10(x) {#log10x} 接受一个数值类型的参数并返回它的底10对数。 -## sqrt(x) +## sqrt(x) {#sqrtx} 接受一个数值类型的参数并返回它的平方根。 -## cbrt(x) +## cbrt(x) {#cbrtx} 接受一个数值类型的参数并返回它的立方根。 -## erf(x) +## erf(x) {#erfx} -如果'x'是非负数,那么erf(x / σ√2)是具有正态分布且标准偏差为“σ”的随机变量的值与预期值之间的距离大于“x”。 +如果’x’是非负数,那么erf(x / σ√2)是具有正态分布且标准偏差为«σ»的随机变量的值与预期值之间的距离大于«x»。 示例 (三西格玛准则): @@ -52,57 +52,55 @@ SELECT erf(3 / sqrt(2)) ``` -``` -┌─erf(divide(3, sqrt(2)))─┐ -│ 0.9973002039367398 │ -└─────────────────────────┘ -``` + ┌─erf(divide(3, sqrt(2)))─┐ + │ 0.9973002039367398 │ + └─────────────────────────┘ -## erfc(x) +## erfc(x) {#erfcx} -接受一个数值参数并返回一个接近1 - erf(x)的Float64数字,但不会丢失大“x”值的精度。 +接受一个数值参数并返回一个接近1 - erf(x)的Float64数字,但不会丢失大«x»值的精度。 -## lgamma(x) +## lgamma(x) {#lgammax} 返回x的绝对值的自然对数的伽玛函数。 -## tgamma(x) +## tgamma(x) {#tgammax} 返回x的伽玛函数。 -## sin(x) +## sin(x) {#sinx} 返回x的三角正弦值。 -## cos(x) +## cos(x) {#cosx} 返回x的三角余弦值。 -## tan(x) +## tan(x) {#tanx} 返回x的三角正切值。 -## asin(x) +## asin(x) {#asinx} 返回x的反三角正弦值。 -## acos(x) +## acos(x) {#acosx} 返回x的反三角余弦值。 -## atan(x) +## atan(x) {#atanx} 返回x的反三角正切值。 -## pow(x, y), power(x, y) +## pow(x, y), power(x, y) {#powx-y-powerx-y} 接受x和y两个参数。返回x的y次方。 -## intExp2 +## intExp2 {#intexp2} 接受一个数值类型的参数并返回它的2的x次幂(UInt64)。 -## intExp10 +## intExp10 {#intexp10} 接受一个数值类型的参数并返回它的10的x次幂(UInt64)。 diff --git a/docs/zh/query_language/functions/other_functions.md b/docs/zh/query_language/functions/other_functions.md index 613a13e48be..f4c89d9af3f 100644 --- a/docs/zh/query_language/functions/other_functions.md +++ b/docs/zh/query_language/functions/other_functions.md @@ -1,59 +1,62 @@ -# 其他函数 +# 其他函数 {#qi-ta-han-shu} -## hostName() +## hostName() {#hostname} 返回一个字符串,其中包含执行此函数的主机的名称。 对于分布式处理,如果在远程服务器上执行此函数,则将返回远程服务器主机的名称。 -## basename +## basename {#basename} 在最后一个斜杠或反斜杠后的字符串文本。 此函数通常用于从路径中提取文件名。 -``` -basename( expr ) -``` + basename( expr ) **参数** -- `expr` — 任何一个返回[String](../../data_types/string.md)结果的表达式。[String](../../data_types/string.md) +- `expr` — 任何一个返回[String](../../data_types/string.md)结果的表达式。[String](../../data_types/string.md) **返回值** 一个String类型的值,其包含: -- 在最后一个斜杠或反斜杠后的字符串文本内容。 +- 在最后一个斜杠或反斜杠后的字符串文本内容。 - 如果输入的字符串以斜杆或反斜杆结尾,例如:`/`或`c:\`,函数将返回一个空字符串。 + 如果输入的字符串以斜杆或反斜杆结尾,例如:`/`或`c:\`,函数将返回一个空字符串。 -- 如果输入的字符串中不包含斜杆或反斜杠,函数返回输入字符串本身。 +- 如果输入的字符串中不包含斜杆或反斜杠,函数返回输入字符串本身。 **示例** -```sql +``` sql SELECT 'some/long/path/to/file' AS a, basename(a) ``` -```text + +``` text ┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ │ some\long\path\to\file │ file │ └────────────────────────┴────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT 'some\\long\\path\\to\\file' AS a, basename(a) ``` -```text + +``` text ┌─a──────────────────────┬─basename('some\\long\\path\\to\\file')─┐ │ some\long\path\to\file │ file │ └────────────────────────┴────────────────────────────────────────┘ ``` -```sql + +``` sql SELECT 'some-file-name' AS a, basename(a) ``` -```text + +``` text ┌─a──────────────┬─basename('some-file-name')─┐ │ some-file-name │ some-file-name │ └────────────────┴────────────────────────────┘ ``` -## visibleWidth(x) +## visibleWidth(x) {#visiblewidthx} 以文本格式(以制表符分隔)向控制台输出值时,计算近似宽度。 系统使用此函数实现Pretty格式。 @@ -62,15 +65,13 @@ This function is used by the system for implementing Pretty formats. `NULL` is represented as a string corresponding to `NULL` in `Pretty` formats. -``` -SELECT visibleWidth(NULL) + SELECT visibleWidth(NULL) -┌─visibleWidth(NULL)─┐ -│ 4 │ -└────────────────────┘ -``` + ┌─visibleWidth(NULL)─┐ + │ 4 │ + └────────────────────┘ -## toTypeName(x) +## toTypeName(x) {#totypenamex} 返回包含参数的类型名称的字符串。 @@ -81,42 +82,42 @@ SELECT visibleWidth(NULL) 获取Block的大小。 在ClickHouse中,查询始终工作在Block(包含列的部分的集合)上。此函数允许您获取调用其的块的大小。 -## materialize(x) +## materialize(x) {#materializex} 将一个常量列变为一个非常量列。 在ClickHouse中,非常量列和常量列在内存中的表示方式不同。尽管函数对于常量列和非常量总是返回相同的结果,但它们的工作方式可能完全不同(执行不同的代码)。此函数用于调试这种行为。 -## ignore(...) +## ignore(…) {#ignore} 接受任何参数,包括`NULL`。始终返回0。 但是,函数的参数总是被计算的。该函数可以用于基准测试。 -## sleep(seconds) +## sleep(seconds) {#sleepseconds} -在每个Block上休眠'seconds'秒。可以是整数或浮点数。 +在每个Block上休眠’seconds’秒。可以是整数或浮点数。 -## sleepEachRow(seconds) +## sleepEachRow(seconds) {#sleepeachrowseconds} -在每行上休眠'seconds'秒。可以是整数或浮点数。 +在每行上休眠’seconds’秒。可以是整数或浮点数。 -## currentDatabase() +## currentDatabase() {#currentdatabase} 返回当前数据库的名称。 当您需要在CREATE TABLE中的表引擎参数中指定数据库,您可以使用此函数。 -## isFinite(x) +## isFinite(x) {#isfinitex} 接受Float32或Float64类型的参数,如果参数不是infinite且不是NaN,则返回1,否则返回0。 -## isInfinite(x) +## isInfinite(x) {#isinfinitex} 接受Float32或Float64类型的参数,如果参数是infinite,则返回1,否则返回0。注意NaN返回0。 -## isNaN(x) +## isNaN(x) {#isnanx} 接受Float32或Float64类型的参数,如果参数是Nan,则返回1,否则返回0。 -## hasColumnInTable(\['hostname'\[, 'username'\[, 'password'\]\],\] 'database', 'table', 'column') +## hasColumnInTable(\[‘hostname’\[, ‘username’\[, ‘password’\]\],\] ‘database’, ‘table’, ‘column’) {#hascolumnintablehostname-username-password-database-table-column} Accepts constant strings: database name, table name, and column name. Returns a UInt8 constant expression equal to 1 if there is a column, otherwise 0. If the hostname parameter is set, the test will run on a remote server. The function throws an exception if the table does not exist. @@ -130,9 +131,9 @@ For elements in a nested data structure, the function checks for the existence o 参数: -- `x` — 要显示的尺寸。 -- `min, max` — 整数常量,该值必须是`Int64`。 -- `width` — 常量,可以是正整数或小数。 +- `x` — 要显示的尺寸。 +- `min, max` — 整数常量,该值必须是`Int64`。 +- `width` — 常量,可以是正整数或小数。 字符带的绘制精度是符号的八分之一。 @@ -148,42 +149,39 @@ GROUP BY h ORDER BY h ASC ``` -``` -┌──h─┬──────c─┬─bar────────────────┐ -│ 0 │ 292907 │ █████████▋ │ -│ 1 │ 180563 │ ██████ │ -│ 2 │ 114861 │ ███▋ │ -│ 3 │ 85069 │ ██▋ │ -│ 4 │ 68543 │ ██▎ │ -│ 5 │ 78116 │ ██▌ │ -│ 6 │ 113474 │ ███▋ │ -│ 7 │ 170678 │ █████▋ │ -│ 8 │ 278380 │ █████████▎ │ -│ 9 │ 391053 │ █████████████ │ -│ 10 │ 457681 │ ███████████████▎ │ -│ 11 │ 493667 │ ████████████████▍ │ -│ 12 │ 509641 │ ████████████████▊ │ -│ 13 │ 522947 │ █████████████████▍ │ -│ 14 │ 539954 │ █████████████████▊ │ -│ 15 │ 528460 │ █████████████████▌ │ -│ 16 │ 539201 │ █████████████████▊ │ -│ 17 │ 523539 │ █████████████████▍ │ -│ 18 │ 506467 │ ████████████████▊ │ -│ 19 │ 520915 │ █████████████████▎ │ -│ 20 │ 521665 │ █████████████████▍ │ -│ 21 │ 542078 │ ██████████████████ │ -│ 22 │ 493642 │ ████████████████▍ │ -│ 23 │ 400397 │ █████████████▎ │ -└────┴────────┴────────────────────┘ -``` + ┌──h─┬──────c─┬─bar────────────────┐ + │ 0 │ 292907 │ █████████▋ │ + │ 1 │ 180563 │ ██████ │ + │ 2 │ 114861 │ ███▋ │ + │ 3 │ 85069 │ ██▋ │ + │ 4 │ 68543 │ ██▎ │ + │ 5 │ 78116 │ ██▌ │ + │ 6 │ 113474 │ ███▋ │ + │ 7 │ 170678 │ █████▋ │ + │ 8 │ 278380 │ █████████▎ │ + │ 9 │ 391053 │ █████████████ │ + │ 10 │ 457681 │ ███████████████▎ │ + │ 11 │ 493667 │ ████████████████▍ │ + │ 12 │ 509641 │ ████████████████▊ │ + │ 13 │ 522947 │ █████████████████▍ │ + │ 14 │ 539954 │ █████████████████▊ │ + │ 15 │ 528460 │ █████████████████▌ │ + │ 16 │ 539201 │ █████████████████▊ │ + │ 17 │ 523539 │ █████████████████▍ │ + │ 18 │ 506467 │ ████████████████▊ │ + │ 19 │ 520915 │ █████████████████▎ │ + │ 20 │ 521665 │ █████████████████▍ │ + │ 21 │ 542078 │ ██████████████████ │ + │ 22 │ 493642 │ ████████████████▍ │ + │ 23 │ 400397 │ █████████████▎ │ + └────┴────────┴────────────────────┘ - -## transform +## transform {#transform} 根据定义,将某些元素转换为其他元素。 此函数有两种使用方式: -1. `transform(x, array_from, array_to, default)` +1. `transform(x, array_from, array_to, default)` `x` – 要转换的值。 @@ -203,7 +201,7 @@ ORDER BY h ASC 对于相同的字母(T或U),如果数值类型,那么它们不可不完全匹配的,只需要具备共同的类型即可。 例如,第一个参数是Int64类型,第二个参数是Array(UInt16)类型。 -如果'x'值等于'array_from'数组中的一个元素,它将从'array_to'数组返回一个对应的元素(下标相同)。否则,它返回'default'。如果'array_from'匹配到了多个元素,则返回第一个匹配的元素。 +如果’x’值等于’array\_from’数组中的一个元素,它将从’array\_to’数组返回一个对应的元素(下标相同)。否则,它返回’default’。如果’array\_from’匹配到了多个元素,则返回第一个匹配的元素。 示例: @@ -217,18 +215,16 @@ GROUP BY title ORDER BY c DESC ``` -``` -┌─title─────┬──────c─┐ -│ Yandex │ 498635 │ -│ Google │ 229872 │ -│ Other │ 104472 │ -└───────────┴────────┘ -``` + ┌─title─────┬──────c─┐ + │ Yandex │ 498635 │ + │ Google │ 229872 │ + │ Other │ 104472 │ + └───────────┴────────┘ -2. `transform(x, array_from, array_to)` +1. `transform(x, array_from, array_to)` -与第一种不同在于省略了'default'参数。 -如果'x'值等于'array_from'数组中的一个元素,它将从'array_to'数组返回相应的元素(下标相同)。 否则,它返回'x'。 +与第一种不同在于省略了’default’参数。 +如果’x’值等于’array\_from’数组中的一个元素,它将从’array\_to’数组返回相应的元素(下标相同)。 否则,它返回’x’。 类型约束: @@ -246,21 +242,19 @@ ORDER BY count() DESC LIMIT 10 ``` -``` -┌─s──────────────┬───────c─┐ -│ │ 2906259 │ -│ www.yandex │ 867767 │ -│ ███████.ru │ 313599 │ -│ mail.yandex.ru │ 107147 │ -│ ██████.ru │ 100355 │ -│ █████████.ru │ 65040 │ -│ news.yandex.ru │ 64515 │ -│ ██████.net │ 59141 │ -│ example.com │ 57316 │ -└────────────────┴─────────┘ -``` + ┌─s──────────────┬───────c─┐ + │ │ 2906259 │ + │ www.yandex │ 867767 │ + │ ███████.ru │ 313599 │ + │ mail.yandex.ru │ 107147 │ + │ ██████.ru │ 100355 │ + │ █████████.ru │ 65040 │ + │ news.yandex.ru │ 64515 │ + │ ██████.net │ 59141 │ + │ example.com │ 57316 │ + └────────────────┴─────────┘ -## formatReadableSize(x) +## formatReadableSize(x) {#formatreadablesizex} 接受大小(字节数)。返回带有后缀(KiB, MiB等)的字符串。 @@ -272,36 +266,34 @@ SELECT formatReadableSize(filesize_bytes) AS filesize ``` -``` -┌─filesize_bytes─┬─filesize───┐ -│ 1 │ 1.00 B │ -│ 1024 │ 1.00 KiB │ -│ 1048576 │ 1.00 MiB │ -│ 192851925 │ 183.92 MiB │ -└────────────────┴────────────┘ -``` + ┌─filesize_bytes─┬─filesize───┐ + │ 1 │ 1.00 B │ + │ 1024 │ 1.00 KiB │ + │ 1048576 │ 1.00 MiB │ + │ 192851925 │ 183.92 MiB │ + └────────────────┴────────────┘ -## least(a, b) +## least(a, b) {#leasta-b} 返回a和b中的最小值。 -## greatest(a, b) +## greatest(a, b) {#greatesta-b} 返回a和b的最大值。 -## uptime() +## uptime() {#uptime} 返回服务正常运行的秒数。 -## version() +## version() {#version} 以字符串形式返回服务器的版本。 -## timezone() +## timezone() {#timezone} 返回服务器的时区。 -## blockNumber +## blockNumber {#blocknumber} 返回行所在的Block的序列号。 @@ -309,11 +301,11 @@ SELECT 返回行所在Block中行的序列号。 针对不同的Block始终重新计算。 -## rowNumberInAllBlocks() +## rowNumberInAllBlocks() {#rownumberinallblocks} 返回行所在结果集中的序列号。此函数仅考虑受影响的Block。 -## runningDifference(x) {#other_functions-runningdifference} +## runningDifference(x) {#other-functions-runningdifference} 计算数据块中相邻行的值之间的差异。 对于第一行返回0,并为每个后续行返回与前一行的差异。 @@ -340,204 +332,181 @@ FROM ) ``` -``` -┌─EventID─┬───────────EventTime─┬─delta─┐ -│ 1106 │ 2016-11-24 00:00:04 │ 0 │ -│ 1107 │ 2016-11-24 00:00:05 │ 1 │ -│ 1108 │ 2016-11-24 00:00:05 │ 0 │ -│ 1109 │ 2016-11-24 00:00:09 │ 4 │ -│ 1110 │ 2016-11-24 00:00:10 │ 1 │ -└─────────┴─────────────────────┴───────┘ -``` + ┌─EventID─┬───────────EventTime─┬─delta─┐ + │ 1106 │ 2016-11-24 00:00:04 │ 0 │ + │ 1107 │ 2016-11-24 00:00:05 │ 1 │ + │ 1108 │ 2016-11-24 00:00:05 │ 0 │ + │ 1109 │ 2016-11-24 00:00:09 │ 4 │ + │ 1110 │ 2016-11-24 00:00:10 │ 1 │ + └─────────┴─────────────────────┴───────┘ -## runningDifferenceStartingWithFirstValue +## runningDifferenceStartingWithFirstValue {#runningdifferencestartingwithfirstvalue} 与[runningDifference](./other_functions.md#other_functions-runningdifference)相同,区别在于第一行返回第一行的值,后续每个后续行返回与上一行的差值。 -## MACNumToString(num) +## MACNumToString(num) {#macnumtostringnum} 接受一个UInt64类型的数字。 将其解释为big endian的MAC地址。 返回包含相应MAC地址的字符串,格式为AA:BB:CC:DD:EE:FF(以冒号分隔的十六进制形式的数字)。 -## MACStringToNum(s) +## MACStringToNum(s) {#macstringtonums} 与MACNumToString相反。 如果MAC地址格式无效,则返回0。 -## MACStringToOUI(s) +## MACStringToOUI(s) {#macstringtoouis} 接受格式为AA:BB:CC:DD:EE:FF(十六进制形式的冒号分隔数字)的MAC地址。 返回前三个八位字节作为UInt64编号。 如果MAC地址格式无效,则返回0。 -## getSizeOfEnumType +## getSizeOfEnumType {#getsizeofenumtype} 返回[Enum](../../data_types/enum.md)中的枚举数量。 -``` -getSizeOfEnumType(value) -``` + getSizeOfEnumType(value) **参数:** -- `value` — `Enum`类型的值。 +- `value` — `Enum`类型的值。 **返回值** -- `Enum`的枚举数量。 -- 如果类型不是`Enum`,则抛出异常。 +- `Enum`的枚举数量。 +- 如果类型不是`Enum`,则抛出异常。 **示例** -``` -SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x + SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x -┌─x─┐ -│ 2 │ -└───┘ -``` + ┌─x─┐ + │ 2 │ + └───┘ -## toColumnTypeName +## toColumnTypeName {#tocolumntypename} 返回在RAM中列的数据类型的名称。 -``` -toColumnTypeName(value) -``` + toColumnTypeName(value) **参数:** -- `value` — 任何类型的值。 +- `value` — 任何类型的值。 **返回值** -- 一个字符串,其内容是`value`在RAM中的类型名称。 +- 一个字符串,其内容是`value`在RAM中的类型名称。 -**` toTypeName ' 与 ' toColumnTypeName`的区别示例** +**`toTypeName ' 与 ' toColumnTypeName`的区别示例** -``` -:) select toTypeName(cast('2018-01-01 01:02:03' AS DateTime)) + :) select toTypeName(cast('2018-01-01 01:02:03' AS DateTime)) -SELECT toTypeName(CAST('2018-01-01 01:02:03', 'DateTime')) + SELECT toTypeName(CAST('2018-01-01 01:02:03', 'DateTime')) -┌─toTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ -│ DateTime │ -└─────────────────────────────────────────────────────┘ + ┌─toTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ + │ DateTime │ + └─────────────────────────────────────────────────────┘ -1 rows in set. Elapsed: 0.008 sec. + 1 rows in set. Elapsed: 0.008 sec. -:) select toColumnTypeName(cast('2018-01-01 01:02:03' AS DateTime)) + :) select toColumnTypeName(cast('2018-01-01 01:02:03' AS DateTime)) -SELECT toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime')) + SELECT toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime')) -┌─toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ -│ Const(UInt32) │ -└───────────────────────────────────────────────────────────┘ -``` + ┌─toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ + │ Const(UInt32) │ + └───────────────────────────────────────────────────────────┘ 该示例显示`DateTime`数据类型作为`Const(UInt32)`存储在内存中。 -## dumpColumnStructure +## dumpColumnStructure {#dumpcolumnstructure} 输出在RAM中的数据结果的详细信息。 -``` -dumpColumnStructure(value) -``` + dumpColumnStructure(value) **参数:** -- `value` — 任何类型的值. +- `value` — 任何类型的值. **返回值** -- 一个字符串,其内容是`value`在RAM中的数据结构的详细描述。 +- 一个字符串,其内容是`value`在RAM中的数据结构的详细描述。 **示例** -``` -SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) + SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) -┌─dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ -│ DateTime, Const(size = 1, UInt32(size = 1)) │ -└──────────────────────────────────────────────────────────────┘ -``` + ┌─dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ + │ DateTime, Const(size = 1, UInt32(size = 1)) │ + └──────────────────────────────────────────────────────────────┘ -## defaultValueOfArgumentType +## defaultValueOfArgumentType {#defaultvalueofargumenttype} 输出数据类型的默认值。 不包括用户设置的自定义列的默认值。 -``` -defaultValueOfArgumentType(expression) -``` + defaultValueOfArgumentType(expression) **参数:** -- `expression` — 任意类型的值或导致任意类型值的表达式。 +- `expression` — 任意类型的值或导致任意类型值的表达式。 **返回值** -- 数值类型返回`0`。 -- 字符串类型返回空的字符串。 -- [Nullable](../../data_types/nullable.md)类型返回`ᴺᵁᴸᴸ`。 +- 数值类型返回`0`。 +- 字符串类型返回空的字符串。 +- [Nullable](../../data_types/nullable.md)类型返回`ᴺᵁᴸᴸ`。 **示例** -``` -:) SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) + :) SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) -SELECT defaultValueOfArgumentType(CAST(1, 'Int8')) + SELECT defaultValueOfArgumentType(CAST(1, 'Int8')) -┌─defaultValueOfArgumentType(CAST(1, 'Int8'))─┐ -│ 0 │ -└─────────────────────────────────────────────┘ + ┌─defaultValueOfArgumentType(CAST(1, 'Int8'))─┐ + │ 0 │ + └─────────────────────────────────────────────┘ -1 rows in set. Elapsed: 0.002 sec. + 1 rows in set. Elapsed: 0.002 sec. -:) SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) + :) SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) -SELECT defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)')) + SELECT defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)')) -┌─defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)'))─┐ -│ ᴺᵁᴸᴸ │ -└───────────────────────────────────────────────────────┘ + ┌─defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)'))─┐ + │ ᴺᵁᴸᴸ │ + └───────────────────────────────────────────────────────┘ -1 rows in set. Elapsed: 0.002 sec. -``` + 1 rows in set. Elapsed: 0.002 sec. - -## replicate +## replicate {#replicate} 使用单个值填充一个数组。 用于[arrayJoin](array_join.md#functions_arrayjoin)的内部实现。 -``` -replicate(x, arr) -``` + replicate(x, arr) **参数:** -- `arr` — 原始数组。 ClickHouse创建一个与原始数据长度相同的新数组,并用值`x`填充它。 -- `x` — 生成的数组将被填充的值。 +- `arr` — 原始数组。 ClickHouse创建一个与原始数据长度相同的新数组,并用值`x`填充它。 +- `x` — 生成的数组将被填充的值。 **输出** -- 一个被`x`填充的数组。 +- 一个被`x`填充的数组。 **示例** -``` -SELECT replicate(1, ['a', 'b', 'c']) + SELECT replicate(1, ['a', 'b', 'c']) -┌─replicate(1, ['a', 'b', 'c'])─┐ -│ [1,1,1] │ -└───────────────────────────────┘ -``` + ┌─replicate(1, ['a', 'b', 'c'])─┐ + │ [1,1,1] │ + └───────────────────────────────┘ -## filesystemAvailable +## filesystemAvailable {#filesystemavailable} 返回磁盘的剩余空间信息(以字节为单位)。使用配置文件中的path配置评估此信息。 -## filesystemCapacity +## filesystemCapacity {#filesystemcapacity} 返回磁盘的容量信息,以字节为单位。使用配置文件中的path配置评估此信息。 @@ -552,15 +521,16 @@ SELECT replicate(1, ['a', 'b', 'c']) 例如,获取聚合函数的状态(示例runningAccumulate(uniqState(UserID))),对于数据块的每一行,返回所有先前行和当前行的状态合并后的聚合函数的结果。 因此,函数的结果取决于分区中数据块的顺序以及数据块中行的顺序。 -## joinGet('join_storage_table_name', 'get_column', join_key) +## joinGet(‘join\_storage\_table\_name’, ‘get\_column’, join\_key) {#joingetjoin-storage-table-name-get-column-join-key} 使用指定的连接键从Join类型引擎的表中获取数据。 -## modelEvaluate(model_name, ...) {#function-modelevaluate} +## modelEvaluate(model\_name, …) {#function-modelevaluate} + 使用外部模型计算。 接受模型的名称以及模型的参数。返回Float64类型的值。 -## throwIf(x) +## throwIf(x) {#throwifx} 如果参数不为零则抛出异常。 diff --git a/docs/zh/query_language/functions/random_functions.md b/docs/zh/query_language/functions/random_functions.md index cd457257143..31283cce08a 100644 --- a/docs/zh/query_language/functions/random_functions.md +++ b/docs/zh/query_language/functions/random_functions.md @@ -1,4 +1,4 @@ -# 随机函数 +# 随机函数 {#sui-ji-han-shu} 随机函数使用非加密方式生成伪随机数字。 @@ -6,15 +6,15 @@ 您可以向它传递任何类型的参数,但传递的参数将不会使用在任何随机数生成过程中。 此参数的唯一目的是防止公共子表达式消除,以便在相同的查询中使用相同的随机函数生成不同的随机数。 -## rand +## rand {#rand} 返回一个UInt32类型的随机数字,所有UInt32类型的数字被生成的概率均相等。此函数线性同于的方式生成随机数。 -## rand64 +## rand64 {#rand64} 返回一个UInt64类型的随机数字,所有UInt64类型的数字被生成的概率均相等。此函数线性同于的方式生成随机数。 -## randConstant +## randConstant {#randconstant} 返回一个UInt32类型的随机数字,该函数不同之处在于仅为每个数据块参数一个随机数。 diff --git a/docs/zh/query_language/functions/rounding_functions.md b/docs/zh/query_language/functions/rounding_functions.md index c6e9de0ea15..4f18f3e983e 100644 --- a/docs/zh/query_language/functions/rounding_functions.md +++ b/docs/zh/query_language/functions/rounding_functions.md @@ -1,6 +1,6 @@ -# 取整函数 +# 取整函数 {#qu-zheng-han-shu} -## floor(x\[, N\]) +## floor(x\[, N\]) {#floorx-n} 返回小于或等于x的最大舍入数。该函数使用参数乘1/10N,如果1/10N不精确,则选择最接近的精确的适当数据类型的数。 ‘N’是一个整数常量,可选参数。默认为0,这意味着不对其进行舍入。 @@ -9,84 +9,77 @@ 示例: `floor(123.45, 1) = 123.4, floor(123.45, -1) = 120.` `x`是任何数字类型。结果与其为相同类型。 -对于整数参数,使用负‘N’值进行舍入是有意义的(对于非负“N”,该函数不执行任何操作)。 +对于整数参数,使用负‘N’值进行舍入是有意义的(对于非负«N»,该函数不执行任何操作)。 如果取整导致溢出(例如,floor(-128,-1)),则返回特定于实现的结果。 -## ceil(x\[, N\]), ceiling(x\[, N\]) +## ceil(x\[, N\]), ceiling(x\[, N\]) {#ceilx-n-ceilingx-n} -返回大于或等于'x'的最小舍入数。在其他方面,它与'floor'功能相同(见上文)。 +返回大于或等于’x’的最小舍入数。在其他方面,它与’floor’功能相同(见上文)。 -## round(x[, N]) {#rounding_functions-round} +## round(x\[, N\]) {#rounding-functions-round} 将值取整到指定的小数位数。 该函数按顺序返回最近的数字。如果给定数字包含多个最近数字,则函数返回其中最接近偶数的数字(银行的取整方式)。 -``` -round(expression [, decimal_places]) -``` + round(expression [, decimal_places]) **参数:** -- `expression` — 要进行取整的数字。可以是任何返回数字[类型](../../data_types/index.md#data_types)的[表达式](../syntax.md#syntax-expressions)。 -- `decimal-places` — 整数类型。 - - 如果`decimal-places > 0`,则该函数将值舍入小数点右侧。 - - 如果`decimal-places < 0`,则该函数将小数点左侧的值四舍五入。 - - 如果`decimal-places = 0`,则该函数将该值舍入为整数。在这种情况下,可以省略参数。 +- `expression` — 要进行取整的数字。可以是任何返回数字[类型](../../data_types/index.md#data_types)的[表达式](../syntax.md#syntax-expressions)。 +- `decimal-places` — 整数类型。 + - 如果`decimal-places > 0`,则该函数将值舍入小数点右侧。 + - 如果`decimal-places < 0`,则该函数将小数点左侧的值四舍五入。 + - 如果`decimal-places = 0`,则该函数将该值舍入为整数。在这种情况下,可以省略参数。 **返回值:** 与输入数字相同类型的取整后的数字。 -### 示例 +### 示例 {#shi-li} **使用示例** ``` sql SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 ``` -``` -┌───x─┬─round(divide(number, 2))─┐ -│ 0 │ 0 │ -│ 0.5 │ 0 │ -│ 1 │ 1 │ -└─────┴──────────────────────────┘ -``` + + ┌───x─┬─round(divide(number, 2))─┐ + │ 0 │ 0 │ + │ 0.5 │ 0 │ + │ 1 │ 1 │ + └─────┴──────────────────────────┘ **取整的示例** 取整到最近的数字。 -``` -round(3.2, 0) = 3 -round(4.1267, 2) = 4.13 -round(22,-1) = 20 -round(467,-2) = 500 -round(-467,-2) = -500 -``` + round(3.2, 0) = 3 + round(4.1267, 2) = 4.13 + round(22,-1) = 20 + round(467,-2) = 500 + round(-467,-2) = -500 银行的取整。 -``` -round(3.5) = 4 -round(4.5) = 4 -round(3.55, 1) = 3.6 -round(3.65, 1) = 3.6 -``` + round(3.5) = 4 + round(4.5) = 4 + round(3.55, 1) = 3.6 + round(3.65, 1) = 3.6 -## roundToExp2(num) +## roundToExp2(num) {#roundtoexp2num} 接受一个数字。如果数字小于1,则返回0。否则,它将数字向下舍入到最接近的(整个非负)2的x次幂。 -## roundDuration(num) +## roundDuration(num) {#rounddurationnum} 接受一个数字。如果数字小于1,则返回0。否则,它将数字向下舍入为集合中的数字:1,10,30,60,120,180,240,300,600,1200,1800,3600,7200,18000,36000。此函数用于Yandex.Metrica报表中计算会话的持续时长。 -## roundAge(num) +## roundAge(num) {#roundagenum} 接受一个数字。如果数字小于18,则返回0。否则,它将数字向下舍入为集合中的数字:18,25,35,45,55。此函数用于Yandex.Metrica报表中用户年龄的计算。 -## roundDown(num, arr) +## roundDown(num, arr) {#rounddownnum-arr} 接受一个数字,将其向下舍入到指定数组中的元素。如果该值小于数组中的最低边界,则返回最低边界。 diff --git a/docs/zh/query_language/functions/splitting_merging_functions.md b/docs/zh/query_language/functions/splitting_merging_functions.md index 6193ad5f6e7..7477e89441e 100644 --- a/docs/zh/query_language/functions/splitting_merging_functions.md +++ b/docs/zh/query_language/functions/splitting_merging_functions.md @@ -1,30 +1,29 @@ -# 字符串拆分合并函数 +# 字符串拆分合并函数 {#zi-fu-chuan-chai-fen-he-bing-han-shu} -## splitByChar(separator, s) +## splitByChar(separator, s) {#splitbycharseparator-s} 将字符串以‘separator’拆分成多个子串。‘separator’必须为仅包含一个字符的字符串常量。 返回拆分后的子串的数组。 如果分隔符出现在字符串的开头或结尾,或者如果有多个连续的分隔符,则将在对应位置填充空的子串。 -## splitByString(separator, s) +## splitByString(separator, s) {#splitbystringseparator-s} 与上面相同,但它使用多个字符的字符串作为分隔符。 该字符串必须为非空。 -## arrayStringConcat(arr\[, separator\]) +## arrayStringConcat(arr\[, separator\]) {#arraystringconcatarr-separator} 使用separator将数组中列出的字符串拼接起来。‘separator’是一个可选参数:一个常量字符串,默认情况下设置为空字符串。 返回拼接后的字符串。 -## alphaTokens(s) +## alphaTokens(s) {#alphatokenss} 从范围a-z和A-Z中选择连续字节的子字符串。返回子字符串数组。 **示例:** -``` -SELECT alphaTokens('abca1abc') + SELECT alphaTokens('abca1abc') + + ┌─alphaTokens('abca1abc')─┐ + │ ['abca','abc'] │ + └─────────────────────────┘ -┌─alphaTokens('abca1abc')─┐ -│ ['abca','abc'] │ -└─────────────────────────┘ -``` [来源文章](https://clickhouse.tech/docs/en/query_language/functions/splitting_merging_functions/) diff --git a/docs/zh/query_language/functions/string_functions.md b/docs/zh/query_language/functions/string_functions.md index c2db80e8c51..a2b5355ae8c 100644 --- a/docs/zh/query_language/functions/string_functions.md +++ b/docs/zh/query_language/functions/string_functions.md @@ -1,103 +1,102 @@ -# 字符串函数 +# 字符串函数 {#zi-fu-chuan-han-shu} -## empty {#string_functions-empty} +## empty {#string-functions-empty} 对于空字符串返回1,对于非空字符串返回0。 结果类型是UInt8。 如果字符串包含至少一个字节,则该字符串被视为非空字符串,即使这是一个空格或空字符。 该函数也适用于数组。 -## notEmpty +## notEmpty {#notempty} 对于空字符串返回0,对于非空字符串返回1。 结果类型是UInt8。 该函数也适用于数组。 -## length +## length {#length} 返回字符串的字节长度。 结果类型是UInt64。 该函数也适用于数组。 -## lengthUTF8 +## lengthUTF8 {#lengthutf8} 假定字符串以UTF-8编码组成的文本,返回此字符串的Unicode字符长度。如果传入的字符串不是UTF-8编码,则函数可能返回一个预期外的值(不会抛出异常)。 结果类型是UInt64。 -## char_length, CHAR_LENGTH +## char\_length, CHAR\_LENGTH {#char-length-char-length} 假定字符串以UTF-8编码组成的文本,返回此字符串的Unicode字符长度。如果传入的字符串不是UTF-8编码,则函数可能返回一个预期外的值(不会抛出异常)。 结果类型是UInt64。 -## character_length, CHARACTER_LENGTH +## character\_length, CHARACTER\_LENGTH {#character-length-character-length} 假定字符串以UTF-8编码组成的文本,返回此字符串的Unicode字符长度。如果传入的字符串不是UTF-8编码,则函数可能返回一个预期外的值(不会抛出异常)。 结果类型是UInt64。 -## lower, lcase +## lower, lcase {#lower-lcase} 将字符串中的ASCII转换为小写。 -## upper, ucase +## upper, ucase {#upper-ucase} 将字符串中的ASCII转换为大写。 -## lowerUTF8 +## lowerUTF8 {#lowerutf8} 将字符串转换为小写,函数假设字符串是以UTF-8编码文本的字符集。 同时函数不检测语言。因此对土耳其人来说,结果可能不完全正确。 如果UTF-8字节序列的长度对于代码点的大写和小写不同,则该代码点的结果可能不正确。 如果字符串包含一组非UTF-8的字节,则将引发未定义行为。 -## upperUTF8 +## upperUTF8 {#upperutf8} 将字符串转换为大写,函数假设字符串是以UTF-8编码文本的字符集。 同时函数不检测语言。因此对土耳其人来说,结果可能不完全正确。 如果UTF-8字节序列的长度对于代码点的大写和小写不同,则该代码点的结果可能不正确。 如果字符串包含一组非UTF-8的字节,则将引发未定义行为。 -## isValidUTF8 +## isValidUTF8 {#isvalidutf8} 检查字符串是否为有效的UTF-8编码,是则返回1,否则返回0。 -## toValidUTF8 +## toValidUTF8 {#tovalidutf8} 用`�`(U+FFFD)字符替换无效的UTF-8字符。所有连续的无效字符都会被替换为一个替换字符。 -``` -toValidUTF8( input_string ) -``` + toValidUTF8( input_string ) 参数: -- input_string — 任何一个[String](../../data_types/string.md)类型的对象。 +- input\_string — 任何一个[String](../../data_types/string.md)类型的对象。 返回值: 有效的UTF-8字符串。 -### 示例 +### 示例 {#shi-li} -```sql +``` sql SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') ``` -```text + +``` text ┌─toValidUTF8('a����b')─┐ │ a�b │ └───────────────────────┘ ``` -## reverse +## reverse {#reverse} 反转字符串。 -## reverseUTF8 +## reverseUTF8 {#reverseutf8} 以Unicode字符为单位反转UTF-8编码的字符串。如果字符串不是UTF-8编码,则可能获取到一个非预期的结果(不会抛出异常)。 -## format(pattern, s0, s1, ...) +## format(pattern, s0, s1, …) {#formatpattern-s0-s1} -使用常量字符串`pattern`格式化其他参数。`pattern`字符串中包含由大括号`{}`包围的“替换字段”。 未被包含在大括号中的任何内容都被视为文本内容,它将原样保留在返回值中。 如果你需要在文本内容中包含一个大括号字符,它可以通过加倍来转义:`{{`和`}}`。 字段名称可以是数字(从零开始)或空(然后将它们视为连续数字) +使用常量字符串`pattern`格式化其他参数。`pattern`字符串中包含由大括号`{}`包围的«替换字段»。 未被包含在大括号中的任何内容都被视为文本内容,它将原样保留在返回值中。 如果你需要在文本内容中包含一个大括号字符,它可以通过加倍来转义:`{{ '{{' }}`和`{{ '{{' }} '}}' }}`。 字段名称可以是数字(从零开始)或空(然后将它们视为连续数字) -```sql +``` sql SELECT format('{1} {0} {1}', 'World', 'Hello') ┌─format('{1} {0} {1}', 'World', 'Hello')─┐ @@ -111,56 +110,59 @@ SELECT format('{} {}', 'Hello', 'World') └───────────────────────────────────┘ ``` -## concat(s1, s2, ...) +## concat(s1, s2, …) {#concats1-s2} 将参数中的多个字符串拼接,不带分隔符。 -## concatAssumeInjective(s1, s2, ...) +## concatAssumeInjective(s1, s2, …) {#concatassumeinjectives1-s2} -与[concat](./string_functions.md#concat-s1-s2)相同,区别在于,你需要保证concat(s1, s2, s3) -> s4是单射的,它将用于GROUP BY的优化。 +与[concat](./string_functions.md#concat-s1-s2)相同,区别在于,你需要保证concat(s1, s2, s3) -\> s4是单射的,它将用于GROUP BY的优化。 -## substring(s, offset, length), mid(s, offset, length), substr(s, offset, length) +## substring(s, offset, length), mid(s, offset, length), substr(s, offset, length) {#substrings-offset-length-mids-offset-length-substrs-offset-length} 以字节为单位截取指定位置字符串,返回以‘offset’位置为开头,长度为‘length’的子串。‘offset’从1开始(与标准SQL相同)。‘offset’和‘length’参数必须是常量。 -## substringUTF8(s, offset, length) +## substringUTF8(s, offset, length) {#substringutf8s-offset-length} 与‘substring’相同,但其操作单位为Unicode字符,函数假设字符串是以UTF-8进行编码的文本。如果不是则可能返回一个预期外的结果(不会抛出异常)。 -## appendTrailingCharIfAbsent(s, c) +## appendTrailingCharIfAbsent(s, c) {#appendtrailingcharifabsents-c} 如果‘s’字符串非空并且末尾不包含‘c’字符,则将‘c’字符附加到末尾。 -## convertCharset(s, from, to) +## convertCharset(s, from, to) {#convertcharsets-from-to} 返回从‘from’中的编码转换为‘to’中的编码的字符串‘s’。 -## base64Encode(s) +## base64Encode(s) {#base64encodes} + 将字符串‘s’编码成base64 -## base64Decode(s) +## base64Decode(s) {#base64decodes} + 使用base64将字符串解码成原始字符串。如果失败则抛出异常。 -## tryBase64Decode(s) +## tryBase64Decode(s) {#trybase64decodes} + 使用base64将字符串解码成原始字符串。但如果出现错误,将返回空字符串。 -## endsWith(s, suffix) +## endsWith(s, suffix) {#endswiths-suffix} 返回是否以指定的后缀结尾。如果字符串以指定的后缀结束,则返回1,否则返回0。 -## startsWith(s, prefix) +## startsWith(s, prefix) {#startswiths-prefix} 返回是否以指定的前缀开头。如果字符串以指定的前缀开头,则返回1,否则返回0。 -## trimLeft(s) +## trimLeft(s) {#trimlefts} 返回一个字符串,用于删除左侧的空白字符。 -## trimRight(s) +## trimRight(s) {#trimrights} 返回一个字符串,用于删除右侧的空白字符。 -## trimBoth(s) +## trimBoth(s) {#trimboths} 返回一个字符串,用于删除任一侧的空白字符。 diff --git a/docs/zh/query_language/functions/string_replace_functions.md b/docs/zh/query_language/functions/string_replace_functions.md index 78a2820358f..e70dcade3a0 100644 --- a/docs/zh/query_language/functions/string_replace_functions.md +++ b/docs/zh/query_language/functions/string_replace_functions.md @@ -1,20 +1,20 @@ -# 字符串替换函数 +# 字符串替换函数 {#zi-fu-chuan-ti-huan-han-shu} -## replaceOne(haystack, pattern, replacement) +## replaceOne(haystack, pattern, replacement) {#replaceonehaystack-pattern-replacement} 用‘replacement’子串替换‘haystack’中与‘pattern’子串第一个匹配的匹配项(如果存在)。 ‘pattern’和‘replacement’必须是常量。 -## replaceAll(haystack, pattern, replacement), replace(haystack, pattern, replacement) +## replaceAll(haystack, pattern, replacement), replace(haystack, pattern, replacement) {#replaceallhaystack-pattern-replacement-replacehaystack-pattern-replacement} 用‘replacement’子串替换‘haystack’中出现的所有‘pattern’子串。 -## replaceRegexpOne(haystack, pattern, replacement) +## replaceRegexpOne(haystack, pattern, replacement) {#replaceregexponehaystack-pattern-replacement} 使用‘pattern’正则表达式替换。 ‘pattern’可以是任意一个有效的re2正则表达式。 如果存在与正则表达式匹配的匹配项,仅替换第一个匹配项。 同时‘replacement’可以指定为正则表达式中的捕获组。可以包含`\0-\9`。 -在这种情况下,函数将使用正则表达式的整个匹配项替换‘\0’。使用其他与之对应的子模式替换对应的‘\1-\9’。要在模版中使用‘\’字符,请使用‘\’将其转义。 +在这种情况下,函数将使用正则表达式的整个匹配项替换‘\\0’。使用其他与之对应的子模式替换对应的‘\\1-\\9’。要在模版中使用‘’字符,请使用‘’将其转义。 另外还请记住,字符串文字需要额外的转义。 示例1.将日期转换为美国格式: @@ -28,15 +28,13 @@ LIMIT 7 FORMAT TabSeparated ``` -``` -2014-03-17 03/17/2014 -2014-03-18 03/18/2014 -2014-03-19 03/19/2014 -2014-03-20 03/20/2014 -2014-03-21 03/21/2014 -2014-03-22 03/22/2014 -2014-03-23 03/23/2014 -``` + 2014-03-17 03/17/2014 + 2014-03-18 03/18/2014 + 2014-03-19 03/19/2014 + 2014-03-20 03/20/2014 + 2014-03-21 03/21/2014 + 2014-03-22 03/22/2014 + 2014-03-23 03/23/2014 示例2.复制字符串十次: @@ -44,13 +42,11 @@ FORMAT TabSeparated SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res ``` -``` -┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ -└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` + ┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ + │ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ + └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -## replaceRegexpAll(haystack, pattern, replacement) +## replaceRegexpAll(haystack, pattern, replacement) {#replaceregexpallhaystack-pattern-replacement} 与replaceRegexpOne相同,但会替换所有出现的匹配项。例如: @@ -58,11 +54,9 @@ SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res ``` -``` -┌─res────────────────────────┐ -│ HHeelllloo,, WWoorrlldd!! │ -└────────────────────────────┘ -``` + ┌─res────────────────────────┐ + │ HHeelllloo,, WWoorrlldd!! │ + └────────────────────────────┘ 例外的是,如果使用正则表达式捕获空白子串,则仅会进行一次替换。 示例: @@ -71,18 +65,15 @@ SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res ``` -``` -┌─res─────────────────┐ -│ here: Hello, World! │ -└─────────────────────┘ -``` + ┌─res─────────────────┐ + │ here: Hello, World! │ + └─────────────────────┘ - -## regexpQuoteMeta(s) +## regexpQuoteMeta(s) {#regexpquotemetas} 该函数用于在字符串中的某些预定义字符之前添加反斜杠。 -预定义字符:'0','\\','|','(',')','^','$','。','[',']','?','* ','+','{',':',' - '。 -这个实现与re2 :: RE2 :: QuoteMeta略有不同。它以\0而不是\x00转义零字节,它只转义所需的字符。 -有关详细信息,请参阅链接:[RE2](https://github.com/google/re2/blob/master/re2/re2.cc#L473) +预定义字符:‘0’,‘\\’,‘\|’,‘(’,‘)’,‘^’,‘$’,‘。’,‘\[’,’\]’,‘?’,‘\*’,‘+’,‘{’,‘:’,’ - ’。 +这个实现与re2 :: RE2 :: QuoteMeta略有不同。它以\\0而不是00转义零字节,它只转义所需的字符。 +有关详细信息,请参阅链接:\[RE2\](https://github.com/google/re2/blob/master/re2/re2.cc\#L473) [来源文章](https://clickhouse.tech/docs/en/query_language/functions/string_replace_functions/) diff --git a/docs/zh/query_language/functions/string_search_functions.md b/docs/zh/query_language/functions/string_search_functions.md index 97edd104700..8a27c460966 100644 --- a/docs/zh/query_language/functions/string_search_functions.md +++ b/docs/zh/query_language/functions/string_search_functions.md @@ -1,39 +1,39 @@ -# 字符串搜索函数 +# 字符串搜索函数 {#zi-fu-chuan-sou-suo-han-shu} 下列所有函数在默认的情况下区分大小写。对于不区分大小写的搜索,存在单独的变体。 -## position(haystack, needle), locate(haystack, needle) +## position(haystack, needle), locate(haystack, needle) {#positionhaystack-needle-locatehaystack-needle} 在字符串`haystack`中搜索子串`needle`。 返回子串的位置(以字节为单位),从1开始,如果未找到子串,则返回0。 对于不区分大小写的搜索,请使用函数`positionCaseInsensitive`。 -## positionUTF8(haystack, needle) +## positionUTF8(haystack, needle) {#positionutf8haystack-needle} 与`position`相同,但位置以Unicode字符返回。此函数工作在UTF-8编码的文本字符集中。如非此编码的字符集,则返回一些非预期结果(他不会抛出异常)。 对于不区分大小写的搜索,请使用函数`positionCaseInsensitiveUTF8`。 -## multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) +## multiSearchAllPositions(haystack, \[needle1, needle2, …, needlen\]) {#multisearchallpositionshaystack-needle1-needle2-needlen} 与`position`相同,但函数返回一个数组,其中包含所有匹配needlei的位置。 对于不区分大小写的搜索或/和UTF-8格式,使用函数`multiSearchAllPositionsCaseInsensitive,multiSearchAllPositionsUTF8,multiSearchAllPositionsCaseInsensitiveUTF8`。 -## multiSearchFirstPosition(haystack, [needle1, needle2, ..., needlen]) +## multiSearchFirstPosition(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstpositionhaystack-needle1-needle2-needlen} 与`position`相同,但返回在`haystack`中与needles字符串匹配的最左偏移。 对于不区分大小写的搜索或/和UTF-8格式,使用函数`multiSearchFirstPositionCaseInsensitive,multiSearchFirstPositionUTF8,multiSearchFirstPositionCaseInsensitiveUTF8`。 -## multiSearchFirstIndex(haystack, [needle1, needle2, ..., needlen]) +## multiSearchFirstIndex(haystack, \[needle1, needle2, …, needlen\]) {#multisearchfirstindexhaystack-needle1-needle2-needlen} 返回在字符串`haystack`中最先查找到的needlei的索引`i`(从1开始),没有找到任何匹配项则返回0。 对于不区分大小写的搜索或/和UTF-8格式,使用函数`multiSearchFirstIndexCaseInsensitive,multiSearchFirstIndexUTF8,multiSearchFirstIndexCaseInsensitiveUTF8`。 -## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) +## multiSearchAny(haystack, \[needle1, needle2, …, needlen\]) {#multisearchanyhaystack-needle1-needle2-needlen} 如果`haystack`中至少存在一个needlei匹配则返回1,否则返回0。 @@ -42,7 +42,7 @@ !!! note "注意" 在所有`multiSearch*`函数中,由于实现规范,needles的数量应小于28。 -## match(haystack, pattern) +## match(haystack, pattern) {#matchhaystack-pattern} 检查字符串是否与`pattern`正则表达式匹配。`pattern`可以是一个任意的`re2`正则表达式。 `re2`正则表达式的[语法](https://github.com/google/re2/wiki/Syntax)比Perl正则表达式的语法存在更多限制。 @@ -51,24 +51,24 @@ 请注意,反斜杠符号(`\`)用于在正则表达式中转义。由于字符串中采用相同的符号来进行转义。因此,为了在正则表达式中转义符号,必须在字符串文字中写入两个反斜杠(\\)。 正则表达式与字符串一起使用,就像它是一组字节一样。正则表达式中不能包含空字节。 -对于在字符串中搜索子字符串的模式,最好使用LIKE或“position”,因为它们更加高效。 +对于在字符串中搜索子字符串的模式,最好使用LIKE或«position»,因为它们更加高效。 -## multiMatchAny(haystack, [pattern1, pattern2, ..., patternn]) +## multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyhaystack-pattern1-pattern2-patternn} -与`match`相同,但如果所有正则表达式都不匹配,则返回0;如果任何模式匹配,则返回1。它使用[hyperscan](https://github.com/intel/hyperscan)库。对于在字符串中搜索子字符串的模式,最好使用“multisearchany”,因为它更高效。 +与`match`相同,但如果所有正则表达式都不匹配,则返回0;如果任何模式匹配,则返回1。它使用[hyperscan](https://github.com/intel/hyperscan)库。对于在字符串中搜索子字符串的模式,最好使用«multisearchany»,因为它更高效。 !!! note "注意" - 任何`haystack`字符串的长度必须小于232字节,否则抛出异常。这种限制是因为hyperscan API而产生的。 + 任何`haystack`字符串的长度必须小于232\字节,否则抛出异常。这种限制是因为hyperscan API而产生的。 -## multiMatchAnyIndex(haystack, [pattern1, pattern2, ..., patternn]) +## multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} 与`multiMatchAny`相同,但返回与haystack匹配的任何内容的索引位置。 -## multiFuzzyMatchAny(haystack, distance, [pattern1, pattern2, ..., patternn]) +## multiFuzzyMatchAny(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyhaystack-distance-pattern1-pattern2-patternn} 与`multiMatchAny`相同,但如果在haystack能够查找到任何模式匹配能够在指定的[编辑距离](https://en.wikipedia.org/wiki/Edit_distance)内进行匹配,则返回1。此功能也处于实验模式,可能非常慢。有关更多信息,请参阅[hyperscan文档](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching)。 -## multiFuzzyMatchAnyIndex(haystack, distance, [pattern1, pattern2, ..., patternn]) +## multiFuzzyMatchAnyIndex(haystack, distance, \[pattern1, pattern2, …, patternn\]) {#multifuzzymatchanyindexhaystack-distance-pattern1-pattern2-patternn} 与`multiFuzzyMatchAny`相同,但返回匹配项的匹配能容的索引位置。 @@ -78,15 +78,15 @@ !!! note "注意" 如要关闭所有hyperscan函数的使用,请设置`SET allow_hyperscan = 0;`。 -## extract(haystack, pattern) +## extract(haystack, pattern) {#extracthaystack-pattern} 使用正则表达式截取字符串。如果‘haystack’与‘pattern’不匹配,则返回空字符串。如果正则表达式中不包含子模式,它将获取与整个正则表达式匹配的子串。否则,它将获取与第一个子模式匹配的子串。 -## extractAll(haystack, pattern) +## extractAll(haystack, pattern) {#extractallhaystack-pattern} 使用正则表达式提取字符串的所有片段。如果‘haystack’与‘pattern’正则表达式不匹配,则返回一个空字符串。否则返回所有与正则表达式匹配的字符串数组。通常,行为与‘extract’函数相同(它采用第一个子模式,如果没有子模式,则采用整个表达式)。 -## like(haystack, pattern), haystack LIKE pattern operator +## like(haystack, pattern), haystack LIKE pattern operator {#likehaystack-pattern-haystack-like-pattern-operator} 检查字符串是否与简单正则表达式匹配。 正则表达式可以包含的元符号有`%`和`_`。 @@ -95,22 +95,22 @@ `_` 表示任何一个字节。 -可以使用反斜杠(`\`)来对元符号进行转义。请参阅“match”函数说明中有关转义的说明。 +可以使用反斜杠(`\`)来对元符号进行转义。请参阅«match»函数说明中有关转义的说明。 对于像`%needle%`这样的正则表达式,改函数与`position`函数一样快。 对于其他正则表达式,函数与‘match’函数相同。 -## notLike(haystack, pattern), haystack NOT LIKE pattern operator +## notLike(haystack, pattern), haystack NOT LIKE pattern operator {#notlikehaystack-pattern-haystack-not-like-pattern-operator} 与‘like’函数返回相反的结果。 -## ngramDistance(haystack, needle) +## ngramDistance(haystack, needle) {#ngramdistancehaystack-needle} -基于4-gram计算`haystack`和`needle`之间的距离:计算两个4-gram集合之间的对称差异,并用它们的基数和对其进行归一化。返回0到1之间的任何浮点数 -- 越接近0则表示越多的字符串彼此相似。如果常量的`needle`或`haystack`超过32KB,函数将抛出异常。如果非常量的`haystack`或`needle`字符串超过32Kb,则距离始终为1。 +基于4-gram计算`haystack`和`needle`之间的距离:计算两个4-gram集合之间的对称差异,并用它们的基数和对其进行归一化。返回0到1之间的任何浮点数 – 越接近0则表示越多的字符串彼此相似。如果常量的`needle`或`haystack`超过32KB,函数将抛出异常。如果非常量的`haystack`或`needle`字符串超过32Kb,则距离始终为1。 对于不区分大小写的搜索或/和UTF-8格式,使用函数`ngramDistanceCaseInsensitive,ngramDistanceUTF8,ngramDistanceCaseInsensitiveUTF8`。 -## ngramSearch(haystack, needle) +## ngramSearch(haystack, needle) {#ngramsearchhaystack-needle} 与`ngramDistance`相同,但计算`needle`和`haystack`之间的非对称差异——`needle`的n-gram减去`needle`归一化n-gram。可用于模糊字符串搜索。 @@ -119,5 +119,4 @@ !!! note "注意" 对于UTF-8,我们使用3-gram。所有这些都不是完全公平的n-gram距离。我们使用2字节哈希来散列n-gram,然后计算这些哈希表之间的(非)对称差异 - 可能会发生冲突。对于UTF-8不区分大小写的格式,我们不使用公平的`tolower`函数 - 我们将每个Unicode字符字节的第5位(从零开始)和字节的第一位归零 - 这适用于拉丁语,主要用于所有西里尔字母。 - [来源文章](https://clickhouse.tech/docs/en/query_language/functions/string_search_functions/) diff --git a/docs/zh/query_language/functions/type_conversion_functions.md b/docs/zh/query_language/functions/type_conversion_functions.md index 5a53fddf0c8..53da98a69d6 100644 --- a/docs/zh/query_language/functions/type_conversion_functions.md +++ b/docs/zh/query_language/functions/type_conversion_functions.md @@ -1,18 +1,18 @@ -# 类型转换函数 +# 类型转换函数 {#lei-xing-zhuan-huan-han-shu} -## toUInt8, toUInt16, toUInt32, toUInt64 +## toUInt8, toUInt16, toUInt32, toUInt64 {#touint8-touint16-touint32-touint64} -## toInt8, toInt16, toInt32, toInt64 +## toInt8, toInt16, toInt32, toInt64 {#toint8-toint16-toint32-toint64} -## toFloat32, toFloat64 +## toFloat32, toFloat64 {#tofloat32-tofloat64} -## toDate, toDateTime +## toDate, toDateTime {#todate-todatetime} -## toUInt8OrZero, toUInt16OrZero, toUInt32OrZero, toUInt64OrZero, toInt8OrZero, toInt16OrZero, toInt32OrZero, toInt64OrZero, toFloat32OrZero, toFloat64OrZero, toDateOrZero, toDateTimeOrZero +## toUInt8OrZero, toUInt16OrZero, toUInt32OrZero, toUInt64OrZero, toInt8OrZero, toInt16OrZero, toInt32OrZero, toInt64OrZero, toFloat32OrZero, toFloat64OrZero, toDateOrZero, toDateTimeOrZero {#touint8orzero-touint16orzero-touint32orzero-touint64orzero-toint8orzero-toint16orzero-toint32orzero-toint64orzero-tofloat32orzero-tofloat64orzero-todateorzero-todatetimeorzero} -## toUInt8OrNull, toUInt16OrNull, toUInt32OrNull, toUInt64OrNull, toInt8OrNull, toInt16OrNull, toInt32OrNull, toInt64OrNull, toFloat32OrNull, toFloat64OrNull, toDateOrNull, toDateTimeOrNull +## toUInt8OrNull, toUInt16OrNull, toUInt32OrNull, toUInt64OrNull, toInt8OrNull, toInt16OrNull, toInt32OrNull, toInt64OrNull, toFloat32OrNull, toFloat64OrNull, toDateOrNull, toDateTimeOrNull {#touint8ornull-touint16ornull-touint32ornull-touint64ornull-toint8ornull-toint16ornull-toint32ornull-toint64ornull-tofloat32ornull-tofloat64ornull-todateornull-todatetimeornull} -## toString +## toString {#tostring} 这些函数用于在数字、字符串(不包含FixedString)、Date以及DateTime之间互相转换。 所有的函数都接受一个参数。 @@ -24,12 +24,10 @@ toDate/toDateTime函数的日期和日期时间格式定义如下: -``` -YYYY-MM-DD -YYYY-MM-DD hh:mm:ss -``` + YYYY-MM-DD + YYYY-MM-DD hh:mm:ss -例外的是,如果将UInt32、Int32、UInt64或Int64类型的数值转换为Date类型,并且其对应的值大于等于65536,则该数值将被解析成unix时间戳(而不是对应的天数)。这意味着允许写入‘toDate(unix_timestamp)’这种常见情况,否则这将是错误的,并且需要便携更加繁琐的‘toDate(toDateTime(unix_timestamp))’。 +例外的是,如果将UInt32、Int32、UInt64或Int64类型的数值转换为Date类型,并且其对应的值大于等于65536,则该数值将被解析成unix时间戳(而不是对应的天数)。这意味着允许写入‘toDate(unix\_timestamp)’这种常见情况,否则这将是错误的,并且需要便携更加繁琐的‘toDate(toDateTime(unix\_timestamp))’。 Date与DateTime之间的转换以更为自然的方式进行:通过添加空的time或删除time。 @@ -43,24 +41,22 @@ SELECT toString(now(), 'Asia/Yekaterinburg') AS now_yekat ``` -``` -┌───────────now_local─┬─now_yekat───────────┐ -│ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ -└─────────────────────┴─────────────────────┘ -``` + ┌───────────now_local─┬─now_yekat───────────┐ + │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ + └─────────────────────┴─────────────────────┘ 另请参阅`toUnixTimestamp`函数。 -## toDecimal32(value, S), toDecimal64(value, S), toDecimal128(value, S) +## toDecimal32(value, S), toDecimal64(value, S), toDecimal128(value, S) {#todecimal32value-s-todecimal64value-s-todecimal128value-s} 将`value`转换为精度为`S`的[Decimal](../../data_types/decimal.md)。`value`可以是数字或字符串。`S`参数为指定的小数位数。 -## toFixedString(s, N) +## toFixedString(s, N) {#tofixedstrings-n} 将String类型的参数转换为FixedString(N)类型的值(具有固定长度N的字符串)。N必须是一个常量。 如果字符串的字节数少于N,则向右填充空字节。如果字符串的字节数多于N,则抛出异常。 -## toStringCutToZero(s) +## toStringCutToZero(s) {#tostringcuttozeros} 接受String或FixedString参数。返回String,其内容在找到的第一个零字节处被截断。 @@ -70,41 +66,37 @@ SELECT SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut ``` -``` -┌─s─────────────┬─s_cut─┐ -│ foo\0\0\0\0\0 │ foo │ -└───────────────┴───────┘ -``` + ┌─s─────────────┬─s_cut─┐ + │ foo\0\0\0\0\0 │ foo │ + └───────────────┴───────┘ ``` sql SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ``` -``` -┌─s──────────┬─s_cut─┐ -│ foo\0bar\0 │ foo │ -└────────────┴───────┘ -``` + ┌─s──────────┬─s_cut─┐ + │ foo\0bar\0 │ foo │ + └────────────┴───────┘ -## reinterpretAsUInt8, reinterpretAsUInt16, reinterpretAsUInt32, reinterpretAsUInt64 +## reinterpretAsUInt8, reinterpretAsUInt16, reinterpretAsUInt32, reinterpretAsUInt64 {#reinterpretasuint8-reinterpretasuint16-reinterpretasuint32-reinterpretasuint64} -## reinterpretAsInt8, reinterpretAsInt16, reinterpretAsInt32, reinterpretAsInt64 +## reinterpretAsInt8, reinterpretAsInt16, reinterpretAsInt32, reinterpretAsInt64 {#reinterpretasint8-reinterpretasint16-reinterpretasint32-reinterpretasint64} -## reinterpretAsFloat32, reinterpretAsFloat64 +## reinterpretAsFloat32, reinterpretAsFloat64 {#reinterpretasfloat32-reinterpretasfloat64} -## reinterpretAsDate, reinterpretAsDateTime +## reinterpretAsDate, reinterpretAsDateTime {#reinterpretasdate-reinterpretasdatetime} 这些函数接受一个字符串,并将放在字符串开头的字节解释为主机顺序中的数字(little endian)。如果字符串不够长,则函数就像使用必要数量的空字节填充字符串一样。如果字符串比需要的长,则忽略额外的字节。Date被解释为Unix时间戳的天数,DateTime被解释为Unix时间戳。 -## reinterpretAsString +## reinterpretAsString {#reinterpretasstring} 此函数接受数字、Date或DateTime,并返回一个字符串,其中包含表示主机顺序(小端)的相应值的字节。从末尾删除空字节。例如,UInt32类型值255是一个字节长的字符串。 -## reinterpretAsFixedString +## reinterpretAsFixedString {#reinterpretasfixedstring} 此函数接受数字、Date或DateTime,并返回包含表示主机顺序(小端)的相应值的字节的FixedString。从末尾删除空字节。例如,UInt32类型值255是一个长度为一个字节的FixedString。 -## CAST(x, t) {#type_conversion_function-cast} +## CAST(x, t) {#type-conversion-function-cast} 将‘x’转换为‘t’数据类型。还支持语法CAST(x AS t) @@ -119,38 +111,34 @@ SELECT CAST(timestamp, 'FixedString(22)') AS fixed_string ``` -``` -┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ -│ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ -└─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ -``` + ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ + │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ + └─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ 将参数转换为FixedString(N),仅适用于String或FixedString(N)类型的参数。 支持将数据转换为[Nullable](../../data_types/nullable.md)。例如: -``` -SELECT toTypeName(x) FROM t_null + SELECT toTypeName(x) FROM t_null -┌─toTypeName(x)─┐ -│ Int8 │ -│ Int8 │ -└───────────────┘ + ┌─toTypeName(x)─┐ + │ Int8 │ + │ Int8 │ + └───────────────┘ -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null + SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null -┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ -│ Nullable(UInt16) │ -│ Nullable(UInt16) │ -└─────────────────────────────────────────┘ -``` + ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ + │ Nullable(UInt16) │ + │ Nullable(UInt16) │ + └─────────────────────────────────────────┘ ## toIntervalYear, toIntervalQuarter, toIntervalMonth, toIntervalWeek, toIntervalDay, toIntervalHour, toIntervalMinute, toIntervalSecond {#function-tointerval} 将数字类型参数转换为Interval类型(时间区间)。 Interval类型实际上是非常有用的,您可以使用此类型的数据直接与Date或DateTime执行算术运算。同时,ClickHouse为Interval类型数据的声明提供了更方便的语法。例如: -```sql +``` sql WITH toDate('2019-01-01') AS date, INTERVAL 1 WEEK AS interval_week, @@ -160,23 +148,21 @@ SELECT date + interval_to_week ``` -``` -┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ -│ 2019-01-08 │ 2019-01-08 │ -└───────────────────────────┴──────────────────────────────┘ -``` + ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ + │ 2019-01-08 │ 2019-01-08 │ + └───────────────────────────┴──────────────────────────────┘ -## parseDateTimeBestEffort {#type_conversion_functions-parsedatetimebesteffort} +## parseDateTimeBestEffort {#type-conversion-functions-parsedatetimebesteffort} 将数字类型参数解析为Date或DateTime类型。 与toDate和toDateTime不同,parseDateTimeBestEffort可以进行更复杂的日期格式。 有关详细信息,请参阅链接:[复杂日期格式](https://xkcd.com/1179/)。 -## parseDateTimeBestEffortOrNull +## parseDateTimeBestEffortOrNull {#parsedatetimebesteffortornull} 与[parseDateTimeBestEffort](#type_conversion_functions-parsedatetimebesteffort)相同,但它遇到无法处理的日期格式时返回null。 -## parseDateTimeBestEffortOrZero +## parseDateTimeBestEffortOrZero {#parsedatetimebesteffortorzero} 与[parseDateTimeBestEffort](#type_conversion_functions-parsedatetimebesteffort)相同,但它遇到无法处理的日期格式时返回零Date或零DateTime。 diff --git a/docs/zh/query_language/functions/url_functions.md b/docs/zh/query_language/functions/url_functions.md index 7639bcf0580..df8b1cb69c4 100644 --- a/docs/zh/query_language/functions/url_functions.md +++ b/docs/zh/query_language/functions/url_functions.md @@ -1,86 +1,84 @@ -# URL函数 +# URL函数 {#urlhan-shu} 所有这些功能都不遵循RFC。它们被最大程度简化以提高性能。 -## URL截取函数 +## URL截取函数 {#urljie-qu-han-shu} 如果URL中没有要截取的内容则返回空字符串。 -### protocol +### protocol {#protocol} -返回URL的协议。例如: http、ftp、mailto、magnet... +返回URL的协议。例如: http、ftp、mailto、magnet… -### domain +### domain {#domain} 获取域名。 -### domainWithoutWWW +### domainWithoutWWW {#domainwithoutwww} 返回域名并删除第一个‘www.’。 -### topLevelDomain +### topLevelDomain {#topleveldomain} 返回顶级域名。例如:.ru。 -### firstSignificantSubdomain +### firstSignificantSubdomain {#firstsignificantsubdomain} -返回“第一个有效子域名”。这并不是一个标准概念,仅用于Yandex.Metrica。如果顶级域名为‘com’,‘net’,‘org’或者‘co’则第一个有效子域名为二级域名。否则则返回三级域名。例如,irstSignificantSubdomain ('') = 'yandex ', firstSignificantSubdomain ('') = 'yandex '。一些实现细节在未来可能会进行改变。 +返回«第一个有效子域名»。这并不是一个标准概念,仅用于Yandex.Metrica。如果顶级域名为‘com’,‘net’,‘org’或者‘co’则第一个有效子域名为二级域名。否则则返回三级域名。例如,irstSignificantSubdomain (’https://news.yandex.ru/‘) = ’yandex’, firstSignificantSubdomain (‘https://news.yandex.com.tr/’) = ‘yandex’。一些实现细节在未来可能会进行改变。 -### cutToFirstSignificantSubdomain +### cutToFirstSignificantSubdomain {#cuttofirstsignificantsubdomain} 返回包含顶级域名与第一个有效子域名之间的内容(请参阅上面的内容)。 例如, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`. -### path +### path {#path} 返回URL路径。例如:`/top/news.html`,不包含请求参数。 -### pathFull +### pathFull {#pathfull} 与上面相同,但包括请求参数和fragment。例如:/top/news.html?page=2\#comments -### queryString +### queryString {#querystring} 返回请求参数。例如:page=1&lr=213。请求参数不包含问号已经\# 以及\# 之后所有的内容。 -### fragment +### fragment {#fragment} 返回URL的fragment标识。fragment不包含\#。 -### queryStringAndFragment +### queryStringAndFragment {#querystringandfragment} 返回请求参数和fragment标识。例如:page=1\#29390。 -### extractURLParameter(URL, name) +### extractURLParameter(URL, name) {#extracturlparameterurl-name} 返回URL请求参数中名称为‘name’的参数。如果不存在则返回一个空字符串。如果存在多个匹配项则返回第一个相匹配的。此函数假设参数名称与参数值在url中的编码方式相同。 -### extractURLParameters(URL) +### extractURLParameters(URL) {#extracturlparametersurl} 返回一个数组,其中以name=value的字符串形式返回url的所有请求参数。不以任何编码解析任何内容。 -### extractURLParameterNames(URL) +### extractURLParameterNames(URL) {#extracturlparameternamesurl} 返回一个数组,其中包含url的所有请求参数的名称。不以任何编码解析任何内容。 -### URLHierarchy(URL) +### URLHierarchy(URL) {#urlhierarchyurl} 返回一个数组,其中包含以/切割的URL的所有内容。?将被包含在URL路径以及请求参数中。连续的分割符号被记为一个。 -### URLPathHierarchy(URL) +### URLPathHierarchy(URL) {#urlpathhierarchyurl} 与上面相同,但结果不包含协议和host部分。 /element(root)不包括在内。该函数用于在Yandex.Metric中实现导出URL的树形结构。 -``` -URLPathHierarchy('https://example.com/browse/CONV-6788') = -[ - '/browse/', - '/browse/CONV-6788' -] -``` + URLPathHierarchy('https://example.com/browse/CONV-6788') = + [ + '/browse/', + '/browse/CONV-6788' + ] -### decodeURLComponent(URL) +### decodeURLComponent(URL) {#decodeurlcomponenturl} 返回已经解码的URL。 例如: @@ -89,35 +87,32 @@ URLPathHierarchy('https://example.com/browse/CONV-6788') = SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; ``` -``` -┌─DecodedURL─────────────────────────────┐ -│ http://127.0.0.1:8123/?query=SELECT 1; │ -└────────────────────────────────────────┘ -``` + ┌─DecodedURL─────────────────────────────┐ + │ http://127.0.0.1:8123/?query=SELECT 1; │ + └────────────────────────────────────────┘ -## 删除URL中的部分内容 +## 删除URL中的部分内容 {#shan-chu-urlzhong-de-bu-fen-nei-rong} 如果URL中不包含指定的部分,则URL不变。 -### cutWWW +### cutWWW {#cutwww} -删除开始的第一个'www.'。 +删除开始的第一个’www.’。 -### cutQueryString +### cutQueryString {#cutquerystring} 删除请求参数。问号也将被删除。 -### cutFragment +### cutFragment {#cutfragment} 删除fragment标识。\#同样也会被删除。 -### cutQueryStringAndFragment +### cutQueryStringAndFragment {#cutquerystringandfragment} 删除请求参数以及fragment标识。问号以及\#也会被删除。 -### cutURLParameter(URL, name) +### cutURLParameter(URL, name) {#cuturlparameterurl-name} 删除URL中名称为‘name’的参数。改函数假设参数名称以及参数值经过URL相同的编码。 - [来源文章](https://clickhouse.tech/docs/en/query_language/functions/url_functions/) diff --git a/docs/zh/query_language/functions/uuid_functions.md b/docs/zh/query_language/functions/uuid_functions.md index e398847c149..2cb2ff30872 100644 --- a/docs/zh/query_language/functions/uuid_functions.md +++ b/docs/zh/query_language/functions/uuid_functions.md @@ -1,12 +1,12 @@ -# UUID函数 +# UUID函数 {#uuidhan-shu} 下面列出了所有UUID的相关函数 -## generateUUIDv4 {#uuid_function-generate} +## generateUUIDv4 {#uuid-function-generate} 生成一个UUID([版本4](https://tools.ietf.org/html/rfc4122#section-4.4))。 -```sql +``` sql generateUUIDv4() ``` @@ -30,11 +30,11 @@ UUID类型的值。 └──────────────────────────────────────┘ ``` -## toUUID (x) +## toUUID (x) {#touuid-x} 将String类型的值转换为UUID类型的值。 -```sql +``` sql toUUID(String) ``` @@ -52,7 +52,7 @@ UUID类型的值 └──────────────────────────────────────┘ ``` -## UUIDStringToNum +## UUIDStringToNum {#uuidstringtonum} 接受一个String类型的值,其中包含36个字符且格式为`xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`,将其转换为UUID的数值并以[FixedString(16)](../../data_types/fixedstring.md)将其返回。 @@ -76,7 +76,7 @@ FixedString(16) └──────────────────────────────────────┴──────────────────┘ ``` -## UUIDNumToString +## UUIDNumToString {#uuidnumtostring} 接受一个[FixedString(16)](../../data_types/fixedstring.md)类型的值,返回其对应的String表现形式。 @@ -100,8 +100,8 @@ SELECT └──────────────────┴──────────────────────────────────────┘ ``` -## 另请参阅 +## 另请参阅 {#ling-qing-can-yue} -- [dictGetUUID](ext_dict_functions.md) +- [dictGetUUID](ext_dict_functions.md) [来源文章](https://clickhouse.tech/docs/en/query_language/functions/uuid_function/) diff --git a/docs/zh/query_language/functions/ym_dict_functions.md b/docs/zh/query_language/functions/ym_dict_functions.md index 130490b2c7a..6d03ae228e8 100644 --- a/docs/zh/query_language/functions/ym_dict_functions.md +++ b/docs/zh/query_language/functions/ym_dict_functions.md @@ -1,38 +1,36 @@ -# Functions for working with Yandex.Metrica dictionaries +# Functions for working with Yandex.Metrica dictionaries {#functions-for-working-with-yandex-metrica-dictionaries} -In order for the functions below to work, the server config must specify the paths and addresses for getting all the Yandex.Metrica dictionaries. The dictionaries are loaded at the first call of any of these functions. If the reference lists can't be loaded, an exception is thrown. +In order for the functions below to work, the server config must specify the paths and addresses for getting all the Yandex.Metrica dictionaries. The dictionaries are loaded at the first call of any of these functions. If the reference lists can’t be loaded, an exception is thrown. -For information about creating reference lists, see the section "Dictionaries". +For information about creating reference lists, see the section «Dictionaries». -## Multiple geobases +## Multiple geobases {#multiple-geobases} ClickHouse supports working with multiple alternative geobases (regional hierarchies) simultaneously, in order to support various perspectives on which countries certain regions belong to. -The 'clickhouse-server' config specifies the file with the regional hierarchy::`/opt/geo/regions_hierarchy.txt` +The ‘clickhouse-server’ config specifies the file with the regional hierarchy::`/opt/geo/regions_hierarchy.txt` -Besides this file, it also searches for files nearby that have the _ symbol and any suffix appended to the name (before the file extension). +Besides this file, it also searches for files nearby that have the \_ symbol and any suffix appended to the name (before the file extension). For example, it will also find the file `/opt/geo/regions_hierarchy_ua.txt`, if present. `ua` is called the dictionary key. For a dictionary without a suffix, the key is an empty string. -All the dictionaries are re-loaded in runtime (once every certain number of seconds, as defined in the builtin_dictionaries_reload_interval config parameter, or once an hour by default). However, the list of available dictionaries is defined one time, when the server starts. +All the dictionaries are re-loaded in runtime (once every certain number of seconds, as defined in the builtin\_dictionaries\_reload\_interval config parameter, or once an hour by default). However, the list of available dictionaries is defined one time, when the server starts. All functions for working with regions have an optional argument at the end – the dictionary key. It is referred to as the geobase. Example: -``` -regionToCountry(RegionID) – Uses the default dictionary: /opt/geo/regions_hierarchy.txt -regionToCountry(RegionID, '') – Uses the default dictionary: /opt/geo/regions_hierarchy.txt -regionToCountry(RegionID, 'ua') – Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt -``` + regionToCountry(RegionID) – Uses the default dictionary: /opt/geo/regions_hierarchy.txt + regionToCountry(RegionID, '') – Uses the default dictionary: /opt/geo/regions_hierarchy.txt + regionToCountry(RegionID, 'ua') – Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt -### regionToCity(id[, geobase]) +### regionToCity(id\[, geobase\]) {#regiontocityid-geobase} Accepts a UInt32 number – the region ID from the Yandex geobase. If this region is a city or part of a city, it returns the region ID for the appropriate city. Otherwise, returns 0. -### regionToArea(id\[, geobase\]) +### regionToArea(id\[, geobase\]) {#regiontoareaid-geobase} -Converts a region to an area (type 5 in the geobase). In every other way, this function is the same as 'regionToCity'. +Converts a region to an area (type 5 in the geobase). In every other way, this function is the same as ‘regionToCity’. ``` sql SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua')) @@ -40,29 +38,27 @@ FROM system.numbers LIMIT 15 ``` -``` -┌─regionToName(regionToArea(toUInt32(number), \'ua\'))─┐ -│ │ -│ Moscow and Moscow region │ -│ St. Petersburg and Leningrad region │ -│ Belgorod region │ -│ Ivanovsk region │ -│ Kaluga region │ -│ Kostroma region │ -│ Kursk region │ -│ Lipetsk region │ -│ Orlov region │ -│ Ryazan region │ -│ Smolensk region │ -│ Tambov region │ -│ Tver region │ -│ Tula region │ -└──────────────────────────────────────────────────────┘ -``` + ┌─regionToName(regionToArea(toUInt32(number), \'ua\'))─┐ + │ │ + │ Moscow and Moscow region │ + │ St. Petersburg and Leningrad region │ + │ Belgorod region │ + │ Ivanovsk region │ + │ Kaluga region │ + │ Kostroma region │ + │ Kursk region │ + │ Lipetsk region │ + │ Orlov region │ + │ Ryazan region │ + │ Smolensk region │ + │ Tambov region │ + │ Tver region │ + │ Tula region │ + └──────────────────────────────────────────────────────┘ -### regionToDistrict(id[, geobase]) +### regionToDistrict(id\[, geobase\]) {#regiontodistrictid-geobase} -Converts a region to a federal district (type 4 in the geobase). In every other way, this function is the same as 'regionToCity'. +Converts a region to a federal district (type 4 in the geobase). In every other way, this function is the same as ‘regionToCity’. ``` sql SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua')) @@ -70,58 +66,55 @@ FROM system.numbers LIMIT 15 ``` -``` -┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'))─┐ -│ │ -│ Central federal district │ -│ Northwest federal district │ -│ South federal district │ -│ North Caucases federal district │ -│ Privolga federal district │ -│ Ural federal district │ -│ Siberian federal district │ -│ Far East federal district │ -│ Scotland │ -│ Faroe Islands │ -│ Flemish region │ -│ Brussels capital region │ -│ Wallonia │ -│ Federation of Bosnia and Herzegovina │ -└──────────────────────────────────────────────────────────┘ -``` + ┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'))─┐ + │ │ + │ Central federal district │ + │ Northwest federal district │ + │ South federal district │ + │ North Caucases federal district │ + │ Privolga federal district │ + │ Ural federal district │ + │ Siberian federal district │ + │ Far East federal district │ + │ Scotland │ + │ Faroe Islands │ + │ Flemish region │ + │ Brussels capital region │ + │ Wallonia │ + │ Federation of Bosnia and Herzegovina │ + └──────────────────────────────────────────────────────────┘ -### regionToCountry(id[, geobase]) +### regionToCountry(id\[, geobase\]) {#regiontocountryid-geobase} -Converts a region to a country. In every other way, this function is the same as 'regionToCity'. +Converts a region to a country. In every other way, this function is the same as ‘regionToCity’. Example: `regionToCountry(toUInt32(213)) = 225` converts Moscow (213) to Russia (225). -### regionToContinent(id[, geobase]) +### regionToContinent(id\[, geobase\]) {#regiontocontinentid-geobase} -Converts a region to a continent. In every other way, this function is the same as 'regionToCity'. +Converts a region to a continent. In every other way, this function is the same as ‘regionToCity’. Example: `regionToContinent(toUInt32(213)) = 10001` converts Moscow (213) to Eurasia (10001). -### regionToPopulation(id[, geobase]) +### regionToPopulation(id\[, geobase\]) {#regiontopopulationid-geobase} Gets the population for a region. -The population can be recorded in files with the geobase. See the section "External dictionaries". +The population can be recorded in files with the geobase. See the section «External dictionaries». If the population is not recorded for the region, it returns 0. In the Yandex geobase, the population might be recorded for child regions, but not for parent regions. -### regionIn(lhs, rhs[, geobase]) +### regionIn(lhs, rhs\[, geobase\]) {#regioninlhs-rhs-geobase} -Checks whether a 'lhs' region belongs to a 'rhs' region. Returns a UInt8 number equal to 1 if it belongs, or 0 if it doesn't belong. +Checks whether a ‘lhs’ region belongs to a ‘rhs’ region. Returns a UInt8 number equal to 1 if it belongs, or 0 if it doesn’t belong. The relationship is reflexive – any region also belongs to itself. -### regionHierarchy(id\[, geobase\]) +### regionHierarchy(id\[, geobase\]) {#regionhierarchyid-geobase} Accepts a UInt32 number – the region ID from the Yandex geobase. Returns an array of region IDs consisting of the passed region and all parents along the chain. Example: `regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]`. -### regionToName(id\[, lang\]) +### regionToName(id\[, lang\]) {#regiontonameid-lang} -Accepts a UInt32 number – the region ID from the Yandex geobase. A string with the name of the language can be passed as a second argument. Supported languages are: ru, en, ua, uk, by, kz, tr. If the second argument is omitted, the language 'ru' is used. If the language is not supported, an exception is thrown. Returns a string – the name of the region in the corresponding language. If the region with the specified ID doesn't exist, an empty string is returned. +Accepts a UInt32 number – the region ID from the Yandex geobase. A string with the name of the language can be passed as a second argument. Supported languages are: ru, en, ua, uk, by, kz, tr. If the second argument is omitted, the language ‘ru’ is used. If the language is not supported, an exception is thrown. Returns a string – the name of the region in the corresponding language. If the region with the specified ID doesn’t exist, an empty string is returned. `ua` and `uk` both mean Ukrainian. - [Original article](https://clickhouse.tech/docs/en/query_language/functions/ym_dict_functions/) diff --git a/docs/zh/query_language/insert_into.md b/docs/zh/query_language/insert_into.md index 7e91baa5afc..16289f1e1ba 100644 --- a/docs/zh/query_language/insert_into.md +++ b/docs/zh/query_language/insert_into.md @@ -1,5 +1,4 @@ - -## INSERT +## INSERT {#insert} INSERT查询主要用于向系统中添加数据. @@ -11,10 +10,10 @@ INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), . 您可以在查询中指定插入的列的列表,如:`[(c1, c2, c3)]`。对于存在于表结构中但不存在于插入列表中的列,它们将会按照如下方式填充数据: -- 如果存在`DEFAULT`表达式,根据`DEFAULT`表达式计算被填充的值。 -- 如果没有定义`DEFAULT`表达式,则填充零或空字符串。 +- 如果存在`DEFAULT`表达式,根据`DEFAULT`表达式计算被填充的值。 +- 如果没有定义`DEFAULT`表达式,则填充零或空字符串。 -如果 [strict_insert_defaults=1](../operations/settings/settings.md),你必须在查询中列出所有没有定义`DEFAULT`表达式的列。 +如果 [strict\_insert\_defaults=1](../operations/settings/settings.md),你必须在查询中列出所有没有定义`DEFAULT`表达式的列。 数据可以以ClickHouse支持的任何 [输入输出格式](../interfaces/formats.md#formats) 传递给INSERT。格式的名称必须显示的指定在查询中: @@ -22,7 +21,7 @@ INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), . INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` -例如,下面的查询所使用的输入格式就与上面INSERT ... VALUES的中使用的输入格式相同: +例如,下面的查询所使用的输入格式就与上面INSERT … VALUES的中使用的输入格式相同: ``` sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... @@ -38,10 +37,9 @@ INSERT INTO t FORMAT TabSeparated 22 Qwerty ``` -在使用命令行客户端或HTTP客户端时,你可以将具体的查询语句与数据分开发送。更多具体信息,请参考“[客户端](../interfaces/index.md#interfaces)”部分。 +在使用命令行客户端或HTTP客户端时,你可以将具体的查询语句与数据分开发送。更多具体信息,请参考«[客户端](../interfaces/index.md#interfaces)»部分。 - -### 使用`SELECT`的结果写入 {#insert_query_insert-select} +### 使用`SELECT`的结果写入 {#insert-query-insert-select} ``` sql INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... @@ -54,16 +52,16 @@ INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... 系统不支持的其他用于修改数据的查询:`UPDATE`, `DELETE`, `REPLACE`, `MERGE`, `UPSERT`, `INSERT UPDATE`。 但是,您可以使用 `ALTER TABLE ... DROP PARTITION`查询来删除一些旧的数据。 -### 性能的注意事项 +### 性能的注意事项 {#xing-neng-de-zhu-yi-shi-xiang} 在进行`INSERT`时将会对写入的数据进行一些处理,按照主键排序,按照月份对数据进行分区等。所以如果在您的写入数据中包含多个月份的混合数据时,将会显著的降低`INSERT`的性能。为了避免这种情况: -- 数据总是以尽量大的batch进行写入,如每次写入100,000行。 -- 数据在写入ClickHouse前预先的对数据进行分组。 +- 数据总是以尽量大的batch进行写入,如每次写入100,000行。 +- 数据在写入ClickHouse前预先的对数据进行分组。 在以下的情况下,性能不会下降: -- 数据总是被实时的写入。 -- 写入的数据已经按照时间排序。 +- 数据总是被实时的写入。 +- 写入的数据已经按照时间排序。 [来源文章](https://clickhouse.tech/docs/en/query_language/insert_into/) diff --git a/docs/zh/query_language/operators.md b/docs/zh/query_language/operators.md index 8b53c0953eb..a8fb0a703db 100644 --- a/docs/zh/query_language/operators.md +++ b/docs/zh/query_language/operators.md @@ -1,32 +1,32 @@ -# 操作符 +# 操作符 {#cao-zuo-fu} 所有的操作符(运算符)都会在查询时依据他们的优先级及其结合顺序在被解析时转换为对应的函数。下面按优先级从高到低列出各组运算符及其对应的函数: -## 下标运算符 +## 下标运算符 {#xia-biao-yun-suan-fu} `a[N]` – 数组中的第N个元素; 对应函数 `arrayElement(a, N)` `a.N` – 元组中第N个元素; 对应函数 `tupleElement(a, N)` -## 负号 +## 负号 {#fu-hao} -`-a` – 对应函数 `negate(a)` +`-a` – 对应函数 `negate(a)` -## 乘号、除号和取余 +## 乘号、除号和取余 {#cheng-hao-chu-hao-he-qu-yu} -`a * b` – 对应函数 `multiply(a, b)` +`a * b` – 对应函数 `multiply(a, b)` -`a / b` – 对应函数 `divide(a, b)` +`a / b` – 对应函数 `divide(a, b)` `a % b` – 对应函数 `modulo(a, b)` -## 加号和减号 +## 加号和减号 {#jia-hao-he-jian-hao} `a + b` – 对应函数 `plus(a, b)` -`a - b` – 对应函数 `minus(a, b)` +`a - b` – 对应函数 `minus(a, b)` -## 关系运算符 +## 关系运算符 {#guan-xi-yun-suan-fu} `a = b` – 对应函数 `equals(a, b)` @@ -48,9 +48,9 @@ `a NOT LIKE s` – 对应函数 `notLike(a, b)` -`a BETWEEN b AND c` – 等价于 `a >= b AND a <= c` +`a BETWEEN b AND c` – 等价于 `a >= b AND a <= c` -## 集合关系运算符 +## 集合关系运算符 {#ji-he-guan-xi-yun-suan-fu} *详见此节 [IN 相关操作符](select.md#select-in-operators) 。* @@ -62,19 +62,19 @@ `a GLOBAL NOT IN ...` – 对应函数 `globalNotIn(a, b)` -## 逻辑非 +## 逻辑非 {#luo-ji-fei} `NOT a` – 对应函数 `not(a)` -## 逻辑与 +## 逻辑与 {#luo-ji-yu} `a AND b` – 对应函数`and(a, b)` -## 逻辑或 +## 逻辑或 {#luo-ji-huo} `a OR b` – 对应函数 `or(a, b)` -## 条件运算符 +## 条件运算符 {#tiao-jian-yun-suan-fu} `a ? b : c` – 对应函数 `if(a, b, c)` @@ -82,12 +82,11 @@ 条件运算符会先计算表达式b和表达式c的值,再根据表达式a的真假,返回相应的值。如果表达式b和表达式c是 [arrayJoin()](functions/array_join.md#functions_arrayjoin) 函数,则不管表达式a是真是假,每行都会被复制展开。 - ## Operators for Working with Dates and Times {#operators-datetime} ### EXTRACT {#operator-extract} -```sql +``` sql EXTRACT(part FROM date); ``` @@ -95,12 +94,12 @@ Extracts a part from a given date. For example, you can retrieve a month from a The `part` parameter specifies which part of the date to retrieve. The following values are available: -- `DAY` — The day of the month. Possible values: 1–31. -- `MONTH` — The number of a month. Possible values: 1–12. -- `YEAR` — The year. -- `SECOND` — The second. Possible values: 0–59. -- `MINUTE` — The minute. Possible values: 0–59. -- `HOUR` — The hour. Possible values: 0–23. +- `DAY` — The day of the month. Possible values: 1–31. +- `MONTH` — The number of a month. Possible values: 1–12. +- `YEAR` — The year. +- `SECOND` — The second. Possible values: 0–59. +- `MINUTE` — The minute. Possible values: 0–59. +- `HOUR` — The hour. Possible values: 0–23. The `part` parameter is case-insensitive. @@ -108,7 +107,7 @@ The `date` parameter specifies the date or the time to process. Either [Date](.. Examples: -```sql +``` sql SELECT EXTRACT(DAY FROM toDate('2017-06-15')); SELECT EXTRACT(MONTH FROM toDate('2017-06-15')); SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); @@ -116,7 +115,7 @@ SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); In the following example we create a table and insert into it a value with the `DateTime` type. -```sql +``` sql CREATE TABLE test.Orders ( OrderId UInt64, @@ -126,10 +125,11 @@ CREATE TABLE test.Orders ENGINE = Log; ``` -```sql +``` sql INSERT INTO test.Orders VALUES (1, 'Jarlsberg Cheese', toDateTime('2008-10-11 13:23:44')); ``` -```sql + +``` sql SELECT toYear(OrderDate) AS OrderYear, toMonth(OrderDate) AS OrderMonth, @@ -139,7 +139,8 @@ SELECT toSecond(OrderDate) AS OrderSecond FROM test.Orders; ``` -```text + +``` text ┌─OrderYear─┬─OrderMonth─┬─OrderDay─┬─OrderHour─┬─OrderMinute─┬─OrderSecond─┐ │ 2008 │ 10 │ 11 │ 13 │ 23 │ 44 │ └───────────┴────────────┴──────────┴───────────┴─────────────┴─────────────┘ @@ -153,10 +154,11 @@ Creates an [Interval](../data_types/special_data_types/interval.md)-type value t Example: -```sql +``` sql SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL 3 HOUR ``` -```text + +``` text ┌───current_date_time─┬─plus(plus(now(), toIntervalDay(4)), toIntervalHour(3))─┐ │ 2019-10-23 11:16:28 │ 2019-10-27 14:16:28 │ └─────────────────────┴────────────────────────────────────────────────────────┘ @@ -164,11 +166,10 @@ SELECT now() AS current_date_time, current_date_time + INTERVAL 4 DAY + INTERVAL **See Also** -- [Interval](../data_types/special_data_types/interval.md) data type -- [toInterval](functions/type_conversion_functions.md#function-tointerval) type convertion functions +- [Interval](../data_types/special_data_types/interval.md) data type +- [toInterval](functions/type_conversion_functions.md#function-tointerval) type convertion functions - -## CASE条件表达式 {#operator_case} +## CASE条件表达式 {#operator-case} ``` sql CASE [x] @@ -184,43 +185,45 @@ END 但 `transform` 函数不支持 `NULL` -## 连接运算符 +## 连接运算符 {#lian-jie-yun-suan-fu} `s1 || s2` – 对应函数 `concat(s1, s2)` -## 创建 Lambda 函数 +## 创建 Lambda 函数 {#chuang-jian-lambda-han-shu} `x -> expr` – 对应函数 `lambda(x, expr)` 接下来的这些操作符因为其本身是括号没有优先级: -## 创建数组 +## 创建数组 {#chuang-jian-shu-zu} `[x1, ...]` – 对应函数 `array(x1, ...)` -## 创建元组 +## 创建元组 {#chuang-jian-yuan-zu} `(x1, x2, ...)` – 对应函数 `tuple(x2, x2, ...)` -## 结合方式 +## 结合方式 {#jie-he-fang-shi} 所有的同级操作符从左到右结合。例如, `1 + 2 + 3` 会转换成 `plus(plus(1, 2), 3)`。 所以,有时他们会跟我们预期的不太一样。例如, `SELECT 4 > 2 > 3` 的结果是0。 为了高效, `and` 和 `or` 函数支持任意多参数,一连串的 `AND` 和 `OR` 运算符会转换成其对应的单个函数。 -## 判断是否为 `NULL` +## 判断是否为 `NULL` {#pan-duan-shi-fou-wei-null} ClickHouse 支持 `IS NULL` 和 `IS NOT NULL` 。 ### IS NULL {#operator-is-null} -- 对于 [Nullable](../data_types/nullable.md) 类型的值, `IS NULL` 会返回: - - `1` 值为 `NULL` - - `0` 否则 -- 对于其他类型的值, `IS NULL` 总会返回 `0` +- 对于 [Nullable](../data_types/nullable.md) 类型的值, `IS NULL` 会返回: + - `1` 值为 `NULL` + - `0` 否则 +- 对于其他类型的值, `IS NULL` 总会返回 `0` -```bash + + +``` bash :) SELECT x+100 FROM t_null WHERE y IS NULL SELECT x + 100 @@ -234,15 +237,16 @@ WHERE isNull(y) 1 rows in set. Elapsed: 0.002 sec. ``` +### IS NOT NULL {#is-not-null} -### IS NOT NULL +- 对于 [Nullable](../data_types/nullable.md) 类型的值, `IS NOT NULL` 会返回: + - `0` 值为 `NULL` + - `1` 否则 +- 对于其他类型的值,`IS NOT NULL` 总会返回 `1` -- 对于 [Nullable](../data_types/nullable.md) 类型的值, `IS NOT NULL` 会返回: - - `0` 值为 `NULL` - - `1` 否则 -- 对于其他类型的值,`IS NOT NULL` 总会返回 `1` + -```bash +``` bash :) SELECT * FROM t_null WHERE y IS NOT NULL SELECT * diff --git a/docs/zh/query_language/select.md b/docs/zh/query_language/select.md index 6f2f1a81145..8400e963f3c 100644 --- a/docs/zh/query_language/select.md +++ b/docs/zh/query_language/select.md @@ -1,4 +1,4 @@ -# SELECT 查询语法 +# SELECT 查询语法 {#select-cha-xun-yu-fa} `SELECT` 语句用于执行数据的检索。 @@ -20,11 +20,11 @@ SELECT [DISTINCT] expr_list [FORMAT format] ``` -所有的子句都是可选的,除了SELECT之后的表达式列表(expr_list)。 +所有的子句都是可选的,除了SELECT之后的表达式列表(expr\_list)。 下面将按照查询运行的顺序逐一对各个子句进行说明。 如果查询中不包含`DISTINCT`,`GROUP BY`,`ORDER BY`子句以及`IN`和`JOIN`子查询,那么它将仅使用O(1)数量的内存来完全流式的处理查询 -否则,这个查询将消耗大量的内存,除非你指定了这些系统配置:`max_memory_usage`, `max_rows_to_group_by`, `max_rows_to_sort`, `max_rows_in_distinct`, `max_bytes_in_distinct`, `max_rows_in_set`, `max_bytes_in_set`, `max_rows_in_join`, `max_bytes_in_join`, `max_bytes_before_external_sort`, `max_bytes_before_external_group_by`。它们规定了可以使用外部排序(将临时表存储到磁盘中)以及外部聚合,`目前系统不存在关于Join的配置`,更多关于它们的信息,可以参见“配置”部分。 +否则,这个查询将消耗大量的内存,除非你指定了这些系统配置:`max_memory_usage`, `max_rows_to_group_by`, `max_rows_to_sort`, `max_rows_in_distinct`, `max_bytes_in_distinct`, `max_rows_in_set`, `max_bytes_in_set`, `max_rows_in_join`, `max_bytes_in_join`, `max_bytes_before_external_sort`, `max_bytes_before_external_group_by`。它们规定了可以使用外部排序(将临时表存储到磁盘中)以及外部聚合,`目前系统不存在关于Join的配置`,更多关于它们的信息,可以参见«配置»部分。 ### FROM 子句 {#select-from} @@ -37,21 +37,21 @@ FROM子句规定了将从哪个表、或子查询、或表函数中读取数据 在这种情况下,子查询的处理将会构建在外部的查询内部。 不同于SQL标准,子查询后无需指定别名。为了兼容,你可以在子查询后添加‘AS 别名’,但是指定的名字不能被使用在任何地方。 -也可以使用表函数来代替表,有关信息,参见“表函数”。 +也可以使用表函数来代替表,有关信息,参见«表函数»。 执行查询时,在查询中列出的所有列都将从对应的表中提取数据;如果你使用的是子查询的方式,则任何在外部查询中没有使用的列,子查询将从查询中忽略它们; 如果你的查询没有列出任何的列(例如,SELECT count() FROM t),则将额外的从表中提取一些列(最好的情况下是最小的列),以便计算行数。 -最后的FINAL修饰符仅能够被使用在SELECT from CollapsingMergeTree场景中。当你为FROM指定了FINAL修饰符时,你的查询结果将会在查询过程中被聚合。需要注意的是,在这种情况下,查询将在单个流中读取所有相关的主键列,同时对需要的数据进行合并。这意味着,当使用FINAL修饰符时,查询将会处理的更慢。在大多数情况下,你应该避免使用FINAL修饰符。更多信息,请参阅“CollapsingMergeTree引擎”部分。 +最后的FINAL修饰符仅能够被使用在SELECT from CollapsingMergeTree场景中。当你为FROM指定了FINAL修饰符时,你的查询结果将会在查询过程中被聚合。需要注意的是,在这种情况下,查询将在单个流中读取所有相关的主键列,同时对需要的数据进行合并。这意味着,当使用FINAL修饰符时,查询将会处理的更慢。在大多数情况下,你应该避免使用FINAL修饰符。更多信息,请参阅«CollapsingMergeTree引擎»部分。 ### SAMPLE 子句 {#select-sample-clause} -通过SAMPLE子句用户可以进行近似查询处理,近似查询处理仅能工作在MergeTree\*类型的表中,并且在创建表时需要您指定采样表达式(参见“MergeTree 引擎”部分)。 +通过SAMPLE子句用户可以进行近似查询处理,近似查询处理仅能工作在MergeTree\*类型的表中,并且在创建表时需要您指定采样表达式(参见«MergeTree 引擎»部分)。 `SAMPLE`子句可以使用`SAMPLE k`来表示,其中k可以是0到1的小数值,或者是一个足够大的正整数值。 -当k为0到1的小数时,查询将使用'k'作为百分比选取数据。例如,`SAMPLE 0.1`查询只会检索数据总量的10%。 -当k为一个足够大的正整数时,查询将使用'k'作为最大样本数。例如, `SAMPLE 10000000`查询只会检索最多10,000,000行数据。 +当k为0到1的小数时,查询将使用’k’作为百分比选取数据。例如,`SAMPLE 0.1`查询只会检索数据总量的10%。 +当k为一个足够大的正整数时,查询将使用’k’作为最大样本数。例如, `SAMPLE 10000000`查询只会检索最多10,000,000行数据。 Example: @@ -80,260 +80,242 @@ ORDER BY PageViews DESC LIMIT 1000 例如,我们可以使用采样的方式获取到与不进行采样相同的用户ID的列表。这将表明,你可以在IN子查询中使用采样,或者使用采样的结果与其他查询进行关联。 - ### ARRAY JOIN 子句 {#select-array-join-clause} ARRAY JOIN子句可以帮助查询进行与数组和nested数据类型的连接。它有点类似arrayJoin函数,但它的功能更广泛。 `ARRAY JOIN` 本质上等同于`INNERT JOIN`数组。 例如: -``` -:) CREATE TABLE arrays_test (s String, arr Array(UInt8)) ENGINE = Memory + :) CREATE TABLE arrays_test (s String, arr Array(UInt8)) ENGINE = Memory -CREATE TABLE arrays_test -( - s String, - arr Array(UInt8) -) ENGINE = Memory + CREATE TABLE arrays_test + ( + s String, + arr Array(UInt8) + ) ENGINE = Memory -Ok. + Ok. -0 rows in set. Elapsed: 0.001 sec. + 0 rows in set. Elapsed: 0.001 sec. -:) INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []) + :) INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []) -INSERT INTO arrays_test VALUES + INSERT INTO arrays_test VALUES -Ok. + Ok. -3 rows in set. Elapsed: 0.001 sec. + 3 rows in set. Elapsed: 0.001 sec. -:) SELECT * FROM arrays_test + :) SELECT * FROM arrays_test -SELECT * -FROM arrays_test + SELECT * + FROM arrays_test -┌─s───────┬─arr─────┐ -│ Hello │ [1,2] │ -│ World │ [3,4,5] │ -│ Goodbye │ [] │ -└─────────┴─────────┘ + ┌─s───────┬─arr─────┐ + │ Hello │ [1,2] │ + │ World │ [3,4,5] │ + │ Goodbye │ [] │ + └─────────┴─────────┘ -3 rows in set. Elapsed: 0.001 sec. + 3 rows in set. Elapsed: 0.001 sec. -:) SELECT s, arr FROM arrays_test ARRAY JOIN arr + :) SELECT s, arr FROM arrays_test ARRAY JOIN arr -SELECT s, arr -FROM arrays_test -ARRAY JOIN arr + SELECT s, arr + FROM arrays_test + ARRAY JOIN arr -┌─s─────┬─arr─┐ -│ Hello │ 1 │ -│ Hello │ 2 │ -│ World │ 3 │ -│ World │ 4 │ -│ World │ 5 │ -└───────┴─────┘ + ┌─s─────┬─arr─┐ + │ Hello │ 1 │ + │ Hello │ 2 │ + │ World │ 3 │ + │ World │ 4 │ + │ World │ 5 │ + └───────┴─────┘ -5 rows in set. Elapsed: 0.001 sec. -``` + 5 rows in set. Elapsed: 0.001 sec. 你还可以为ARRAY JOIN子句指定一个别名,这时你可以通过这个别名来访问数组中的数据,但是数据本身仍然可以通过原来的名称进行访问。例如: -``` -:) SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a + :) SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a -SELECT s, arr, a -FROM arrays_test -ARRAY JOIN arr AS a + SELECT s, arr, a + FROM arrays_test + ARRAY JOIN arr AS a -┌─s─────┬─arr─────┬─a─┐ -│ Hello │ [1,2] │ 1 │ -│ Hello │ [1,2] │ 2 │ -│ World │ [3,4,5] │ 3 │ -│ World │ [3,4,5] │ 4 │ -│ World │ [3,4,5] │ 5 │ -└───────┴─────────┴───┘ + ┌─s─────┬─arr─────┬─a─┐ + │ Hello │ [1,2] │ 1 │ + │ Hello │ [1,2] │ 2 │ + │ World │ [3,4,5] │ 3 │ + │ World │ [3,4,5] │ 4 │ + │ World │ [3,4,5] │ 5 │ + └───────┴─────────┴───┘ -5 rows in set. Elapsed: 0.001 sec. -``` + 5 rows in set. Elapsed: 0.001 sec. 当多个具有相同大小的数组使用逗号分割出现在ARRAY JOIN子句中时,ARRAY JOIN会将它们同时执行(直接合并,而不是它们的笛卡尔积)。例如: -``` -:) SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped + :) SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped -SELECT s, arr, a, num, mapped -FROM arrays_test -ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(lambda(tuple(x), plus(x, 1)), arr) AS mapped + SELECT s, arr, a, num, mapped + FROM arrays_test + ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(lambda(tuple(x), plus(x, 1)), arr) AS mapped -┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ -│ Hello │ [1,2] │ 1 │ 1 │ 2 │ -│ Hello │ [1,2] │ 2 │ 2 │ 3 │ -│ World │ [3,4,5] │ 3 │ 1 │ 4 │ -│ World │ [3,4,5] │ 4 │ 2 │ 5 │ -│ World │ [3,4,5] │ 5 │ 3 │ 6 │ -└───────┴─────────┴───┴─────┴────────┘ + ┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ + │ Hello │ [1,2] │ 1 │ 1 │ 2 │ + │ Hello │ [1,2] │ 2 │ 2 │ 3 │ + │ World │ [3,4,5] │ 3 │ 1 │ 4 │ + │ World │ [3,4,5] │ 4 │ 2 │ 5 │ + │ World │ [3,4,5] │ 5 │ 3 │ 6 │ + └───────┴─────────┴───┴─────┴────────┘ -5 rows in set. Elapsed: 0.002 sec. + 5 rows in set. Elapsed: 0.002 sec. -:) SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num + :) SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num -SELECT s, arr, a, num, arrayEnumerate(arr) -FROM arrays_test -ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num + SELECT s, arr, a, num, arrayEnumerate(arr) + FROM arrays_test + ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num -┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ -│ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ -│ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ -│ World │ [3,4,5] │ 3 │ 1 │ [1,2,3] │ -│ World │ [3,4,5] │ 4 │ 2 │ [1,2,3] │ -│ World │ [3,4,5] │ 5 │ 3 │ [1,2,3] │ -└───────┴─────────┴───┴─────┴─────────────────────┘ + ┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ + │ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ + │ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ + │ World │ [3,4,5] │ 3 │ 1 │ [1,2,3] │ + │ World │ [3,4,5] │ 4 │ 2 │ [1,2,3] │ + │ World │ [3,4,5] │ 5 │ 3 │ [1,2,3] │ + └───────┴─────────┴───┴─────┴─────────────────────┘ -5 rows in set. Elapsed: 0.002 sec. -``` + 5 rows in set. Elapsed: 0.002 sec. 另外ARRAY JOIN也可以工作在nested数据结构上。例如: -``` -:) CREATE TABLE nested_test (s String, nest Nested(x UInt8, y UInt32)) ENGINE = Memory + :) CREATE TABLE nested_test (s String, nest Nested(x UInt8, y UInt32)) ENGINE = Memory -CREATE TABLE nested_test -( - s String, - nest Nested( - x UInt8, - y UInt32) -) ENGINE = Memory + CREATE TABLE nested_test + ( + s String, + nest Nested( + x UInt8, + y UInt32) + ) ENGINE = Memory -Ok. + Ok. -0 rows in set. Elapsed: 0.006 sec. + 0 rows in set. Elapsed: 0.006 sec. -:) INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []) + :) INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []) -INSERT INTO nested_test VALUES + INSERT INTO nested_test VALUES -Ok. + Ok. -3 rows in set. Elapsed: 0.001 sec. + 3 rows in set. Elapsed: 0.001 sec. -:) SELECT * FROM nested_test + :) SELECT * FROM nested_test -SELECT * -FROM nested_test + SELECT * + FROM nested_test -┌─s───────┬─nest.x──┬─nest.y─────┐ -│ Hello │ [1,2] │ [10,20] │ -│ World │ [3,4,5] │ [30,40,50] │ -│ Goodbye │ [] │ [] │ -└─────────┴─────────┴────────────┘ + ┌─s───────┬─nest.x──┬─nest.y─────┐ + │ Hello │ [1,2] │ [10,20] │ + │ World │ [3,4,5] │ [30,40,50] │ + │ Goodbye │ [] │ [] │ + └─────────┴─────────┴────────────┘ -3 rows in set. Elapsed: 0.001 sec. + 3 rows in set. Elapsed: 0.001 sec. -:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest + :) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN nest + SELECT s, `nest.x`, `nest.y` + FROM nested_test + ARRAY JOIN nest -┌─s─────┬─nest.x─┬─nest.y─┐ -│ Hello │ 1 │ 10 │ -│ Hello │ 2 │ 20 │ -│ World │ 3 │ 30 │ -│ World │ 4 │ 40 │ -│ World │ 5 │ 50 │ -└───────┴────────┴────────┘ + ┌─s─────┬─nest.x─┬─nest.y─┐ + │ Hello │ 1 │ 10 │ + │ Hello │ 2 │ 20 │ + │ World │ 3 │ 30 │ + │ World │ 4 │ 40 │ + │ World │ 5 │ 50 │ + └───────┴────────┴────────┘ -5 rows in set. Elapsed: 0.001 sec. -``` + 5 rows in set. Elapsed: 0.001 sec. 当你在ARRAY JOIN指定nested数据类型的名称时,其作用与与包含所有数组元素的ARRAY JOIN相同,例如: -``` -:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x, nest.y + :) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x, nest.y -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN `nest.x`, `nest.y` + SELECT s, `nest.x`, `nest.y` + FROM nested_test + ARRAY JOIN `nest.x`, `nest.y` -┌─s─────┬─nest.x─┬─nest.y─┐ -│ Hello │ 1 │ 10 │ -│ Hello │ 2 │ 20 │ -│ World │ 3 │ 30 │ -│ World │ 4 │ 40 │ -│ World │ 5 │ 50 │ -└───────┴────────┴────────┘ + ┌─s─────┬─nest.x─┬─nest.y─┐ + │ Hello │ 1 │ 10 │ + │ Hello │ 2 │ 20 │ + │ World │ 3 │ 30 │ + │ World │ 4 │ 40 │ + │ World │ 5 │ 50 │ + └───────┴────────┴────────┘ -5 rows in set. Elapsed: 0.001 sec. -``` + 5 rows in set. Elapsed: 0.001 sec. 这种方式也是可以运行的: -``` -:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x + :) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN `nest.x` + SELECT s, `nest.x`, `nest.y` + FROM nested_test + ARRAY JOIN `nest.x` -┌─s─────┬─nest.x─┬─nest.y─────┐ -│ Hello │ 1 │ [10,20] │ -│ Hello │ 2 │ [10,20] │ -│ World │ 3 │ [30,40,50] │ -│ World │ 4 │ [30,40,50] │ -│ World │ 5 │ [30,40,50] │ -└───────┴────────┴────────────┘ + ┌─s─────┬─nest.x─┬─nest.y─────┐ + │ Hello │ 1 │ [10,20] │ + │ Hello │ 2 │ [10,20] │ + │ World │ 3 │ [30,40,50] │ + │ World │ 4 │ [30,40,50] │ + │ World │ 5 │ [30,40,50] │ + └───────┴────────┴────────────┘ -5 rows in set. Elapsed: 0.001 sec. -``` + 5 rows in set. Elapsed: 0.001 sec. 为了方便使用原来的nested类型的数组,你可以为nested类型定义一个别名。例如: -``` -:) SELECT s, n.x, n.y, nest.x, nest.y FROM nested_test ARRAY JOIN nest AS n + :) SELECT s, n.x, n.y, nest.x, nest.y FROM nested_test ARRAY JOIN nest AS n -SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN nest AS n + SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` + FROM nested_test + ARRAY JOIN nest AS n -┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ -│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ -│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ -│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ -│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ -│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ -└───────┴─────┴─────┴─────────┴────────────┘ + ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ + │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ + │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ + │ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ + │ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ + │ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ + └───────┴─────┴─────┴─────────┴────────────┘ -5 rows in set. Elapsed: 0.001 sec. -``` + 5 rows in set. Elapsed: 0.001 sec. 使用arrayEnumerate函数的示例: -``` -:) SELECT s, n.x, n.y, nest.x, nest.y, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(nest.x) AS num + :) SELECT s, n.x, n.y, nest.x, nest.y, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(nest.x) AS num -SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num -FROM nested_test -ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num + SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num + FROM nested_test + ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num -┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ -│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ -│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ -│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ 1 │ -│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ 2 │ -│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ 3 │ -└───────┴─────┴─────┴─────────┴────────────┴─────┘ + ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ + │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ + │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ + │ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ 1 │ + │ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ 2 │ + │ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ 3 │ + └───────┴─────┴─────┴─────────┴────────────┴─────┘ -5 rows in set. Elapsed: 0.002 sec. -``` + 5 rows in set. Elapsed: 0.002 sec. 在一个查询中只能出现一个ARRAY JOIN子句。 如果在WHERE/PREWHERE子句中使用了ARRAY JOIN子句的结果,它将优先于WHERE/PREWHERE子句执行,否则它将在WHERE/PRWHERE子句之后执行,以便减少计算。 - ### JOIN 子句 {#select-join} JOIN子句用于连接数据,作用与[SQL JOIN](https://en.wikipedia.org/wiki/Join_(SQL))的定义相同。 @@ -341,7 +323,6 @@ JOIN子句用于连接数据,作用与[SQL JOIN](https://en.wikipedia.org/wiki !!! info "注意" 与 [ARRAY JOIN](#select-array-join-clause) 没有关系. - ``` sql SELECT FROM @@ -349,14 +330,14 @@ FROM (ON )|(USING ) ... ``` -可以使用具体的表名来代替``与``。但这与使用`SELECT * FROM table`子查询的方式相同。除非你的表是[Join](../operations/table_engines/join.md +可以使用具体的表名来代替``与``。但这与使用`SELECT * FROM table`子查询的方式相同。除非你的表是\[Join\](../operations/table\_engines/join.md **支持的`JOIN`类型** -- `INNER JOIN` -- `LEFT OUTER JOIN` -- `RIGHT OUTER JOIN` -- `FULL OUTER JOIN` -- `CROSS JOIN` +- `INNER JOIN` +- `LEFT OUTER JOIN` +- `RIGHT OUTER JOIN` +- `FULL OUTER JOIN` +- `CROSS JOIN` 你可以跳过默认的`OUTER`关键字。 @@ -365,7 +346,7 @@ FROM 在使用`ALL`修饰符对JOIN进行修饰时,如果右表中存在多个与左表关联的数据,那么系统则将右表中所有可以与左表关联的数据全部返回在结果中。这与SQL标准的JOIN行为相同。 在使用`ANY`修饰符对JOIN进行修饰时,如果右表中存在多个与左表关联的数据,那么系统仅返回第一个与左表匹配的结果。如果左表与右表一一对应,不存在多余的行时,`ANY`与`ALL`的结果相同。 -你可以在会话中通过设置 [join_default_strictness](../operations/settings/settings.md) 来指定默认的JOIN修饰符。 +你可以在会话中通过设置 [join\_default\_strictness](../operations/settings/settings.md) 来指定默认的JOIN修饰符。 **`GLOBAL` distribution** @@ -407,23 +388,21 @@ ORDER BY hits DESC LIMIT 10 ``` -``` -┌─CounterID─┬───hits─┬─visits─┐ -│ 1143050 │ 523264 │ 13665 │ -│ 731962 │ 475698 │ 102716 │ -│ 722545 │ 337212 │ 108187 │ -│ 722889 │ 252197 │ 10547 │ -│ 2237260 │ 196036 │ 9522 │ -│ 23057320 │ 147211 │ 7689 │ -│ 722818 │ 90109 │ 17847 │ -│ 48221 │ 85379 │ 4652 │ -│ 19762435 │ 77807 │ 7026 │ -│ 722884 │ 77492 │ 11056 │ -└───────────┴────────┴────────┘ -``` + ┌─CounterID─┬───hits─┬─visits─┐ + │ 1143050 │ 523264 │ 13665 │ + │ 731962 │ 475698 │ 102716 │ + │ 722545 │ 337212 │ 108187 │ + │ 722889 │ 252197 │ 10547 │ + │ 2237260 │ 196036 │ 9522 │ + │ 23057320 │ 147211 │ 7689 │ + │ 722818 │ 90109 │ 17847 │ + │ 48221 │ 85379 │ 4652 │ + │ 19762435 │ 77807 │ 7026 │ + │ 722884 │ 77492 │ 11056 │ + └───────────┴────────┴────────┘ 子查询不允许您设置别名或在其他地方引用它们。 -`USING`中指定的列必须在两个子查询中具有相同的名称,而其他列必须具有不同的名称。您可以通过使用别名的方式来更改子查询中的列名(示例中就分别使用了'hits'与'visits'别名)。 +`USING`中指定的列必须在两个子查询中具有相同的名称,而其他列必须具有不同的名称。您可以通过使用别名的方式来更改子查询中的列名(示例中就分别使用了’hits’与’visits’别名)。 `USING`子句用于指定要进行链接的一个或多个列,系统会将这些列在两张表中相等的值连接起来。如果列是一个列表,不需要使用括号包裹。同时JOIN不支持其他更复杂的Join方式。 @@ -431,19 +410,18 @@ LIMIT 10 只能在查询中指定一个`JOIN`。若要运行多个`JOIN`,你可以将它们放入子查询中。 -每次运行相同的`JOIN`查询,总是会再次计算 - 没有缓存结果。 为了避免这种情况,可以使用‘Join’引擎,它是一个预处理的Join数据结构,总是保存在内存中。更多信息,参见“Join引擎”部分。 +每次运行相同的`JOIN`查询,总是会再次计算 - 没有缓存结果。 为了避免这种情况,可以使用‘Join’引擎,它是一个预处理的Join数据结构,总是保存在内存中。更多信息,参见«Join引擎»部分。 在一些场景下,使用`IN`代替`JOIN`将会得到更高的效率。在各种类型的JOIN中,最高效的是`ANY LEFT JOIN`,然后是`ANY INNER JOIN`,效率最差的是`ALL LEFT JOIN`以及`ALL INNER JOIN`。 -如果你需要使用`JOIN`来关联一些纬度表(包含纬度属性的一些相对比较小的表,例如广告活动的名称),那么`JOIN`可能不是好的选择,因为语法负责,并且每次查询都将重新访问这些表。对于这种情况,您应该使用“外部字典”的功能来替换`JOIN`。更多信息,参见 [外部字典](dicts/external_dicts.md) 部分。 +如果你需要使用`JOIN`来关联一些纬度表(包含纬度属性的一些相对比较小的表,例如广告活动的名称),那么`JOIN`可能不是好的选择,因为语法负责,并且每次查询都将重新访问这些表。对于这种情况,您应该使用«外部字典»的功能来替换`JOIN`。更多信息,参见 [外部字典](dicts/external_dicts.md) 部分。 -#### Null的处理 +#### Null的处理 {#nullde-chu-li} -JOIN的行为受 [join_use_nulls](../operations/settings/settings.md) 的影响。当`join_use_nulls=1`时,`JOIN`的工作与SQL标准相同。 +JOIN的行为受 [join\_use\_nulls](../operations/settings/settings.md) 的影响。当`join_use_nulls=1`时,`JOIN`的工作与SQL标准相同。 如果JOIN的key是 [Nullable](../data_types/nullable.md) 类型的字段,则其中至少一个存在 [NULL](syntax.md) 值的key不会被关联。 - ### WHERE 子句 {#select-where} 如果存在WHERE子句, 则在该子句中必须包含一个UInt8类型的表达式。 这个表达是通常是一个带有比较和逻辑的表达式。 @@ -451,8 +429,7 @@ JOIN的行为受 [join_use_nulls](../operations/settings/settings.md) 的影响 如果在支持索引的数据库表引擎中,这个表达式将被评估是否使用索引。 - -### PREWHERE 子句 +### PREWHERE 子句 {#prewhere-zi-ju} 这个子句与WHERE子句的意思相同。主要的不同之处在于表数据的读取。 当使用PREWHERE时,首先只读取PREWHERE表达式中需要的列。然后在根据PREWHERE执行的结果读取其他需要的列。 @@ -467,16 +444,16 @@ PREWHERE 仅支持`*MergeTree`系列引擎。 值得注意的是,PREWHERE不适合用于已经存在于索引中的列,因为当列已经存在于索引中的情况下,只有满足索引的数据块才会被读取。 -如果将'optimize_move_to_prewhere'设置为1,并且在查询中不包含PREWHERE,则系统将自动的把适合PREWHERE表达式的部分从WHERE中抽离到PREWHERE中。 +如果将’optimize\_move\_to\_prewhere’设置为1,并且在查询中不包含PREWHERE,则系统将自动的把适合PREWHERE表达式的部分从WHERE中抽离到PREWHERE中。 ### GROUP BY 子句 {#select-group-by-clause} 这是列式数据库管理系统中最重要的一部分。 -如果存在GROUP BY子句,则在该子句中必须包含一个表达式列表。其中每个表达式将会被称之为“key”。 -SELECT,HAVING,ORDER BY子句中的表达式列表必须来自于这些“key”或聚合函数。简而言之,被选择的列中不能包含非聚合函数或key之外的其他列。 +如果存在GROUP BY子句,则在该子句中必须包含一个表达式列表。其中每个表达式将会被称之为«key»。 +SELECT,HAVING,ORDER BY子句中的表达式列表必须来自于这些«key»或聚合函数。简而言之,被选择的列中不能包含非聚合函数或key之外的其他列。 -如果查询表达式列表中仅包含聚合函数,则可以省略GROUP BY子句,这时会假定将所有数据聚合成一组空“key”。 +如果查询表达式列表中仅包含聚合函数,则可以省略GROUP BY子句,这时会假定将所有数据聚合成一组空«key»。 Example: @@ -509,7 +486,7 @@ GROUP BY子句会为遇到的每一个不同的key计算一组聚合函数的值 常量不能作为聚合函数的参数传入聚合函数中。例如: sum(1)。这种情况下你可以省略常量。例如:`count()`。 -#### NULL 处理 +#### NULL 处理 {#null-chu-li} 对于GROUP BY子句,ClickHouse将 [NULL](syntax.md) 解释为一个值,并且支持`NULL=NULL`。 @@ -517,31 +494,27 @@ GROUP BY子句会为遇到的每一个不同的key计算一组聚合函数的值 假设你有这样一张表: -``` -┌─x─┬────y─┐ -│ 1 │ 2 │ -│ 2 │ ᴺᵁᴸᴸ │ -│ 3 │ 2 │ -│ 3 │ 3 │ -│ 3 │ ᴺᵁᴸᴸ │ -└───┴──────┘ -``` + ┌─x─┬────y─┐ + │ 1 │ 2 │ + │ 2 │ ᴺᵁᴸᴸ │ + │ 3 │ 2 │ + │ 3 │ 3 │ + │ 3 │ ᴺᵁᴸᴸ │ + └───┴──────┘ 运行`SELECT sum(x), y FROM t_null_big GROUP BY y`你将得到如下结果: -``` -┌─sum(x)─┬────y─┐ -│ 4 │ 2 │ -│ 3 │ 3 │ -│ 5 │ ᴺᵁᴸᴸ │ -└────────┴──────┘ -``` + ┌─sum(x)─┬────y─┐ + │ 4 │ 2 │ + │ 3 │ 3 │ + │ 5 │ ᴺᵁᴸᴸ │ + └────────┴──────┘ 你可以看到GROUP BY为`y=NULL`的聚合了x。 如果你在向`GROUP BY`中放入几个key,结果将列出所有的组合可能。就像`NULL`是一个特定的值一样。 -#### WITH TOTALS 修饰符 +#### WITH TOTALS 修饰符 {#with-totals-xiu-shi-fu} 如果你指定了WITH TOTALS修饰符,你将会在结果中得到一个被额外计算出的行。在这一行中将包含所有key的默认值(零或者空值),以及所有聚合函数对所有被选择数据行的聚合结果。 @@ -549,7 +522,7 @@ GROUP BY子句会为遇到的每一个不同的key计算一组聚合函数的值 在JSON\*输出格式中,这行将出现在Json的‘totals’字段中。在TabSeparated\*输出格式中,这行将位于其他结果之后,同时与其他结果使用空白行分隔。在Pretty\*输出格式中,这行将作为单独的表在所有结果之后输出。 -当`WITH TOTALS`与HAVING子句同时存在时,它的行为受‘totals_mode’配置的影响。 +当`WITH TOTALS`与HAVING子句同时存在时,它的行为受‘totals\_mode’配置的影响。 默认情况下,`totals_mode = 'before_having'`,这时`WITH TOTALS`将会在HAVING前计算最多不超过`max_rows_to_group_by`行的数据。 在`group_by_overflow_mode = 'any'`并指定了`max_rows_to_group_by`的情况下,`WITH TOTALS`的行为受`totals_mode`的影响。 @@ -558,7 +531,7 @@ GROUP BY子句会为遇到的每一个不同的key计算一组聚合函数的值 `after_having_inclusive` - 在HAVING后进行计算,计算不少于`max_rows_to_group_by`行的数据。 -`after_having_auto` - 在HAVING后进行计算,采用统计通过HAVING的行数,在超过不超过‘max_rows_to_group_by’指定值(默认为50%)的情况下,包含所有行的结果。否则排除这些结果。 +`after_having_auto` - 在HAVING后进行计算,采用统计通过HAVING的行数,在超过不超过‘max\_rows\_to\_group\_by’指定值(默认为50%)的情况下,包含所有行的结果。否则排除这些结果。 `totals_auto_threshold` - 默认 0.5,是`after_having_auto`的参数。 @@ -571,7 +544,7 @@ GROUP BY子句会为遇到的每一个不同的key计算一组聚合函数的值 你可以在GROUP BY中允许将临时数据转存到磁盘上,以限制对内存的使用。 `max_bytes_before_external_group_by`这个配置确定了在GROUP BY中启动将临时数据转存到磁盘上的内存阈值。如果你将它设置为0(这是默认值),这项功能将被禁用。 -当使用`max_bytes_before_external_group_by`时,我们建议将max_memory_usage设置为它的两倍。这是因为一个聚合需要两个阶段来完成:(1)读取数据并形成中间数据 (2)合并中间数据。临时数据的转存只会发生在第一个阶段。如果没有发生临时文件的转存,那么阶段二将最多消耗与1阶段相同的内存大小。 +当使用`max_bytes_before_external_group_by`时,我们建议将max\_memory\_usage设置为它的两倍。这是因为一个聚合需要两个阶段来完成:(1)读取数据并形成中间数据 (2)合并中间数据。临时数据的转存只会发生在第一个阶段。如果没有发生临时文件的转存,那么阶段二将最多消耗与1阶段相同的内存大小。 例如:如果将`max_memory_usage`设置为10000000000并且你想要开启外部聚合,那么你需要将`max_bytes_before_external_group_by`设置为10000000000的同时将`max_memory_usage`设置为20000000000。当外部聚合被触发时(如果刚好只形成了一份临时数据),它的内存使用量将会稍高与`max_bytes_before_external_group_by`。 @@ -584,7 +557,7 @@ GROUP BY子句会为遇到的每一个不同的key计算一组聚合函数的值 如果你在GROUP BY后面存在ORDER BY子句,并且ORDER BY后面存在一个极小限制的LIMIT,那么ORDER BY子句将不会使用太多内存。 否则请不要忘记启动外部排序(`max_bytes_before_external_sort`)。 -### LIMIT N BY 子句 +### LIMIT N BY 子句 {#limit-n-by-zi-ju} LIMIT N BY子句和LIMIT没有关系, LIMIT N BY COLUMNS 子句可以用来在每一个COLUMNS分组中求得最大的N行数据。我们可以将它们同时用在一个查询中。LIMIT N BY子句中可以包含任意多个分组字段表达式列表。 @@ -605,13 +578,12 @@ LIMIT 100 查询将会为每个`domain, device_type`的组合选出前5个访问最多的数据,但是结果最多将不超过100行(`LIMIT n BY + LIMIT`)。 -### HAVING 子句 +### HAVING 子句 {#having-zi-ju} HAVING子句可以用来过滤GROUP BY之后的数据,类似于WHERE子句。 WHERE于HAVING不同之处在于WHERE在聚合前(GROUP BY)执行,HAVING在聚合后执行。 如果不存在聚合,则不能使用HAVING。 - ### ORDER BY 子句 {#select-order-by} 如果存在ORDER BY 子句,则该子句中必须存在一个表达式列表,表达式列表中每一个表达式都可以分配一个DESC或ASC(排序的方向)。如果没有指明排序的方向,将假定以ASC的方式进行排序。其中ASC表示按照升序排序,DESC按照降序排序。示例:`ORDER BY Visits DESC, SearchPhrase` @@ -625,57 +597,53 @@ WHERE于HAVING不同之处在于WHERE在聚合前(GROUP BY)执行,HAVING在聚 `NaN` 和 `NULL` 的排序规则: -- 当使用`NULLS FIRST`修饰符时,将会先输出`NULL`,然后是`NaN`,最后才是其他值。 -- 当使用`NULLS LAST`修饰符时,将会先输出其他值,然后是`NaN`,最后才是`NULL`。 -- 默认情况下与使用`NULLS LAST`修饰符相同。 +- 当使用`NULLS FIRST`修饰符时,将会先输出`NULL`,然后是`NaN`,最后才是其他值。 +- 当使用`NULLS LAST`修饰符时,将会先输出其他值,然后是`NaN`,最后才是`NULL`。 +- 默认情况下与使用`NULLS LAST`修饰符相同。 示例: 假设存在如下一张表 -``` -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 2 │ -│ 1 │ nan │ -│ 2 │ 2 │ -│ 3 │ 4 │ -│ 5 │ 6 │ -│ 6 │ nan │ -│ 7 │ ᴺᵁᴸᴸ │ -│ 6 │ 7 │ -│ 8 │ 9 │ -└───┴──────┘ -``` + ┌─x─┬────y─┐ + │ 1 │ ᴺᵁᴸᴸ │ + │ 2 │ 2 │ + │ 1 │ nan │ + │ 2 │ 2 │ + │ 3 │ 4 │ + │ 5 │ 6 │ + │ 6 │ nan │ + │ 7 │ ᴺᵁᴸᴸ │ + │ 6 │ 7 │ + │ 8 │ 9 │ + └───┴──────┘ 运行查询 `SELECT * FROM t_null_nan ORDER BY y NULLS FIRST` 将获得如下结果: -``` -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 7 │ ᴺᵁᴸᴸ │ -│ 1 │ nan │ -│ 6 │ nan │ -│ 2 │ 2 │ -│ 2 │ 2 │ -│ 3 │ 4 │ -│ 5 │ 6 │ -│ 6 │ 7 │ -│ 8 │ 9 │ -└───┴──────┘ -``` + ┌─x─┬────y─┐ + │ 1 │ ᴺᵁᴸᴸ │ + │ 7 │ ᴺᵁᴸᴸ │ + │ 1 │ nan │ + │ 6 │ nan │ + │ 2 │ 2 │ + │ 2 │ 2 │ + │ 3 │ 4 │ + │ 5 │ 6 │ + │ 6 │ 7 │ + │ 8 │ 9 │ + └───┴──────┘ 当使用浮点类型的数值进行排序时,不管排序的顺序如何,NaNs总是出现在所有值的后面。换句话说,当你使用升序排列一个浮点数值列时,NaNs好像比所有值都要大。反之,当你使用降序排列一个浮点数值列时,NaNs好像比所有值都小。 如果你在ORDER BY子句后面存在LIMIT并给定了较小的数值,则将会使用较少的内存。否则,内存的使用量将与需要排序的数据成正比。对于分布式查询,如果省略了GROUP BY,则在远程服务器上执行部分排序,最后在请求服务器上合并排序结果。这意味这对于分布式查询而言,要排序的数据量可以大于单台服务器的内存。 -如果没有足够的内存,可以使用外部排序(在磁盘中创建一些临时文件)。可以使用`max_bytes_before_external_sort`来设置外部排序,如果你讲它设置为0(默认),则表示禁用外部排序功能。如果启用该功能。当要排序的数据量达到所指定的字节数时,当前排序的结果会被转存到一个临时文件中去。当全部数据读取完毕后,所有的临时文件将会合并成最终输出结果。这些临时文件将会写到config文件配置的/var/lib/clickhouse/tmp/目录中(默认值,你可以通过修改'tmp_path'配置调整该目录的位置)。 +如果没有足够的内存,可以使用外部排序(在磁盘中创建一些临时文件)。可以使用`max_bytes_before_external_sort`来设置外部排序,如果你讲它设置为0(默认),则表示禁用外部排序功能。如果启用该功能。当要排序的数据量达到所指定的字节数时,当前排序的结果会被转存到一个临时文件中去。当全部数据读取完毕后,所有的临时文件将会合并成最终输出结果。这些临时文件将会写到config文件配置的/var/lib/clickhouse/tmp/目录中(默认值,你可以通过修改’tmp\_path’配置调整该目录的位置)。 -查询运行使用的内存要高于‘max_bytes_before_external_sort’,为此,这个配置必须要远远小于‘max_memory_usage’配置的值。例如,如果你的服务器有128GB的内存去运行一个查询,那么推荐你将‘max_memory_usage’设置为100GB,‘max_bytes_before_external_sort’设置为80GB。 +查询运行使用的内存要高于‘max\_bytes\_before\_external\_sort’,为此,这个配置必须要远远小于‘max\_memory\_usage’配置的值。例如,如果你的服务器有128GB的内存去运行一个查询,那么推荐你将‘max\_memory\_usage’设置为100GB,‘max\_bytes\_before\_external\_sort’设置为80GB。 外部排序效率要远低于在内存中排序。 -### SELECT 子句 +### SELECT 子句 {#select-zi-ju} 在完成上述列出的所有子句后,将对SELECT子句中的表达式进行分析。 具体来讲,如果在存在聚合函数的情况下,将对聚合函数之前的表达式进行分析。 @@ -687,15 +655,15 @@ WHERE于HAVING不同之处在于WHERE在聚合前(GROUP BY)执行,HAVING在聚 如果存在DISTINCT子句,则会对结果中的完全相同的行进行去重。 在GROUP BY不包含聚合函数,并对全部SELECT部分都包含在GROUP BY中时的作用一样。但该子句还是与GROUP BY子句存在以下几点不同: -- 可以与GROUP BY配合使用。 -- 当不存在ORDER BY子句并存在LIMIT子句时,查询将在同时满足DISTINCT与LIMIT的情况下立即停止查询。 -- 在处理数据的同时输出结果,并不是等待整个查询全部完成。 +- 可以与GROUP BY配合使用。 +- 当不存在ORDER BY子句并存在LIMIT子句时,查询将在同时满足DISTINCT与LIMIT的情况下立即停止查询。 +- 在处理数据的同时输出结果,并不是等待整个查询全部完成。 在SELECT表达式中存在Array类型的列时,不能使用DISTINCT。 `DISTINCT`可以与 [NULL](syntax.md)一起工作,就好像`NULL`仅是一个特殊的值一样,并且`NULL=NULL`。换而言之,在`DISTINCT`的结果中,与`NULL`不同的组合仅能出现一次。 -### LIMIT 子句 +### LIMIT 子句 {#limit-zi-ju} LIMIT m 用于在查询结果中选择前m行数据。 LIMIT n, m 用于在查询结果中选择从n行开始的m行数据。 @@ -704,7 +672,7 @@ LIMIT n, m 用于在查询结果中选择从n行开始的m行数据。 如果没有指定ORDER BY子句,则结果可能是任意的顺序,并且是不确定的。 -### UNION ALL 子句 +### UNION ALL 子句 {#union-all-zi-ju} UNION ALL子句可以组合任意数量的查询,例如: @@ -729,7 +697,7 @@ UNION ALL中的查询可以同时运行,它们的结果将被混合到一起 作为UNION ALL查询的部分不能包含在括号内。ORDER BY与LIMIT子句应该被应用在每个查询中,而不是最终的查询中。如果你需要做最终结果转换,你可以将UNION ALL作为一个子查询包含在FROM子句中。 -### INTO OUTFILE 子句 +### INTO OUTFILE 子句 {#into-outfile-zi-ju} `INTO OUTFILE filename` 子句用于将查询结果重定向输出到指定文件中(filename是一个字符串类型的值)。 与MySQL不同,执行的结果文件将在客户端建立,如果文件已存在,查询将会失败。 @@ -737,16 +705,15 @@ UNION ALL中的查询可以同时运行,它们的结果将被混合到一起 默认的输出格式是TabSeparated(与命令行客户端的批处理模式相同)。 -### FORMAT 子句 +### FORMAT 子句 {#format-zi-ju} -'FORMAT format' 子句用于指定返回数据的格式。 +‘FORMAT format’ 子句用于指定返回数据的格式。 你可以使用它方便的转换或创建数据的转储。 -更多信息,参见“输入输出格式”部分。 +更多信息,参见«输入输出格式»部分。 如果不存在FORMAT子句,则使用默认的格式,这将取决与DB的配置以及所使用的客户端。对于批量模式的HTTP客户端和命令行客户端而言,默认的格式是TabSeparated。对于交互模式下的命令行客户端,默认的格式是PrettyCompact(它有更加美观的格式)。 当使用命令行客户端时,数据以内部高效的格式在服务器和客户端之间进行传递。客户端将单独的解析FORMAT子句,以帮助数据格式的转换(这将减轻网络和服务器的负载)。 - ### IN 运算符 {#select-in-operators} 对于`IN`、`NOT IN`、`GLOBAL IN`、`GLOBAL NOT IN`操作符被分别实现,因为它们的功能非常丰富。 @@ -762,8 +729,8 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... 如果左侧是单个列并且是一个索引,并且右侧是一组常量时,系统将使用索引来处理查询。 -不要在列表中列出太多的值(百万)。如果数据集很大,将它们放入临时表中(可以参考“”), 然后使用子查询。 -Don't list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section "External data for query processing"), then use a subquery. +不要在列表中列出太多的值(百万)。如果数据集很大,将它们放入临时表中(可以参考«»), 然后使用子查询。 +Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section «External data for query processing»), then use a subquery. 右侧可以是一个由常量表达式组成的元组列表(像上面的例子一样),或者是一个数据库中的表的名称,或是一个包含在括号中的子查询。 @@ -797,63 +764,54 @@ GROUP BY EventDate ORDER BY EventDate ASC ``` -``` -┌──EventDate─┬────ratio─┐ -│ 2014-03-17 │ 1 │ -│ 2014-03-18 │ 0.807696 │ -│ 2014-03-19 │ 0.755406 │ -│ 2014-03-20 │ 0.723218 │ -│ 2014-03-21 │ 0.697021 │ -│ 2014-03-22 │ 0.647851 │ -│ 2014-03-23 │ 0.648416 │ -└────────────┴──────────┘ -``` + ┌──EventDate─┬────ratio─┐ + │ 2014-03-17 │ 1 │ + │ 2014-03-18 │ 0.807696 │ + │ 2014-03-19 │ 0.755406 │ + │ 2014-03-20 │ 0.723218 │ + │ 2014-03-21 │ 0.697021 │ + │ 2014-03-22 │ 0.647851 │ + │ 2014-03-23 │ 0.648416 │ + └────────────┴──────────┘ 为3月17日之后的每一天计算与3月17日访问该网站的用户浏览网页的百分比。 IN子句中的子查询仅在单个服务器上运行一次。不能够是相关子查询。 -#### NULL 处理 +#### NULL 处理 {#null-chu-li-1} 在处理中,IN操作符总是假定 [NULL](syntax.md) 值的操作结果总是等于`0`,而不管`NULL`位于左侧还是右侧。`NULL`值不应该包含在任何数据集中,它们彼此不能够对应,并且不能够比较。 下面的示例中有一个`t_null`表: -``` -┌─x─┬────y─┐ -│ 1 │ ᴺᵁᴸᴸ │ -│ 2 │ 3 │ -└───┴──────┘ -``` + ┌─x─┬────y─┐ + │ 1 │ ᴺᵁᴸᴸ │ + │ 2 │ 3 │ + └───┴──────┘ 运行查询`SELECT x FROM t_null WHERE y IN (NULL,3)`将得到如下结果: -``` -┌─x─┐ -│ 2 │ -└───┘ -``` + ┌─x─┐ + │ 2 │ + └───┘ 你可以看到在查询结果中不存在`y = NULL`的结果。这是因为ClickHouse无法确定`NULL`是否包含在`(NULL,3)`数据集中,对于这次比较操作返回了`0`,并且在`SELECT`的最终输出中排除了这行。 -``` -SELECT y IN (NULL, 3) -FROM t_null - -┌─in(y, tuple(NULL, 3))─┐ -│ 0 │ -│ 1 │ -└───────────────────────┘ -``` + SELECT y IN (NULL, 3) + FROM t_null + ┌─in(y, tuple(NULL, 3))─┐ + │ 0 │ + │ 1 │ + └───────────────────────┘ #### 分布式子查询 {#select-distributed-subqueries} 对于带有子查询的(类似与JOINs)IN中,有两种选择:普通的`IN`/`JOIN`与`GLOBAL IN` / `GLOBAL JOIN`。它们对于分布式查询的处理运行方式是不同的。 -!!! 注意 +!!! 注意 "注意" 请记住,下面描述的算法可能因为根据 [settings](../operations/settings/settings.md) 配置的不同而不同。 -当使用普通的IN时,查询总是被发送到远程的服务器,并且在每个服务器中运行“IN”或“JOIN”子句中的子查询。 +当使用普通的IN时,查询总是被发送到远程的服务器,并且在每个服务器中运行«IN»或«JOIN»子句中的子查询。 当使用`GLOBAL IN` / `GLOBAL JOIN`时,首先会为`GLOBAL IN` / `GLOBAL JOIN`运行所有子查询,并将结果收集到临时表中,并将临时表发送到每个远程服务器,并使用该临时表运行查询。 @@ -861,9 +819,9 @@ FROM t_null 在分布式查询中使用`IN` / `JOIN`子句中使用子查询需要小心。 -让我们来看一些例子。假设集群中的每个服务器都存在一个正常表**local_table**。与一个分布式表**distributed_table**。 +让我们来看一些例子。假设集群中的每个服务器都存在一个正常表**local\_table**。与一个分布式表**distributed\_table**。 -对于所有查询**distributed_table**的查询,查询会被发送到所有的远程服务器并使用**local_table**表运行查询。 +对于所有查询**distributed\_table**的查询,查询会被发送到所有的远程服务器并使用**local\_table**表运行查询。 例如,查询 @@ -885,7 +843,7 @@ SELECT uniq(UserID) FROM local_table SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) ``` -- 计算两个站点的用户交集。 +- 计算两个站点的用户交集。 此查询将被发送给所有的远程服务器 @@ -895,9 +853,9 @@ SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SEL 换句话说,IN子句中的数据集将被在每台服务器上被独立的收集,仅与每台服务器上的本地存储上的数据计算交集。 -如果您已经将数据分散到了集群的每台服务器上,并且单个UserID的数据完全分布在单个服务器上,那么这将是正确且最佳的查询方式。在这种情况下,所有需要的数据都可以在每台服务器的本地进行获取。否则,结果将是不准确的。我们将这种查询称为“local IN”。 +如果您已经将数据分散到了集群的每台服务器上,并且单个UserID的数据完全分布在单个服务器上,那么这将是正确且最佳的查询方式。在这种情况下,所有需要的数据都可以在每台服务器的本地进行获取。否则,结果将是不准确的。我们将这种查询称为«local IN»。 -为了修正这种在数据随机分布的集群中的工作,你可以在子查询中使用**distributed_table**。查询将更改为这样: +为了修正这种在数据随机分布的集群中的工作,你可以在子查询中使用**distributed\_table**。查询将更改为这样: ``` sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) @@ -939,25 +897,25 @@ SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL 这比使用普通的IN更加理想,但是,请注意以下几点: -1. 创建临时表时,数据不是唯一的,为了减少通过网络传输的数据量,请在子查询中使用DISTINCT(你不需要在普通的IN中这么做) -2. 临时表将发送到所有远程服务器。其中传输不考虑网络的拓扑结构。例如,如果你有10个远程服务器存在与请求服务器非常远的数据中心中,则数据将通过通道发送数据到远程数据中心10次。使用GLOBAL IN时应避免大数据集。 -3. 当向远程服务器发送数据时,网络带宽的限制是不可配置的,这可能会网络的负载造成压力。 -4. 尝试将数据跨服务器分布,这样你将不需要使用GLOBAL IN。 -5. 如果你需要经常使用GLOBAL IN,请规划你的ClickHouse集群位置,以便副本之间不存在跨数据中心,并且它们之间具有快速的网络交换能力,以便查询可以完全在一个数据中心内完成。 +1. 创建临时表时,数据不是唯一的,为了减少通过网络传输的数据量,请在子查询中使用DISTINCT(你不需要在普通的IN中这么做) +2. 临时表将发送到所有远程服务器。其中传输不考虑网络的拓扑结构。例如,如果你有10个远程服务器存在与请求服务器非常远的数据中心中,则数据将通过通道发送数据到远程数据中心10次。使用GLOBAL IN时应避免大数据集。 +3. 当向远程服务器发送数据时,网络带宽的限制是不可配置的,这可能会网络的负载造成压力。 +4. 尝试将数据跨服务器分布,这样你将不需要使用GLOBAL IN。 +5. 如果你需要经常使用GLOBAL IN,请规划你的ClickHouse集群位置,以便副本之间不存在跨数据中心,并且它们之间具有快速的网络交换能力,以便查询可以完全在一个数据中心内完成。 另外,在`GLOBAL IN`子句中使用本地表也是有用的,比如,本地表仅在请求服务器上可用,并且您希望在远程服务器上使用来自本地表的数据。 -### Extreme Values +### Extreme Values {#extreme-values} 除了结果外,你还可以获得结果列的最大值与最小值,可以将**extremes**配置设置成1来做到这一点。最大值最小值的计算是针对于数字类型,日期类型进行计算的,对于其他列,将会输出默认值。 额外计算的两行结果 - 最大值与最小值,这两行额外的结果仅在JSON\*, TabSeparated\*, and Pretty\* 格式与其他行分开的输出方式输出,不支持其他输出格式。 -在JSON\*格式中,Extreme值在单独的'extremes'字段中。在TabSeparated\*格式中,在其他结果与'totals'之后输出,并使用空行与其分隔。在Pretty\* 格式中,将在其他结果与'totals'后以单独的表格输出。 +在JSON\*格式中,Extreme值在单独的’extremes’字段中。在TabSeparated\*格式中,在其他结果与’totals’之后输出,并使用空行与其分隔。在Pretty\* 格式中,将在其他结果与’totals’后以单独的表格输出。 如果在计算Extreme值的同时包含LIMIT。extremes的计算结果将包含offset跳过的行。在流式的请求中,它可能还包含多余LIMIT的少量行的值。 -### 注意事项 +### 注意事项 {#zhu-yi-shi-xiang} 不同于MySQL, `GROUP BY`与`ORDER BY`子句不支持使用列的位置信息作为参数,但这实际上是符合SQL标准的。 例如,`GROUP BY 1, 2`将被解释为按照常量进行分组(即,所有的行将会被聚合成一行)。 @@ -967,11 +925,11 @@ SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL 可以在查询的任何部分添加星号,而不仅仅是表达式。在分析查询时,星号被替换为所有的列(不包含`MATERIALIZED`与`ALIAS`的列)。 只有少数情况下使用星号是合理的: -- 创建表转储时。 -- 对于仅包含几个列的表,如系统表. -- 获取表中的列信息。在这种情况下应该使用`LIMIT 1`。但是,更好的办法是使用`DESC TABLE`。 -- 当使用`PREWHERE`在少数的几个列上做强过滤时。 -- 在子查询中(因为外部查询不需要的列被排除在子查询之外)。 +- 创建表转储时。 +- 对于仅包含几个列的表,如系统表. +- 获取表中的列信息。在这种情况下应该使用`LIMIT 1`。但是,更好的办法是使用`DESC TABLE`。 +- 当使用`PREWHERE`在少数的几个列上做强过滤时。 +- 在子查询中(因为外部查询不需要的列被排除在子查询之外)。 在所有的其他情况下,我们不建议使用星号,因为它是列式数据库的缺点而不是优点。 diff --git a/docs/zh/roadmap.md b/docs/zh/roadmap.md index 7ace0ce36c8..49532c046f5 100644 --- a/docs/zh/roadmap.md +++ b/docs/zh/roadmap.md @@ -1,9 +1,9 @@ -# 规划 +# 规划 {#gui-hua} -## Q1 2020 +## Q1 2020 {#q1-2020} -- 更精确的用户资源池,可以在用户之间合理分配集群资源 -- 细粒度的授权管理 -- 与外部认证服务集成 +- 更精确的用户资源池,可以在用户之间合理分配集群资源 +- 细粒度的授权管理 +- 与外部认证服务集成 [来源文章](https://clickhouse.tech/docs/en/roadmap/) diff --git a/docs/zh/security_changelog.md b/docs/zh/security_changelog.md index 1b234f53a48..e35d6a7c632 100644 --- a/docs/zh/security_changelog.md +++ b/docs/zh/security_changelog.md @@ -1,36 +1,38 @@ -## 修复于 ClickHouse Release 18.12.13, 2018-09-10 +## 修复于 ClickHouse Release 18.12.13, 2018-09-10 {#xiu-fu-yu-clickhouse-release-18-12-13-2018-09-10} -### CVE-2018-14672 +### CVE-2018-14672 {#cve-2018-14672} 加载CatBoost模型的功能,允许遍历路径并通过错误消息读取任意文件。 来源: Yandex信息安全团队的Andrey Krasichkov -## 修复于 ClickHouse Release 18.10.3, 2018-08-13 +## 修复于 ClickHouse Release 18.10.3, 2018-08-13 {#xiu-fu-yu-clickhouse-release-18-10-3-2018-08-13} -### CVE-2018-14671 +### CVE-2018-14671 {#cve-2018-14671} -unixODBC允许从文件系统加载任意共享对象,从而导致“远程执行代码”漏洞。 +unixODBC允许从文件系统加载任意共享对象,从而导致«远程执行代码»漏洞。 来源:Yandex信息安全团队的Andrey Krasichkov和Evgeny Sidorov -## 修复于 ClickHouse Release 1.1.54388, 2018-06-28 +## 修复于 ClickHouse Release 1.1.54388, 2018-06-28 {#xiu-fu-yu-clickhouse-release-1-1-54388-2018-06-28} -### CVE-2018-14668 -远程表函数功能允许在 "user", "password" 及 "default_database" 字段中使用任意符号,从而导致跨协议请求伪造攻击。 +### CVE-2018-14668 {#cve-2018-14668} + +远程表函数功能允许在 «user», «password» 及 «default\_database» 字段中使用任意符号,从而导致跨协议请求伪造攻击。 来源:Yandex信息安全团队的Andrey Krasichkov -## 修复于 ClickHouse Release 1.1.54390, 2018-07-06 +## 修复于 ClickHouse Release 1.1.54390, 2018-07-06 {#xiu-fu-yu-clickhouse-release-1-1-54390-2018-07-06} -### CVE-2018-14669 -ClickHouse MySQL客户端启用了 "LOAD DATA LOCAL INFILE" 功能,该功能允许恶意MySQL数据库从连接的ClickHouse服务器读取任意文件。 +### CVE-2018-14669 {#cve-2018-14669} + +ClickHouse MySQL客户端启用了 «LOAD DATA LOCAL INFILE» 功能,该功能允许恶意MySQL数据库从连接的ClickHouse服务器读取任意文件。 来源:Yandex信息安全团队的Andrey Krasichkov和Evgeny Sidorov -## 修复于 ClickHouse Release 1.1.54131, 2017-01-10 +## 修复于 ClickHouse Release 1.1.54131, 2017-01-10 {#xiu-fu-yu-clickhouse-release-1-1-54131-2017-01-10} -### CVE-2018-14670 +### CVE-2018-14670 {#cve-2018-14670} deb软件包中的错误配置可能导致使用未经授权的数据库。 diff --git a/utils/compressor/decompress_perf.cpp b/utils/compressor/decompress_perf.cpp index 3e598b10876..881c1cf3918 100644 --- a/utils/compressor/decompress_perf.cpp +++ b/utils/compressor/decompress_perf.cpp @@ -1,8 +1,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/utils/compressor/mutator.cpp b/utils/compressor/mutator.cpp index 65125d073d0..13c80c292e2 100644 --- a/utils/compressor/mutator.cpp +++ b/utils/compressor/mutator.cpp @@ -1,8 +1,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/utils/convert-month-partitioned-parts/main.cpp b/utils/convert-month-partitioned-parts/main.cpp index 8f1ca05dd32..51ea87d35b9 100644 --- a/utils/convert-month-partitioned-parts/main.cpp +++ b/utils/convert-month-partitioned-parts/main.cpp @@ -1,13 +1,13 @@ -#include -#include #include +#include #include #include -#include -#include -#include #include #include +#include +#include +#include +#include #include #include @@ -73,7 +73,7 @@ void run(String part_path, String date_column, String dest_path) { /// If the file is already deleted, do nothing. } - localBackup(old_part_path, new_tmp_part_path, {}); + localBackup(disk, old_part_path.toString(), new_tmp_part_path.toString(), {}); WriteBufferFromFile count_out(new_tmp_part_path_str + "count.txt", 4096); HashingWriteBuffer count_out_hashing(count_out); diff --git a/utils/fill-factor/main.cpp b/utils/fill-factor/main.cpp index b492be1be85..305c33a91d4 100644 --- a/utils/fill-factor/main.cpp +++ b/utils/fill-factor/main.cpp @@ -5,7 +5,7 @@ #include #endif -#include +#include #include #include diff --git a/website/css/base.css b/website/css/base.css index 3d8595bb0ef..e171339f171 100644 --- a/website/css/base.css +++ b/website/css/base.css @@ -56,19 +56,6 @@ a.btn-outline-yellow { text-decoration: none; } -.stealth-link:link, .stealth-link:visited { - color: #000; - text-decoration: none; -} - -.text-light .stealth-link:link, .text-light .stealth-link:visited { - color: #fff; -} - -.text-muted .stealth-link:link, .text-muted .stealth-link:visited { - color: #999; -} - .text-red { color: #ff3939; } @@ -89,7 +76,7 @@ a.btn-outline-yellow { color: #000; } -.navbar-light .navbar-nav .nav-link:hover, .stealth-link:hover, .stealth-link:active { +.navbar-light .navbar-nav .nav-link:hover { color: #f14600; } diff --git a/website/images/index/hardware-efficient.jpg b/website/images/index/hardware-efficient.jpg deleted file mode 100644 index f6e75ed0ac9..00000000000 Binary files a/website/images/index/hardware-efficient.jpg and /dev/null differ diff --git a/website/images/index/hardware-efficient.svg b/website/images/index/hardware-efficient.svg index 4a514e82edb..f7f06d5a962 100644 --- a/website/images/index/hardware-efficient.svg +++ b/website/images/index/hardware-efficient.svg @@ -1 +1 @@ - \ No newline at end of file +215 \ No newline at end of file diff --git a/website/index.html b/website/index.html index 859a3b5d234..8770c8a97a9 100644 --- a/website/index.html +++ b/website/index.html @@ -1,7 +1,5 @@ {% extends "templates/base.html" %} -{% set extrajs = ["js/index.js"] %} - {% block content %} {% include "templates/index/nav.html" %} diff --git a/website/js/base.js b/website/js/base.js index 1722d2ebc2f..17f4c98b39b 100644 --- a/website/js/base.js +++ b/website/js/base.js @@ -58,3 +58,5 @@ $(document).ready(function () { } else { f(); } })(document, window, "yandex_metrika_callbacks2"); }); + +{% include "js/index.js" %} diff --git a/website/templates/footer.html b/website/templates/footer.html index 7cca9a48d90..d69f866b4dd 100644 --- a/website/templates/footer.html +++ b/website/templates/footer.html @@ -3,8 +3,8 @@
- ClickHouse source code is published under Apache 2.0 License. Software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + rel="external nofollow" target="_blank" class="text-reset"> + ClickHouse source code is published under the Apache 2.0 License. Software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/website/templates/index/community.html b/website/templates/index/community.html index ede5f84e883..26e6466d228 100644 --- a/website/templates/index/community.html +++ b/website/templates/index/community.html @@ -4,8 +4,8 @@

ClickHouse Community

-
-
+
+
-
+
@@ -150,9 +150,9 @@
diff --git a/website/templates/index/efficient.html b/website/templates/index/efficient.html index da21ce4e03c..275e0bf1c00 100644 --- a/website/templates/index/efficient.html +++ b/website/templates/index/efficient.html @@ -4,11 +4,12 @@

Hardware efficient

ClickHouse processes typical analytical queries two to three orders of magnitude faster than traditional - row-oriented systems with the same available I/O throughput and CPU capacity. The system's columnar storage format allows fitting - more hot data in RAM, which leads to a shorter typical response times.

+ row-oriented systems with the same available I/O throughput and CPU capacity. Columnar storage format allows fitting + more hot data in RAM, which leads to shorter typical response times.

+

Total cost of ownership could be further lowered by using commodity hardware with rotating disk drives instead of enterprise grade NVMe or SSD without significant sacrifices in latency for most kinds of queries.

-
- ClickHouse is hardware efficient +
+ ClickHouse is hardware efficient
@@ -20,7 +21,7 @@

Optimizes disk drive access

-

ClickHouse minimizes the number of seeks for range queries, which increases efficiency of using rotational +

ClickHouse minimizes the number of seeks for range queries, which increases the efficiency of using rotational disk drives, as it maintains locality of reference for continually stored data.

diff --git a/website/templates/index/features.html b/website/templates/index/features.html index c7ac56c8b09..30d682843b0 100644 --- a/website/templates/index/features.html +++ b/website/templates/index/features.html @@ -11,8 +11,8 @@
  • True column-oriented storage
  • Vectorized query execution
  • Parallel and distributed query execution
  • -
  • Real time query processing
  • -
  • Real time data ingestion
  • +
  • Real-time query processing
  • +
  • Real-time data ingestion
  • On-disk locality of reference
  • Data compression
  • @@ -40,11 +40,11 @@
    -
    -
    +
    + - diff --git a/website/templates/index/hero.html b/website/templates/index/hero.html index 35eba0c4ce8..5166d303ea8 100644 --- a/website/templates/index/hero.html +++ b/website/templates/index/hero.html @@ -1,21 +1,21 @@
    -
    +

    ClickHouse is a fast open-source database management system

    It is column-oriented and allows to generate analytical reports using SQL queries in real-time.

    @@ -40,7 +40,7 @@
    -
    +
    ClickHouse is capable of generating analytical data reports in real time
    diff --git a/website/templates/index/nav.html b/website/templates/index/nav.html index fa8fe780654..6d851aff2dd 100644 --- a/website/templates/index/nav.html +++ b/website/templates/index/nav.html @@ -4,8 +4,8 @@ ClickHouse -
    diff --git a/website/templates/index/reliable.html b/website/templates/index/reliable.html index 0dad88e4021..c7a7146050d 100644 --- a/website/templates/index/reliable.html +++ b/website/templates/index/reliable.html @@ -6,13 +6,13 @@

    ClickHouse has been managing petabytes of data serving a number of highload mass audience services of Yandex, Russia's - leading search provider and one of largest European IT companies. + leading search provider and one of the largest European IT companies. Since 2012, ClickHouse has been providing robust database management for the company's web analytics service, comparison e-commerce platform, public email service, online advertising platform, business intelligence tools and infrastructure monitoring.

    -

    ClickHouse can be configured as purely distributed system located on independent nodes, +

    ClickHouse can be configured as a purely distributed system located on independent nodes, without any single points of failure.

    Software and hardware failures or misconfigurations do not result in loss of data. Instead of deleting "broken" @@ -22,6 +22,6 @@

    ClickHouse offers flexible limits on query complexity and resource usage, which can be fine-tuned with settings. It is possible to simultaneously serve both a number of high priority low-latency requests and some - long-running queries with background priority.

    + long-running queries with a background priority.

    diff --git a/website/templates/index/rich.html b/website/templates/index/rich.html index a7091831018..14b2f86e75f 100644 --- a/website/templates/index/rich.html +++ b/website/templates/index/rich.html @@ -3,14 +3,14 @@
    -

    Feature rich

    +

    Feature-rich

    -
    +
    1

    User-friendly SQL dialect

    -

    ClickHouse features a user-friendly SQL query dialect with a number of built-in analytics capabilities. In additional to common functions that could by found in most DBMS, ClickHouse comes with a lot of domain-specific functions and features out of the box.

    +

    ClickHouse features a user-friendly SQL query dialect with a number of built-in analytics capabilities. In addition to common functions that could be found in most DBMS, ClickHouse comes with a lot of domain-specific functions and features out of the box.

    2
    @@ -22,13 +22,13 @@
    3

    Join distributed or co-located data

    -

    ClickHouse provides various options for joining tables. Joins could be either cluster local, they can also access data stored in external systems. There's also a external dictionaries support that provide alternative more simple syntax for accessing data from outside source.

    +

    ClickHouse provides various options for joining tables. Joins could be either cluster local, they can also access data stored in external systems. There's also an external dictionaries support that provides an alternative more simple syntax for accessing data from an outside source.

    4

    Approximate query processing

    -

    Users can control trade-off between result accuracy and query execution time, which is handy when dealing with multiple terabytes or petabytes of data. ClickHouse also provides probabilistic data structures for fast and memory-efficient calculation of cardinatilities and quantiles

    +

    Users can control the trade-off between result accuracy and query execution time, which is handy when dealing with multiple terabytes or petabytes of data. ClickHouse also provides probabilistic data structures for fast and memory-efficient calculation of cardinalities and quantiles

    diff --git a/website/templates/index/success.html b/website/templates/index/success.html index be14ab33e58..cd22fe3b1da 100644 --- a/website/templates/index/success.html +++ b/website/templates/index/success.html @@ -3,7 +3,7 @@

    Success stories

    - +
    - +
    - +

    Experimentation

    @@ -35,7 +35,7 @@
    - +
    - +
    - +

    Speeding up Report API

    diff --git a/website/templates/index/use.html b/website/templates/index/use.html index 3c53a54b1ff..edf4a28cf67 100644 --- a/website/templates/index/use.html +++ b/website/templates/index/use.html @@ -3,7 +3,7 @@

    When to use ClickHouse

    -

    For analytics over stream of clean, well structured and immutable events or logs. +

    For analytics over a stream of clean, well structured and immutable events or logs. It is recommended to put each such stream into a single wide fact table with pre-joined dimensions.

    @@ -32,7 +32,7 @@
    • ✕ Transactional workloads (OLTP)
    • -
    • ✕ Key-value requests with high rate
    • +
    • ✕ Key-value requests with a high rate
    diff --git a/website/templates/index/why.html b/website/templates/index/why.html index 95273ac48e0..291df2c6653 100644 --- a/website/templates/index/why.html +++ b/website/templates/index/why.html @@ -18,8 +18,8 @@ Fault tolerant
    -

    Fault tolerant

    -

    ClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows to avoid having single points of failure. Downtime of a single node or the whole datacenter won't affect the system's availability for both reads and writes.

    +

    Fault-tolerant

    +

    ClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter won't affect the system's availability for both reads and writes.

    @@ -28,15 +28,14 @@

    Linearly scalable

    -

    ClickHouse scales well both vertically and horizontally. ClickHouse is easily adaptable to perform either on - cluster with hundreds or thousands of nodes, or on a single server or even on a tiny virtual machine. Currently there are installations with more multiple trillion rows or hundreds of terabytes of data per single node.

    +

    ClickHouse scales well both vertically and horizontally. ClickHouse is easily adaptable to perform either on a cluster with hundreds or thousands of nodes or on a single server or even on a tiny virtual machine. Currently, there are installations with more multiple trillion rows or hundreds of terabytes of data per single node.

    Easy to use

    Easy to use

    -

    ClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows to express the desired result without involving any custom non-standard API that could be found in some DBMS.

    +

    ClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some DBMS.

    diff --git a/website/templates/install/tgz.html b/website/templates/install/tgz.html index dae588faf28..0865bfe681f 100644 --- a/website/templates/install/tgz.html +++ b/website/templates/install/tgz.html @@ -1,10 +1,10 @@ {% highlight "bash" %} -export LATEST_VERSION=$(curl https://api.github.com/repos/ClickHouse/ClickHouse/tags | \ - grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -n 1) -curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-$LATEST_VERSION.tgz -curl -O https://repo.clickhouse.tech/tgz/clickhouse-common-static-dbg-$LATEST_VERSION.tgz -curl -O https://repo.clickhouse.tech/tgz/clickhouse-server-$LATEST_VERSION.tgz -curl -O https://repo.clickhouse.tech/tgz/clickhouse-client-$LATEST_VERSION.tgz +export LATEST_VERSION=$(curl -s https://repo.clickhouse.tech/tgz/stable/ | \ + grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) +curl -O https://repo.clickhouse.tech/tgz/stable/clickhouse-common-static-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.tech/tgz/stable/clickhouse-common-static-dbg-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.tech/tgz/stable/clickhouse-server-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.tech/tgz/stable/clickhouse-client-$LATEST_VERSION.tgz tar -xzvf clickhouse-common-static-$LATEST_VERSION.tgz sudo clickhouse-common-static-$LATEST_VERSION/install/doinst.sh